From c39564332bdd5030b9031ed3b1a428464fea670e Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Tue, 27 Aug 2019 10:46:06 -0700
Subject: Mount volumes as super user

This used to be the case, but regressed after a recent change.
Also made a few fixes around it and clean up the code a bit.

Closes #720

PiperOrigin-RevId: 265717496
---
 pkg/sentry/fs/mounts.go | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/mounts.go b/pkg/sentry/fs/mounts.go
index 9b713e785..ac0398bd9 100644
--- a/pkg/sentry/fs/mounts.go
+++ b/pkg/sentry/fs/mounts.go
@@ -171,8 +171,6 @@ type MountNamespace struct {
 // NewMountNamespace returns a new MountNamespace, with the provided node at the
 // root, and the given cache size. A root must always be provided.
 func NewMountNamespace(ctx context.Context, root *Inode) (*MountNamespace, error) {
-	creds := auth.CredentialsFromContext(ctx)
-
 	// Set the root dirent and id on the root mount. The reference returned from
 	// NewDirent will be donated to the MountNamespace constructed below.
 	d := NewDirent(ctx, root, "/")
@@ -181,6 +179,7 @@ func NewMountNamespace(ctx context.Context, root *Inode) (*MountNamespace, error
 		d: newRootMount(1, d),
 	}
 
+	creds := auth.CredentialsFromContext(ctx)
 	mns := MountNamespace{
 		userns:  creds.UserNamespace,
 		root:    d,
-- 
cgit v1.2.3


From 8fd89fd7a2b4a69c76b126fa52f47757b6076d36 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Tue, 27 Aug 2019 10:51:23 -0700
Subject: Fix sendfile(2) error code

When output file is in append mode, sendfile(2) should fail
with EINVAL and not EBADF.

Closes #721

PiperOrigin-RevId: 265718958
---
 pkg/sentry/syscalls/linux/sys_splice.go | 19 +++++++++++++------
 test/syscalls/linux/sendfile.cc         | 22 +++++++++++++++++++++-
 2 files changed, 34 insertions(+), 7 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/syscalls/linux/sys_splice.go b/pkg/sentry/syscalls/linux/sys_splice.go
index 17e3dde1f..8a98fedcb 100644
--- a/pkg/sentry/syscalls/linux/sys_splice.go
+++ b/pkg/sentry/syscalls/linux/sys_splice.go
@@ -91,22 +91,29 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 	}
 
 	// Get files.
+	inFile := t.GetFile(inFD)
+	if inFile == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer inFile.DecRef()
+
+	if !inFile.Flags().Read {
+		return 0, nil, syserror.EBADF
+	}
+
 	outFile := t.GetFile(outFD)
 	if outFile == nil {
 		return 0, nil, syserror.EBADF
 	}
 	defer outFile.DecRef()
 
-	inFile := t.GetFile(inFD)
-	if inFile == nil {
+	if !outFile.Flags().Write {
 		return 0, nil, syserror.EBADF
 	}
-	defer inFile.DecRef()
 
-	// Verify that the outfile Append flag is not set. Note that fs.Splice
-	// itself validates that the output file is writable.
+	// Verify that the outfile Append flag is not set.
 	if outFile.Flags().Append {
-		return 0, nil, syserror.EBADF
+		return 0, nil, syserror.EINVAL
 	}
 
 	// Verify that we have a regular infile. This is a requirement; the
diff --git a/test/syscalls/linux/sendfile.cc b/test/syscalls/linux/sendfile.cc
index e5d72e28a..9167ab066 100644
--- a/test/syscalls/linux/sendfile.cc
+++ b/test/syscalls/linux/sendfile.cc
@@ -299,10 +299,30 @@ TEST(SendFileTest, DoNotSendfileIfOutfileIsAppendOnly) {
 
   // Open the output file as append only.
   const FileDescriptor outf =
-      ASSERT_NO_ERRNO_AND_VALUE(Open(out_file.path(), O_APPEND));
+      ASSERT_NO_ERRNO_AND_VALUE(Open(out_file.path(), O_WRONLY | O_APPEND));
 
   // Send data and verify that sendfile returns the correct errno.
   EXPECT_THAT(sendfile(outf.get(), inf.get(), nullptr, kDataSize),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(SendFileTest, AppendCheckOrdering) {
+  constexpr char kData[] = "And by opposing end them: to die, to sleep";
+  constexpr int kDataSize = sizeof(kData) - 1;
+  const TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      GetAbsoluteTestTmpdir(), kData, TempPath::kDefaultFileMode));
+
+  const FileDescriptor read =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDONLY));
+  const FileDescriptor write =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_WRONLY));
+  const FileDescriptor append =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_APPEND));
+
+  // Check that read/write file mode is verified before append.
+  EXPECT_THAT(sendfile(append.get(), read.get(), nullptr, kDataSize),
+              SyscallFailsWithErrno(EBADF));
+  EXPECT_THAT(sendfile(write.get(), write.get(), nullptr, kDataSize),
               SyscallFailsWithErrno(EBADF));
 }
 
-- 
cgit v1.2.3


From 9679f9891fe524647d5027a3cfb9c892604df064 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Tue, 27 Aug 2019 11:42:52 -0700
Subject: Fix comment typo

PiperOrigin-RevId: 265731735
---
 pkg/sentry/loader/elf.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/loader/elf.go b/pkg/sentry/loader/elf.go
index bc5b841fb..ba9c9ce12 100644
--- a/pkg/sentry/loader/elf.go
+++ b/pkg/sentry/loader/elf.go
@@ -464,7 +464,7 @@ func loadParsedELF(ctx context.Context, m *mm.MemoryManager, f *fs.File, info el
 	// base address big enough to fit all segments, so we first create a
 	// mapping for the total size just to find a region that is big enough.
 	//
-	// It is safe to unmap it immediately with racing with another mapping
+	// It is safe to unmap it immediately without racing with another mapping
 	// because we are the only one in control of the MemoryManager.
 	//
 	// Note that the vaddr of the first PT_LOAD segment is ignored when
-- 
cgit v1.2.3


From 36a8949b2a52aabbe3f0548f1207c133da113c56 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Thu, 29 Aug 2019 10:50:48 -0700
Subject: Add limit_host_fd_translation Gofer mount option.

PiperOrigin-RevId: 266177409
---
 pkg/sentry/fs/fsutil/inode_cached.go      | 34 +++++++++++++++++++++++--------
 pkg/sentry/fs/fsutil/inode_cached_test.go | 10 ++++-----
 pkg/sentry/fs/gofer/fs.go                 | 22 ++++++++++++++------
 pkg/sentry/fs/gofer/session.go            | 27 ++++++++++++++++--------
 pkg/sentry/fs/host/inode.go               |  6 ++++--
 5 files changed, 68 insertions(+), 31 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/fsutil/inode_cached.go b/pkg/sentry/fs/fsutil/inode_cached.go
index e70bc28fb..20cb9a367 100644
--- a/pkg/sentry/fs/fsutil/inode_cached.go
+++ b/pkg/sentry/fs/fsutil/inode_cached.go
@@ -66,10 +66,8 @@ type CachingInodeOperations struct {
 	// mfp is used to allocate memory that caches backingFile's contents.
 	mfp pgalloc.MemoryFileProvider
 
-	// forcePageCache indicates the sentry page cache should be used regardless
-	// of whether the platform supports host mapped I/O or not. This must not be
-	// modified after inode creation.
-	forcePageCache bool
+	// opts contains options. opts is immutable.
+	opts CachingInodeOperationsOptions
 
 	attrMu sync.Mutex `state:"nosave"`
 
@@ -116,6 +114,20 @@ type CachingInodeOperations struct {
 	refs frameRefSet
 }
 
+// CachingInodeOperationsOptions configures a CachingInodeOperations.
+//
+// +stateify savable
+type CachingInodeOperationsOptions struct {
+	// If ForcePageCache is true, use the sentry page cache even if a host file
+	// descriptor is available.
+	ForcePageCache bool
+
+	// If LimitHostFDTranslation is true, apply maxFillRange() constraints to
+	// host file descriptor mappings returned by
+	// CachingInodeOperations.Translate().
+	LimitHostFDTranslation bool
+}
+
 // CachedFileObject is a file that may require caching.
 type CachedFileObject interface {
 	// ReadToBlocksAt reads up to dsts.NumBytes() bytes from the file to dsts,
@@ -159,7 +171,7 @@ type CachedFileObject interface {
 
 // NewCachingInodeOperations returns a new CachingInodeOperations backed by
 // a CachedFileObject and its initial unstable attributes.
-func NewCachingInodeOperations(ctx context.Context, backingFile CachedFileObject, uattr fs.UnstableAttr, forcePageCache bool) *CachingInodeOperations {
+func NewCachingInodeOperations(ctx context.Context, backingFile CachedFileObject, uattr fs.UnstableAttr, opts CachingInodeOperationsOptions) *CachingInodeOperations {
 	mfp := pgalloc.MemoryFileProviderFromContext(ctx)
 	if mfp == nil {
 		panic(fmt.Sprintf("context.Context %T lacks non-nil value for key %T", ctx, pgalloc.CtxMemoryFileProvider))
@@ -167,7 +179,7 @@ func NewCachingInodeOperations(ctx context.Context, backingFile CachedFileObject
 	return &CachingInodeOperations{
 		backingFile:    backingFile,
 		mfp:            mfp,
-		forcePageCache: forcePageCache,
+		opts:           opts,
 		attr:           uattr,
 		hostFileMapper: NewHostFileMapper(),
 	}
@@ -763,7 +775,7 @@ func (rw *inodeReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error
 // and memory mappings, and false if c.cache may contain data cached from
 // c.backingFile.
 func (c *CachingInodeOperations) useHostPageCache() bool {
-	return !c.forcePageCache && c.backingFile.FD() >= 0
+	return !c.opts.ForcePageCache && c.backingFile.FD() >= 0
 }
 
 // AddMapping implements memmap.Mappable.AddMapping.
@@ -835,11 +847,15 @@ func (c *CachingInodeOperations) CopyMapping(ctx context.Context, ms memmap.Mapp
 func (c *CachingInodeOperations) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) {
 	// Hot path. Avoid defer.
 	if c.useHostPageCache() {
+		mr := optional
+		if c.opts.LimitHostFDTranslation {
+			mr = maxFillRange(required, optional)
+		}
 		return []memmap.Translation{
 			{
-				Source: optional,
+				Source: mr,
 				File:   c,
-				Offset: optional.Start,
+				Offset: mr.Start,
 				Perms:  usermem.AnyAccess,
 			},
 		}, nil
diff --git a/pkg/sentry/fs/fsutil/inode_cached_test.go b/pkg/sentry/fs/fsutil/inode_cached_test.go
index dc19255ed..eb5730c35 100644
--- a/pkg/sentry/fs/fsutil/inode_cached_test.go
+++ b/pkg/sentry/fs/fsutil/inode_cached_test.go
@@ -61,7 +61,7 @@ func TestSetPermissions(t *testing.T) {
 	uattr := fs.WithCurrentTime(ctx, fs.UnstableAttr{
 		Perms: fs.FilePermsFromMode(0444),
 	})
-	iops := NewCachingInodeOperations(ctx, noopBackingFile{}, uattr, false /*forcePageCache*/)
+	iops := NewCachingInodeOperations(ctx, noopBackingFile{}, uattr, CachingInodeOperationsOptions{})
 	defer iops.Release()
 
 	perms := fs.FilePermsFromMode(0777)
@@ -150,7 +150,7 @@ func TestSetTimestamps(t *testing.T) {
 				ModificationTime: epoch,
 				StatusChangeTime: epoch,
 			}
-			iops := NewCachingInodeOperations(ctx, noopBackingFile{}, uattr, false /*forcePageCache*/)
+			iops := NewCachingInodeOperations(ctx, noopBackingFile{}, uattr, CachingInodeOperationsOptions{})
 			defer iops.Release()
 
 			if err := iops.SetTimestamps(ctx, nil, test.ts); err != nil {
@@ -188,7 +188,7 @@ func TestTruncate(t *testing.T) {
 	uattr := fs.UnstableAttr{
 		Size: 0,
 	}
-	iops := NewCachingInodeOperations(ctx, noopBackingFile{}, uattr, false /*forcePageCache*/)
+	iops := NewCachingInodeOperations(ctx, noopBackingFile{}, uattr, CachingInodeOperationsOptions{})
 	defer iops.Release()
 
 	if err := iops.Truncate(ctx, nil, uattr.Size); err != nil {
@@ -280,7 +280,7 @@ func TestRead(t *testing.T) {
 	uattr := fs.UnstableAttr{
 		Size: int64(len(buf)),
 	}
-	iops := NewCachingInodeOperations(ctx, newSliceBackingFile(buf), uattr, false /*forcePageCache*/)
+	iops := NewCachingInodeOperations(ctx, newSliceBackingFile(buf), uattr, CachingInodeOperationsOptions{})
 	defer iops.Release()
 
 	// Expect the cache to be initially empty.
@@ -336,7 +336,7 @@ func TestWrite(t *testing.T) {
 	uattr := fs.UnstableAttr{
 		Size: int64(len(buf)),
 	}
-	iops := NewCachingInodeOperations(ctx, newSliceBackingFile(buf), uattr, false /*forcePageCache*/)
+	iops := NewCachingInodeOperations(ctx, newSliceBackingFile(buf), uattr, CachingInodeOperationsOptions{})
 	defer iops.Release()
 
 	// Expect the cache to be initially empty.
diff --git a/pkg/sentry/fs/gofer/fs.go b/pkg/sentry/fs/gofer/fs.go
index 69999dc28..8f8ab5d29 100644
--- a/pkg/sentry/fs/gofer/fs.go
+++ b/pkg/sentry/fs/gofer/fs.go
@@ -54,6 +54,10 @@ const (
 	// sandbox using files backed by the gofer. If set to false, unix sockets
 	// cannot be bound to gofer files without an overlay on top.
 	privateUnixSocketKey = "privateunixsocket"
+
+	// If present, sets CachingInodeOperationsOptions.LimitHostFDTranslation to
+	// true.
+	limitHostFDTranslationKey = "limit_host_fd_translation"
 )
 
 // defaultAname is the default attach name.
@@ -134,12 +138,13 @@ func (f *filesystem) Mount(ctx context.Context, device string, flags fs.MountSou
 
 // opts are parsed 9p mount options.
 type opts struct {
-	fd                int
-	aname             string
-	policy            cachePolicy
-	msize             uint32
-	version           string
-	privateunixsocket bool
+	fd                     int
+	aname                  string
+	policy                 cachePolicy
+	msize                  uint32
+	version                string
+	privateunixsocket      bool
+	limitHostFDTranslation bool
 }
 
 // options parses mount(2) data into structured options.
@@ -237,6 +242,11 @@ func options(data string) (opts, error) {
 		delete(options, privateUnixSocketKey)
 	}
 
+	if _, ok := options[limitHostFDTranslationKey]; ok {
+		o.limitHostFDTranslation = true
+		delete(options, limitHostFDTranslationKey)
+	}
+
 	// Fail to attach if the caller wanted us to do something that we
 	// don't support.
 	if len(options) > 0 {
diff --git a/pkg/sentry/fs/gofer/session.go b/pkg/sentry/fs/gofer/session.go
index 69d08a627..50da865c1 100644
--- a/pkg/sentry/fs/gofer/session.go
+++ b/pkg/sentry/fs/gofer/session.go
@@ -117,6 +117,11 @@ type session struct {
 	// Flags provided to the mount.
 	superBlockFlags fs.MountSourceFlags `state:"wait"`
 
+	// limitHostFDTranslation is the value used for
+	// CachingInodeOperationsOptions.LimitHostFDTranslation for all
+	// CachingInodeOperations created by the session.
+	limitHostFDTranslation bool
+
 	// connID is a unique identifier for the session connection.
 	connID string `state:"wait"`
 
@@ -218,8 +223,11 @@ func newInodeOperations(ctx context.Context, s *session, file contextFile, qid p
 
 	uattr := unstable(ctx, valid, attr, s.mounter, s.client)
 	return sattr, &inodeOperations{
-		fileState:       fileState,
-		cachingInodeOps: fsutil.NewCachingInodeOperations(ctx, fileState, uattr, s.superBlockFlags.ForcePageCache),
+		fileState: fileState,
+		cachingInodeOps: fsutil.NewCachingInodeOperations(ctx, fileState, uattr, fsutil.CachingInodeOperationsOptions{
+			ForcePageCache:         s.superBlockFlags.ForcePageCache,
+			LimitHostFDTranslation: s.limitHostFDTranslation,
+		}),
 	}
 }
 
@@ -242,13 +250,14 @@ func Root(ctx context.Context, dev string, filesystem fs.Filesystem, superBlockF
 
 	// Construct the session.
 	s := session{
-		connID:          dev,
-		msize:           o.msize,
-		version:         o.version,
-		cachePolicy:     o.policy,
-		aname:           o.aname,
-		superBlockFlags: superBlockFlags,
-		mounter:         mounter,
+		connID:                 dev,
+		msize:                  o.msize,
+		version:                o.version,
+		cachePolicy:            o.policy,
+		aname:                  o.aname,
+		superBlockFlags:        superBlockFlags,
+		limitHostFDTranslation: o.limitHostFDTranslation,
+		mounter:                mounter,
 	}
 	s.EnableLeakCheck("gofer.session")
 
diff --git a/pkg/sentry/fs/host/inode.go b/pkg/sentry/fs/host/inode.go
index 679d8321a..894ab01f0 100644
--- a/pkg/sentry/fs/host/inode.go
+++ b/pkg/sentry/fs/host/inode.go
@@ -200,8 +200,10 @@ func newInode(ctx context.Context, msrc *fs.MountSource, fd int, saveable bool,
 	// Build the fs.InodeOperations.
 	uattr := unstableAttr(msrc.MountSourceOperations.(*superOperations), &s)
 	iops := &inodeOperations{
-		fileState:       fileState,
-		cachingInodeOps: fsutil.NewCachingInodeOperations(ctx, fileState, uattr, msrc.Flags.ForcePageCache),
+		fileState: fileState,
+		cachingInodeOps: fsutil.NewCachingInodeOperations(ctx, fileState, uattr, fsutil.CachingInodeOperationsOptions{
+			ForcePageCache: msrc.Flags.ForcePageCache,
+		}),
 	}
 
 	// Return the fs.Inode.
-- 
cgit v1.2.3


From 863e11ac4d6a49787cd5e5f6fe1cd771d0ceb100 Mon Sep 17 00:00:00 2001
From: Rahat Mahmood <rahat@google.com>
Date: Thu, 29 Aug 2019 14:29:43 -0700
Subject: Implement /proc/net/udp.

PiperOrigin-RevId: 266229756
---
 pkg/sentry/fs/proc/BUILD                   |   1 -
 pkg/sentry/fs/proc/net.go                  | 201 +++++++++++++++++--
 pkg/sentry/socket/epsocket/epsocket.go     |  25 ++-
 pkg/tcpip/transport/udp/endpoint.go        |  61 +++---
 pkg/tcpip/transport/udp/endpoint_state.go  |   4 +-
 pkg/tcpip/transport/udp/forwarder.go       |   2 +-
 test/syscalls/BUILD                        |   4 +
 test/syscalls/linux/BUILD                  |  15 ++
 test/syscalls/linux/ip_socket_test_util.cc |  10 +
 test/syscalls/linux/ip_socket_test_util.h  |  25 +++
 test/syscalls/linux/proc_net_tcp.cc        |  65 ++----
 test/syscalls/linux/proc_net_udp.cc        | 309 +++++++++++++++++++++++++++++
 test/util/fs_util.cc                       |   9 +
 test/util/fs_util.h                        |   3 +
 14 files changed, 633 insertions(+), 101 deletions(-)
 create mode 100644 test/syscalls/linux/proc_net_udp.cc

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/proc/BUILD b/pkg/sentry/fs/proc/BUILD
index 70ed854a8..c7599d1f6 100644
--- a/pkg/sentry/fs/proc/BUILD
+++ b/pkg/sentry/fs/proc/BUILD
@@ -31,7 +31,6 @@ go_library(
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
-        "//pkg/binary",
         "//pkg/log",
         "//pkg/sentry/context",
         "//pkg/sentry/fs",
diff --git a/pkg/sentry/fs/proc/net.go b/pkg/sentry/fs/proc/net.go
index 9adb23608..5e28982c5 100644
--- a/pkg/sentry/fs/proc/net.go
+++ b/pkg/sentry/fs/proc/net.go
@@ -17,10 +17,10 @@ package proc
 import (
 	"bytes"
 	"fmt"
+	"io"
 	"time"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/binary"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
@@ -28,9 +28,11 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/fs/ramfs"
 	"gvisor.dev/gvisor/pkg/sentry/inet"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/socket"
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix"
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
+	"gvisor.dev/gvisor/pkg/sentry/usermem"
 )
 
 // newNet creates a new proc net entry.
@@ -57,9 +59,8 @@ func (p *proc) newNetDir(ctx context.Context, k *kernel.Kernel, msrc *fs.MountSo
 			"ptype":  newStaticProcInode(ctx, msrc, []byte("Type Device      Function")),
 			"route":  newStaticProcInode(ctx, msrc, []byte("Iface   Destination     Gateway         Flags   RefCnt  Use     Metric  Mask            MTU     Window  IRTT")),
 			"tcp":    seqfile.NewSeqFileInode(ctx, &netTCP{k: k}, msrc),
-			"udp":    newStaticProcInode(ctx, msrc, []byte("  sl  local_address rem_address   st tx_queue rx_queue tr tm->when retrnsmt   uid  timeout inode ref pointer drops")),
-
-			"unix": seqfile.NewSeqFileInode(ctx, &netUnix{k: k}, msrc),
+			"udp":    seqfile.NewSeqFileInode(ctx, &netUDP{k: k}, msrc),
+			"unix":   seqfile.NewSeqFileInode(ctx, &netUnix{k: k}, msrc),
 		}
 
 		if s.SupportsIPv6() {
@@ -216,7 +217,7 @@ func (n *netUnix) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]s
 	for _, se := range n.k.ListSockets() {
 		s := se.Sock.Get()
 		if s == nil {
-			log.Debugf("Couldn't resolve weakref %v in socket table, racing with destruction?", se.Sock)
+			log.Debugf("Couldn't resolve weakref with ID %v in socket table, racing with destruction?", se.ID)
 			continue
 		}
 		sfile := s.(*fs.File)
@@ -297,6 +298,42 @@ func (n *netUnix) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]s
 	return data, 0
 }
 
+func networkToHost16(n uint16) uint16 {
+	// n is in network byte order, so is big-endian. The most-significant byte
+	// should be stored in the lower address.
+	//
+	// We manually inline binary.BigEndian.Uint16() because Go does not support
+	// non-primitive consts, so binary.BigEndian is a (mutable) var, so calls to
+	// binary.BigEndian.Uint16() require a read of binary.BigEndian and an
+	// interface method call, defeating inlining.
+	buf := [2]byte{byte(n >> 8 & 0xff), byte(n & 0xff)}
+	return usermem.ByteOrder.Uint16(buf[:])
+}
+
+func writeInetAddr(w io.Writer, a linux.SockAddrInet) {
+	// linux.SockAddrInet.Port is stored in the network byte order and is
+	// printed like a number in host byte order. Note that all numbers in host
+	// byte order are printed with the most-significant byte first when
+	// formatted with %X. See get_tcp4_sock() and udp4_format_sock() in Linux.
+	port := networkToHost16(a.Port)
+
+	// linux.SockAddrInet.Addr is stored as a byte slice in big-endian order
+	// (i.e. most-significant byte in index 0). Linux represents this as a
+	// __be32 which is a typedef for an unsigned int, and is printed with
+	// %X. This means that for a little-endian machine, Linux prints the
+	// least-significant byte of the address first. To emulate this, we first
+	// invert the byte order for the address using usermem.ByteOrder.Uint32,
+	// which makes it have the equivalent encoding to a __be32 on a little
+	// endian machine. Note that this operation is a no-op on a big endian
+	// machine. Then similar to Linux, we format it with %X, which will print
+	// the most-significant byte of the __be32 address first, which is now
+	// actually the least-significant byte of the original address in
+	// linux.SockAddrInet.Addr on little endian machines, due to the conversion.
+	addr := usermem.ByteOrder.Uint32(a.Addr[:])
+
+	fmt.Fprintf(w, "%08X:%04X ", addr, port)
+}
+
 // netTCP implements seqfile.SeqSource for /proc/net/tcp.
 //
 // +stateify savable
@@ -311,6 +348,9 @@ func (*netTCP) NeedsUpdate(generation int64) bool {
 
 // ReadSeqFileData implements seqfile.SeqSource.ReadSeqFileData.
 func (n *netTCP) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]seqfile.SeqData, int64) {
+	// t may be nil here if our caller is not part of a task goroutine. This can
+	// happen for example if we're here for "sentryctl cat". When t is nil,
+	// degrade gracefully and retrieve what we can.
 	t := kernel.TaskFromContext(ctx)
 
 	if h != nil {
@@ -321,7 +361,7 @@ func (n *netTCP) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]se
 	for _, se := range n.k.ListSockets() {
 		s := se.Sock.Get()
 		if s == nil {
-			log.Debugf("Couldn't resolve weakref %+v in socket table, racing with destruction?", se.Sock)
+			log.Debugf("Couldn't resolve weakref with ID %v in socket table, racing with destruction?", se.ID)
 			continue
 		}
 		sfile := s.(*fs.File)
@@ -343,27 +383,23 @@ func (n *netTCP) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]se
 		// Field: sl; entry number.
 		fmt.Fprintf(&buf, "%4d: ", se.ID)
 
-		portBuf := make([]byte, 2)
-
 		// Field: local_adddress.
 		var localAddr linux.SockAddrInet
-		if local, _, err := sops.GetSockName(t); err == nil {
-			localAddr = *local.(*linux.SockAddrInet)
+		if t != nil {
+			if local, _, err := sops.GetSockName(t); err == nil {
+				localAddr = *local.(*linux.SockAddrInet)
+			}
 		}
-		binary.LittleEndian.PutUint16(portBuf, localAddr.Port)
-		fmt.Fprintf(&buf, "%08X:%04X ",
-			binary.LittleEndian.Uint32(localAddr.Addr[:]),
-			portBuf)
+		writeInetAddr(&buf, localAddr)
 
 		// Field: rem_address.
 		var remoteAddr linux.SockAddrInet
-		if remote, _, err := sops.GetPeerName(t); err == nil {
-			remoteAddr = *remote.(*linux.SockAddrInet)
+		if t != nil {
+			if remote, _, err := sops.GetPeerName(t); err == nil {
+				remoteAddr = *remote.(*linux.SockAddrInet)
+			}
 		}
-		binary.LittleEndian.PutUint16(portBuf, remoteAddr.Port)
-		fmt.Fprintf(&buf, "%08X:%04X ",
-			binary.LittleEndian.Uint32(remoteAddr.Addr[:]),
-			portBuf)
+		writeInetAddr(&buf, remoteAddr)
 
 		// Field: state; socket state.
 		fmt.Fprintf(&buf, "%02X ", sops.State())
@@ -386,7 +422,8 @@ func (n *netTCP) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]se
 			log.Warningf("Failed to retrieve unstable attr for socket file: %v", err)
 			fmt.Fprintf(&buf, "%5d ", 0)
 		} else {
-			fmt.Fprintf(&buf, "%5d ", uint32(uattr.Owner.UID.In(t.UserNamespace()).OrOverflow()))
+			creds := auth.CredentialsFromContext(ctx)
+			fmt.Fprintf(&buf, "%5d ", uint32(uattr.Owner.UID.In(creds.UserNamespace).OrOverflow()))
 		}
 
 		// Field: timeout; number of unanswered 0-window probes.
@@ -438,3 +475,125 @@ func (n *netTCP) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]se
 	}
 	return data, 0
 }
+
+// netUDP implements seqfile.SeqSource for /proc/net/udp.
+//
+// +stateify savable
+type netUDP struct {
+	k *kernel.Kernel
+}
+
+// NeedsUpdate implements seqfile.SeqSource.NeedsUpdate.
+func (*netUDP) NeedsUpdate(generation int64) bool {
+	return true
+}
+
+// ReadSeqFileData implements seqfile.SeqSource.ReadSeqFileData.
+func (n *netUDP) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]seqfile.SeqData, int64) {
+	// t may be nil here if our caller is not part of a task goroutine. This can
+	// happen for example if we're here for "sentryctl cat". When t is nil,
+	// degrade gracefully and retrieve what we can.
+	t := kernel.TaskFromContext(ctx)
+
+	if h != nil {
+		return nil, 0
+	}
+
+	var buf bytes.Buffer
+	for _, se := range n.k.ListSockets() {
+		s := se.Sock.Get()
+		if s == nil {
+			log.Debugf("Couldn't resolve weakref with ID %v in socket table, racing with destruction?", se.ID)
+			continue
+		}
+		sfile := s.(*fs.File)
+		sops, ok := sfile.FileOperations.(socket.Socket)
+		if !ok {
+			panic(fmt.Sprintf("Found non-socket file in socket table: %+v", sfile))
+		}
+		if family, stype, _ := sops.Type(); family != linux.AF_INET || stype != linux.SOCK_DGRAM {
+			s.DecRef()
+			// Not udp4 socket.
+			continue
+		}
+
+		// For Linux's implementation, see net/ipv4/udp.c:udp4_format_sock().
+
+		// Field: sl; entry number.
+		fmt.Fprintf(&buf, "%5d: ", se.ID)
+
+		// Field: local_adddress.
+		var localAddr linux.SockAddrInet
+		if t != nil {
+			if local, _, err := sops.GetSockName(t); err == nil {
+				localAddr = *local.(*linux.SockAddrInet)
+			}
+		}
+		writeInetAddr(&buf, localAddr)
+
+		// Field: rem_address.
+		var remoteAddr linux.SockAddrInet
+		if t != nil {
+			if remote, _, err := sops.GetPeerName(t); err == nil {
+				remoteAddr = *remote.(*linux.SockAddrInet)
+			}
+		}
+		writeInetAddr(&buf, remoteAddr)
+
+		// Field: state; socket state.
+		fmt.Fprintf(&buf, "%02X ", sops.State())
+
+		// Field: tx_queue, rx_queue; number of packets in the transmit and
+		// receive queue. Unimplemented.
+		fmt.Fprintf(&buf, "%08X:%08X ", 0, 0)
+
+		// Field: tr, tm->when. Always 0 for UDP.
+		fmt.Fprintf(&buf, "%02X:%08X ", 0, 0)
+
+		// Field: retrnsmt. Always 0 for UDP.
+		fmt.Fprintf(&buf, "%08X ", 0)
+
+		// Field: uid.
+		uattr, err := sfile.Dirent.Inode.UnstableAttr(ctx)
+		if err != nil {
+			log.Warningf("Failed to retrieve unstable attr for socket file: %v", err)
+			fmt.Fprintf(&buf, "%5d ", 0)
+		} else {
+			creds := auth.CredentialsFromContext(ctx)
+			fmt.Fprintf(&buf, "%5d ", uint32(uattr.Owner.UID.In(creds.UserNamespace).OrOverflow()))
+		}
+
+		// Field: timeout. Always 0 for UDP.
+		fmt.Fprintf(&buf, "%8d ", 0)
+
+		// Field: inode.
+		fmt.Fprintf(&buf, "%8d ", sfile.InodeID())
+
+		// Field: ref; reference count on the socket inode. Don't count the ref
+		// we obtain while deferencing the weakref to this socket.
+		fmt.Fprintf(&buf, "%d ", sfile.ReadRefs()-1)
+
+		// Field: Socket struct address. Redacted due to the same reason as
+		// the 'Num' field in /proc/net/unix, see netUnix.ReadSeqFileData.
+		fmt.Fprintf(&buf, "%#016p ", (*socket.Socket)(nil))
+
+		// Field: drops; number of dropped packets. Unimplemented.
+		fmt.Fprintf(&buf, "%d", 0)
+
+		fmt.Fprintf(&buf, "\n")
+
+		s.DecRef()
+	}
+
+	data := []seqfile.SeqData{
+		{
+			Buf:    []byte("  sl  local_address rem_address   st tx_queue rx_queue tr tm->when retrnsmt   uid  timeout inode ref pointer drops             \n"),
+			Handle: n,
+		},
+		{
+			Buf:    buf.Bytes(),
+			Handle: n,
+		},
+	}
+	return data, 0
+}
diff --git a/pkg/sentry/socket/epsocket/epsocket.go b/pkg/sentry/socket/epsocket/epsocket.go
index 635042263..def29646e 100644
--- a/pkg/sentry/socket/epsocket/epsocket.go
+++ b/pkg/sentry/socket/epsocket/epsocket.go
@@ -27,12 +27,14 @@ package epsocket
 import (
 	"bytes"
 	"math"
+	"reflect"
 	"sync"
 	"syscall"
 	"time"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/metric"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/context"
@@ -52,6 +54,7 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/tcpip/transport/tcp"
+	"gvisor.dev/gvisor/pkg/tcpip/transport/udp"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
@@ -2421,7 +2424,8 @@ func (s *SocketOperations) State() uint32 {
 		return 0
 	}
 
-	if !s.isPacketBased() {
+	switch {
+	case s.skType == linux.SOCK_STREAM && s.protocol == 0 || s.protocol == syscall.IPPROTO_TCP:
 		// TCP socket.
 		switch tcp.EndpointState(s.Endpoint.State()) {
 		case tcp.StateEstablished:
@@ -2450,9 +2454,26 @@ func (s *SocketOperations) State() uint32 {
 			// Internal or unknown state.
 			return 0
 		}
+	case s.skType == linux.SOCK_DGRAM && s.protocol == 0 || s.protocol == syscall.IPPROTO_UDP:
+		// UDP socket.
+		switch udp.EndpointState(s.Endpoint.State()) {
+		case udp.StateInitial, udp.StateBound, udp.StateClosed:
+			return linux.TCP_CLOSE
+		case udp.StateConnected:
+			return linux.TCP_ESTABLISHED
+		default:
+			return 0
+		}
+	case s.skType == linux.SOCK_DGRAM && s.protocol == syscall.IPPROTO_ICMP || s.protocol == syscall.IPPROTO_ICMPV6:
+		// TODO(b/112063468): Export states for ICMP sockets.
+	case s.skType == linux.SOCK_RAW:
+		// TODO(b/112063468): Export states for raw sockets.
+	default:
+		// Unknown transport protocol, how did we make this socket?
+		log.Warningf("Unknown transport protocol for an existing socket: family=%v, type=%v, protocol=%v, internal type %v", s.family, s.skType, s.protocol, reflect.TypeOf(s.Endpoint).Elem())
+		return 0
 	}
 
-	// TODO(b/112063468): Export states for UDP, ICMP, and raw sockets.
 	return 0
 }
 
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index ac5905772..66455ef46 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -37,13 +37,17 @@ type udpPacket struct {
 	views [8]buffer.View `state:"nosave"`
 }
 
-type endpointState int
+// EndpointState represents the state of a UDP endpoint.
+type EndpointState uint32
 
+// Endpoint states. Note that are represented in a netstack-specific manner and
+// may not be meaningful externally. Specifically, they need to be translated to
+// Linux's representation for these states if presented to userspace.
 const (
-	stateInitial endpointState = iota
-	stateBound
-	stateConnected
-	stateClosed
+	StateInitial EndpointState = iota
+	StateBound
+	StateConnected
+	StateClosed
 )
 
 // endpoint represents a UDP endpoint. This struct serves as the interface
@@ -74,7 +78,7 @@ type endpoint struct {
 	mu             sync.RWMutex `state:"nosave"`
 	sndBufSize     int
 	id             stack.TransportEndpointID
-	state          endpointState
+	state          EndpointState
 	bindNICID      tcpip.NICID
 	regNICID       tcpip.NICID
 	route          stack.Route `state:"manual"`
@@ -140,7 +144,7 @@ func (e *endpoint) Close() {
 	e.shutdownFlags = tcpip.ShutdownRead | tcpip.ShutdownWrite
 
 	switch e.state {
-	case stateBound, stateConnected:
+	case StateBound, StateConnected:
 		e.stack.UnregisterTransportEndpoint(e.regNICID, e.effectiveNetProtos, ProtocolNumber, e.id, e)
 		e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.id.LocalAddress, e.id.LocalPort)
 	}
@@ -163,7 +167,7 @@ func (e *endpoint) Close() {
 	e.route.Release()
 
 	// Update the state.
-	e.state = stateClosed
+	e.state = StateClosed
 
 	e.mu.Unlock()
 
@@ -211,11 +215,11 @@ func (e *endpoint) Read(addr *tcpip.FullAddress) (buffer.View, tcpip.ControlMess
 // Returns true for retry if preparation should be retried.
 func (e *endpoint) prepareForWrite(to *tcpip.FullAddress) (retry bool, err *tcpip.Error) {
 	switch e.state {
-	case stateInitial:
-	case stateConnected:
+	case StateInitial:
+	case StateConnected:
 		return false, nil
 
-	case stateBound:
+	case StateBound:
 		if to == nil {
 			return false, tcpip.ErrDestinationRequired
 		}
@@ -232,7 +236,7 @@ func (e *endpoint) prepareForWrite(to *tcpip.FullAddress) (retry bool, err *tcpi
 
 	// The state changed when we released the shared locked and re-acquired
 	// it in exclusive mode. Try again.
-	if e.state != stateInitial {
+	if e.state != StateInitial {
 		return true, nil
 	}
 
@@ -322,7 +326,7 @@ func (e *endpoint) Write(p tcpip.Payload, opts tcpip.WriteOptions) (int64, <-cha
 			defer e.mu.Unlock()
 
 			// Recheck state after lock was re-acquired.
-			if e.state != stateConnected {
+			if e.state != StateConnected {
 				return 0, nil, tcpip.ErrInvalidEndpointState
 			}
 		}
@@ -400,7 +404,7 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 		defer e.mu.Unlock()
 
 		// We only allow this to be set when we're in the initial state.
-		if e.state != stateInitial {
+		if e.state != StateInitial {
 			return tcpip.ErrInvalidEndpointState
 		}
 
@@ -726,7 +730,7 @@ func (e *endpoint) Disconnect() *tcpip.Error {
 	e.mu.Lock()
 	defer e.mu.Unlock()
 
-	if e.state != stateConnected {
+	if e.state != StateConnected {
 		return nil
 	}
 	id := stack.TransportEndpointID{}
@@ -741,9 +745,9 @@ func (e *endpoint) Disconnect() *tcpip.Error {
 		if err != nil {
 			return err
 		}
-		e.state = stateBound
+		e.state = StateBound
 	} else {
-		e.state = stateInitial
+		e.state = StateInitial
 	}
 
 	e.stack.UnregisterTransportEndpoint(e.regNICID, e.effectiveNetProtos, ProtocolNumber, e.id, e)
@@ -772,8 +776,8 @@ func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
 	nicid := addr.NIC
 	var localPort uint16
 	switch e.state {
-	case stateInitial:
-	case stateBound, stateConnected:
+	case StateInitial:
+	case StateBound, StateConnected:
 		localPort = e.id.LocalPort
 		if e.bindNICID == 0 {
 			break
@@ -801,7 +805,7 @@ func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
 		RemoteAddress: r.RemoteAddress,
 	}
 
-	if e.state == stateInitial {
+	if e.state == StateInitial {
 		id.LocalAddress = r.LocalAddress
 	}
 
@@ -832,7 +836,7 @@ func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
 	e.regNICID = nicid
 	e.effectiveNetProtos = netProtos
 
-	e.state = stateConnected
+	e.state = StateConnected
 
 	e.rcvMu.Lock()
 	e.rcvReady = true
@@ -854,7 +858,7 @@ func (e *endpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error {
 
 	// A socket in the bound state can still receive multicast messages,
 	// so we need to notify waiters on shutdown.
-	if e.state != stateBound && e.state != stateConnected {
+	if e.state != StateBound && e.state != StateConnected {
 		return tcpip.ErrNotConnected
 	}
 
@@ -903,7 +907,7 @@ func (e *endpoint) registerWithStack(nicid tcpip.NICID, netProtos []tcpip.Networ
 func (e *endpoint) bindLocked(addr tcpip.FullAddress) *tcpip.Error {
 	// Don't allow binding once endpoint is not in the initial state
 	// anymore.
-	if e.state != stateInitial {
+	if e.state != StateInitial {
 		return tcpip.ErrInvalidEndpointState
 	}
 
@@ -946,7 +950,7 @@ func (e *endpoint) bindLocked(addr tcpip.FullAddress) *tcpip.Error {
 	e.effectiveNetProtos = netProtos
 
 	// Mark endpoint as bound.
-	e.state = stateBound
+	e.state = StateBound
 
 	e.rcvMu.Lock()
 	e.rcvReady = true
@@ -989,7 +993,7 @@ func (e *endpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) {
 	e.mu.RLock()
 	defer e.mu.RUnlock()
 
-	if e.state != stateConnected {
+	if e.state != StateConnected {
 		return tcpip.FullAddress{}, tcpip.ErrNotConnected
 	}
 
@@ -1069,10 +1073,11 @@ func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, vv
 func (e *endpoint) HandleControlPacket(id stack.TransportEndpointID, typ stack.ControlType, extra uint32, vv buffer.VectorisedView) {
 }
 
-// State implements socket.Socket.State.
+// State implements tcpip.Endpoint.State.
 func (e *endpoint) State() uint32 {
-	// TODO(b/112063468): Translate internal state to values returned by Linux.
-	return 0
+	e.mu.Lock()
+	defer e.mu.Unlock()
+	return uint32(e.state)
 }
 
 func isBroadcastOrMulticast(a tcpip.Address) bool {
diff --git a/pkg/tcpip/transport/udp/endpoint_state.go b/pkg/tcpip/transport/udp/endpoint_state.go
index 5cbb56120..be46e6d4e 100644
--- a/pkg/tcpip/transport/udp/endpoint_state.go
+++ b/pkg/tcpip/transport/udp/endpoint_state.go
@@ -77,7 +77,7 @@ func (e *endpoint) Resume(s *stack.Stack) {
 		}
 	}
 
-	if e.state != stateBound && e.state != stateConnected {
+	if e.state != StateBound && e.state != StateConnected {
 		return
 	}
 
@@ -92,7 +92,7 @@ func (e *endpoint) Resume(s *stack.Stack) {
 	}
 
 	var err *tcpip.Error
-	if e.state == stateConnected {
+	if e.state == StateConnected {
 		e.route, err = e.stack.FindRoute(e.regNICID, e.id.LocalAddress, e.id.RemoteAddress, netProto, e.multicastLoop)
 		if err != nil {
 			panic(err)
diff --git a/pkg/tcpip/transport/udp/forwarder.go b/pkg/tcpip/transport/udp/forwarder.go
index a874fc9fd..a9edc2c8d 100644
--- a/pkg/tcpip/transport/udp/forwarder.go
+++ b/pkg/tcpip/transport/udp/forwarder.go
@@ -84,7 +84,7 @@ func (r *ForwarderRequest) CreateEndpoint(queue *waiter.Queue) (tcpip.Endpoint,
 	ep.dstPort = r.id.RemotePort
 	ep.regNICID = r.route.NICID()
 
-	ep.state = stateConnected
+	ep.state = StateConnected
 
 	ep.rcvMu.Lock()
 	ep.rcvReady = true
diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD
index ccae4925f..6947ddc25 100644
--- a/test/syscalls/BUILD
+++ b/test/syscalls/BUILD
@@ -691,6 +691,10 @@ syscall_test(
 
 syscall_test(test = "//test/syscalls/linux:proc_net_unix_test")
 
+syscall_test(test = "//test/syscalls/linux:proc_net_tcp_test")
+
+syscall_test(test = "//test/syscalls/linux:proc_net_udp_test")
+
 go_binary(
     name = "syscall_test_runner",
     srcs = ["syscall_test_runner.go"],
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 88f3bfcb3..1ce38c929 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -3472,3 +3472,18 @@ cc_binary(
         "@com_google_googletest//:gtest",
     ],
 )
+
+cc_binary(
+    name = "proc_net_udp_test",
+    testonly = 1,
+    srcs = ["proc_net_udp.cc"],
+    linkstatic = 1,
+    deps = [
+        ":ip_socket_test_util",
+        "//test/util:file_descriptor",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest",
+    ],
+)
diff --git a/test/syscalls/linux/ip_socket_test_util.cc b/test/syscalls/linux/ip_socket_test_util.cc
index c73262e72..410b42a47 100644
--- a/test/syscalls/linux/ip_socket_test_util.cc
+++ b/test/syscalls/linux/ip_socket_test_util.cc
@@ -23,6 +23,16 @@
 namespace gvisor {
 namespace testing {
 
+uint32_t IPFromInetSockaddr(const struct sockaddr* addr) {
+  auto* in_addr = reinterpret_cast<const struct sockaddr_in*>(addr);
+  return in_addr->sin_addr.s_addr;
+}
+
+uint16_t PortFromInetSockaddr(const struct sockaddr* addr) {
+  auto* in_addr = reinterpret_cast<const struct sockaddr_in*>(addr);
+  return ntohs(in_addr->sin_port);
+}
+
 PosixErrorOr<int> InterfaceIndex(std::string name) {
   // TODO(igudger): Consider using netlink.
   ifreq req = {};
diff --git a/test/syscalls/linux/ip_socket_test_util.h b/test/syscalls/linux/ip_socket_test_util.h
index b498a053d..3d36b9620 100644
--- a/test/syscalls/linux/ip_socket_test_util.h
+++ b/test/syscalls/linux/ip_socket_test_util.h
@@ -26,6 +26,31 @@
 namespace gvisor {
 namespace testing {
 
+// Possible values of the "st" field in a /proc/net/{tcp,udp} entry. Source:
+// Linux kernel, include/net/tcp_states.h.
+enum {
+  TCP_ESTABLISHED = 1,
+  TCP_SYN_SENT,
+  TCP_SYN_RECV,
+  TCP_FIN_WAIT1,
+  TCP_FIN_WAIT2,
+  TCP_TIME_WAIT,
+  TCP_CLOSE,
+  TCP_CLOSE_WAIT,
+  TCP_LAST_ACK,
+  TCP_LISTEN,
+  TCP_CLOSING,
+  TCP_NEW_SYN_RECV,
+
+  TCP_MAX_STATES
+};
+
+// Extracts the IP address from an inet sockaddr in network byte order.
+uint32_t IPFromInetSockaddr(const struct sockaddr* addr);
+
+// Extracts the port from an inet sockaddr in host byte order.
+uint16_t PortFromInetSockaddr(const struct sockaddr* addr);
+
 // InterfaceIndex returns the index of the named interface.
 PosixErrorOr<int> InterfaceIndex(std::string name);
 
diff --git a/test/syscalls/linux/proc_net_tcp.cc b/test/syscalls/linux/proc_net_tcp.cc
index 498f62d9c..f6d7ad0bb 100644
--- a/test/syscalls/linux/proc_net_tcp.cc
+++ b/test/syscalls/linux/proc_net_tcp.cc
@@ -38,25 +38,6 @@ constexpr char kProcNetTCPHeader[] =
     "retrnsmt   uid  timeout inode                                             "
     "        ";
 
-// Possible values of the "st" field in a /proc/net/tcp entry. Source: Linux
-// kernel, include/net/tcp_states.h.
-enum {
-  TCP_ESTABLISHED = 1,
-  TCP_SYN_SENT,
-  TCP_SYN_RECV,
-  TCP_FIN_WAIT1,
-  TCP_FIN_WAIT2,
-  TCP_TIME_WAIT,
-  TCP_CLOSE,
-  TCP_CLOSE_WAIT,
-  TCP_LAST_ACK,
-  TCP_LISTEN,
-  TCP_CLOSING,
-  TCP_NEW_SYN_RECV,
-
-  TCP_MAX_STATES
-};
-
 // TCPEntry represents a single entry from /proc/net/tcp.
 struct TCPEntry {
   uint32_t local_addr;
@@ -70,42 +51,35 @@ struct TCPEntry {
   uint64_t inode;
 };
 
-uint32_t IP(const struct sockaddr* addr) {
-  auto* in_addr = reinterpret_cast<const struct sockaddr_in*>(addr);
-  return in_addr->sin_addr.s_addr;
-}
-
-uint16_t Port(const struct sockaddr* addr) {
-  auto* in_addr = reinterpret_cast<const struct sockaddr_in*>(addr);
-  return ntohs(in_addr->sin_port);
-}
-
 // Finds the first entry in 'entries' for which 'predicate' returns true.
-// Returns true on match, and sets 'match' to point to the matching entry.
-bool FindBy(std::vector<TCPEntry> entries, TCPEntry* match,
+// Returns true on match, and sets 'match' to a copy of the matching entry. If
+// 'match' is null, it's ignored.
+bool FindBy(const std::vector<TCPEntry>& entries, TCPEntry* match,
             std::function<bool(const TCPEntry&)> predicate) {
-  for (int i = 0; i < entries.size(); ++i) {
-    if (predicate(entries[i])) {
-      *match = entries[i];
+  for (const TCPEntry& entry : entries) {
+    if (predicate(entry)) {
+      if (match != nullptr) {
+        *match = entry;
+      }
       return true;
     }
   }
   return false;
 }
 
-bool FindByLocalAddr(std::vector<TCPEntry> entries, TCPEntry* match,
+bool FindByLocalAddr(const std::vector<TCPEntry>& entries, TCPEntry* match,
                      const struct sockaddr* addr) {
-  uint32_t host = IP(addr);
-  uint16_t port = Port(addr);
+  uint32_t host = IPFromInetSockaddr(addr);
+  uint16_t port = PortFromInetSockaddr(addr);
   return FindBy(entries, match, [host, port](const TCPEntry& e) {
     return (e.local_addr == host && e.local_port == port);
   });
 }
 
-bool FindByRemoteAddr(std::vector<TCPEntry> entries, TCPEntry* match,
+bool FindByRemoteAddr(const std::vector<TCPEntry>& entries, TCPEntry* match,
                       const struct sockaddr* addr) {
-  uint32_t host = IP(addr);
-  uint16_t port = Port(addr);
+  uint32_t host = IPFromInetSockaddr(addr);
+  uint16_t port = PortFromInetSockaddr(addr);
   return FindBy(entries, match, [host, port](const TCPEntry& e) {
     return (e.remote_addr == host && e.remote_port == port);
   });
@@ -120,7 +94,7 @@ PosixErrorOr<std::vector<TCPEntry>> ProcNetTCPEntries() {
   std::vector<TCPEntry> entries;
   std::vector<std::string> lines = StrSplit(content, '\n');
   std::cerr << "<contents of /proc/net/tcp>" << std::endl;
-  for (std::string line : lines) {
+  for (const std::string& line : lines) {
     std::cerr << line << std::endl;
 
     if (!found_header) {
@@ -204,9 +178,8 @@ TEST(ProcNetTCP, BindAcceptConnect) {
     EXPECT_EQ(entries.size(), 2);
   }
 
-  TCPEntry e;
-  EXPECT_TRUE(FindByLocalAddr(entries, &e, sockets->first_addr()));
-  EXPECT_TRUE(FindByRemoteAddr(entries, &e, sockets->first_addr()));
+  EXPECT_TRUE(FindByLocalAddr(entries, nullptr, sockets->first_addr()));
+  EXPECT_TRUE(FindByRemoteAddr(entries, nullptr, sockets->first_addr()));
 }
 
 TEST(ProcNetTCP, InodeReasonable) {
@@ -261,8 +234,8 @@ TEST(ProcNetTCP, State) {
   FileDescriptor accepted =
       ASSERT_NO_ERRNO_AND_VALUE(Accept(server->get(), nullptr, nullptr));
 
-  const uint32_t accepted_local_host = IP(&addr);
-  const uint16_t accepted_local_port = Port(&addr);
+  const uint32_t accepted_local_host = IPFromInetSockaddr(&addr);
+  const uint16_t accepted_local_port = PortFromInetSockaddr(&addr);
 
   entries = ASSERT_NO_ERRNO_AND_VALUE(ProcNetTCPEntries());
   TCPEntry accepted_entry;
diff --git a/test/syscalls/linux/proc_net_udp.cc b/test/syscalls/linux/proc_net_udp.cc
new file mode 100644
index 000000000..369df8e0e
--- /dev/null
+++ b/test/syscalls/linux/proc_net_udp.cc
@@ -0,0 +1,309 @@
+// Copyright 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "gtest/gtest.h"
+#include "absl/strings/numbers.h"
+#include "absl/strings/str_join.h"
+#include "absl/strings/str_split.h"
+#include "test/syscalls/linux/ip_socket_test_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+namespace {
+
+using absl::StrCat;
+using absl::StrFormat;
+using absl::StrSplit;
+
+constexpr char kProcNetUDPHeader[] =
+    "  sl  local_address rem_address   st tx_queue rx_queue tr tm->when "
+    "retrnsmt   uid  timeout inode ref pointer drops             ";
+
+// UDPEntry represents a single entry from /proc/net/udp.
+struct UDPEntry {
+  uint32_t local_addr;
+  uint16_t local_port;
+
+  uint32_t remote_addr;
+  uint16_t remote_port;
+
+  uint64_t state;
+  uint64_t uid;
+  uint64_t inode;
+};
+
+std::string DescribeFirstInetSocket(const SocketPair& sockets) {
+  const struct sockaddr* addr = sockets.first_addr();
+  return StrFormat("First test socket: fd:%d %8X:%4X", sockets.first_fd(),
+                   IPFromInetSockaddr(addr), PortFromInetSockaddr(addr));
+}
+
+std::string DescribeSecondInetSocket(const SocketPair& sockets) {
+  const struct sockaddr* addr = sockets.second_addr();
+  return StrFormat("Second test socket fd:%d %8X:%4X", sockets.second_fd(),
+                   IPFromInetSockaddr(addr), PortFromInetSockaddr(addr));
+}
+
+// Finds the first entry in 'entries' for which 'predicate' returns true.
+// Returns true on match, and set 'match' to a copy of the matching entry. If
+// 'match' is null, it's ignored.
+bool FindBy(const std::vector<UDPEntry>& entries, UDPEntry* match,
+            std::function<bool(const UDPEntry&)> predicate) {
+  for (const UDPEntry& entry : entries) {
+    if (predicate(entry)) {
+      if (match != nullptr) {
+        *match = entry;
+      }
+      return true;
+    }
+  }
+  return false;
+}
+
+bool FindByLocalAddr(const std::vector<UDPEntry>& entries, UDPEntry* match,
+                     const struct sockaddr* addr) {
+  uint32_t host = IPFromInetSockaddr(addr);
+  uint16_t port = PortFromInetSockaddr(addr);
+  return FindBy(entries, match, [host, port](const UDPEntry& e) {
+    return (e.local_addr == host && e.local_port == port);
+  });
+}
+
+bool FindByRemoteAddr(const std::vector<UDPEntry>& entries, UDPEntry* match,
+                      const struct sockaddr* addr) {
+  uint32_t host = IPFromInetSockaddr(addr);
+  uint16_t port = PortFromInetSockaddr(addr);
+  return FindBy(entries, match, [host, port](const UDPEntry& e) {
+    return (e.remote_addr == host && e.remote_port == port);
+  });
+}
+
+PosixErrorOr<uint64_t> InodeFromSocketFD(int fd) {
+  ASSIGN_OR_RETURN_ERRNO(struct stat s, Fstat(fd));
+  if (!S_ISSOCK(s.st_mode)) {
+    return PosixError(EINVAL, StrFormat("FD %d is not a socket", fd));
+  }
+  return s.st_ino;
+}
+
+PosixErrorOr<bool> FindByFD(const std::vector<UDPEntry>& entries,
+                            UDPEntry* match, int fd) {
+  ASSIGN_OR_RETURN_ERRNO(uint64_t inode, InodeFromSocketFD(fd));
+  return FindBy(entries, match,
+                [inode](const UDPEntry& e) { return (e.inode == inode); });
+}
+
+// Returns a parsed representation of /proc/net/udp entries.
+PosixErrorOr<std::vector<UDPEntry>> ProcNetUDPEntries() {
+  std::string content;
+  RETURN_IF_ERRNO(GetContents("/proc/net/udp", &content));
+
+  bool found_header = false;
+  std::vector<UDPEntry> entries;
+  std::vector<std::string> lines = StrSplit(content, '\n');
+  std::cerr << "<contents of /proc/net/udp>" << std::endl;
+  for (const std::string& line : lines) {
+    std::cerr << line << std::endl;
+
+    if (!found_header) {
+      EXPECT_EQ(line, kProcNetUDPHeader);
+      found_header = true;
+      continue;
+    }
+    if (line.empty()) {
+      continue;
+    }
+
+    // Parse a single entry from /proc/net/udp.
+    //
+    // Example entries:
+    //
+    // clang-format off
+    //
+    //  sl  local_address rem_address   st tx_queue rx_queue tr tm->when retrnsmt   uid  timeout inode ref pointer drops
+    // 3503: 0100007F:0035 00000000:0000 07 00000000:00000000 00:00000000 00000000     0        0 33317 2 0000000000000000 0
+    // 3518: 00000000:0044 00000000:0000 07 00000000:00000000 00:00000000 00000000     0        0 40394 2 0000000000000000 0
+    //   ^     ^       ^     ^       ^   ^     ^       ^      ^     ^        ^         ^        ^   ^   ^      ^           ^
+    //   0     1       2     3       4   5     6       7      8     9       10        11       12  13  14     15           16
+    //
+    // clang-format on
+
+    UDPEntry entry;
+    std::vector<std::string> fields =
+        StrSplit(line, absl::ByAnyChar(": "), absl::SkipEmpty());
+
+    ASSIGN_OR_RETURN_ERRNO(entry.local_addr, AtoiBase(fields[1], 16));
+    ASSIGN_OR_RETURN_ERRNO(entry.local_port, AtoiBase(fields[2], 16));
+
+    ASSIGN_OR_RETURN_ERRNO(entry.remote_addr, AtoiBase(fields[3], 16));
+    ASSIGN_OR_RETURN_ERRNO(entry.remote_port, AtoiBase(fields[4], 16));
+
+    ASSIGN_OR_RETURN_ERRNO(entry.state, AtoiBase(fields[5], 16));
+    ASSIGN_OR_RETURN_ERRNO(entry.uid, Atoi<uint64_t>(fields[11]));
+    ASSIGN_OR_RETURN_ERRNO(entry.inode, Atoi<uint64_t>(fields[13]));
+
+    // Linux shares internal data structures between TCP and UDP sockets. The
+    // proc entries for UDP sockets share some fields with TCP sockets, but
+    // these fields should always be zero as they're not meaningful for UDP
+    // sockets.
+    EXPECT_EQ(fields[8], "00") << StrFormat("sl:%s, tr", fields[0]);
+    EXPECT_EQ(fields[9], "00000000") << StrFormat("sl:%s, tm->when", fields[0]);
+    EXPECT_EQ(fields[10], "00000000")
+        << StrFormat("sl:%s, retrnsmt", fields[0]);
+    EXPECT_EQ(fields[12], "0") << StrFormat("sl:%s, timeout", fields[0]);
+
+    entries.push_back(entry);
+  }
+  std::cerr << "<end of /proc/net/udp>" << std::endl;
+
+  return entries;
+}
+
+TEST(ProcNetUDP, Exists) {
+  const std::string content =
+      ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/net/udp"));
+  const std::string header_line = StrCat(kProcNetUDPHeader, "\n");
+  EXPECT_THAT(content, ::testing::StartsWith(header_line));
+}
+
+TEST(ProcNetUDP, EntryUID) {
+  auto sockets =
+      ASSERT_NO_ERRNO_AND_VALUE(IPv4UDPBidirectionalBindSocketPair(0).Create());
+  std::vector<UDPEntry> entries =
+      ASSERT_NO_ERRNO_AND_VALUE(ProcNetUDPEntries());
+  UDPEntry e;
+  ASSERT_TRUE(FindByLocalAddr(entries, &e, sockets->first_addr()))
+      << DescribeFirstInetSocket(*sockets);
+  EXPECT_EQ(e.uid, geteuid());
+  ASSERT_TRUE(FindByRemoteAddr(entries, &e, sockets->first_addr()))
+      << DescribeSecondInetSocket(*sockets);
+  EXPECT_EQ(e.uid, geteuid());
+}
+
+TEST(ProcNetUDP, FindMutualEntries) {
+  auto sockets =
+      ASSERT_NO_ERRNO_AND_VALUE(IPv4UDPBidirectionalBindSocketPair(0).Create());
+  std::vector<UDPEntry> entries =
+      ASSERT_NO_ERRNO_AND_VALUE(ProcNetUDPEntries());
+
+  EXPECT_TRUE(FindByLocalAddr(entries, nullptr, sockets->first_addr()))
+      << DescribeFirstInetSocket(*sockets);
+  EXPECT_TRUE(FindByRemoteAddr(entries, nullptr, sockets->first_addr()))
+      << DescribeSecondInetSocket(*sockets);
+
+  EXPECT_TRUE(FindByLocalAddr(entries, nullptr, sockets->second_addr()))
+      << DescribeSecondInetSocket(*sockets);
+  EXPECT_TRUE(FindByRemoteAddr(entries, nullptr, sockets->second_addr()))
+      << DescribeFirstInetSocket(*sockets);
+}
+
+TEST(ProcNetUDP, EntriesRemovedOnClose) {
+  auto sockets =
+      ASSERT_NO_ERRNO_AND_VALUE(IPv4UDPBidirectionalBindSocketPair(0).Create());
+  std::vector<UDPEntry> entries =
+      ASSERT_NO_ERRNO_AND_VALUE(ProcNetUDPEntries());
+
+  EXPECT_TRUE(FindByLocalAddr(entries, nullptr, sockets->first_addr()))
+      << DescribeFirstInetSocket(*sockets);
+  EXPECT_TRUE(FindByLocalAddr(entries, nullptr, sockets->second_addr()))
+      << DescribeSecondInetSocket(*sockets);
+
+  EXPECT_THAT(close(sockets->release_first_fd()), SyscallSucceeds());
+  entries = ASSERT_NO_ERRNO_AND_VALUE(ProcNetUDPEntries());
+  // First socket's entry should be gone, but the second socket's entry should
+  // still exist.
+  EXPECT_FALSE(FindByLocalAddr(entries, nullptr, sockets->first_addr()))
+      << DescribeFirstInetSocket(*sockets);
+  EXPECT_TRUE(FindByLocalAddr(entries, nullptr, sockets->second_addr()))
+      << DescribeSecondInetSocket(*sockets);
+
+  EXPECT_THAT(close(sockets->release_second_fd()), SyscallSucceeds());
+  entries = ASSERT_NO_ERRNO_AND_VALUE(ProcNetUDPEntries());
+  // Both entries should be gone.
+  EXPECT_FALSE(FindByLocalAddr(entries, nullptr, sockets->first_addr()))
+      << DescribeFirstInetSocket(*sockets);
+  EXPECT_FALSE(FindByLocalAddr(entries, nullptr, sockets->second_addr()))
+      << DescribeSecondInetSocket(*sockets);
+}
+
+PosixErrorOr<std::unique_ptr<FileDescriptor>> BoundUDPSocket() {
+  ASSIGN_OR_RETURN_ERRNO(std::unique_ptr<FileDescriptor> socket,
+                         IPv4UDPUnboundSocket(0).Create());
+  struct sockaddr_in addr;
+  addr.sin_family = AF_INET;
+  addr.sin_addr.s_addr = htonl(INADDR_ANY);
+  addr.sin_port = 0;
+
+  int res = bind(socket->get(), reinterpret_cast<const struct sockaddr*>(&addr),
+                 sizeof(addr));
+  if (res) {
+    return PosixError(errno, "bind()");
+  }
+  return socket;
+}
+
+TEST(ProcNetUDP, BoundEntry) {
+  std::unique_ptr<FileDescriptor> socket =
+      ASSERT_NO_ERRNO_AND_VALUE(BoundUDPSocket());
+  struct sockaddr addr;
+  socklen_t len = sizeof(addr);
+  ASSERT_THAT(getsockname(socket->get(), &addr, &len), SyscallSucceeds());
+  uint16_t port = PortFromInetSockaddr(&addr);
+
+  std::vector<UDPEntry> entries =
+      ASSERT_NO_ERRNO_AND_VALUE(ProcNetUDPEntries());
+  UDPEntry e;
+  ASSERT_TRUE(ASSERT_NO_ERRNO_AND_VALUE(FindByFD(entries, &e, socket->get())));
+  EXPECT_EQ(e.local_port, port);
+  EXPECT_EQ(e.remote_addr, 0);
+  EXPECT_EQ(e.remote_port, 0);
+}
+
+TEST(ProcNetUDP, BoundSocketStateClosed) {
+  std::unique_ptr<FileDescriptor> socket =
+      ASSERT_NO_ERRNO_AND_VALUE(BoundUDPSocket());
+  std::vector<UDPEntry> entries =
+      ASSERT_NO_ERRNO_AND_VALUE(ProcNetUDPEntries());
+  UDPEntry e;
+  ASSERT_TRUE(ASSERT_NO_ERRNO_AND_VALUE(FindByFD(entries, &e, socket->get())));
+  EXPECT_EQ(e.state, TCP_CLOSE);
+}
+
+TEST(ProcNetUDP, ConnectedSocketStateEstablished) {
+  auto sockets =
+      ASSERT_NO_ERRNO_AND_VALUE(IPv4UDPBidirectionalBindSocketPair(0).Create());
+  std::vector<UDPEntry> entries =
+      ASSERT_NO_ERRNO_AND_VALUE(ProcNetUDPEntries());
+
+  UDPEntry e;
+  ASSERT_TRUE(FindByLocalAddr(entries, &e, sockets->first_addr()))
+      << DescribeFirstInetSocket(*sockets);
+  EXPECT_EQ(e.state, TCP_ESTABLISHED);
+
+  ASSERT_TRUE(FindByLocalAddr(entries, &e, sockets->second_addr()))
+      << DescribeSecondInetSocket(*sockets);
+  EXPECT_EQ(e.state, TCP_ESTABLISHED);
+}
+
+}  // namespace
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/util/fs_util.cc b/test/util/fs_util.cc
index ae49725a0..f7d231b14 100644
--- a/test/util/fs_util.cc
+++ b/test/util/fs_util.cc
@@ -105,6 +105,15 @@ PosixErrorOr<struct stat> Stat(absl::string_view path) {
   return stat_buf;
 }
 
+PosixErrorOr<struct stat> Fstat(int fd) {
+  struct stat stat_buf;
+  int res = fstat(fd, &stat_buf);
+  if (res < 0) {
+    return PosixError(errno, absl::StrCat("fstat ", fd));
+  }
+  return stat_buf;
+}
+
 PosixErrorOr<bool> Exists(absl::string_view path) {
   struct stat stat_buf;
   int res = stat(std::string(path).c_str(), &stat_buf);
diff --git a/test/util/fs_util.h b/test/util/fs_util.h
index 3969f8309..e5b555891 100644
--- a/test/util/fs_util.h
+++ b/test/util/fs_util.h
@@ -35,6 +35,9 @@ PosixErrorOr<bool> Exists(absl::string_view path);
 // Returns a stat structure for the given path or an error.
 PosixErrorOr<struct stat> Stat(absl::string_view path);
 
+// Returns a stat struct for the given fd.
+PosixErrorOr<struct stat> Fstat(int fd);
+
 // Deletes the file or directory at path or returns an error.
 PosixError Delete(absl::string_view path);
 
-- 
cgit v1.2.3


From 502c47f7a70a088213faf27b60e6f62ece4dd765 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Fri, 30 Aug 2019 17:17:45 -0700
Subject: Return correct buffer size for ioctl(socket, FIONREAD)

Ioctl was returning just the buffer size from epsocket.endpoint
and it was not considering data from epsocket.SocketOperations
that was read from the endpoint, but not yet sent to the caller.

PiperOrigin-RevId: 266485461
---
 pkg/sentry/socket/epsocket/epsocket.go | 22 +++++++++++++++++++++-
 test/syscalls/linux/tcp_socket.cc      | 21 ++++++++++++++++++++-
 2 files changed, 41 insertions(+), 2 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/socket/epsocket/epsocket.go b/pkg/sentry/socket/epsocket/epsocket.go
index def29646e..0e37ce61b 100644
--- a/pkg/sentry/socket/epsocket/epsocket.go
+++ b/pkg/sentry/socket/epsocket/epsocket.go
@@ -2104,7 +2104,8 @@ func (s *SocketOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO,
 	// SIOCGSTAMP is implemented by epsocket rather than all commonEndpoint
 	// sockets.
 	// TODO(b/78348848): Add a commonEndpoint method to support SIOCGSTAMP.
-	if int(args[1].Int()) == syscall.SIOCGSTAMP {
+	switch args[1].Int() {
+	case syscall.SIOCGSTAMP:
 		s.readMu.Lock()
 		defer s.readMu.Unlock()
 		if !s.timestampValid {
@@ -2116,6 +2117,25 @@ func (s *SocketOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO,
 			AddressSpaceActive: true,
 		})
 		return 0, err
+
+	case linux.TIOCINQ:
+		v, terr := s.Endpoint.GetSockOptInt(tcpip.ReceiveQueueSizeOption)
+		if terr != nil {
+			return 0, syserr.TranslateNetstackError(terr).ToError()
+		}
+
+		// Add bytes removed from the endpoint but not yet sent to the caller.
+		v += len(s.readView)
+
+		if v > math.MaxInt32 {
+			v = math.MaxInt32
+		}
+
+		// Copy result to user-space.
+		_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(v), usermem.IOOpts{
+			AddressSpaceActive: true,
+		})
+		return 0, err
 	}
 
 	return Ioctl(ctx, s.Endpoint, io, args)
diff --git a/test/syscalls/linux/tcp_socket.cc b/test/syscalls/linux/tcp_socket.cc
index 8f4d3f386..bfa031bce 100644
--- a/test/syscalls/linux/tcp_socket.cc
+++ b/test/syscalls/linux/tcp_socket.cc
@@ -579,7 +579,7 @@ TEST_P(TcpSocketTest, TcpInq) {
     if (size == sizeof(buf)) {
       break;
     }
-    usleep(10000);
+    absl::SleepFor(absl::Milliseconds(10));
   }
 
   struct msghdr msg = {};
@@ -610,6 +610,25 @@ TEST_P(TcpSocketTest, TcpInq) {
   }
 }
 
+TEST_P(TcpSocketTest, Tiocinq) {
+  char buf[1024];
+  size_t size = sizeof(buf);
+  ASSERT_THAT(RetryEINTR(write)(s_, buf, size), SyscallSucceedsWithValue(size));
+
+  uint32_t seed = time(nullptr);
+  const size_t max_chunk = size / 10;
+  while (size > 0) {
+    size_t chunk = (rand_r(&seed) % max_chunk) + 1;
+    ssize_t read = RetryEINTR(recvfrom)(t_, buf, chunk, 0, nullptr, nullptr);
+    ASSERT_THAT(read, SyscallSucceeds());
+    size -= read;
+
+    int inq = 0;
+    ASSERT_THAT(ioctl(t_, TIOCINQ, &inq), SyscallSucceeds());
+    ASSERT_EQ(inq, size);
+  }
+}
+
 TEST_P(TcpSocketTest, TcpSCMPriority) {
   char buf[1024];
   ASSERT_THAT(RetryEINTR(write)(s_, buf, sizeof(buf)),
-- 
cgit v1.2.3


From 54bf2e8eff4a5e619e7e3abafcda6ffc52d937f2 Mon Sep 17 00:00:00 2001
From: Bhasker Hariharan <bhaskerh@google.com>
Date: Fri, 30 Aug 2019 18:09:37 -0700
Subject: Automated rollback of changelist 261387276

PiperOrigin-RevId: 266491264
---
 pkg/sentry/fs/tty/BUILD           |   1 -
 pkg/sentry/fs/tty/dir.go          |   3 -
 pkg/sentry/fs/tty/master.go       |  17 +-
 pkg/sentry/fs/tty/slave.go        |  13 +-
 pkg/sentry/fs/tty/terminal.go     |  92 +---------
 pkg/sentry/kernel/BUILD           |   1 -
 pkg/sentry/kernel/sessions.go     |  12 +-
 pkg/sentry/kernel/task_start.go   |   3 +-
 pkg/sentry/kernel/thread_group.go | 179 -------------------
 pkg/sentry/kernel/tty.go          |  28 ---
 test/syscalls/BUILD               |   4 -
 test/syscalls/linux/BUILD         |  19 ---
 test/syscalls/linux/pty.cc        | 351 +++-----------------------------------
 test/syscalls/linux/pty_root.cc   |  68 --------
 test/util/BUILD                   |  11 --
 test/util/pty_util.cc             |  45 -----
 test/util/pty_util.h              |  30 ----
 17 files changed, 34 insertions(+), 843 deletions(-)
 delete mode 100644 pkg/sentry/kernel/tty.go
 delete mode 100644 test/syscalls/linux/pty_root.cc
 delete mode 100644 test/util/pty_util.cc
 delete mode 100644 test/util/pty_util.h

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/tty/BUILD b/pkg/sentry/fs/tty/BUILD
index 291164986..5e9327aec 100644
--- a/pkg/sentry/fs/tty/BUILD
+++ b/pkg/sentry/fs/tty/BUILD
@@ -23,7 +23,6 @@ go_library(
         "//pkg/sentry/device",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/fsutil",
-        "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/safemem",
         "//pkg/sentry/socket/unix/transport",
diff --git a/pkg/sentry/fs/tty/dir.go b/pkg/sentry/fs/tty/dir.go
index 2f639c823..1d128532b 100644
--- a/pkg/sentry/fs/tty/dir.go
+++ b/pkg/sentry/fs/tty/dir.go
@@ -129,9 +129,6 @@ func newDir(ctx context.Context, m *fs.MountSource) *fs.Inode {
 
 // Release implements fs.InodeOperations.Release.
 func (d *dirInodeOperations) Release(ctx context.Context) {
-	d.mu.Lock()
-	defer d.mu.Unlock()
-
 	d.master.DecRef()
 	if len(d.slaves) != 0 {
 		panic(fmt.Sprintf("devpts directory still contains active terminals: %+v", d))
diff --git a/pkg/sentry/fs/tty/master.go b/pkg/sentry/fs/tty/master.go
index 19b7557d5..92ec1ca18 100644
--- a/pkg/sentry/fs/tty/master.go
+++ b/pkg/sentry/fs/tty/master.go
@@ -172,19 +172,6 @@ func (mf *masterFileOperations) Ioctl(ctx context.Context, _ *fs.File, io userme
 		return 0, mf.t.ld.windowSize(ctx, io, args)
 	case linux.TIOCSWINSZ:
 		return 0, mf.t.ld.setWindowSize(ctx, io, args)
-	case linux.TIOCSCTTY:
-		// Make the given terminal the controlling terminal of the
-		// calling process.
-		return 0, mf.t.setControllingTTY(ctx, io, args, true /* isMaster */)
-	case linux.TIOCNOTTY:
-		// Release this process's controlling terminal.
-		return 0, mf.t.releaseControllingTTY(ctx, io, args, true /* isMaster */)
-	case linux.TIOCGPGRP:
-		// Get the foreground process group.
-		return mf.t.foregroundProcessGroup(ctx, io, args, true /* isMaster */)
-	case linux.TIOCSPGRP:
-		// Set the foreground process group.
-		return mf.t.setForegroundProcessGroup(ctx, io, args, true /* isMaster */)
 	default:
 		maybeEmitUnimplementedEvent(ctx, cmd)
 		return 0, syserror.ENOTTY
@@ -198,6 +185,8 @@ func maybeEmitUnimplementedEvent(ctx context.Context, cmd uint32) {
 		linux.TCSETS,
 		linux.TCSETSW,
 		linux.TCSETSF,
+		linux.TIOCGPGRP,
+		linux.TIOCSPGRP,
 		linux.TIOCGWINSZ,
 		linux.TIOCSWINSZ,
 		linux.TIOCSETD,
@@ -211,6 +200,8 @@ func maybeEmitUnimplementedEvent(ctx context.Context, cmd uint32) {
 		linux.TIOCEXCL,
 		linux.TIOCNXCL,
 		linux.TIOCGEXCL,
+		linux.TIOCNOTTY,
+		linux.TIOCSCTTY,
 		linux.TIOCGSID,
 		linux.TIOCGETD,
 		linux.TIOCVHANGUP,
diff --git a/pkg/sentry/fs/tty/slave.go b/pkg/sentry/fs/tty/slave.go
index 944c4ada1..e30266404 100644
--- a/pkg/sentry/fs/tty/slave.go
+++ b/pkg/sentry/fs/tty/slave.go
@@ -152,16 +152,9 @@ func (sf *slaveFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem
 	case linux.TIOCSCTTY:
 		// Make the given terminal the controlling terminal of the
 		// calling process.
-		return 0, sf.si.t.setControllingTTY(ctx, io, args, false /* isMaster */)
-	case linux.TIOCNOTTY:
-		// Release this process's controlling terminal.
-		return 0, sf.si.t.releaseControllingTTY(ctx, io, args, false /* isMaster */)
-	case linux.TIOCGPGRP:
-		// Get the foreground process group.
-		return sf.si.t.foregroundProcessGroup(ctx, io, args, false /* isMaster */)
-	case linux.TIOCSPGRP:
-		// Set the foreground process group.
-		return sf.si.t.setForegroundProcessGroup(ctx, io, args, false /* isMaster */)
+		// TODO(b/129283598): Implement once we have support for job
+		// control.
+		return 0, nil
 	default:
 		maybeEmitUnimplementedEvent(ctx, cmd)
 		return 0, syserror.ENOTTY
diff --git a/pkg/sentry/fs/tty/terminal.go b/pkg/sentry/fs/tty/terminal.go
index ff8138820..b7cecb2ed 100644
--- a/pkg/sentry/fs/tty/terminal.go
+++ b/pkg/sentry/fs/tty/terminal.go
@@ -17,10 +17,7 @@ package tty
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/refs"
-	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/context"
-	"gvisor.dev/gvisor/pkg/sentry/kernel"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 )
 
 // Terminal is a pseudoterminal.
@@ -29,100 +26,23 @@ import (
 type Terminal struct {
 	refs.AtomicRefCount
 
-	// n is the terminal index. It is immutable.
+	// n is the terminal index.
 	n uint32
 
-	// d is the containing directory. It is immutable.
+	// d is the containing directory.
 	d *dirInodeOperations
 
-	// ld is the line discipline of the terminal. It is immutable.
+	// ld is the line discipline of the terminal.
 	ld *lineDiscipline
-
-	// masterKTTY contains the controlling process of the master end of
-	// this terminal. This field is immutable.
-	masterKTTY *kernel.TTY
-
-	// slaveKTTY contains the controlling process of the slave end of this
-	// terminal. This field is immutable.
-	slaveKTTY *kernel.TTY
 }
 
 func newTerminal(ctx context.Context, d *dirInodeOperations, n uint32) *Terminal {
 	termios := linux.DefaultSlaveTermios
 	t := Terminal{
-		d:          d,
-		n:          n,
-		ld:         newLineDiscipline(termios),
-		masterKTTY: &kernel.TTY{},
-		slaveKTTY:  &kernel.TTY{},
+		d:  d,
+		n:  n,
+		ld: newLineDiscipline(termios),
 	}
 	t.EnableLeakCheck("tty.Terminal")
 	return &t
 }
-
-// setControllingTTY makes tm the controlling terminal of the calling thread
-// group.
-func (tm *Terminal) setControllingTTY(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) error {
-	task := kernel.TaskFromContext(ctx)
-	if task == nil {
-		panic("setControllingTTY must be called from a task context")
-	}
-
-	return task.ThreadGroup().SetControllingTTY(tm.tty(isMaster), args[2].Int())
-}
-
-// releaseControllingTTY removes tm as the controlling terminal of the calling
-// thread group.
-func (tm *Terminal) releaseControllingTTY(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) error {
-	task := kernel.TaskFromContext(ctx)
-	if task == nil {
-		panic("releaseControllingTTY must be called from a task context")
-	}
-
-	return task.ThreadGroup().ReleaseControllingTTY(tm.tty(isMaster))
-}
-
-// foregroundProcessGroup gets the process group ID of tm's foreground process.
-func (tm *Terminal) foregroundProcessGroup(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) (uintptr, error) {
-	task := kernel.TaskFromContext(ctx)
-	if task == nil {
-		panic("foregroundProcessGroup must be called from a task context")
-	}
-
-	ret, err := task.ThreadGroup().ForegroundProcessGroup(tm.tty(isMaster))
-	if err != nil {
-		return 0, err
-	}
-
-	// Write it out to *arg.
-	_, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(ret), usermem.IOOpts{
-		AddressSpaceActive: true,
-	})
-	return 0, err
-}
-
-// foregroundProcessGroup sets tm's foreground process.
-func (tm *Terminal) setForegroundProcessGroup(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) (uintptr, error) {
-	task := kernel.TaskFromContext(ctx)
-	if task == nil {
-		panic("setForegroundProcessGroup must be called from a task context")
-	}
-
-	// Read in the process group ID.
-	var pgid int32
-	if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &pgid, usermem.IOOpts{
-		AddressSpaceActive: true,
-	}); err != nil {
-		return 0, err
-	}
-
-	ret, err := task.ThreadGroup().SetForegroundProcessGroup(tm.tty(isMaster), kernel.ProcessGroupID(pgid))
-	return uintptr(ret), err
-}
-
-func (tm *Terminal) tty(isMaster bool) *kernel.TTY {
-	if isMaster {
-		return tm.masterKTTY
-	}
-	return tm.slaveKTTY
-}
diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
index 41bee9a22..e61d39c82 100644
--- a/pkg/sentry/kernel/BUILD
+++ b/pkg/sentry/kernel/BUILD
@@ -144,7 +144,6 @@ go_library(
         "threads.go",
         "timekeeper.go",
         "timekeeper_state.go",
-        "tty.go",
         "uts_namespace.go",
         "vdso.go",
         "version.go",
diff --git a/pkg/sentry/kernel/sessions.go b/pkg/sentry/kernel/sessions.go
index e5f297478..81fcd8258 100644
--- a/pkg/sentry/kernel/sessions.go
+++ b/pkg/sentry/kernel/sessions.go
@@ -47,11 +47,6 @@ type Session struct {
 	// The id is immutable.
 	id SessionID
 
-	// foreground is the foreground process group.
-	//
-	// This is protected by TaskSet.mu.
-	foreground *ProcessGroup
-
 	// ProcessGroups is a list of process groups in this Session. This is
 	// protected by TaskSet.mu.
 	processGroups processGroupList
@@ -265,14 +260,12 @@ func (pg *ProcessGroup) SendSignal(info *arch.SignalInfo) error {
 func (tg *ThreadGroup) CreateSession() error {
 	tg.pidns.owner.mu.Lock()
 	defer tg.pidns.owner.mu.Unlock()
-	tg.signalHandlers.mu.Lock()
-	defer tg.signalHandlers.mu.Unlock()
 	return tg.createSession()
 }
 
 // createSession creates a new session for a threadgroup.
 //
-// Precondition: callers must hold TaskSet.mu and the signal mutex for writing.
+// Precondition: callers must hold TaskSet.mu for writing.
 func (tg *ThreadGroup) createSession() error {
 	// Get the ID for this thread in the current namespace.
 	id := tg.pidns.tgids[tg]
@@ -353,9 +346,6 @@ func (tg *ThreadGroup) createSession() error {
 		ns.processGroups[ProcessGroupID(local)] = pg
 	}
 
-	// Disconnect from the controlling terminal.
-	tg.tty = nil
-
 	return nil
 }
 
diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go
index ae6fc4025..d60cd62c7 100644
--- a/pkg/sentry/kernel/task_start.go
+++ b/pkg/sentry/kernel/task_start.go
@@ -172,10 +172,9 @@ func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) {
 		if parentPG := tg.parentPG(); parentPG == nil {
 			tg.createSession()
 		} else {
-			// Inherit the process group and terminal.
+			// Inherit the process group.
 			parentPG.incRefWithParent(parentPG)
 			tg.processGroup = parentPG
-			tg.tty = t.parent.tg.tty
 		}
 	}
 	tg.tasks.PushBack(t)
diff --git a/pkg/sentry/kernel/thread_group.go b/pkg/sentry/kernel/thread_group.go
index 0eef24bfb..2a97e3e8e 100644
--- a/pkg/sentry/kernel/thread_group.go
+++ b/pkg/sentry/kernel/thread_group.go
@@ -19,13 +19,10 @@ import (
 	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
-	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
 	"gvisor.dev/gvisor/pkg/sentry/limits"
 	"gvisor.dev/gvisor/pkg/sentry/usage"
-	"gvisor.dev/gvisor/pkg/syserror"
 )
 
 // A ThreadGroup is a logical grouping of tasks that has widespread
@@ -248,12 +245,6 @@ type ThreadGroup struct {
 	//
 	// mounts is immutable.
 	mounts *fs.MountNamespace
-
-	// tty is the thread group's controlling terminal. If nil, there is no
-	// controlling terminal.
-	//
-	// tty is protected by the signal mutex.
-	tty *TTY
 }
 
 // newThreadGroup returns a new, empty thread group in PID namespace ns. The
@@ -333,176 +324,6 @@ func (tg *ThreadGroup) forEachChildThreadGroupLocked(fn func(*ThreadGroup)) {
 	}
 }
 
-// SetControllingTTY sets tty as the controlling terminal of tg.
-func (tg *ThreadGroup) SetControllingTTY(tty *TTY, arg int32) error {
-	tty.mu.Lock()
-	defer tty.mu.Unlock()
-
-	// We might be asked to set the controlling terminal of multiple
-	// processes, so we lock both the TaskSet and SignalHandlers.
-	tg.pidns.owner.mu.Lock()
-	defer tg.pidns.owner.mu.Unlock()
-	tg.signalHandlers.mu.Lock()
-	defer tg.signalHandlers.mu.Unlock()
-
-	// "The calling process must be a session leader and not have a
-	// controlling terminal already." - tty_ioctl(4)
-	if tg.processGroup.session.leader != tg || tg.tty != nil {
-		return syserror.EINVAL
-	}
-
-	// "If this terminal is already the controlling terminal of a different
-	// session group, then the ioctl fails with EPERM, unless the caller
-	// has the CAP_SYS_ADMIN capability and arg equals 1, in which case the
-	// terminal is stolen, and all processes that had it as controlling
-	// terminal lose it." - tty_ioctl(4)
-	if tty.tg != nil && tg.processGroup.session != tty.tg.processGroup.session {
-		if !auth.CredentialsFromContext(tg.leader).HasCapability(linux.CAP_SYS_ADMIN) || arg != 1 {
-			return syserror.EPERM
-		}
-		// Steal the TTY away. Unlike TIOCNOTTY, don't send signals.
-		for othertg := range tg.pidns.owner.Root.tgids {
-			// This won't deadlock by locking tg.signalHandlers
-			// because at this point:
-			// - We only lock signalHandlers if it's in the same
-			//   session as the tty's controlling thread group.
-			// - We know that the calling thread group is not in
-			//   the same session as the tty's controlling thread
-			//   group.
-			if othertg.processGroup.session == tty.tg.processGroup.session {
-				othertg.signalHandlers.mu.Lock()
-				othertg.tty = nil
-				othertg.signalHandlers.mu.Unlock()
-			}
-		}
-	}
-
-	// Set the controlling terminal and foreground process group.
-	tg.tty = tty
-	tg.processGroup.session.foreground = tg.processGroup
-	// Set this as the controlling process of the terminal.
-	tty.tg = tg
-
-	return nil
-}
-
-// ReleaseControllingTTY gives up tty as the controlling tty of tg.
-func (tg *ThreadGroup) ReleaseControllingTTY(tty *TTY) error {
-	tty.mu.Lock()
-	defer tty.mu.Unlock()
-
-	// We might be asked to set the controlling terminal of multiple
-	// processes, so we lock both the TaskSet and SignalHandlers.
-	tg.pidns.owner.mu.RLock()
-	defer tg.pidns.owner.mu.RUnlock()
-
-	// Just below, we may re-lock signalHandlers in order to send signals.
-	// Thus we can't defer Unlock here.
-	tg.signalHandlers.mu.Lock()
-
-	if tg.tty == nil || tg.tty != tty {
-		tg.signalHandlers.mu.Unlock()
-		return syserror.ENOTTY
-	}
-
-	// "If the process was session leader, then send SIGHUP and SIGCONT to
-	// the foreground process group and all processes in the current
-	// session lose their controlling terminal." - tty_ioctl(4)
-	// Remove tty as the controlling tty for each process in the session,
-	// then send them SIGHUP and SIGCONT.
-
-	// If we're not the session leader, we don't have to do much.
-	if tty.tg != tg {
-		tg.tty = nil
-		tg.signalHandlers.mu.Unlock()
-		return nil
-	}
-
-	tg.signalHandlers.mu.Unlock()
-
-	// We're the session leader. SIGHUP and SIGCONT the foreground process
-	// group and remove all controlling terminals in the session.
-	var lastErr error
-	for othertg := range tg.pidns.owner.Root.tgids {
-		if othertg.processGroup.session == tg.processGroup.session {
-			othertg.signalHandlers.mu.Lock()
-			othertg.tty = nil
-			if othertg.processGroup == tg.processGroup.session.foreground {
-				if err := othertg.leader.sendSignalLocked(&arch.SignalInfo{Signo: int32(linux.SIGHUP)}, true /* group */); err != nil {
-					lastErr = err
-				}
-				if err := othertg.leader.sendSignalLocked(&arch.SignalInfo{Signo: int32(linux.SIGCONT)}, true /* group */); err != nil {
-					lastErr = err
-				}
-			}
-			othertg.signalHandlers.mu.Unlock()
-		}
-	}
-
-	return lastErr
-}
-
-// ForegroundProcessGroup returns the process group ID of the foreground
-// process group.
-func (tg *ThreadGroup) ForegroundProcessGroup(tty *TTY) (int32, error) {
-	tty.mu.Lock()
-	defer tty.mu.Unlock()
-
-	tg.pidns.owner.mu.Lock()
-	defer tg.pidns.owner.mu.Unlock()
-	tg.signalHandlers.mu.Lock()
-	defer tg.signalHandlers.mu.Unlock()
-
-	// "When fd does not refer to the controlling terminal of the calling
-	// process, -1 is returned" - tcgetpgrp(3)
-	if tg.tty != tty {
-		return -1, syserror.ENOTTY
-	}
-
-	return int32(tg.processGroup.session.foreground.id), nil
-}
-
-// SetForegroundProcessGroup sets the foreground process group of tty to pgid.
-func (tg *ThreadGroup) SetForegroundProcessGroup(tty *TTY, pgid ProcessGroupID) (int32, error) {
-	tty.mu.Lock()
-	defer tty.mu.Unlock()
-
-	tg.pidns.owner.mu.Lock()
-	defer tg.pidns.owner.mu.Unlock()
-	tg.signalHandlers.mu.Lock()
-	defer tg.signalHandlers.mu.Unlock()
-
-	// TODO(b/129283598): "If tcsetpgrp() is called by a member of a
-	// background process group in its session, and the calling process is
-	// not blocking or ignoring SIGTTOU, a SIGTTOU signal is sent to all
-	// members of this background process group."
-
-	// tty must be the controlling terminal.
-	if tg.tty != tty {
-		return -1, syserror.ENOTTY
-	}
-
-	// pgid must be positive.
-	if pgid < 0 {
-		return -1, syserror.EINVAL
-	}
-
-	// pg must not be empty. Empty process groups are removed from their
-	// pid namespaces.
-	pg, ok := tg.pidns.processGroups[pgid]
-	if !ok {
-		return -1, syserror.ESRCH
-	}
-
-	// pg must be part of this process's session.
-	if tg.processGroup.session != pg.session {
-		return -1, syserror.EPERM
-	}
-
-	tg.processGroup.session.foreground.id = pgid
-	return 0, nil
-}
-
 // itimerRealListener implements ktime.Listener for ITIMER_REAL expirations.
 //
 // +stateify savable
diff --git a/pkg/sentry/kernel/tty.go b/pkg/sentry/kernel/tty.go
deleted file mode 100644
index 34f84487a..000000000
--- a/pkg/sentry/kernel/tty.go
+++ /dev/null
@@ -1,28 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package kernel
-
-import "sync"
-
-// TTY defines the relationship between a thread group and its controlling
-// terminal.
-//
-// +stateify savable
-type TTY struct {
-	mu sync.Mutex `state:"nosave"`
-
-	// tg is protected by mu.
-	tg *ThreadGroup
-}
diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD
index 6947ddc25..a8a2e75d3 100644
--- a/test/syscalls/BUILD
+++ b/test/syscalls/BUILD
@@ -318,10 +318,6 @@ syscall_test(
     test = "//test/syscalls/linux:pty_test",
 )
 
-syscall_test(
-    test = "//test/syscalls/linux:pty_root_test",
-)
-
 syscall_test(
     add_overlay = True,
     test = "//test/syscalls/linux:pwritev2_test",
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index bb065aa4f..aeb4b405d 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -1278,10 +1278,8 @@ cc_binary(
     srcs = ["pty.cc"],
     linkstatic = 1,
     deps = [
-        "//test/util:capability_util",
         "//test/util:file_descriptor",
         "//test/util:posix_error",
-        "//test/util:pty_util",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
@@ -1293,23 +1291,6 @@ cc_binary(
     ],
 )
 
-cc_binary(
-    name = "pty_root_test",
-    testonly = 1,
-    srcs = ["pty_root.cc"],
-    linkstatic = 1,
-    deps = [
-        "//test/util:capability_util",
-        "//test/util:file_descriptor",
-        "//test/util:posix_error",
-        "//test/util:pty_util",
-        "//test/util:test_main",
-        "//test/util:thread_util",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
 cc_binary(
     name = "partial_bad_buffer_test",
     testonly = 1,
diff --git a/test/syscalls/linux/pty.cc b/test/syscalls/linux/pty.cc
index c605b6549..d1ab4703f 100644
--- a/test/syscalls/linux/pty.cc
+++ b/test/syscalls/linux/pty.cc
@@ -13,17 +13,13 @@
 // limitations under the License.
 
 #include <fcntl.h>
-#include <linux/capability.h>
 #include <linux/major.h>
 #include <poll.h>
-#include <sched.h>
-#include <signal.h>
 #include <sys/ioctl.h>
 #include <sys/mman.h>
 #include <sys/stat.h>
 #include <sys/sysmacros.h>
 #include <sys/types.h>
-#include <sys/wait.h>
 #include <termios.h>
 #include <unistd.h>
 
@@ -35,10 +31,8 @@
 #include "absl/synchronization/notification.h"
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
-#include "test/util/capability_util.h"
 #include "test/util/file_descriptor.h"
 #include "test/util/posix_error.h"
-#include "test/util/pty_util.h"
 #include "test/util/test_util.h"
 #include "test/util/thread_util.h"
 
@@ -376,6 +370,25 @@ PosixErrorOr<size_t> PollAndReadFd(int fd, void* buf, size_t count,
   return PosixError(ETIMEDOUT, "Poll timed out");
 }
 
+// Opens the slave end of the passed master as R/W and nonblocking.
+PosixErrorOr<FileDescriptor> OpenSlave(const FileDescriptor& master) {
+  // Get pty index.
+  int n;
+  int ret = ioctl(master.get(), TIOCGPTN, &n);
+  if (ret < 0) {
+    return PosixError(errno, "ioctl(TIOCGPTN) failed");
+  }
+
+  // Unlock pts.
+  int unlock = 0;
+  ret = ioctl(master.get(), TIOCSPTLCK, &unlock);
+  if (ret < 0) {
+    return PosixError(errno, "ioctl(TIOSPTLCK) failed");
+  }
+
+  return Open(absl::StrCat("/dev/pts/", n), O_RDWR | O_NONBLOCK);
+}
+
 TEST(BasicPtyTest, StatUnopenedMaster) {
   struct stat s;
   ASSERT_THAT(stat("/dev/ptmx", &s), SyscallSucceeds());
@@ -1220,332 +1233,6 @@ TEST_F(PtyTest, SetMasterWindowSize) {
   EXPECT_EQ(retrieved_ws.ws_col, kCols);
 }
 
-class JobControlTest : public ::testing::Test {
- protected:
-  void SetUp() override {
-    master_ = ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/ptmx", O_RDWR | O_NONBLOCK));
-    slave_ = ASSERT_NO_ERRNO_AND_VALUE(OpenSlave(master_));
-
-    // Make this a session leader, which also drops the controlling terminal.
-    // In the gVisor test environment, this test will be run as the session
-    // leader already (as the sentry init process).
-    if (!IsRunningOnGvisor()) {
-      ASSERT_THAT(setsid(), SyscallSucceeds());
-    }
-  }
-
-  // Master and slave ends of the PTY. Non-blocking.
-  FileDescriptor master_;
-  FileDescriptor slave_;
-};
-
-TEST_F(JobControlTest, SetTTYMaster) {
-  ASSERT_THAT(ioctl(master_.get(), TIOCSCTTY, 0), SyscallSucceeds());
-}
-
-TEST_F(JobControlTest, SetTTY) {
-  ASSERT_THAT(ioctl(slave_.get(), TIOCSCTTY, 0), SyscallSucceeds());
-}
-
-TEST_F(JobControlTest, SetTTYNonLeader) {
-  // Fork a process that won't be the session leader.
-  pid_t child = fork();
-  if (!child) {
-    // We shouldn't be able to set the terminal.
-    TEST_PCHECK(ioctl(slave_.get(), TIOCSCTTY, 0));
-    _exit(0);
-  }
-
-  int wstatus;
-  ASSERT_THAT(waitpid(child, &wstatus, 0), SyscallSucceedsWithValue(child));
-  ASSERT_EQ(wstatus, 0);
-}
-
-TEST_F(JobControlTest, SetTTYBadArg) {
-  // Despite the man page saying arg should be 0 here, Linux doesn't actually
-  // check.
-  ASSERT_THAT(ioctl(slave_.get(), TIOCSCTTY, 1), SyscallSucceeds());
-}
-
-TEST_F(JobControlTest, SetTTYDifferentSession) {
-  SKIP_IF(ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN)));
-
-  ASSERT_THAT(ioctl(slave_.get(), TIOCSCTTY, 0), SyscallSucceeds());
-
-  // Fork, join a new session, and try to steal the parent's controlling
-  // terminal, which should fail.
-  pid_t child = fork();
-  if (!child) {
-    TEST_PCHECK(setsid() >= 0);
-    // We shouldn't be able to steal the terminal.
-    TEST_PCHECK(ioctl(slave_.get(), TIOCSCTTY, 1));
-    _exit(0);
-  }
-
-  int wstatus;
-  ASSERT_THAT(waitpid(child, &wstatus, 0), SyscallSucceedsWithValue(child));
-  ASSERT_EQ(wstatus, 0);
-}
-
-TEST_F(JobControlTest, ReleaseTTY) {
-  ASSERT_THAT(ioctl(slave_.get(), TIOCSCTTY, 0), SyscallSucceeds());
-
-  // Make sure we're ignoring SIGHUP, which will be sent to this process once we
-  // disconnect they TTY.
-  struct sigaction sa = {};
-  sa.sa_handler = SIG_IGN;
-  sigemptyset(&sa.sa_mask);
-  struct sigaction old_sa;
-  EXPECT_THAT(sigaction(SIGHUP, &sa, &old_sa), SyscallSucceeds());
-  EXPECT_THAT(ioctl(slave_.get(), TIOCNOTTY), SyscallSucceeds());
-  EXPECT_THAT(sigaction(SIGHUP, &old_sa, NULL), SyscallSucceeds());
-}
-
-TEST_F(JobControlTest, ReleaseUnsetTTY) {
-  ASSERT_THAT(ioctl(slave_.get(), TIOCNOTTY), SyscallFailsWithErrno(ENOTTY));
-}
-
-TEST_F(JobControlTest, ReleaseWrongTTY) {
-  ASSERT_THAT(ioctl(slave_.get(), TIOCSCTTY, 0), SyscallSucceeds());
-
-  ASSERT_THAT(ioctl(master_.get(), TIOCNOTTY), SyscallFailsWithErrno(ENOTTY));
-}
-
-TEST_F(JobControlTest, ReleaseTTYNonLeader) {
-  ASSERT_THAT(ioctl(slave_.get(), TIOCSCTTY, 0), SyscallSucceeds());
-
-  pid_t child = fork();
-  if (!child) {
-    TEST_PCHECK(!ioctl(slave_.get(), TIOCNOTTY));
-    _exit(0);
-  }
-
-  int wstatus;
-  ASSERT_THAT(waitpid(child, &wstatus, 0), SyscallSucceedsWithValue(child));
-  ASSERT_EQ(wstatus, 0);
-}
-
-TEST_F(JobControlTest, ReleaseTTYDifferentSession) {
-  ASSERT_THAT(ioctl(slave_.get(), TIOCSCTTY, 0), SyscallSucceeds());
-
-  pid_t child = fork();
-  if (!child) {
-    // Join a new session, then try to disconnect.
-    TEST_PCHECK(setsid() >= 0);
-    TEST_PCHECK(ioctl(slave_.get(), TIOCNOTTY));
-    _exit(0);
-  }
-
-  int wstatus;
-  ASSERT_THAT(waitpid(child, &wstatus, 0), SyscallSucceedsWithValue(child));
-  ASSERT_EQ(wstatus, 0);
-}
-
-// Used by the child process spawned in ReleaseTTYSignals to track received
-// signals.
-static int received;
-
-void sig_handler(int signum) { received |= signum; }
-
-// When the session leader releases its controlling terminal, the foreground
-// process group gets SIGHUP, then SIGCONT. This test:
-// - Spawns 2 threads
-// - Has thread 1 return 0 if it gets both SIGHUP and SIGCONT
-// - Has thread 2 leave the foreground process group, and return non-zero if it
-//   receives any signals.
-// - Has the parent thread release its controlling terminal
-// - Checks that thread 1 got both signals
-// - Checks that thread 2 didn't get any signals.
-TEST_F(JobControlTest, ReleaseTTYSignals) {
-  ASSERT_THAT(ioctl(slave_.get(), TIOCSCTTY, 0), SyscallSucceeds());
-
-  received = 0;
-  struct sigaction sa = {};
-  sa.sa_handler = sig_handler;
-  sigemptyset(&sa.sa_mask);
-  sigaddset(&sa.sa_mask, SIGHUP);
-  sigaddset(&sa.sa_mask, SIGCONT);
-  sigprocmask(SIG_BLOCK, &sa.sa_mask, NULL);
-
-  pid_t same_pgrp_child = fork();
-  if (!same_pgrp_child) {
-    // The child will wait for SIGHUP and SIGCONT, then return 0. It begins with
-    // SIGHUP and SIGCONT blocked. We install signal handlers for those signals,
-    // then use sigsuspend to wait for those specific signals.
-    TEST_PCHECK(!sigaction(SIGHUP, &sa, NULL));
-    TEST_PCHECK(!sigaction(SIGCONT, &sa, NULL));
-    sigset_t mask;
-    sigfillset(&mask);
-    sigdelset(&mask, SIGHUP);
-    sigdelset(&mask, SIGCONT);
-    while (received != (SIGHUP | SIGCONT)) {
-      sigsuspend(&mask);
-    }
-    _exit(0);
-  }
-
-  // We don't want to block these anymore.
-  sigprocmask(SIG_UNBLOCK, &sa.sa_mask, NULL);
-
-  // This child will return non-zero if either SIGHUP or SIGCONT are received.
-  pid_t diff_pgrp_child = fork();
-  if (!diff_pgrp_child) {
-    TEST_PCHECK(!setpgid(0, 0));
-    TEST_PCHECK(pause());
-    _exit(1);
-  }
-
-  EXPECT_THAT(setpgid(diff_pgrp_child, diff_pgrp_child), SyscallSucceeds());
-
-  // Make sure we're ignoring SIGHUP, which will be sent to this process once we
-  // disconnect they TTY.
-  struct sigaction sighup_sa = {};
-  sighup_sa.sa_handler = SIG_IGN;
-  sigemptyset(&sighup_sa.sa_mask);
-  struct sigaction old_sa;
-  EXPECT_THAT(sigaction(SIGHUP, &sighup_sa, &old_sa), SyscallSucceeds());
-
-  // Release the controlling terminal, sending SIGHUP and SIGCONT to all other
-  // processes in this process group.
-  EXPECT_THAT(ioctl(slave_.get(), TIOCNOTTY), SyscallSucceeds());
-
-  EXPECT_THAT(sigaction(SIGHUP, &old_sa, NULL), SyscallSucceeds());
-
-  // The child in the same process group will get signaled.
-  int wstatus;
-  EXPECT_THAT(waitpid(same_pgrp_child, &wstatus, 0),
-              SyscallSucceedsWithValue(same_pgrp_child));
-  EXPECT_EQ(wstatus, 0);
-
-  // The other child will not get signaled.
-  EXPECT_THAT(waitpid(diff_pgrp_child, &wstatus, WNOHANG),
-              SyscallSucceedsWithValue(0));
-  EXPECT_THAT(kill(diff_pgrp_child, SIGKILL), SyscallSucceeds());
-}
-
-TEST_F(JobControlTest, GetForegroundProcessGroup) {
-  ASSERT_THAT(ioctl(slave_.get(), TIOCSCTTY, 0), SyscallSucceeds());
-  pid_t foreground_pgid;
-  pid_t pid;
-  ASSERT_THAT(ioctl(slave_.get(), TIOCGPGRP, &foreground_pgid),
-              SyscallSucceeds());
-  ASSERT_THAT(pid = getpid(), SyscallSucceeds());
-
-  ASSERT_EQ(foreground_pgid, pid);
-}
-
-TEST_F(JobControlTest, GetForegroundProcessGroupNonControlling) {
-  // At this point there's no controlling terminal, so TIOCGPGRP should fail.
-  pid_t foreground_pgid;
-  ASSERT_THAT(ioctl(slave_.get(), TIOCGPGRP, &foreground_pgid),
-              SyscallFailsWithErrno(ENOTTY));
-}
-
-// This test:
-// - sets itself as the foreground process group
-// - creates a child process in a new process group
-// - sets that child as the foreground process group
-// - kills its child and sets itself as the foreground process group.
-TEST_F(JobControlTest, SetForegroundProcessGroup) {
-  ASSERT_THAT(ioctl(slave_.get(), TIOCSCTTY, 0), SyscallSucceeds());
-
-  // Ignore SIGTTOU so that we don't stop ourself when calling tcsetpgrp.
-  struct sigaction sa = {};
-  sa.sa_handler = SIG_IGN;
-  sigemptyset(&sa.sa_mask);
-  sigaction(SIGTTOU, &sa, NULL);
-
-  // Set ourself as the foreground process group.
-  ASSERT_THAT(tcsetpgrp(slave_.get(), getpgid(0)), SyscallSucceeds());
-
-  // Create a new process that just waits to be signaled.
-  pid_t child = fork();
-  if (!child) {
-    TEST_PCHECK(!pause());
-    // We should never reach this.
-    _exit(1);
-  }
-
-  // Make the child its own process group, then make it the controlling process
-  // group of the terminal.
-  ASSERT_THAT(setpgid(child, child), SyscallSucceeds());
-  ASSERT_THAT(tcsetpgrp(slave_.get(), child), SyscallSucceeds());
-
-  // Sanity check - we're still the controlling session.
-  ASSERT_EQ(getsid(0), getsid(child));
-
-  // Signal the child, wait for it to exit, then retake the terminal.
-  ASSERT_THAT(kill(child, SIGTERM), SyscallSucceeds());
-  int wstatus;
-  ASSERT_THAT(waitpid(child, &wstatus, 0), SyscallSucceedsWithValue(child));
-  ASSERT_TRUE(WIFSIGNALED(wstatus));
-  ASSERT_EQ(WTERMSIG(wstatus), SIGTERM);
-
-  // Set ourself as the foreground process.
-  pid_t pgid;
-  ASSERT_THAT(pgid = getpgid(0), SyscallSucceeds());
-  ASSERT_THAT(tcsetpgrp(slave_.get(), pgid), SyscallSucceeds());
-}
-
-TEST_F(JobControlTest, SetForegroundProcessGroupWrongTTY) {
-  pid_t pid = getpid();
-  ASSERT_THAT(ioctl(slave_.get(), TIOCSPGRP, &pid),
-              SyscallFailsWithErrno(ENOTTY));
-}
-
-TEST_F(JobControlTest, SetForegroundProcessGroupNegPgid) {
-  ASSERT_THAT(ioctl(slave_.get(), TIOCSCTTY, 0), SyscallSucceeds());
-
-  pid_t pid = -1;
-  ASSERT_THAT(ioctl(slave_.get(), TIOCSPGRP, &pid),
-              SyscallFailsWithErrno(EINVAL));
-}
-
-TEST_F(JobControlTest, SetForegroundProcessGroupEmptyProcessGroup) {
-  ASSERT_THAT(ioctl(slave_.get(), TIOCSCTTY, 0), SyscallSucceeds());
-
-  // Create a new process, put it in a new process group, make that group the
-  // foreground process group, then have the process wait.
-  pid_t child = fork();
-  if (!child) {
-    TEST_PCHECK(!setpgid(0, 0));
-    _exit(0);
-  }
-
-  // Wait for the child to exit.
-  int wstatus;
-  EXPECT_THAT(waitpid(child, &wstatus, 0), SyscallSucceedsWithValue(child));
-  // The child's process group doesn't exist anymore - this should fail.
-  ASSERT_THAT(ioctl(slave_.get(), TIOCSPGRP, &child),
-              SyscallFailsWithErrno(ESRCH));
-}
-
-TEST_F(JobControlTest, SetForegroundProcessGroupDifferentSession) {
-  ASSERT_THAT(ioctl(slave_.get(), TIOCSCTTY, 0), SyscallSucceeds());
-
-  // Create a new process and put it in a new session.
-  pid_t child = fork();
-  if (!child) {
-    TEST_PCHECK(setsid() >= 0);
-    // Tell the parent we're in a new session.
-    TEST_PCHECK(!raise(SIGSTOP));
-    TEST_PCHECK(!pause());
-    _exit(1);
-  }
-
-  // Wait for the child to tell us it's in a new session.
-  int wstatus;
-  EXPECT_THAT(waitpid(child, &wstatus, WUNTRACED),
-              SyscallSucceedsWithValue(child));
-  EXPECT_TRUE(WSTOPSIG(wstatus));
-
-  // Child is in a new session, so we can't make it the foregroup process group.
-  EXPECT_THAT(ioctl(slave_.get(), TIOCSPGRP, &child),
-              SyscallFailsWithErrno(EPERM));
-
-  EXPECT_THAT(kill(child, SIGKILL), SyscallSucceeds());
-}
-
 }  // namespace
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/pty_root.cc b/test/syscalls/linux/pty_root.cc
deleted file mode 100644
index d2a321a6e..000000000
--- a/test/syscalls/linux/pty_root.cc
+++ /dev/null
@@ -1,68 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <sys/ioctl.h>
-#include <termios.h>
-
-#include "gtest/gtest.h"
-#include "absl/base/macros.h"
-#include "test/util/capability_util.h"
-#include "test/util/file_descriptor.h"
-#include "test/util/posix_error.h"
-#include "test/util/pty_util.h"
-
-namespace gvisor {
-namespace testing {
-
-// These tests should be run as root.
-namespace {
-
-TEST(JobControlRootTest, StealTTY) {
-  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN)));
-
-  // Make this a session leader, which also drops the controlling terminal.
-  // In the gVisor test environment, this test will be run as the session
-  // leader already (as the sentry init process).
-  if (!IsRunningOnGvisor()) {
-    ASSERT_THAT(setsid(), SyscallSucceeds());
-  }
-
-  FileDescriptor master =
-      ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/ptmx", O_RDWR | O_NONBLOCK));
-  FileDescriptor slave = ASSERT_NO_ERRNO_AND_VALUE(OpenSlave(master));
-
-  // Make slave the controlling terminal.
-  ASSERT_THAT(ioctl(slave.get(), TIOCSCTTY, 0), SyscallSucceeds());
-
-  // Fork, join a new session, and try to steal the parent's controlling
-  // terminal, which should succeed when we have CAP_SYS_ADMIN and pass an arg
-  // of 1.
-  pid_t child = fork();
-  if (!child) {
-    TEST_PCHECK(setsid() >= 0);
-    // We shouldn't be able to steal the terminal with the wrong arg value.
-    TEST_PCHECK(ioctl(slave.get(), TIOCSCTTY, 0));
-    // We should be able to steal it here.
-    TEST_PCHECK(!ioctl(slave.get(), TIOCSCTTY, 1));
-    _exit(0);
-  }
-
-  int wstatus;
-  ASSERT_THAT(waitpid(child, &wstatus, 0), SyscallSucceedsWithValue(child));
-  ASSERT_EQ(wstatus, 0);
-}
-
-}  // namespace
-}  // namespace testing
-}  // namespace gvisor
diff --git a/test/util/BUILD b/test/util/BUILD
index cfea029b2..8afd89d8d 100644
--- a/test/util/BUILD
+++ b/test/util/BUILD
@@ -189,17 +189,6 @@ cc_test(
     ],
 )
 
-cc_library(
-    name = "pty_util",
-    testonly = 1,
-    srcs = ["pty_util.cc"],
-    hdrs = ["pty_util.h"],
-    deps = [
-        ":file_descriptor",
-        ":posix_error",
-    ],
-)
-
 cc_library(
     name = "signal_util",
     testonly = 1,
diff --git a/test/util/pty_util.cc b/test/util/pty_util.cc
deleted file mode 100644
index c0fd9a095..000000000
--- a/test/util/pty_util.cc
+++ /dev/null
@@ -1,45 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "test/util/pty_util.h"
-
-#include <sys/ioctl.h>
-#include <termios.h>
-
-#include "test/util/file_descriptor.h"
-#include "test/util/posix_error.h"
-
-namespace gvisor {
-namespace testing {
-
-PosixErrorOr<FileDescriptor> OpenSlave(const FileDescriptor& master) {
-  // Get pty index.
-  int n;
-  int ret = ioctl(master.get(), TIOCGPTN, &n);
-  if (ret < 0) {
-    return PosixError(errno, "ioctl(TIOCGPTN) failed");
-  }
-
-  // Unlock pts.
-  int unlock = 0;
-  ret = ioctl(master.get(), TIOCSPTLCK, &unlock);
-  if (ret < 0) {
-    return PosixError(errno, "ioctl(TIOSPTLCK) failed");
-  }
-
-  return Open(absl::StrCat("/dev/pts/", n), O_RDWR | O_NONBLOCK);
-}
-
-}  // namespace testing
-}  // namespace gvisor
diff --git a/test/util/pty_util.h b/test/util/pty_util.h
deleted file mode 100644
index 367b14f15..000000000
--- a/test/util/pty_util.h
+++ /dev/null
@@ -1,30 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef GVISOR_TEST_UTIL_PTY_UTIL_H_
-#define GVISOR_TEST_UTIL_PTY_UTIL_H_
-
-#include "test/util/file_descriptor.h"
-#include "test/util/posix_error.h"
-
-namespace gvisor {
-namespace testing {
-
-// Opens the slave end of the passed master as R/W and nonblocking.
-PosixErrorOr<FileDescriptor> OpenSlave(const FileDescriptor& master);
-
-}  // namespace testing
-}  // namespace gvisor
-
-#endif  // GVISOR_TEST_UTIL_PTY_UTIL_H_
-- 
cgit v1.2.3


From 0352cf5866ddb5eea24fa35c69e2e43038cfb60a Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Fri, 30 Aug 2019 19:05:30 -0700
Subject: Remove support for non-incremental mapped accounting.

PiperOrigin-RevId: 266496644
---
 pkg/sentry/fs/fsutil/inode_cached.go | 18 ++----------------
 pkg/sentry/usage/memory.go           |  5 -----
 2 files changed, 2 insertions(+), 21 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/fsutil/inode_cached.go b/pkg/sentry/fs/fsutil/inode_cached.go
index 20cb9a367..d404a79d4 100644
--- a/pkg/sentry/fs/fsutil/inode_cached.go
+++ b/pkg/sentry/fs/fsutil/inode_cached.go
@@ -796,11 +796,6 @@ func (c *CachingInodeOperations) AddMapping(ctx context.Context, ms memmap.Mappi
 			mf.MarkUnevictable(c, pgalloc.EvictableRange{r.Start, r.End})
 		}
 	}
-	if c.useHostPageCache() && !usage.IncrementalMappedAccounting {
-		for _, r := range mapped {
-			usage.MemoryAccounting.Inc(r.Length(), usage.Mapped)
-		}
-	}
 	c.mapsMu.Unlock()
 	return nil
 }
@@ -814,11 +809,6 @@ func (c *CachingInodeOperations) RemoveMapping(ctx context.Context, ms memmap.Ma
 		c.hostFileMapper.DecRefOn(r)
 	}
 	if c.useHostPageCache() {
-		if !usage.IncrementalMappedAccounting {
-			for _, r := range unmapped {
-				usage.MemoryAccounting.Dec(r.Length(), usage.Mapped)
-			}
-		}
 		c.mapsMu.Unlock()
 		return
 	}
@@ -1001,9 +991,7 @@ func (c *CachingInodeOperations) IncRef(fr platform.FileRange) {
 			seg, gap = seg.NextNonEmpty()
 		case gap.Ok() && gap.Start() < fr.End:
 			newRange := gap.Range().Intersect(fr)
-			if usage.IncrementalMappedAccounting {
-				usage.MemoryAccounting.Inc(newRange.Length(), usage.Mapped)
-			}
+			usage.MemoryAccounting.Inc(newRange.Length(), usage.Mapped)
 			seg, gap = c.refs.InsertWithoutMerging(gap, newRange, 1).NextNonEmpty()
 		default:
 			c.refs.MergeAdjacent(fr)
@@ -1024,9 +1012,7 @@ func (c *CachingInodeOperations) DecRef(fr platform.FileRange) {
 	for seg.Ok() && seg.Start() < fr.End {
 		seg = c.refs.Isolate(seg, fr)
 		if old := seg.Value(); old == 1 {
-			if usage.IncrementalMappedAccounting {
-				usage.MemoryAccounting.Dec(seg.Range().Length(), usage.Mapped)
-			}
+			usage.MemoryAccounting.Dec(seg.Range().Length(), usage.Mapped)
 			seg = c.refs.Remove(seg).NextSegment()
 		} else {
 			seg.SetValue(old - 1)
diff --git a/pkg/sentry/usage/memory.go b/pkg/sentry/usage/memory.go
index f4326706a..d6ef644d8 100644
--- a/pkg/sentry/usage/memory.go
+++ b/pkg/sentry/usage/memory.go
@@ -277,8 +277,3 @@ func TotalMemory(memSize, used uint64) uint64 {
 	}
 	return memSize
 }
-
-// IncrementalMappedAccounting controls whether host mapped memory is accounted
-// incrementally during map translation. This may be modified during early
-// initialization, and is read-only afterward.
-var IncrementalMappedAccounting = false
-- 
cgit v1.2.3


From 67a2ab1438cdccbe045143bbfaa807cf83110ebc Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Tue, 3 Sep 2019 22:01:34 -0700
Subject: Impose order on test scripts.

The simple test script has gotten out of control. Shard this script into
different pieces and attempt to impose order on overall test structure. This
change helps lay some of the foundations for future improvements.

 * The runsc/test directories are moved into just test/.
 * The runsc/test/testutil package is split into logical pieces.
 * The scripts/ directory contains new top-level targets.
 * Each test is now responsible for building targets it requires.
 * The install functionality is moved into `runsc` itself for simplicity.
 * The existing kokoro run_tests.sh file now just calls all (can be split).

After this change is merged,  I will create multiple distinct workflows for
Kokoro, one for each of the scripts currently targeted by `run_tests.sh` today,
which should dramatically reduce the time-to-run for the Kokoro tests, and
provides a better foundation for further improvements to the infrastructure.

PiperOrigin-RevId: 267081397
---
 kokoro/build.cfg                              |  23 ++
 kokoro/common.cfg                             |   2 +-
 kokoro/continuous.cfg                         |   8 +-
 kokoro/do_tests.cfg                           |   9 +
 kokoro/docker_tests.cfg                       |   9 +
 kokoro/go.cfg                                 |   6 +
 kokoro/go_test.cfg                            |   1 +
 kokoro/hostnet_tests.cfg                      |   9 +
 kokoro/kvm_tests.cfg                          |   9 +
 kokoro/make_tests.cfg                         |   9 +
 kokoro/overlay_tests.cfg                      |   9 +
 kokoro/presubmit.cfg                          |   6 +-
 kokoro/release-nightly.cfg                    |   5 +-
 kokoro/release.cfg                            |   1 +
 kokoro/root_tests.cfg                         |   9 +
 kokoro/run_build.sh                           |  20 +-
 kokoro/run_tests.sh                           |  30 +-
 kokoro/simple_tests.cfg                       |   9 +
 kokoro/syscall_tests.cfg                      |   9 +
 pkg/sentry/fsimpl/ext/BUILD                   |   2 +-
 pkg/sentry/fsimpl/ext/ext_test.go             |   2 +-
 pkg/sentry/platform/kvm/BUILD                 |   1 +
 runsc/BUILD                                   |   9 -
 runsc/cgroup/BUILD                            |   4 +-
 runsc/cmd/BUILD                               |   3 +-
 runsc/cmd/capability_test.go                  |   4 +-
 runsc/cmd/install.go                          | 210 ++++++++++++
 runsc/container/BUILD                         |   2 +-
 runsc/container/console_test.go               |   2 +-
 runsc/container/container_test.go             |  33 +-
 runsc/container/multi_container_test.go       |   2 +-
 runsc/container/shared_volume_test.go         |   2 +-
 runsc/container/test_app/BUILD                |   2 +-
 runsc/container/test_app/fds.go               |   4 +-
 runsc/container/test_app/test_app.go          |   2 +-
 runsc/criutil/BUILD                           |  12 +
 runsc/criutil/criutil.go                      | 246 ++++++++++++++
 runsc/debian/postinst.sh                      |   6 +-
 runsc/dockerutil/BUILD                        |  15 +
 runsc/dockerutil/dockerutil.go                | 445 ++++++++++++++++++++++++++
 runsc/main.go                                 |   5 +
 runsc/test/BUILD                              |   0
 runsc/test/README.md                          |  24 --
 runsc/test/build_defs.bzl                     |  19 --
 runsc/test/image/BUILD                        |  31 --
 runsc/test/image/image.go                     |  16 -
 runsc/test/image/image_test.go                | 350 --------------------
 runsc/test/image/latin10k.txt                 |  33 --
 runsc/test/image/mysql.sql                    |  23 --
 runsc/test/image/ruby.rb                      |  23 --
 runsc/test/image/ruby.sh                      |  20 --
 runsc/test/install.sh                         |  93 ------
 runsc/test/integration/BUILD                  |  30 --
 runsc/test/integration/exec_test.go           | 161 ----------
 runsc/test/integration/integration.go         |  16 -
 runsc/test/integration/integration_test.go    | 344 --------------------
 runsc/test/integration/regression_test.go     |  45 ---
 runsc/test/root/BUILD                         |  33 --
 runsc/test/root/cgroup_test.go                | 237 --------------
 runsc/test/root/chroot_test.go                | 161 ----------
 runsc/test/root/crictl_test.go                | 242 --------------
 runsc/test/root/root.go                       |  16 -
 runsc/test/root/testdata/BUILD                |  18 --
 runsc/test/root/testdata/busybox.go           |  32 --
 runsc/test/root/testdata/containerd_config.go |  39 ---
 runsc/test/root/testdata/httpd.go             |  32 --
 runsc/test/root/testdata/httpd_mount_paths.go |  53 ---
 runsc/test/root/testdata/sandbox.go           |  30 --
 runsc/test/testutil/BUILD                     |  22 --
 runsc/test/testutil/crictl.go                 | 241 --------------
 runsc/test/testutil/docker.go                 | 410 ------------------------
 runsc/test/testutil/testutil.go               | 421 ------------------------
 runsc/test/testutil/testutil_race.go          |  21 --
 runsc/testutil/BUILD                          |  17 +
 runsc/testutil/testutil.go                    | 440 +++++++++++++++++++++++++
 runsc/tools/dockercfg/BUILD                   |  10 -
 runsc/tools/dockercfg/dockercfg.go            | 193 -----------
 scripts/build.sh                              |  62 ++++
 scripts/common.sh                             |  23 ++
 scripts/common_bazel.sh                       |  77 +++++
 scripts/do_tests.sh                           |  27 ++
 scripts/docker_tests.sh                       |  22 ++
 scripts/go.sh                                 |  34 ++
 scripts/hostnet_tests.sh                      |  22 ++
 scripts/kvm_tests.sh                          |  30 ++
 scripts/make_tests.sh                         |  24 ++
 scripts/overlay_tests.sh                      |  22 ++
 scripts/release.sh                            |  34 ++
 scripts/root_tests.sh                         |  31 ++
 scripts/simple_tests.sh                       |  20 ++
 scripts/syscall_tests.sh                      |  20 ++
 test/README.md                                |  18 ++
 test/e2e/BUILD                                |  31 ++
 test/e2e/exec_test.go                         | 156 +++++++++
 test/e2e/integration.go                       |  16 +
 test/e2e/integration_test.go                  | 348 ++++++++++++++++++++
 test/e2e/regression_test.go                   |  45 +++
 test/image/BUILD                              |  34 ++
 test/image/image.go                           |  16 +
 test/image/image_test.go                      | 353 ++++++++++++++++++++
 test/image/latin10k.txt                       |  33 ++
 test/image/mysql.sql                          |  23 ++
 test/image/ruby.rb                            |  23 ++
 test/image/ruby.sh                            |  20 ++
 test/root/BUILD                               |  36 +++
 test/root/cgroup_test.go                      | 238 ++++++++++++++
 test/root/chroot_test.go                      | 158 +++++++++
 test/root/crictl_test.go                      | 242 ++++++++++++++
 test/root/root.go                             |  16 +
 test/root/testdata/BUILD                      |  18 ++
 test/root/testdata/busybox.go                 |  32 ++
 test/root/testdata/containerd_config.go       |  39 +++
 test/root/testdata/httpd.go                   |  32 ++
 test/root/testdata/httpd_mount_paths.go       |  53 +++
 test/root/testdata/sandbox.go                 |  30 ++
 test/runtimes/BUILD                           |   4 +-
 test/runtimes/build_defs.bzl                  |  19 ++
 test/runtimes/common/BUILD                    |   2 +-
 test/runtimes/common/common_test.go           |   2 +-
 test/runtimes/runtimes_test.go                |   2 +-
 test/syscalls/BUILD                           |   3 +-
 test/syscalls/build_defs.bzl                  |   1 +
 test/syscalls/syscall_test_runner.go          |   2 +-
 tools/make_repository.sh                      |  69 ++++
 tools/run_build.sh                            |  49 ---
 tools/run_tests.sh                            | 304 ------------------
 126 files changed, 4148 insertions(+), 3859 deletions(-)
 create mode 100644 kokoro/build.cfg
 create mode 100644 kokoro/do_tests.cfg
 create mode 100644 kokoro/docker_tests.cfg
 create mode 100644 kokoro/go.cfg
 create mode 100644 kokoro/go_test.cfg
 create mode 100644 kokoro/hostnet_tests.cfg
 create mode 100644 kokoro/kvm_tests.cfg
 create mode 100644 kokoro/make_tests.cfg
 create mode 100644 kokoro/overlay_tests.cfg
 create mode 100644 kokoro/release.cfg
 create mode 100644 kokoro/root_tests.cfg
 mode change 120000 => 100755 kokoro/run_build.sh
 mode change 120000 => 100644 kokoro/run_tests.sh
 create mode 100644 kokoro/simple_tests.cfg
 create mode 100644 kokoro/syscall_tests.cfg
 create mode 100644 runsc/cmd/install.go
 create mode 100644 runsc/criutil/BUILD
 create mode 100644 runsc/criutil/criutil.go
 create mode 100644 runsc/dockerutil/BUILD
 create mode 100644 runsc/dockerutil/dockerutil.go
 delete mode 100644 runsc/test/BUILD
 delete mode 100644 runsc/test/README.md
 delete mode 100644 runsc/test/build_defs.bzl
 delete mode 100644 runsc/test/image/BUILD
 delete mode 100644 runsc/test/image/image.go
 delete mode 100644 runsc/test/image/image_test.go
 delete mode 100644 runsc/test/image/latin10k.txt
 delete mode 100644 runsc/test/image/mysql.sql
 delete mode 100644 runsc/test/image/ruby.rb
 delete mode 100644 runsc/test/image/ruby.sh
 delete mode 100755 runsc/test/install.sh
 delete mode 100644 runsc/test/integration/BUILD
 delete mode 100644 runsc/test/integration/exec_test.go
 delete mode 100644 runsc/test/integration/integration.go
 delete mode 100644 runsc/test/integration/integration_test.go
 delete mode 100644 runsc/test/integration/regression_test.go
 delete mode 100644 runsc/test/root/BUILD
 delete mode 100644 runsc/test/root/cgroup_test.go
 delete mode 100644 runsc/test/root/chroot_test.go
 delete mode 100644 runsc/test/root/crictl_test.go
 delete mode 100644 runsc/test/root/root.go
 delete mode 100644 runsc/test/root/testdata/BUILD
 delete mode 100644 runsc/test/root/testdata/busybox.go
 delete mode 100644 runsc/test/root/testdata/containerd_config.go
 delete mode 100644 runsc/test/root/testdata/httpd.go
 delete mode 100644 runsc/test/root/testdata/httpd_mount_paths.go
 delete mode 100644 runsc/test/root/testdata/sandbox.go
 delete mode 100644 runsc/test/testutil/BUILD
 delete mode 100644 runsc/test/testutil/crictl.go
 delete mode 100644 runsc/test/testutil/docker.go
 delete mode 100644 runsc/test/testutil/testutil.go
 delete mode 100644 runsc/test/testutil/testutil_race.go
 create mode 100644 runsc/testutil/BUILD
 create mode 100644 runsc/testutil/testutil.go
 delete mode 100644 runsc/tools/dockercfg/BUILD
 delete mode 100644 runsc/tools/dockercfg/dockercfg.go
 create mode 100755 scripts/build.sh
 create mode 100755 scripts/common.sh
 create mode 100755 scripts/common_bazel.sh
 create mode 100755 scripts/do_tests.sh
 create mode 100755 scripts/docker_tests.sh
 create mode 100755 scripts/go.sh
 create mode 100755 scripts/hostnet_tests.sh
 create mode 100755 scripts/kvm_tests.sh
 create mode 100755 scripts/make_tests.sh
 create mode 100755 scripts/overlay_tests.sh
 create mode 100755 scripts/release.sh
 create mode 100755 scripts/root_tests.sh
 create mode 100755 scripts/simple_tests.sh
 create mode 100755 scripts/syscall_tests.sh
 create mode 100644 test/README.md
 create mode 100644 test/e2e/BUILD
 create mode 100644 test/e2e/exec_test.go
 create mode 100644 test/e2e/integration.go
 create mode 100644 test/e2e/integration_test.go
 create mode 100644 test/e2e/regression_test.go
 create mode 100644 test/image/BUILD
 create mode 100644 test/image/image.go
 create mode 100644 test/image/image_test.go
 create mode 100644 test/image/latin10k.txt
 create mode 100644 test/image/mysql.sql
 create mode 100644 test/image/ruby.rb
 create mode 100644 test/image/ruby.sh
 create mode 100644 test/root/BUILD
 create mode 100644 test/root/cgroup_test.go
 create mode 100644 test/root/chroot_test.go
 create mode 100644 test/root/crictl_test.go
 create mode 100644 test/root/root.go
 create mode 100644 test/root/testdata/BUILD
 create mode 100644 test/root/testdata/busybox.go
 create mode 100644 test/root/testdata/containerd_config.go
 create mode 100644 test/root/testdata/httpd.go
 create mode 100644 test/root/testdata/httpd_mount_paths.go
 create mode 100644 test/root/testdata/sandbox.go
 create mode 100644 test/runtimes/build_defs.bzl
 create mode 100755 tools/make_repository.sh
 delete mode 100755 tools/run_build.sh
 delete mode 100755 tools/run_tests.sh

(limited to 'pkg/sentry')

diff --git a/kokoro/build.cfg b/kokoro/build.cfg
new file mode 100644
index 000000000..d67af4694
--- /dev/null
+++ b/kokoro/build.cfg
@@ -0,0 +1,23 @@
+build_file: "repo/scripts/build.sh"
+
+before_action {
+  fetch_keystore {
+    keystore_resource {
+      keystore_config_id: 73898
+      keyname: "kokoro-repo-key"
+    }
+  }
+}
+
+env_vars {
+  key: "KOKORO_REPO_KEY"
+  value: "$KOKORO_ROOT/src/keystore/73898_kokoro-repo-key"
+}
+
+action {
+  define_artifacts {
+    regex: "**/runsc"
+    regex: "**/runsc.sha256"
+    regex: "**/repo/**"
+  }
+}
diff --git a/kokoro/common.cfg b/kokoro/common.cfg
index cad873fe1..669a2e458 100644
--- a/kokoro/common.cfg
+++ b/kokoro/common.cfg
@@ -10,7 +10,7 @@ before_action {
 
 # Configure bazel to access RBE.
 bazel_setting {
-  # Our GCP project name
+  # Our GCP project name.
   project_id: "gvisor-rbe"
 
   # Use RBE for execution as well as caching.
diff --git a/kokoro/continuous.cfg b/kokoro/continuous.cfg
index 8da47736a..88694220a 100644
--- a/kokoro/continuous.cfg
+++ b/kokoro/continuous.cfg
@@ -1,13 +1,11 @@
-# Location of bash script that runs the test.  The first directory in the path
-# is the directory where Kokoro will check out the repo.  The rest is the path
-# is the path to the test script.
-build_file: "repo/kokoro/run_tests.sh"
+# This is a temporary file. It will be removed when new Kokoro jobs exist for
+# all the other presubmits.
+build_file: "repo/scripts/build.sh"
 
 action {
   define_artifacts {
     regex: "**/sponge_log.xml"
     regex: "**/sponge_log.log"
     regex: "**/outputs.zip"
-    regex: "**/runsc-logs.tar.gz"
   }
 }
diff --git a/kokoro/do_tests.cfg b/kokoro/do_tests.cfg
new file mode 100644
index 000000000..b45ec0b42
--- /dev/null
+++ b/kokoro/do_tests.cfg
@@ -0,0 +1,9 @@
+build_file: "repo/scripts/do_tests.sh"
+
+action {
+  define_artifacts {
+    regex: "**/sponge_log.xml"
+    regex: "**/sponge_log.log"
+    regex: "**/outputs.zip"
+  }
+}
diff --git a/kokoro/docker_tests.cfg b/kokoro/docker_tests.cfg
new file mode 100644
index 000000000..717d71dd3
--- /dev/null
+++ b/kokoro/docker_tests.cfg
@@ -0,0 +1,9 @@
+build_file: "repo/scripts/docker_tests.sh"
+
+action {
+  define_artifacts {
+    regex: "**/sponge_log.xml"
+    regex: "**/sponge_log.log"
+    regex: "**/outputs.zip"
+  }
+}
diff --git a/kokoro/go.cfg b/kokoro/go.cfg
new file mode 100644
index 000000000..d1577252a
--- /dev/null
+++ b/kokoro/go.cfg
@@ -0,0 +1,6 @@
+build_file: "repo/scripts/go.sh"
+
+env_vars {
+  key: "KOKORO_GO_PUSH"
+  value: "true"
+}
diff --git a/kokoro/go_test.cfg b/kokoro/go_test.cfg
new file mode 100644
index 000000000..5eb51041a
--- /dev/null
+++ b/kokoro/go_test.cfg
@@ -0,0 +1 @@
+build_file: "repo/scripts/go.sh"
diff --git a/kokoro/hostnet_tests.cfg b/kokoro/hostnet_tests.cfg
new file mode 100644
index 000000000..532755f4a
--- /dev/null
+++ b/kokoro/hostnet_tests.cfg
@@ -0,0 +1,9 @@
+build_file: "repo/scripts/hostnet_tests.sh"
+
+action {
+  define_artifacts {
+    regex: "**/sponge_log.xml"
+    regex: "**/sponge_log.log"
+    regex: "**/outputs.zip"
+  }
+}
diff --git a/kokoro/kvm_tests.cfg b/kokoro/kvm_tests.cfg
new file mode 100644
index 000000000..54365c2b2
--- /dev/null
+++ b/kokoro/kvm_tests.cfg
@@ -0,0 +1,9 @@
+build_file: "repo/scripts/kvm_tests.sh"
+
+action {
+  define_artifacts {
+    regex: "**/sponge_log.xml"
+    regex: "**/sponge_log.log"
+    regex: "**/outputs.zip"
+  }
+}
diff --git a/kokoro/make_tests.cfg b/kokoro/make_tests.cfg
new file mode 100644
index 000000000..d973130ff
--- /dev/null
+++ b/kokoro/make_tests.cfg
@@ -0,0 +1,9 @@
+build_file: "repo/scripts/make_tests.sh"
+
+action {
+  define_artifacts {
+    regex: "**/sponge_log.xml"
+    regex: "**/sponge_log.log"
+    regex: "**/outputs.zip"
+  }
+}
diff --git a/kokoro/overlay_tests.cfg b/kokoro/overlay_tests.cfg
new file mode 100644
index 000000000..abd96f60c
--- /dev/null
+++ b/kokoro/overlay_tests.cfg
@@ -0,0 +1,9 @@
+build_file: "repo/scripts/overlay_tests.sh"
+
+action {
+  define_artifacts {
+    regex: "**/sponge_log.xml"
+    regex: "**/sponge_log.log"
+    regex: "**/outputs.zip"
+  }
+}
diff --git a/kokoro/presubmit.cfg b/kokoro/presubmit.cfg
index 8da47736a..eb0c78ea4 100644
--- a/kokoro/presubmit.cfg
+++ b/kokoro/presubmit.cfg
@@ -1,6 +1,5 @@
-# Location of bash script that runs the test.  The first directory in the path
-# is the directory where Kokoro will check out the repo.  The rest is the path
-# is the path to the test script.
+# This is a temporary file. It will be removed when new Kokoro jobs exist for
+# all the other presubmits.
 build_file: "repo/kokoro/run_tests.sh"
 
 action {
@@ -8,6 +7,5 @@ action {
     regex: "**/sponge_log.xml"
     regex: "**/sponge_log.log"
     regex: "**/outputs.zip"
-    regex: "**/runsc-logs.tar.gz"
   }
 }
diff --git a/kokoro/release-nightly.cfg b/kokoro/release-nightly.cfg
index e5087b1cd..ae134258c 100644
--- a/kokoro/release-nightly.cfg
+++ b/kokoro/release-nightly.cfg
@@ -1,9 +1,8 @@
-# Location of bash script that builds a release.
+# This file is a temporary bridge. It will be removed shortly, when Kokoro jobs
+# are configured to point at the new build and release configurations.
 build_file: "repo/kokoro/run_build.sh"
 
 action {
-  # Upload runsc binary and its checksum. It may be in multiple paths, so we
-  # must use the wildcard.
   define_artifacts {
     regex: "**/runsc"
     regex: "**/runsc.sha512"
diff --git a/kokoro/release.cfg b/kokoro/release.cfg
new file mode 100644
index 000000000..b9d35bc51
--- /dev/null
+++ b/kokoro/release.cfg
@@ -0,0 +1 @@
+build_file: "repo/scripts/release.sh"
diff --git a/kokoro/root_tests.cfg b/kokoro/root_tests.cfg
new file mode 100644
index 000000000..20b97766a
--- /dev/null
+++ b/kokoro/root_tests.cfg
@@ -0,0 +1,9 @@
+build_file: "repo/scripts/root_tests.sh"
+
+action {
+  define_artifacts {
+    regex: "**/sponge_log.xml"
+    regex: "**/sponge_log.log"
+    regex: "**/outputs.zip"
+  }
+}
diff --git a/kokoro/run_build.sh b/kokoro/run_build.sh
deleted file mode 120000
index 9deafe9bb..000000000
--- a/kokoro/run_build.sh
+++ /dev/null
@@ -1 +0,0 @@
-../tools/run_build.sh
\ No newline at end of file
diff --git a/kokoro/run_build.sh b/kokoro/run_build.sh
new file mode 100755
index 000000000..da6a0c85e
--- /dev/null
+++ b/kokoro/run_build.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+# Copyright 2018 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This file is a temporary bridge. We will create multiple independent Kokoro
+# workflows that call each of the build scripts independently.
+KOKORO_BUILD_NIGHTLY=true $(dirname $0)/../scripts/build.sh
diff --git a/kokoro/run_tests.sh b/kokoro/run_tests.sh
deleted file mode 120000
index 931cd2622..000000000
--- a/kokoro/run_tests.sh
+++ /dev/null
@@ -1 +0,0 @@
-../tools/run_tests.sh
\ No newline at end of file
diff --git a/kokoro/run_tests.sh b/kokoro/run_tests.sh
new file mode 100644
index 000000000..5552da11c
--- /dev/null
+++ b/kokoro/run_tests.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+
+# Copyright 2019 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -xeo pipefail
+
+# This file is a temporary bridge. We will create multiple independent Kokoro
+# workflows that call each of the test scripts independently.
+
+# Run all the tests in sequence.
+$(dirname $0)/../scripts/do_tests.sh
+$(dirname $0)/../scripts/make_tests.sh
+$(dirname $0)/../scripts/root_tests.sh
+$(dirname $0)/../scripts/docker_tests.sh
+$(dirname $0)/../scripts/overlay_tests.sh
+$(dirname $0)/../scripts/hostnet_tests.sh
+$(dirname $0)/../scripts/simple_tests.sh
diff --git a/kokoro/simple_tests.cfg b/kokoro/simple_tests.cfg
new file mode 100644
index 000000000..32e0a9431
--- /dev/null
+++ b/kokoro/simple_tests.cfg
@@ -0,0 +1,9 @@
+build_file: "repo/scripts/simple_tests.sh"
+
+action {
+  define_artifacts {
+    regex: "**/sponge_log.xml"
+    regex: "**/sponge_log.log"
+    regex: "**/outputs.zip"
+  }
+}
diff --git a/kokoro/syscall_tests.cfg b/kokoro/syscall_tests.cfg
new file mode 100644
index 000000000..ee6e4a3a4
--- /dev/null
+++ b/kokoro/syscall_tests.cfg
@@ -0,0 +1,9 @@
+build_file: "repo/scripts/syscall_tests.sh"
+
+action {
+  define_artifacts {
+    regex: "**/sponge_log.xml"
+    regex: "**/sponge_log.log"
+    regex: "**/outputs.zip"
+  }
+}
diff --git a/pkg/sentry/fsimpl/ext/BUILD b/pkg/sentry/fsimpl/ext/BUILD
index a41101339..9e8ebb907 100644
--- a/pkg/sentry/fsimpl/ext/BUILD
+++ b/pkg/sentry/fsimpl/ext/BUILD
@@ -79,7 +79,7 @@ go_test(
         "//pkg/sentry/usermem",
         "//pkg/sentry/vfs",
         "//pkg/syserror",
-        "//runsc/test/testutil",
+        "//runsc/testutil",
         "@com_github_google_go-cmp//cmp:go_default_library",
         "@com_github_google_go-cmp//cmp/cmpopts:go_default_library",
     ],
diff --git a/pkg/sentry/fsimpl/ext/ext_test.go b/pkg/sentry/fsimpl/ext/ext_test.go
index 49b57a2d6..63cf7aeaf 100644
--- a/pkg/sentry/fsimpl/ext/ext_test.go
+++ b/pkg/sentry/fsimpl/ext/ext_test.go
@@ -33,7 +33,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
 
-	"gvisor.dev/gvisor/runsc/test/testutil"
+	"gvisor.dev/gvisor/runsc/testutil"
 )
 
 const (
diff --git a/pkg/sentry/platform/kvm/BUILD b/pkg/sentry/platform/kvm/BUILD
index ad8b95744..fe979dccf 100644
--- a/pkg/sentry/platform/kvm/BUILD
+++ b/pkg/sentry/platform/kvm/BUILD
@@ -54,6 +54,7 @@ go_test(
     ],
     embed = [":kvm"],
     tags = [
+        "manual",
         "nogotsan",
         "requires-kvm",
     ],
diff --git a/runsc/BUILD b/runsc/BUILD
index cc8852d7d..a2a465e1e 100644
--- a/runsc/BUILD
+++ b/runsc/BUILD
@@ -66,20 +66,11 @@ pkg_tar(
     strip_prefix = "/runsc/linux_amd64_pure_stripped",
 )
 
-pkg_tar(
-    name = "runsc-tools",
-    srcs = ["//runsc/tools/dockercfg"],
-    mode = "0755",
-    package_dir = "/usr/libexec/runsc",
-    strip_prefix = "/runsc/tools/dockercfg/linux_amd64_stripped",
-)
-
 pkg_tar(
     name = "debian-data",
     extension = "tar.gz",
     deps = [
         ":runsc-bin",
-        ":runsc-tools",
     ],
 )
 
diff --git a/runsc/cgroup/BUILD b/runsc/cgroup/BUILD
index ab2387614..d6165f9e5 100644
--- a/runsc/cgroup/BUILD
+++ b/runsc/cgroup/BUILD
@@ -6,9 +6,7 @@ go_library(
     name = "cgroup",
     srcs = ["cgroup.go"],
     importpath = "gvisor.dev/gvisor/runsc/cgroup",
-    visibility = [
-        "//runsc:__subpackages__",
-    ],
+    visibility = ["//:sandbox"],
     deps = [
         "//pkg/log",
         "//runsc/specutils",
diff --git a/runsc/cmd/BUILD b/runsc/cmd/BUILD
index 5223b9972..250845ad7 100644
--- a/runsc/cmd/BUILD
+++ b/runsc/cmd/BUILD
@@ -19,6 +19,7 @@ go_library(
         "exec.go",
         "gofer.go",
         "help.go",
+        "install.go",
         "kill.go",
         "list.go",
         "path.go",
@@ -81,7 +82,7 @@ go_test(
         "//runsc/boot",
         "//runsc/container",
         "//runsc/specutils",
-        "//runsc/test/testutil",
+        "//runsc/testutil",
         "@com_github_google_go-cmp//cmp:go_default_library",
         "@com_github_google_go-cmp//cmp/cmpopts:go_default_library",
         "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
diff --git a/runsc/cmd/capability_test.go b/runsc/cmd/capability_test.go
index 3ae25a257..0c27f7313 100644
--- a/runsc/cmd/capability_test.go
+++ b/runsc/cmd/capability_test.go
@@ -15,6 +15,7 @@
 package cmd
 
 import (
+	"flag"
 	"fmt"
 	"os"
 	"testing"
@@ -25,7 +26,7 @@ import (
 	"gvisor.dev/gvisor/runsc/boot"
 	"gvisor.dev/gvisor/runsc/container"
 	"gvisor.dev/gvisor/runsc/specutils"
-	"gvisor.dev/gvisor/runsc/test/testutil"
+	"gvisor.dev/gvisor/runsc/testutil"
 )
 
 func init() {
@@ -121,6 +122,7 @@ func TestCapabilities(t *testing.T) {
 }
 
 func TestMain(m *testing.M) {
+	flag.Parse()
 	specutils.MaybeRunAsRoot()
 	os.Exit(m.Run())
 }
diff --git a/runsc/cmd/install.go b/runsc/cmd/install.go
new file mode 100644
index 000000000..441c1db0d
--- /dev/null
+++ b/runsc/cmd/install.go
@@ -0,0 +1,210 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"io/ioutil"
+	"log"
+	"os"
+	"path"
+
+	"flag"
+	"github.com/google/subcommands"
+)
+
+// Install implements subcommands.Command.
+type Install struct {
+	ConfigFile   string
+	Runtime      string
+	Experimental bool
+}
+
+// Name implements subcommands.Command.Name.
+func (*Install) Name() string {
+	return "install"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*Install) Synopsis() string {
+	return "adds a runtime to docker daemon configuration"
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*Install) Usage() string {
+	return `install [flags] <name> [-- [args...]] -- if provided, args are passed to the runtime
+`
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (i *Install) SetFlags(fs *flag.FlagSet) {
+	fs.StringVar(&i.ConfigFile, "config_file", "/etc/docker/daemon.json", "path to Docker daemon config file")
+	fs.StringVar(&i.Runtime, "runtime", "runsc", "runtime name")
+	fs.BoolVar(&i.Experimental, "experimental", false, "enable experimental features")
+}
+
+// Execute implements subcommands.Command.Execute.
+func (i *Install) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	// Grab the name and arguments.
+	runtimeArgs := f.Args()
+
+	// Extract the executable.
+	path, err := os.Executable()
+	if err != nil {
+		log.Fatalf("Error reading current exectuable: %v", err)
+	}
+
+	// Load the configuration file.
+	c, err := readConfig(i.ConfigFile)
+	if err != nil {
+		log.Fatalf("Error reading config file %q: %v", i.ConfigFile, err)
+	}
+
+	// Add the given runtime.
+	var rts map[string]interface{}
+	if i, ok := c["runtimes"]; ok {
+		rts = i.(map[string]interface{})
+	} else {
+		rts = make(map[string]interface{})
+		c["runtimes"] = rts
+	}
+	rts[i.Runtime] = struct {
+		Path        string   `json:"path,omitempty"`
+		RuntimeArgs []string `json:"runtimeArgs,omitempty"`
+	}{
+		Path:        path,
+		RuntimeArgs: runtimeArgs,
+	}
+
+	// Set experimental if required.
+	if i.Experimental {
+		c["experimental"] = true
+	}
+
+	// Write out the runtime.
+	if err := writeConfig(c, i.ConfigFile); err != nil {
+		log.Fatalf("Error writing config file %q: %v", i.ConfigFile, err)
+	}
+
+	// Success.
+	log.Printf("Added runtime %q with arguments %v to %q.", i.Runtime, runtimeArgs, i.ConfigFile)
+	return subcommands.ExitSuccess
+}
+
+// Uninstall implements subcommands.Command.
+type Uninstall struct {
+	ConfigFile string
+	Runtime    string
+}
+
+// Name implements subcommands.Command.Name.
+func (*Uninstall) Name() string {
+	return "uninstall"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*Uninstall) Synopsis() string {
+	return "removes a runtime from docker daemon configuration"
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*Uninstall) Usage() string {
+	return `uninstall [flags] <name>
+`
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (u *Uninstall) SetFlags(fs *flag.FlagSet) {
+	fs.StringVar(&u.ConfigFile, "config_file", "/etc/docker/daemon.json", "path to Docker daemon config file")
+	fs.StringVar(&u.Runtime, "runtime", "runsc", "runtime name")
+}
+
+// Execute implements subcommands.Command.Execute.
+func (u *Uninstall) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	log.Printf("Removing runtime %q from %q.", u.Runtime, u.ConfigFile)
+
+	c, err := readConfig(u.ConfigFile)
+	if err != nil {
+		log.Fatalf("Error reading config file %q: %v", u.ConfigFile, err)
+	}
+
+	var rts map[string]interface{}
+	if i, ok := c["runtimes"]; ok {
+		rts = i.(map[string]interface{})
+	} else {
+		log.Fatalf("runtime %q not found", u.Runtime)
+	}
+	if _, ok := rts[u.Runtime]; !ok {
+		log.Fatalf("runtime %q not found", u.Runtime)
+	}
+	delete(rts, u.Runtime)
+
+	if err := writeConfig(c, u.ConfigFile); err != nil {
+		log.Fatalf("Error writing config file %q: %v", u.ConfigFile, err)
+	}
+	return subcommands.ExitSuccess
+}
+
+func readConfig(path string) (map[string]interface{}, error) {
+	// Read the configuration data.
+	configBytes, err := ioutil.ReadFile(path)
+	if err != nil && !os.IsNotExist(err) {
+		return nil, err
+	}
+
+	// Unmarshal the configuration.
+	c := make(map[string]interface{})
+	if len(configBytes) > 0 {
+		if err := json.Unmarshal(configBytes, &c); err != nil {
+			return nil, err
+		}
+	}
+
+	return c, nil
+}
+
+func writeConfig(c map[string]interface{}, filename string) error {
+	// Marshal the configuration.
+	b, err := json.MarshalIndent(c, "", "    ")
+	if err != nil {
+		return err
+	}
+
+	// Copy the old configuration.
+	old, err := ioutil.ReadFile(filename)
+	if err != nil {
+		if !os.IsNotExist(err) {
+			return fmt.Errorf("error reading config file %q: %v", filename, err)
+		}
+	} else {
+		if err := ioutil.WriteFile(filename+"~", old, 0644); err != nil {
+			return fmt.Errorf("error backing up config file %q: %v", filename, err)
+		}
+	}
+
+	// Make the necessary directories.
+	if err := os.MkdirAll(path.Dir(filename), 0755); err != nil {
+		return fmt.Errorf("error creating config directory for %q: %v", filename, err)
+	}
+
+	// Write the new configuration.
+	if err := ioutil.WriteFile(filename, b, 0644); err != nil {
+		return fmt.Errorf("error writing config file %q: %v", filename, err)
+	}
+
+	return nil
+}
diff --git a/runsc/container/BUILD b/runsc/container/BUILD
index de8202bb1..bc1fa25e3 100644
--- a/runsc/container/BUILD
+++ b/runsc/container/BUILD
@@ -56,7 +56,7 @@ go_test(
         "//runsc/boot",
         "//runsc/boot/platforms",
         "//runsc/specutils",
-        "//runsc/test/testutil",
+        "//runsc/testutil",
         "@com_github_cenkalti_backoff//:go_default_library",
         "@com_github_kr_pty//:go_default_library",
         "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
diff --git a/runsc/container/console_test.go b/runsc/container/console_test.go
index e9372989f..7d67c3a75 100644
--- a/runsc/container/console_test.go
+++ b/runsc/container/console_test.go
@@ -30,7 +30,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/control"
 	"gvisor.dev/gvisor/pkg/unet"
 	"gvisor.dev/gvisor/pkg/urpc"
-	"gvisor.dev/gvisor/runsc/test/testutil"
+	"gvisor.dev/gvisor/runsc/testutil"
 )
 
 // socketPath creates a path inside bundleDir and ensures that the returned
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index 3d4f304f3..2ac12e5b6 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -16,6 +16,7 @@ package container
 
 import (
 	"bytes"
+	"flag"
 	"fmt"
 	"io"
 	"io/ioutil"
@@ -39,7 +40,7 @@ import (
 	"gvisor.dev/gvisor/runsc/boot"
 	"gvisor.dev/gvisor/runsc/boot/platforms"
 	"gvisor.dev/gvisor/runsc/specutils"
-	"gvisor.dev/gvisor/runsc/test/testutil"
+	"gvisor.dev/gvisor/runsc/testutil"
 )
 
 // waitForProcessList waits for the given process list to show up in the container.
@@ -155,12 +156,7 @@ func waitForFile(f *os.File) error {
 		return nil
 	}
 
-	timeout := 5 * time.Second
-	if testutil.RaceEnabled {
-		// Race makes slow things even slow, so bump the timeout.
-		timeout = 3 * timeout
-	}
-	return testutil.Poll(op, timeout)
+	return testutil.Poll(op, 30*time.Second)
 }
 
 // readOutputNum reads a file at given filepath and returns the int at the
@@ -254,10 +250,6 @@ func configs(opts ...configOption) []*boot.Config {
 			// TODO(b/112165693): KVM tests are flaky. Disable until fixed.
 			continue
 
-			// TODO(b/68787993): KVM doesn't work with --race.
-			if testutil.RaceEnabled {
-				continue
-			}
 			c.Platform = platforms.KVM
 		case nonExclusiveFS:
 			c.FileAccess = boot.FileAccessShared
@@ -1651,22 +1643,27 @@ func TestGoferExits(t *testing.T) {
 }
 
 func TestRootNotMount(t *testing.T) {
-	if testutil.RaceEnabled {
-		// Requires statically linked binary, since it's mapping the root to a
-		// random dir, libs cannot be located.
-		t.Skip("race makes test_app not statically linked")
-	}
-
 	appSym, err := testutil.FindFile("runsc/container/test_app/test_app")
 	if err != nil {
 		t.Fatal("error finding test_app:", err)
 	}
+
 	app, err := filepath.EvalSymlinks(appSym)
 	if err != nil {
 		t.Fatalf("error resolving %q symlink: %v", appSym, err)
 	}
 	log.Infof("App path %q is a symlink to %q", appSym, app)
 
+	static, err := testutil.IsStatic(app)
+	if err != nil {
+		t.Fatalf("error reading application binary: %v", err)
+	}
+	if !static {
+		// This happens during race builds; we cannot map in shared
+		// libraries also, so we need to skip the test.
+		t.Skip()
+	}
+
 	root := filepath.Dir(app)
 	exe := "/" + filepath.Base(app)
 	log.Infof("Executing %q in %q", exe, root)
@@ -2067,10 +2064,10 @@ func (cont *Container) executeSync(args *control.ExecArgs) (syscall.WaitStatus,
 
 func TestMain(m *testing.M) {
 	log.SetLevel(log.Debug)
+	flag.Parse()
 	if err := testutil.ConfigureExePath(); err != nil {
 		panic(err.Error())
 	}
 	specutils.MaybeRunAsRoot()
-
 	os.Exit(m.Run())
 }
diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go
index ae03d24b4..6e5f23ff2 100644
--- a/runsc/container/multi_container_test.go
+++ b/runsc/container/multi_container_test.go
@@ -32,7 +32,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/runsc/boot"
 	"gvisor.dev/gvisor/runsc/specutils"
-	"gvisor.dev/gvisor/runsc/test/testutil"
+	"gvisor.dev/gvisor/runsc/testutil"
 )
 
 func createSpecs(cmds ...[]string) ([]*specs.Spec, []string) {
diff --git a/runsc/container/shared_volume_test.go b/runsc/container/shared_volume_test.go
index 1f90d2462..dc4194134 100644
--- a/runsc/container/shared_volume_test.go
+++ b/runsc/container/shared_volume_test.go
@@ -25,7 +25,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/control"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/runsc/boot"
-	"gvisor.dev/gvisor/runsc/test/testutil"
+	"gvisor.dev/gvisor/runsc/testutil"
 )
 
 // TestSharedVolume checks that modifications to a volume mount are propagated
diff --git a/runsc/container/test_app/BUILD b/runsc/container/test_app/BUILD
index 82dbd54d2..9bf9e6e9d 100644
--- a/runsc/container/test_app/BUILD
+++ b/runsc/container/test_app/BUILD
@@ -13,7 +13,7 @@ go_binary(
     visibility = ["//runsc/container:__pkg__"],
     deps = [
         "//pkg/unet",
-        "//runsc/test/testutil",
+        "//runsc/testutil",
         "@com_github_google_subcommands//:go_default_library",
     ],
 )
diff --git a/runsc/container/test_app/fds.go b/runsc/container/test_app/fds.go
index c12809cab..a90cc1662 100644
--- a/runsc/container/test_app/fds.go
+++ b/runsc/container/test_app/fds.go
@@ -24,7 +24,7 @@ import (
 	"flag"
 	"github.com/google/subcommands"
 	"gvisor.dev/gvisor/pkg/unet"
-	"gvisor.dev/gvisor/runsc/test/testutil"
+	"gvisor.dev/gvisor/runsc/testutil"
 )
 
 const fileContents = "foobarbaz"
@@ -60,7 +60,7 @@ func (fds *fdSender) Execute(ctx context.Context, f *flag.FlagSet, args ...inter
 		log.Fatalf("socket flag must be set")
 	}
 
-	dir, err := ioutil.TempDir(testutil.TmpDir(), "")
+	dir, err := ioutil.TempDir("", "")
 	if err != nil {
 		log.Fatalf("TempDir failed: %v", err)
 	}
diff --git a/runsc/container/test_app/test_app.go b/runsc/container/test_app/test_app.go
index 6578c7b41..7f735c254 100644
--- a/runsc/container/test_app/test_app.go
+++ b/runsc/container/test_app/test_app.go
@@ -29,7 +29,7 @@ import (
 
 	"flag"
 	"github.com/google/subcommands"
-	"gvisor.dev/gvisor/runsc/test/testutil"
+	"gvisor.dev/gvisor/runsc/testutil"
 )
 
 func main() {
diff --git a/runsc/criutil/BUILD b/runsc/criutil/BUILD
new file mode 100644
index 000000000..558133a0e
--- /dev/null
+++ b/runsc/criutil/BUILD
@@ -0,0 +1,12 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "criutil",
+    testonly = 1,
+    srcs = ["criutil.go"],
+    importpath = "gvisor.dev/gvisor/runsc/criutil",
+    visibility = ["//:sandbox"],
+    deps = ["//runsc/testutil"],
+)
diff --git a/runsc/criutil/criutil.go b/runsc/criutil/criutil.go
new file mode 100644
index 000000000..c8ddf5a9a
--- /dev/null
+++ b/runsc/criutil/criutil.go
@@ -0,0 +1,246 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package criutil contains utility functions for interacting with the
+// Container Runtime Interface (CRI), principally via the crictl command line
+// tool. This requires critools to be installed on the local system.
+package criutil
+
+import (
+	"encoding/json"
+	"fmt"
+	"os"
+	"os/exec"
+	"strings"
+	"time"
+
+	"gvisor.dev/gvisor/runsc/testutil"
+)
+
+const endpointPrefix = "unix://"
+
+// Crictl contains information required to run the crictl utility.
+type Crictl struct {
+	executable      string
+	timeout         time.Duration
+	imageEndpoint   string
+	runtimeEndpoint string
+}
+
+// NewCrictl returns a Crictl configured with a timeout and an endpoint over
+// which it will talk to containerd.
+func NewCrictl(timeout time.Duration, endpoint string) *Crictl {
+	// Bazel doesn't pass PATH through, assume the location of crictl
+	// unless specified by environment variable.
+	executable := os.Getenv("CRICTL_PATH")
+	if executable == "" {
+		executable = "/usr/local/bin/crictl"
+	}
+	return &Crictl{
+		executable:      executable,
+		timeout:         timeout,
+		imageEndpoint:   endpointPrefix + endpoint,
+		runtimeEndpoint: endpointPrefix + endpoint,
+	}
+}
+
+// Pull pulls an container image. It corresponds to `crictl pull`.
+func (cc *Crictl) Pull(imageName string) error {
+	_, err := cc.run("pull", imageName)
+	return err
+}
+
+// RunPod creates a sandbox. It corresponds to `crictl runp`.
+func (cc *Crictl) RunPod(sbSpecFile string) (string, error) {
+	podID, err := cc.run("runp", sbSpecFile)
+	if err != nil {
+		return "", fmt.Errorf("runp failed: %v", err)
+	}
+	// Strip the trailing newline from crictl output.
+	return strings.TrimSpace(podID), nil
+}
+
+// Create creates a container within a sandbox. It corresponds to `crictl
+// create`.
+func (cc *Crictl) Create(podID, contSpecFile, sbSpecFile string) (string, error) {
+	podID, err := cc.run("create", podID, contSpecFile, sbSpecFile)
+	if err != nil {
+		return "", fmt.Errorf("create failed: %v", err)
+	}
+	// Strip the trailing newline from crictl output.
+	return strings.TrimSpace(podID), nil
+}
+
+// Start starts a container. It corresponds to `crictl start`.
+func (cc *Crictl) Start(contID string) (string, error) {
+	output, err := cc.run("start", contID)
+	if err != nil {
+		return "", fmt.Errorf("start failed: %v", err)
+	}
+	return output, nil
+}
+
+// Stop stops a container. It corresponds to `crictl stop`.
+func (cc *Crictl) Stop(contID string) error {
+	_, err := cc.run("stop", contID)
+	return err
+}
+
+// Exec execs a program inside a container. It corresponds to `crictl exec`.
+func (cc *Crictl) Exec(contID string, args ...string) (string, error) {
+	a := []string{"exec", contID}
+	a = append(a, args...)
+	output, err := cc.run(a...)
+	if err != nil {
+		return "", fmt.Errorf("exec failed: %v", err)
+	}
+	return output, nil
+}
+
+// Rm removes a container. It corresponds to `crictl rm`.
+func (cc *Crictl) Rm(contID string) error {
+	_, err := cc.run("rm", contID)
+	return err
+}
+
+// StopPod stops a pod. It corresponds to `crictl stopp`.
+func (cc *Crictl) StopPod(podID string) error {
+	_, err := cc.run("stopp", podID)
+	return err
+}
+
+// containsConfig is a minimal copy of
+// https://github.com/kubernetes/kubernetes/blob/master/pkg/kubelet/apis/cri/runtime/v1alpha2/api.proto
+// It only contains fields needed for testing.
+type containerConfig struct {
+	Status containerStatus
+}
+
+type containerStatus struct {
+	Network containerNetwork
+}
+
+type containerNetwork struct {
+	IP string
+}
+
+// PodIP returns a pod's IP address.
+func (cc *Crictl) PodIP(podID string) (string, error) {
+	output, err := cc.run("inspectp", podID)
+	if err != nil {
+		return "", err
+	}
+	conf := &containerConfig{}
+	if err := json.Unmarshal([]byte(output), conf); err != nil {
+		return "", fmt.Errorf("failed to unmarshal JSON: %v, %s", err, output)
+	}
+	if conf.Status.Network.IP == "" {
+		return "", fmt.Errorf("no IP found in config: %s", output)
+	}
+	return conf.Status.Network.IP, nil
+}
+
+// RmPod removes a container. It corresponds to `crictl rmp`.
+func (cc *Crictl) RmPod(podID string) error {
+	_, err := cc.run("rmp", podID)
+	return err
+}
+
+// StartPodAndContainer pulls an image, then starts a sandbox and container in
+// that sandbox. It returns the pod ID and container ID.
+func (cc *Crictl) StartPodAndContainer(image, sbSpec, contSpec string) (string, string, error) {
+	if err := cc.Pull(image); err != nil {
+		return "", "", fmt.Errorf("failed to pull %s: %v", image, err)
+	}
+
+	// Write the specs to files that can be read by crictl.
+	sbSpecFile, err := testutil.WriteTmpFile("sbSpec", sbSpec)
+	if err != nil {
+		return "", "", fmt.Errorf("failed to write sandbox spec: %v", err)
+	}
+	contSpecFile, err := testutil.WriteTmpFile("contSpec", contSpec)
+	if err != nil {
+		return "", "", fmt.Errorf("failed to write container spec: %v", err)
+	}
+
+	podID, err := cc.RunPod(sbSpecFile)
+	if err != nil {
+		return "", "", err
+	}
+
+	contID, err := cc.Create(podID, contSpecFile, sbSpecFile)
+	if err != nil {
+		return "", "", fmt.Errorf("failed to create container in pod %q: %v", podID, err)
+	}
+
+	if _, err := cc.Start(contID); err != nil {
+		return "", "", fmt.Errorf("failed to start container %q in pod %q: %v", contID, podID, err)
+	}
+
+	return podID, contID, nil
+}
+
+// StopPodAndContainer stops a container and pod.
+func (cc *Crictl) StopPodAndContainer(podID, contID string) error {
+	if err := cc.Stop(contID); err != nil {
+		return fmt.Errorf("failed to stop container %q in pod %q: %v", contID, podID, err)
+	}
+
+	if err := cc.Rm(contID); err != nil {
+		return fmt.Errorf("failed to remove container %q in pod %q: %v", contID, podID, err)
+	}
+
+	if err := cc.StopPod(podID); err != nil {
+		return fmt.Errorf("failed to stop pod %q: %v", podID, err)
+	}
+
+	if err := cc.RmPod(podID); err != nil {
+		return fmt.Errorf("failed to remove pod %q: %v", podID, err)
+	}
+
+	return nil
+}
+
+// run runs crictl with the given args and returns an error if it takes longer
+// than cc.Timeout to run.
+func (cc *Crictl) run(args ...string) (string, error) {
+	defaultArgs := []string{
+		"--image-endpoint", cc.imageEndpoint,
+		"--runtime-endpoint", cc.runtimeEndpoint,
+	}
+	cmd := exec.Command(cc.executable, append(defaultArgs, args...)...)
+
+	// Run the command with a timeout.
+	done := make(chan string)
+	errCh := make(chan error)
+	go func() {
+		output, err := cmd.CombinedOutput()
+		if err != nil {
+			errCh <- fmt.Errorf("error: \"%v\", output: %s", err, string(output))
+			return
+		}
+		done <- string(output)
+	}()
+	select {
+	case output := <-done:
+		return output, nil
+	case err := <-errCh:
+		return "", err
+	case <-time.After(cc.timeout):
+		if err := testutil.KillCommand(cmd); err != nil {
+			return "", fmt.Errorf("timed out, then couldn't kill process %+v: %v", cmd, err)
+		}
+		return "", fmt.Errorf("timed out: %+v", cmd)
+	}
+}
diff --git a/runsc/debian/postinst.sh b/runsc/debian/postinst.sh
index 03a5ff524..dc7aeee87 100755
--- a/runsc/debian/postinst.sh
+++ b/runsc/debian/postinst.sh
@@ -15,10 +15,10 @@
 # limitations under the License.
 
 if [ "$1" != configure ]; then
-    exit 0
+  exit 0
 fi
 
 if [ -f /etc/docker/daemon.json ]; then
-	/usr/libexec/runsc/dockercfg runtime-add runsc /usr/bin/runsc
-	systemctl restart docker
+  runsc install
+  systemctl restart docker || echo "unable to restart docker; you must do so manually." >&2
 fi
diff --git a/runsc/dockerutil/BUILD b/runsc/dockerutil/BUILD
new file mode 100644
index 000000000..0e0423504
--- /dev/null
+++ b/runsc/dockerutil/BUILD
@@ -0,0 +1,15 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "dockerutil",
+    testonly = 1,
+    srcs = ["dockerutil.go"],
+    importpath = "gvisor.dev/gvisor/runsc/dockerutil",
+    visibility = ["//:sandbox"],
+    deps = [
+        "//runsc/testutil",
+        "@com_github_kr_pty//:go_default_library",
+    ],
+)
diff --git a/runsc/dockerutil/dockerutil.go b/runsc/dockerutil/dockerutil.go
new file mode 100644
index 000000000..41f5fe1e8
--- /dev/null
+++ b/runsc/dockerutil/dockerutil.go
@@ -0,0 +1,445 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package dockerutil is a collection of utility functions, primarily for
+// testing.
+package dockerutil
+
+import (
+	"encoding/json"
+	"flag"
+	"fmt"
+	"io/ioutil"
+	"log"
+	"os"
+	"os/exec"
+	"path"
+	"regexp"
+	"strconv"
+	"strings"
+	"syscall"
+	"time"
+
+	"github.com/kr/pty"
+	"gvisor.dev/gvisor/runsc/testutil"
+)
+
+var (
+	runtime = flag.String("runtime", "runsc", "specify which runtime to use")
+	config  = flag.String("config_path", "/etc/docker/daemon.json", "configuration file for reading paths")
+)
+
+// EnsureSupportedDockerVersion checks if correct docker is installed.
+func EnsureSupportedDockerVersion() {
+	cmd := exec.Command("docker", "version")
+	out, err := cmd.CombinedOutput()
+	if err != nil {
+		log.Fatalf("Error running %q: %v", "docker version", err)
+	}
+	re := regexp.MustCompile(`Version:\s+(\d+)\.(\d+)\.\d.*`)
+	matches := re.FindStringSubmatch(string(out))
+	if len(matches) != 3 {
+		log.Fatalf("Invalid docker output: %s", out)
+	}
+	major, _ := strconv.Atoi(matches[1])
+	minor, _ := strconv.Atoi(matches[2])
+	if major < 17 || (major == 17 && minor < 9) {
+		log.Fatalf("Docker version 17.09.0 or greater is required, found: %02d.%02d", major, minor)
+	}
+}
+
+// RuntimePath returns the binary path for the current runtime.
+func RuntimePath() (string, error) {
+	// Read the configuration data; the file must exist.
+	configBytes, err := ioutil.ReadFile(*config)
+	if err != nil {
+		return "", err
+	}
+
+	// Unmarshal the configuration.
+	c := make(map[string]interface{})
+	if err := json.Unmarshal(configBytes, &c); err != nil {
+		return "", err
+	}
+
+	// Decode the expected configuration.
+	r, ok := c["runtimes"]
+	if !ok {
+		return "", fmt.Errorf("no runtimes declared: %v", c)
+	}
+	rs, ok := r.(map[string]interface{})
+	if !ok {
+		// The runtimes are not a map.
+		return "", fmt.Errorf("unexpected format: %v", c)
+	}
+	r, ok = rs[*runtime]
+	if !ok {
+		// The expected runtime is not declared.
+		return "", fmt.Errorf("runtime %q not found: %v", *runtime, c)
+	}
+	rs, ok = r.(map[string]interface{})
+	if !ok {
+		// The runtime is not a map.
+		return "", fmt.Errorf("unexpected format: %v", c)
+	}
+	p, ok := rs["path"].(string)
+	if !ok {
+		// The runtime does not declare a path.
+		return "", fmt.Errorf("unexpected format: %v", c)
+	}
+	return p, nil
+}
+
+// MountMode describes if the mount should be ro or rw.
+type MountMode int
+
+const (
+	// ReadOnly is what the name says.
+	ReadOnly MountMode = iota
+	// ReadWrite is what the name says.
+	ReadWrite
+)
+
+// String returns the mount mode argument for this MountMode.
+func (m MountMode) String() string {
+	switch m {
+	case ReadOnly:
+		return "ro"
+	case ReadWrite:
+		return "rw"
+	}
+	panic(fmt.Sprintf("invalid mode: %d", m))
+}
+
+// MountArg formats the volume argument to mount in the container.
+func MountArg(source, target string, mode MountMode) string {
+	return fmt.Sprintf("-v=%s:%s:%v", source, target, mode)
+}
+
+// LinkArg formats the link argument.
+func LinkArg(source *Docker, target string) string {
+	return fmt.Sprintf("--link=%s:%s", source.Name, target)
+}
+
+// PrepareFiles creates temp directory to copy files there. The sandbox doesn't
+// have access to files in the test dir.
+func PrepareFiles(names ...string) (string, error) {
+	dir, err := ioutil.TempDir("", "image-test")
+	if err != nil {
+		return "", fmt.Errorf("ioutil.TempDir failed: %v", err)
+	}
+	if err := os.Chmod(dir, 0777); err != nil {
+		return "", fmt.Errorf("os.Chmod(%q, 0777) failed: %v", dir, err)
+	}
+	for _, name := range names {
+		src := getLocalPath(name)
+		dst := path.Join(dir, name)
+		if err := testutil.Copy(src, dst); err != nil {
+			return "", fmt.Errorf("testutil.Copy(%q, %q) failed: %v", src, dst, err)
+		}
+	}
+	return dir, nil
+}
+
+func getLocalPath(file string) string {
+	return path.Join(".", file)
+}
+
+// do executes docker command.
+func do(args ...string) (string, error) {
+	log.Printf("Running: docker %s\n", args)
+	cmd := exec.Command("docker", args...)
+	out, err := cmd.CombinedOutput()
+	if err != nil {
+		return "", fmt.Errorf("error executing docker %s: %v\nout: %s", args, err, out)
+	}
+	return string(out), nil
+}
+
+// doWithPty executes docker command with stdio attached to a pty.
+func doWithPty(args ...string) (*exec.Cmd, *os.File, error) {
+	log.Printf("Running with pty: docker %s\n", args)
+	cmd := exec.Command("docker", args...)
+	ptmx, err := pty.Start(cmd)
+	if err != nil {
+		return nil, nil, fmt.Errorf("error executing docker %s with a pty: %v", args, err)
+	}
+	return cmd, ptmx, nil
+}
+
+// Pull pulls a docker image. This is used in tests to isolate the
+// time to pull the image off the network from the time to actually
+// start the container, to avoid timeouts over slow networks.
+func Pull(image string) error {
+	_, err := do("pull", image)
+	return err
+}
+
+// Docker contains the name and the runtime of a docker container.
+type Docker struct {
+	Runtime string
+	Name    string
+}
+
+// MakeDocker sets up the struct for a Docker container.
+// Names of containers will be unique.
+func MakeDocker(namePrefix string) Docker {
+	return Docker{
+		Name:    testutil.RandomName(namePrefix),
+		Runtime: *runtime,
+	}
+}
+
+// logDockerID logs a container id, which is needed to find container runsc logs.
+func (d *Docker) logDockerID() {
+	id, err := d.ID()
+	if err != nil {
+		log.Printf("%v\n", err)
+	}
+	log.Printf("Name: %s ID: %v\n", d.Name, id)
+}
+
+// Create calls 'docker create' with the arguments provided.
+func (d *Docker) Create(args ...string) error {
+	a := []string{"create", "--runtime", d.Runtime, "--name", d.Name}
+	a = append(a, args...)
+	_, err := do(a...)
+	if err == nil {
+		d.logDockerID()
+	}
+	return err
+}
+
+// Start calls 'docker start'.
+func (d *Docker) Start() error {
+	if _, err := do("start", d.Name); err != nil {
+		return fmt.Errorf("error starting container %q: %v", d.Name, err)
+	}
+	return nil
+}
+
+// Stop calls 'docker stop'.
+func (d *Docker) Stop() error {
+	if _, err := do("stop", d.Name); err != nil {
+		return fmt.Errorf("error stopping container %q: %v", d.Name, err)
+	}
+	return nil
+}
+
+// Run calls 'docker run' with the arguments provided. The container starts
+// running in the background and the call returns immediately.
+func (d *Docker) Run(args ...string) error {
+	a := []string{"run", "--runtime", d.Runtime, "--name", d.Name, "-d"}
+	a = append(a, args...)
+	_, err := do(a...)
+	if err == nil {
+		d.logDockerID()
+	}
+	return err
+}
+
+// RunWithPty is like Run but with an attached pty.
+func (d *Docker) RunWithPty(args ...string) (*exec.Cmd, *os.File, error) {
+	a := []string{"run", "--runtime", d.Runtime, "--name", d.Name, "-it"}
+	a = append(a, args...)
+	return doWithPty(a...)
+}
+
+// RunFg calls 'docker run' with the arguments provided in the foreground. It
+// blocks until the container exits and returns the output.
+func (d *Docker) RunFg(args ...string) (string, error) {
+	a := []string{"run", "--runtime", d.Runtime, "--name", d.Name}
+	a = append(a, args...)
+	out, err := do(a...)
+	if err == nil {
+		d.logDockerID()
+	}
+	return string(out), err
+}
+
+// Logs calls 'docker logs'.
+func (d *Docker) Logs() (string, error) {
+	return do("logs", d.Name)
+}
+
+// Exec calls 'docker exec' with the arguments provided.
+func (d *Docker) Exec(args ...string) (string, error) {
+	a := []string{"exec", d.Name}
+	a = append(a, args...)
+	return do(a...)
+}
+
+// ExecWithTerminal calls 'docker exec -it' with the arguments provided and
+// attaches a pty to stdio.
+func (d *Docker) ExecWithTerminal(args ...string) (*exec.Cmd, *os.File, error) {
+	a := []string{"exec", "-it", d.Name}
+	a = append(a, args...)
+	return doWithPty(a...)
+}
+
+// Pause calls 'docker pause'.
+func (d *Docker) Pause() error {
+	if _, err := do("pause", d.Name); err != nil {
+		return fmt.Errorf("error pausing container %q: %v", d.Name, err)
+	}
+	return nil
+}
+
+// Unpause calls 'docker pause'.
+func (d *Docker) Unpause() error {
+	if _, err := do("unpause", d.Name); err != nil {
+		return fmt.Errorf("error unpausing container %q: %v", d.Name, err)
+	}
+	return nil
+}
+
+// Checkpoint calls 'docker checkpoint'.
+func (d *Docker) Checkpoint(name string) error {
+	if _, err := do("checkpoint", "create", d.Name, name); err != nil {
+		return fmt.Errorf("error pausing container %q: %v", d.Name, err)
+	}
+	return nil
+}
+
+// Restore calls 'docker start --checkname [name]'.
+func (d *Docker) Restore(name string) error {
+	if _, err := do("start", "--checkpoint", name, d.Name); err != nil {
+		return fmt.Errorf("error starting container %q: %v", d.Name, err)
+	}
+	return nil
+}
+
+// Remove calls 'docker rm'.
+func (d *Docker) Remove() error {
+	if _, err := do("rm", d.Name); err != nil {
+		return fmt.Errorf("error deleting container %q: %v", d.Name, err)
+	}
+	return nil
+}
+
+// CleanUp kills and deletes the container (best effort).
+func (d *Docker) CleanUp() {
+	d.logDockerID()
+	if _, err := do("kill", d.Name); err != nil {
+		if strings.Contains(err.Error(), "is not running") {
+			// Nothing to kill. Don't log the error in this case.
+		} else {
+			log.Printf("error killing container %q: %v", d.Name, err)
+		}
+	}
+	if err := d.Remove(); err != nil {
+		log.Print(err)
+	}
+}
+
+// FindPort returns the host port that is mapped to 'sandboxPort'. This calls
+// docker to allocate a free port in the host and prevent conflicts.
+func (d *Docker) FindPort(sandboxPort int) (int, error) {
+	format := fmt.Sprintf(`{{ (index (index .NetworkSettings.Ports "%d/tcp") 0).HostPort }}`, sandboxPort)
+	out, err := do("inspect", "-f", format, d.Name)
+	if err != nil {
+		return -1, fmt.Errorf("error retrieving port: %v", err)
+	}
+	port, err := strconv.Atoi(strings.TrimSuffix(string(out), "\n"))
+	if err != nil {
+		return -1, fmt.Errorf("error parsing port %q: %v", out, err)
+	}
+	return port, nil
+}
+
+// SandboxPid returns the PID to the sandbox process.
+func (d *Docker) SandboxPid() (int, error) {
+	out, err := do("inspect", "-f={{.State.Pid}}", d.Name)
+	if err != nil {
+		return -1, fmt.Errorf("error retrieving pid: %v", err)
+	}
+	pid, err := strconv.Atoi(strings.TrimSuffix(string(out), "\n"))
+	if err != nil {
+		return -1, fmt.Errorf("error parsing pid %q: %v", out, err)
+	}
+	return pid, nil
+}
+
+// ID returns the container ID.
+func (d *Docker) ID() (string, error) {
+	out, err := do("inspect", "-f={{.Id}}", d.Name)
+	if err != nil {
+		return "", fmt.Errorf("error retrieving ID: %v", err)
+	}
+	return strings.TrimSpace(string(out)), nil
+}
+
+// Wait waits for container to exit, up to the given timeout. Returns error if
+// wait fails or timeout is hit. Returns the application return code otherwise.
+// Note that the application may have failed even if err == nil, always check
+// the exit code.
+func (d *Docker) Wait(timeout time.Duration) (syscall.WaitStatus, error) {
+	timeoutChan := time.After(timeout)
+	waitChan := make(chan (syscall.WaitStatus))
+	errChan := make(chan (error))
+
+	go func() {
+		out, err := do("wait", d.Name)
+		if err != nil {
+			errChan <- fmt.Errorf("error waiting for container %q: %v", d.Name, err)
+		}
+		exit, err := strconv.Atoi(strings.TrimSuffix(string(out), "\n"))
+		if err != nil {
+			errChan <- fmt.Errorf("error parsing exit code %q: %v", out, err)
+		}
+		waitChan <- syscall.WaitStatus(uint32(exit))
+	}()
+
+	select {
+	case ws := <-waitChan:
+		return ws, nil
+	case err := <-errChan:
+		return syscall.WaitStatus(1), err
+	case <-timeoutChan:
+		return syscall.WaitStatus(1), fmt.Errorf("timeout waiting for container %q", d.Name)
+	}
+}
+
+// WaitForOutput calls 'docker logs' to retrieve containers output and searches
+// for the given pattern.
+func (d *Docker) WaitForOutput(pattern string, timeout time.Duration) (string, error) {
+	matches, err := d.WaitForOutputSubmatch(pattern, timeout)
+	if err != nil {
+		return "", err
+	}
+	if len(matches) == 0 {
+		return "", nil
+	}
+	return matches[0], nil
+}
+
+// WaitForOutputSubmatch calls 'docker logs' to retrieve containers output and
+// searches for the given pattern. It returns any regexp submatches as well.
+func (d *Docker) WaitForOutputSubmatch(pattern string, timeout time.Duration) ([]string, error) {
+	re := regexp.MustCompile(pattern)
+	var out string
+	for exp := time.Now().Add(timeout); time.Now().Before(exp); {
+		var err error
+		out, err = d.Logs()
+		if err != nil {
+			return nil, err
+		}
+		if matches := re.FindStringSubmatch(out); matches != nil {
+			// Success!
+			return matches, nil
+		}
+		time.Sleep(100 * time.Millisecond)
+	}
+	return nil, fmt.Errorf("timeout waiting for output %q: %s", re.String(), out)
+}
diff --git a/runsc/main.go b/runsc/main.go
index 70f06dbb8..0ff68160d 100644
--- a/runsc/main.go
+++ b/runsc/main.go
@@ -88,6 +88,11 @@ func main() {
 	subcommands.Register(help, "")
 	subcommands.Register(subcommands.FlagsCommand(), "")
 
+	// Installation helpers.
+	const helperGroup = "helpers"
+	subcommands.Register(new(cmd.Install), helperGroup)
+	subcommands.Register(new(cmd.Uninstall), helperGroup)
+
 	// Register user-facing runsc commands.
 	subcommands.Register(new(cmd.Checkpoint), "")
 	subcommands.Register(new(cmd.Create), "")
diff --git a/runsc/test/BUILD b/runsc/test/BUILD
deleted file mode 100644
index e69de29bb..000000000
diff --git a/runsc/test/README.md b/runsc/test/README.md
deleted file mode 100644
index f22a8e017..000000000
--- a/runsc/test/README.md
+++ /dev/null
@@ -1,24 +0,0 @@
-# Tests
-
-The tests defined under this path are verifying functionality beyond what unit
-tests can cover, e.g. integration and end to end tests. Due to their nature,
-they may need extra setup in the test machine and extra configuration to run.
-
--   **integration:** defines integration tests that uses `docker run` to test
-    functionality.
--   **image:** basic end to end test for popular images.
--   **root:** tests that require to be run as root.
--   **testutil:** utilities library to support the tests.
-
-The following setup steps are required in order to run these tests:
-
-     `./runsc/test/install.sh [--runtime <name>]`
-
-The tests expect the runtime name to be provided in the `RUNSC_RUNTIME`
-environment variable (default: `runsc-test`). To run the tests execute:
-
-```
-bazel test --test_env=RUNSC_RUNTIME=runsc-test \
-  //runsc/test/image:image_test \
-  //runsc/test/integration:integration_test
-```
diff --git a/runsc/test/build_defs.bzl b/runsc/test/build_defs.bzl
deleted file mode 100644
index ac28cc037..000000000
--- a/runsc/test/build_defs.bzl
+++ /dev/null
@@ -1,19 +0,0 @@
-"""Defines a rule for runsc test targets."""
-
-load("@io_bazel_rules_go//go:def.bzl", _go_test = "go_test")
-
-# runtime_test is a macro that will create targets to run the given test target
-# with different runtime options.
-def runtime_test(**kwargs):
-    """Runs the given test target with different runtime options."""
-    name = kwargs["name"]
-    _go_test(**kwargs)
-    kwargs["name"] = name + "_hostnet"
-    kwargs["args"] = ["--runtime-type=hostnet"]
-    _go_test(**kwargs)
-    kwargs["name"] = name + "_kvm"
-    kwargs["args"] = ["--runtime-type=kvm"]
-    _go_test(**kwargs)
-    kwargs["name"] = name + "_overlay"
-    kwargs["args"] = ["--runtime-type=overlay"]
-    _go_test(**kwargs)
diff --git a/runsc/test/image/BUILD b/runsc/test/image/BUILD
deleted file mode 100644
index 58758fde5..000000000
--- a/runsc/test/image/BUILD
+++ /dev/null
@@ -1,31 +0,0 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
-load("//runsc/test:build_defs.bzl", "runtime_test")
-
-package(licenses = ["notice"])
-
-runtime_test(
-    name = "image_test",
-    size = "large",
-    srcs = [
-        "image_test.go",
-    ],
-    data = [
-        "latin10k.txt",
-        "mysql.sql",
-        "ruby.rb",
-        "ruby.sh",
-    ],
-    embed = [":image"],
-    tags = [
-        # Requires docker and runsc to be configured before the test runs.
-        "manual",
-        "local",
-    ],
-    deps = ["//runsc/test/testutil"],
-)
-
-go_library(
-    name = "image",
-    srcs = ["image.go"],
-    importpath = "gvisor.dev/gvisor/runsc/test/image",
-)
diff --git a/runsc/test/image/image.go b/runsc/test/image/image.go
deleted file mode 100644
index 297f1ab92..000000000
--- a/runsc/test/image/image.go
+++ /dev/null
@@ -1,16 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Package image is empty. See image_test.go for description.
-package image
diff --git a/runsc/test/image/image_test.go b/runsc/test/image/image_test.go
deleted file mode 100644
index ddaa2c13b..000000000
--- a/runsc/test/image/image_test.go
+++ /dev/null
@@ -1,350 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Package image provides end-to-end image tests for runsc.
-
-// Each test calls docker commands to start up a container, and tests that it is
-// behaving properly, like connecting to a port or looking at the output. The
-// container is killed and deleted at the end.
-//
-// Setup instruction in runsc/test/README.md.
-package image
-
-import (
-	"fmt"
-	"io/ioutil"
-	"log"
-	"net/http"
-	"os"
-	"path/filepath"
-	"strings"
-	"testing"
-	"time"
-
-	"gvisor.dev/gvisor/runsc/test/testutil"
-)
-
-func TestHelloWorld(t *testing.T) {
-	d := testutil.MakeDocker("hello-test")
-	if err := d.Run("hello-world"); err != nil {
-		t.Fatalf("docker run failed: %v", err)
-	}
-	defer d.CleanUp()
-
-	if _, err := d.WaitForOutput("Hello from Docker!", 5*time.Second); err != nil {
-		t.Fatalf("docker didn't say hello: %v", err)
-	}
-}
-
-func runHTTPRequest(port int) error {
-	url := fmt.Sprintf("http://localhost:%d/not-found", port)
-	resp, err := http.Get(url)
-	if err != nil {
-		return fmt.Errorf("error reaching http server: %v", err)
-	}
-	if want := http.StatusNotFound; resp.StatusCode != want {
-		return fmt.Errorf("Wrong response code, got: %d, want: %d", resp.StatusCode, want)
-	}
-
-	url = fmt.Sprintf("http://localhost:%d/latin10k.txt", port)
-	resp, err = http.Get(url)
-	if err != nil {
-		return fmt.Errorf("Error reaching http server: %v", err)
-	}
-	if want := http.StatusOK; resp.StatusCode != want {
-		return fmt.Errorf("Wrong response code, got: %d, want: %d", resp.StatusCode, want)
-	}
-
-	body, err := ioutil.ReadAll(resp.Body)
-	if err != nil {
-		return fmt.Errorf("Error reading http response: %v", err)
-	}
-	defer resp.Body.Close()
-
-	// READALL is the last word in the file. Ensures everything was read.
-	if want := "READALL"; strings.HasSuffix(string(body), want) {
-		return fmt.Errorf("response doesn't contain %q, resp: %q", want, body)
-	}
-	return nil
-}
-
-func testHTTPServer(t *testing.T, port int) {
-	const requests = 10
-	ch := make(chan error, requests)
-	for i := 0; i < requests; i++ {
-		go func() {
-			start := time.Now()
-			err := runHTTPRequest(port)
-			log.Printf("Response time %v: %v", time.Since(start).String(), err)
-			ch <- err
-		}()
-	}
-
-	for i := 0; i < requests; i++ {
-		err := <-ch
-		if err != nil {
-			t.Errorf("testHTTPServer(%d) failed: %v", port, err)
-		}
-	}
-}
-
-func TestHttpd(t *testing.T) {
-	if err := testutil.Pull("httpd"); err != nil {
-		t.Fatalf("docker pull failed: %v", err)
-	}
-	d := testutil.MakeDocker("http-test")
-
-	dir, err := testutil.PrepareFiles("latin10k.txt")
-	if err != nil {
-		t.Fatalf("PrepareFiles() failed: %v", err)
-	}
-
-	// Start the container.
-	mountArg := testutil.MountArg(dir, "/usr/local/apache2/htdocs", testutil.ReadOnly)
-	if err := d.Run("-p", "80", mountArg, "httpd"); err != nil {
-		t.Fatalf("docker run failed: %v", err)
-	}
-	defer d.CleanUp()
-
-	// Find where port 80 is mapped to.
-	port, err := d.FindPort(80)
-	if err != nil {
-		t.Fatalf("docker.FindPort(80) failed: %v", err)
-	}
-
-	// Wait until it's up and running.
-	if err := testutil.WaitForHTTP(port, 30*time.Second); err != nil {
-		t.Errorf("WaitForHTTP() timeout: %v", err)
-	}
-
-	testHTTPServer(t, port)
-}
-
-func TestNginx(t *testing.T) {
-	if err := testutil.Pull("nginx"); err != nil {
-		t.Fatalf("docker pull failed: %v", err)
-	}
-	d := testutil.MakeDocker("net-test")
-
-	dir, err := testutil.PrepareFiles("latin10k.txt")
-	if err != nil {
-		t.Fatalf("PrepareFiles() failed: %v", err)
-	}
-
-	// Start the container.
-	mountArg := testutil.MountArg(dir, "/usr/share/nginx/html", testutil.ReadOnly)
-	if err := d.Run("-p", "80", mountArg, "nginx"); err != nil {
-		t.Fatalf("docker run failed: %v", err)
-	}
-	defer d.CleanUp()
-
-	// Find where port 80 is mapped to.
-	port, err := d.FindPort(80)
-	if err != nil {
-		t.Fatalf("docker.FindPort(80) failed: %v", err)
-	}
-
-	// Wait until it's up and running.
-	if err := testutil.WaitForHTTP(port, 30*time.Second); err != nil {
-		t.Errorf("WaitForHTTP() timeout: %v", err)
-	}
-
-	testHTTPServer(t, port)
-}
-
-func TestMysql(t *testing.T) {
-	if err := testutil.Pull("mysql"); err != nil {
-		t.Fatalf("docker pull failed: %v", err)
-	}
-	d := testutil.MakeDocker("mysql-test")
-
-	// Start the container.
-	if err := d.Run("-e", "MYSQL_ROOT_PASSWORD=foobar123", "mysql"); err != nil {
-		t.Fatalf("docker run failed: %v", err)
-	}
-	defer d.CleanUp()
-
-	// Wait until it's up and running.
-	if _, err := d.WaitForOutput("port: 3306  MySQL Community Server", 3*time.Minute); err != nil {
-		t.Fatalf("docker.WaitForOutput() timeout: %v", err)
-	}
-
-	client := testutil.MakeDocker("mysql-client-test")
-	dir, err := testutil.PrepareFiles("mysql.sql")
-	if err != nil {
-		t.Fatalf("PrepareFiles() failed: %v", err)
-	}
-
-	// Tell mysql client to connect to the server and execute the file in verbose
-	// mode to verify the output.
-	args := []string{
-		testutil.LinkArg(&d, "mysql"),
-		testutil.MountArg(dir, "/sql", testutil.ReadWrite),
-		"mysql",
-		"mysql", "-hmysql", "-uroot", "-pfoobar123", "-v", "-e", "source /sql/mysql.sql",
-	}
-	if err := client.Run(args...); err != nil {
-		t.Fatalf("docker run failed: %v", err)
-	}
-	defer client.CleanUp()
-
-	// Ensure file executed to the end and shutdown mysql.
-	if _, err := client.WaitForOutput("--------------\nshutdown\n--------------", 15*time.Second); err != nil {
-		t.Fatalf("docker.WaitForOutput() timeout: %v", err)
-	}
-	if _, err := d.WaitForOutput("mysqld: Shutdown complete", 30*time.Second); err != nil {
-		t.Fatalf("docker.WaitForOutput() timeout: %v", err)
-	}
-}
-
-func TestPythonHello(t *testing.T) {
-	// TODO(b/136503277): Once we have more complete python runtime tests,
-	// we can drop this one.
-	const img = "gcr.io/gvisor-presubmit/python-hello"
-	if err := testutil.Pull(img); err != nil {
-		t.Fatalf("docker pull failed: %v", err)
-	}
-	d := testutil.MakeDocker("python-hello-test")
-	if err := d.Run("-p", "8080", img); err != nil {
-		t.Fatalf("docker run failed: %v", err)
-	}
-	defer d.CleanUp()
-
-	// Find where port 8080 is mapped to.
-	port, err := d.FindPort(8080)
-	if err != nil {
-		t.Fatalf("docker.FindPort(8080) failed: %v", err)
-	}
-
-	// Wait until it's up and running.
-	if err := testutil.WaitForHTTP(port, 30*time.Second); err != nil {
-		t.Fatalf("WaitForHTTP() timeout: %v", err)
-	}
-
-	// Ensure that content is being served.
-	url := fmt.Sprintf("http://localhost:%d", port)
-	resp, err := http.Get(url)
-	if err != nil {
-		t.Errorf("Error reaching http server: %v", err)
-	}
-	if want := http.StatusOK; resp.StatusCode != want {
-		t.Errorf("Wrong response code, got: %d, want: %d", resp.StatusCode, want)
-	}
-}
-
-func TestTomcat(t *testing.T) {
-	if err := testutil.Pull("tomcat:8.0"); err != nil {
-		t.Fatalf("docker pull failed: %v", err)
-	}
-	d := testutil.MakeDocker("tomcat-test")
-	if err := d.Run("-p", "8080", "tomcat:8.0"); err != nil {
-		t.Fatalf("docker run failed: %v", err)
-	}
-	defer d.CleanUp()
-
-	// Find where port 8080 is mapped to.
-	port, err := d.FindPort(8080)
-	if err != nil {
-		t.Fatalf("docker.FindPort(8080) failed: %v", err)
-	}
-
-	// Wait until it's up and running.
-	if err := testutil.WaitForHTTP(port, 30*time.Second); err != nil {
-		t.Fatalf("WaitForHTTP() timeout: %v", err)
-	}
-
-	// Ensure that content is being served.
-	url := fmt.Sprintf("http://localhost:%d", port)
-	resp, err := http.Get(url)
-	if err != nil {
-		t.Errorf("Error reaching http server: %v", err)
-	}
-	if want := http.StatusOK; resp.StatusCode != want {
-		t.Errorf("Wrong response code, got: %d, want: %d", resp.StatusCode, want)
-	}
-}
-
-func TestRuby(t *testing.T) {
-	if err := testutil.Pull("ruby"); err != nil {
-		t.Fatalf("docker pull failed: %v", err)
-	}
-	d := testutil.MakeDocker("ruby-test")
-
-	dir, err := testutil.PrepareFiles("ruby.rb", "ruby.sh")
-	if err != nil {
-		t.Fatalf("PrepareFiles() failed: %v", err)
-	}
-	if err := os.Chmod(filepath.Join(dir, "ruby.sh"), 0333); err != nil {
-		t.Fatalf("os.Chmod(%q, 0333) failed: %v", dir, err)
-	}
-
-	if err := d.Run("-p", "8080", testutil.MountArg(dir, "/src", testutil.ReadOnly), "ruby", "/src/ruby.sh"); err != nil {
-		t.Fatalf("docker run failed: %v", err)
-	}
-	defer d.CleanUp()
-
-	// Find where port 8080 is mapped to.
-	port, err := d.FindPort(8080)
-	if err != nil {
-		t.Fatalf("docker.FindPort(8080) failed: %v", err)
-	}
-
-	// Wait until it's up and running, 'gem install' can take some time.
-	if err := testutil.WaitForHTTP(port, 1*time.Minute); err != nil {
-		t.Fatalf("WaitForHTTP() timeout: %v", err)
-	}
-
-	// Ensure that content is being served.
-	url := fmt.Sprintf("http://localhost:%d", port)
-	resp, err := http.Get(url)
-	if err != nil {
-		t.Errorf("error reaching http server: %v", err)
-	}
-	if want := http.StatusOK; resp.StatusCode != want {
-		t.Errorf("wrong response code, got: %d, want: %d", resp.StatusCode, want)
-	}
-	body, err := ioutil.ReadAll(resp.Body)
-	if err != nil {
-		t.Fatalf("error reading body: %v", err)
-	}
-	if got, want := string(body), "Hello World"; !strings.Contains(got, want) {
-		t.Errorf("invalid body content, got: %q, want: %q", got, want)
-	}
-}
-
-func TestStdio(t *testing.T) {
-	if err := testutil.Pull("alpine"); err != nil {
-		t.Fatalf("docker pull failed: %v", err)
-	}
-	d := testutil.MakeDocker("stdio-test")
-
-	wantStdout := "hello stdout"
-	wantStderr := "bonjour stderr"
-	cmd := fmt.Sprintf("echo %q; echo %q 1>&2;", wantStdout, wantStderr)
-	if err := d.Run("alpine", "/bin/sh", "-c", cmd); err != nil {
-		t.Fatalf("docker run failed: %v", err)
-	}
-	defer d.CleanUp()
-
-	for _, want := range []string{wantStdout, wantStderr} {
-		if _, err := d.WaitForOutput(want, 5*time.Second); err != nil {
-			t.Fatalf("docker didn't get output %q : %v", want, err)
-		}
-	}
-}
-
-func TestMain(m *testing.M) {
-	testutil.EnsureSupportedDockerVersion()
-	os.Exit(m.Run())
-}
diff --git a/runsc/test/image/latin10k.txt b/runsc/test/image/latin10k.txt
deleted file mode 100644
index 61341e00b..000000000
--- a/runsc/test/image/latin10k.txt
+++ /dev/null
@@ -1,33 +0,0 @@
-Lorem ipsum dolor sit amet, consectetur adipiscing elit. Cras ut placerat felis. Maecenas urna est, auctor a efficitur sit amet, egestas et augue. Curabitur dignissim scelerisque nunc vel cursus. Ut vehicula est pretium, consectetur nunc non, pharetra ligula. Curabitur ut ultricies metus. Suspendisse pulvinar, orci sed fermentum vestibulum, eros turpis molestie lectus, nec elementum risus dolor mattis felis. Donec ultrices ipsum sem, at pretium lacus convallis at. Mauris nulla enim, tincidunt non bibendum at, vehicula pulvinar mauris.
-
-Duis in dapibus turpis. Pellentesque maximus magna odio, ac congue libero laoreet quis. Maecenas euismod risus in justo aliquam accumsan. Nunc quis ornare arcu, sit amet sodales elit. Phasellus nec scelerisque nisl, a tincidunt arcu. Proin ornare est nunc, sed suscipit orci interdum et. Suspendisse condimentum venenatis diam in tempor. Aliquam egestas lectus in rutrum tempus. Donec id egestas eros. Donec molestie consequat purus, sed posuere odio venenatis vitae. Nunc placerat augue id vehicula varius. In hac habitasse platea dictumst. Proin at est accumsan, venenatis quam a, fermentum risus. Phasellus posuere pellentesque enim, id suscipit magna consequat ut. Quisque ut tortor ante.
-
-Cras ut vulputate metus, a laoreet lectus. Vivamus ultrices molestie odio in tristique. Morbi faucibus mi eget sollicitudin fringilla. Fusce vitae lacinia ligula. Sed egestas sed diam eu posuere. Maecenas justo nisl, venenatis vel nibh vel, cursus aliquam velit. Praesent lacinia dui id erat venenatis rhoncus. Morbi gravida felis ante, sit amet vehicula orci rhoncus vitae.
-
-Sed finibus sagittis dictum. Proin auctor suscipit sem et mattis. Phasellus libero ligula, pellentesque ut felis porttitor, fermentum sollicitudin orci. Nulla eu nulla nibh. Fusce a eros risus. Proin vel magna risus. Donec nec elit eleifend, scelerisque sapien vitae, pharetra quam. Donec porttitor mauris scelerisque, tempus orci hendrerit, dapibus felis. Nullam libero elit, sollicitudin a aliquam at, ultrices in erat. Mauris eget ligula sodales, porta turpis et, scelerisque odio. Mauris mollis leo vitae purus gravida, in tempor nunc efficitur. Nulla facilisis posuere augue, nec pellentesque lectus eleifend ac. Vestibulum convallis est a feugiat tincidunt. Donec vitae enim volutpat, tincidunt eros eu, malesuada nibh.
-
-Quisque molestie, magna ornare elementum convallis, erat enim sagittis ipsum, eget porttitor sapien arcu id purus. Donec ut cursus diam. Nulla rutrum nulla et mi fermentum, vel tempus tellus posuere. Proin vitae pharetra nulla, nec ornare ex. Nulla consequat, augue a accumsan euismod, turpis leo ornare ligula, a pulvinar enim dolor ut augue. Quisque volutpat, lectus a varius mollis, nisl eros feugiat sem, at egestas lacus justo eu elit. Vestibulum scelerisque mauris est, sagittis interdum nunc accumsan sit amet. Maecenas aliquet ex ut lacus ornare, eu sagittis nibh imperdiet. Duis ultrices nisi velit, sed sodales risus sollicitudin et. Vestibulum ante ipsum primis in faucibus orci luctus et ultrices posuere cubilia Curae; Etiam a accumsan augue, vitae pulvinar nulla. Pellentesque euismod sodales magna, nec luctus eros mattis eget. Sed lacinia suscipit lectus, eget consectetur dui pellentesque sed. Nullam nec mattis tellus.
-
-Aliquam erat volutpat. Praesent lobortis massa porttitor eros tincidunt, nec consequat diam pharetra. Duis efficitur non lorem sed mattis. Suspendisse justo nunc, pulvinar eu porttitor at, facilisis id eros. Suspendisse potenti. Cras molestie aliquet orci ut fermentum. In tempus aliquet eros nec suscipit. Suspendisse in mauris ut lectus ultrices blandit sit amet vitae est. Nam magna massa, porttitor ut semper id, feugiat vel quam. Suspendisse dignissim posuere scelerisque. Donec scelerisque lorem efficitur suscipit suscipit. Nunc luctus ligula et scelerisque lacinia.
-
-Suspendisse potenti. Class aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos himenaeos. Sed ultrices, sem in venenatis scelerisque, tellus ipsum porttitor urna, et iaculis lectus odio ac nisi. Integer luctus dui urna, at sollicitudin elit dapibus eu. Praesent nibh ante, porttitor a ante in, ullamcorper pretium felis. Aliquam vel tortor imperdiet, imperdiet lorem et, cursus mi. Proin tempus velit est, ut hendrerit metus gravida sed. Sed nibh sapien, faucibus quis ipsum in, scelerisque lacinia elit. In nec magna eu magna laoreet rhoncus. Donec vitae rutrum mauris. Integer urna felis, consequat at rhoncus vitae, auctor quis elit. Duis a pulvinar sem, nec gravida nisl. Nam non dapibus purus. Praesent vestibulum turpis nec erat porttitor, a scelerisque purus tincidunt.
-
-Nam fringilla leo nisi, nec placerat nisl luctus eget. Aenean malesuada nunc porta sapien sodales convallis. Suspendisse ut massa tempor, ullamcorper mi ut, faucibus turpis. Vivamus at sagittis metus. Donec varius ac mi eget sodales. Nulla feugiat, nulla eu fringilla fringilla, nunc lorem sollicitudin quam, vitae lacinia velit lorem eu orci. Mauris leo urna, pellentesque ac posuere non, pellentesque sit amet quam.
-
-Vestibulum porta diam urna, a aliquet nibh vestibulum et. Proin interdum bibendum nisl sed rhoncus. Sed vel diam hendrerit, faucibus ante et, hendrerit diam. Nunc dolor augue, mattis non dolor vel, luctus sodales neque. Cras malesuada fermentum dolor eu lobortis. Integer dapibus volutpat consequat. Maecenas posuere feugiat nunc. Donec vel mollis elit, volutpat consequat enim. Nulla id nisi finibus orci imperdiet elementum. Phasellus ultrices, elit vitae consequat rutrum, nisl est congue massa, quis condimentum justo nisi vitae turpis. Maecenas aliquet risus sit amet accumsan elementum. Proin non finibus elit, sit amet lobortis augue.
-
-Morbi pretium pulvinar sem vel sollicitudin. Proin imperdiet fringilla leo, non pellentesque lacus gravida nec. Vivamus ullamcorper consectetur ligula eu consectetur. Curabitur sit amet tempus purus. Curabitur quam quam, tincidunt eu tempus vel, volutpat at ipsum. Maecenas lobortis elit ac justo interdum, sit amet mattis ligula mollis. Sed posuere ligula et felis convallis tempor. Aliquam nec mollis velit. Donec varius sit amet erat at imperdiet. Nulla ipsum justo, tempor non sollicitudin gravida, dignissim vel orci. In hac habitasse platea dictumst. Cras cursus tellus id arcu aliquet accumsan. Phasellus ac erat dui.
-
-Duis mollis metus at mi luctus aliquam. Duis varius eget erat ac porttitor. Phasellus lobortis sagittis lacinia. Etiam sagittis eget erat in pulvinar. Phasellus sodales risus nec vulputate accumsan. Cras sit amet pellentesque dui. Praesent consequat felis mi, at vulputate diam convallis a. Donec hendrerit nibh vel justo consequat dictum. In euismod, dui sit amet malesuada suscipit, mauris ex rhoncus eros, sed ornare arcu nunc eu urna. Pellentesque eget erat augue. Integer rutrum mauris sem, nec sodales nulla cursus vel. Vivamus porta, urna vel varius vulputate, nulla arcu malesuada dui, a ultrices magna ante sed nibh.
-
-Morbi ultricies aliquam lorem id bibendum. Donec sit amet nunc vitae massa gravida eleifend hendrerit vel libero. Pellentesque habitant morbi tristique senectus et netus et malesuada fames ac turpis egestas. Nulla vestibulum tempus condimentum. Aliquam dolor ipsum, condimentum in sapien et, tempor iaculis nulla. Aenean non pharetra augue. Maecenas mattis dignissim maximus. Fusce elementum tincidunt massa sit amet lobortis. Phasellus nec pharetra dui, et malesuada ante. Nullam commodo pretium tellus. Praesent sollicitudin, enim eget imperdiet scelerisque, odio felis vulputate dolor, eget auctor neque tellus ac lorem.
-
-In consectetur augue et sapien feugiat varius. Nam tortor mi, consectetur ac felis non, elementum venenatis augue. Suspendisse ut tellus in est sagittis cursus. Quisque faucibus, neque sit amet semper congue, nibh augue finibus odio, vitae interdum dolor arcu eget arcu. Curabitur dictum risus massa, non tincidunt urna molestie non. Maecenas eu quam purus. Donec vulputate, dui eu accumsan blandit, mauris tortor tristique mi, sed blandit leo quam id quam. Ut venenatis sagittis malesuada. Integer non auctor orci. Duis consectetur massa felis. Fusce euismod est sit amet bibendum finibus. Vestibulum dolor ex, tempor at elit in, iaculis cursus dui. Nunc sed neque ac risus rutrum tempus sit amet at ante. In hac habitasse platea dictumst.
-
-Donec rutrum, velit nec viverra tincidunt, est velit viverra neque, quis auctor leo ex at lectus. Morbi eget purus nisi. Aliquam lacus dui, interdum vitae elit at, venenatis dignissim est. Duis ac mollis lorem. Vivamus a vestibulum quam. Maecenas non metus dolor. Praesent tortor nunc, tristique at nisl molestie, vulputate eleifend diam. Integer ultrices lacus odio, vel imperdiet enim accumsan id. Sed ligula tortor, interdum eu velit eget, pharetra pulvinar magna. Sed non lacus in eros tincidunt sagittis ac vel justo. Donec vitae leo sagittis, accumsan ante sit amet, accumsan odio. Ut volutpat ultricies tortor. Vestibulum tempus purus et est tristique sagittis quis vitae turpis.
-
-Nam iaculis neque lacus, eget euismod turpis blandit eget. In hac habitasse platea dictumst. Phasellus justo neque, scelerisque sit amet risus ut, pretium commodo nisl. Phasellus auctor sapien sed ex bibendum fermentum. Proin maximus odio a ante ornare, a feugiat lorem egestas. Etiam efficitur tortor a ante tincidunt interdum. Nullam non est ac massa congue efficitur sit amet nec eros. Nullam at ipsum vel mauris tincidunt efficitur. Duis pulvinar nisl elit, id auctor risus laoreet ac. Sed nunc mauris, tristique id leo ut, condimentum congue nunc. Sed ultricies, mauris et convallis faucibus, justo ex faucibus est, at lobortis purus justo non arcu. Integer vel facilisis elit, dapibus imperdiet mauris.
-
-Pellentesque non mattis turpis, eget bibendum velit. Fusce sollicitudin ante ac tincidunt rhoncus. Praesent porta scelerisque consequat. Donec eleifend faucibus sollicitudin. Quisque vitae purus eget tortor tempor ultrices. Maecenas mauris diam, semper vitae est non, imperdiet tempor magna. Duis elit lacus, auctor vestibulum enim eget, rhoncus porttitor tortor.
-
-Donec non rhoncus nibh. Cras dapibus justo vitae nunc accumsan, id congue erat egestas. Aenean at ante ante. Duis eleifend imperdiet dREADALL
diff --git a/runsc/test/image/mysql.sql b/runsc/test/image/mysql.sql
deleted file mode 100644
index 51554b98d..000000000
--- a/runsc/test/image/mysql.sql
+++ /dev/null
@@ -1,23 +0,0 @@
-# Copyright 2018 The gVisor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-SHOW databases;
-USE mysql;
-
-CREATE TABLE foo (id int);
-INSERT INTO foo VALUES(1);
-SELECT * FROM foo;
-DROP TABLE foo;
-
-shutdown;
diff --git a/runsc/test/image/ruby.rb b/runsc/test/image/ruby.rb
deleted file mode 100644
index aced49c6d..000000000
--- a/runsc/test/image/ruby.rb
+++ /dev/null
@@ -1,23 +0,0 @@
-# Copyright 2018 The gVisor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-require 'sinatra'
-
-set :bind, "0.0.0.0"
-set :port, 8080
-
-get '/' do
-  'Hello World'
-end
-
diff --git a/runsc/test/image/ruby.sh b/runsc/test/image/ruby.sh
deleted file mode 100644
index ebe8d5b0e..000000000
--- a/runsc/test/image/ruby.sh
+++ /dev/null
@@ -1,20 +0,0 @@
-#!/bin/bash
-
-# Copyright 2018 The gVisor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-set -e
-
-gem install sinatra
-ruby /src/ruby.rb
diff --git a/runsc/test/install.sh b/runsc/test/install.sh
deleted file mode 100755
index 8f05dea20..000000000
--- a/runsc/test/install.sh
+++ /dev/null
@@ -1,93 +0,0 @@
-#!/bin/bash
-
-# Copyright 2018 The gVisor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Fail on any error
-set -e
-
-# Defaults
-declare runtime=runsc-test
-declare uninstall=0
-
-function findExe() {
-  local exe=${1}
-
-  local path=$(find bazel-bin/runsc -type f -executable -name "${exe}" | head -n1)
-  if [[ "${path}" == "" ]]; then
-    echo "Location of ${exe} not found in bazel-bin" >&2
-    exit 1
-  fi
-  echo "${path}"
-}
-
-while [[ $# -gt 0 ]]; do
-  case "$1" in
-    --runtime)
-      shift
-      [ "$#" -le 0 ] && echo "No runtime provided" && exit 1
-      runtime=$1
-      ;;
-    -u)
-      uninstall=1
-      ;;
-    *)
-      echo "Unknown option: ${1}"
-      echo ""
-      echo "Usage: ${0} [--runtime <name>] [-u]"
-      echo "  --runtime    sets the runtime name, default: runsc-test"
-      echo "  -u           uninstall the runtime"
-      exit 1
-  esac
-  shift
-done
-
-# Find location of executables.
-declare -r dockercfg=$(findExe dockercfg)
-[[ "${dockercfg}" == "" ]] && exit 1
-
-declare runsc=$(findExe runsc)
-[[ "${runsc}" == "" ]] && exit 1
-
-if [[ ${uninstall} == 0 ]]; then
-  rm -rf /tmp/${runtime}
-  mkdir -p /tmp/${runtime}
-  cp "${runsc}" /tmp/${runtime}/runsc
-  runsc=/tmp/${runtime}/runsc
-
-  # Make tmp dir and runsc binary readable and executable to all users, since it
-  # will run in an empty user namespace.
-  chmod a+rx "${runsc}" $(dirname "${runsc}")
-
-  # Make log dir executable and writable to all users for the same reason.
-  declare logdir=/tmp/"${runtime?}/logs"
-  mkdir -p "${logdir}"
-  sudo -n chmod a+wx "${logdir}"
-
-  declare -r args="--debug-log '${logdir}/' --debug --strace --log-packets"
-  # experimental is needed to checkpoint/restore.
-  sudo -n "${dockercfg}" --experimental=true runtime-add "${runtime}" "${runsc}" ${args}
-  sudo -n "${dockercfg}" runtime-add "${runtime}"-kvm "${runsc}" --platform=kvm ${args}
-  sudo -n "${dockercfg}" runtime-add "${runtime}"-hostnet "${runsc}" --network=host ${args}
-  sudo -n "${dockercfg}" runtime-add "${runtime}"-overlay "${runsc}" --overlay ${args}
-
-else
-  sudo -n "${dockercfg}" runtime-rm "${runtime}"
-  sudo -n "${dockercfg}" runtime-rm "${runtime}"-kvm
-  sudo -n "${dockercfg}" runtime-rm "${runtime}"-hostnet
-  sudo -n "${dockercfg}" runtime-rm "${runtime}"-overlay
-fi
-
-echo "Restarting docker service..."
-sudo -n /etc/init.d/docker restart
diff --git a/runsc/test/integration/BUILD b/runsc/test/integration/BUILD
deleted file mode 100644
index 12065617c..000000000
--- a/runsc/test/integration/BUILD
+++ /dev/null
@@ -1,30 +0,0 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
-load("//runsc/test:build_defs.bzl", "runtime_test")
-
-package(licenses = ["notice"])
-
-runtime_test(
-    name = "integration_test",
-    size = "large",
-    srcs = [
-        "exec_test.go",
-        "integration_test.go",
-        "regression_test.go",
-    ],
-    embed = [":integration"],
-    tags = [
-        # Requires docker and runsc to be configured before the test runs.
-        "manual",
-        "local",
-    ],
-    deps = [
-        "//pkg/abi/linux",
-        "//runsc/test/testutil",
-    ],
-)
-
-go_library(
-    name = "integration",
-    srcs = ["integration.go"],
-    importpath = "gvisor.dev/gvisor/runsc/test/integration",
-)
diff --git a/runsc/test/integration/exec_test.go b/runsc/test/integration/exec_test.go
deleted file mode 100644
index 993136f96..000000000
--- a/runsc/test/integration/exec_test.go
+++ /dev/null
@@ -1,161 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Package image provides end-to-end integration tests for runsc. These tests require
-// docker and runsc to be installed on the machine. To set it up, run:
-//
-//     ./runsc/test/install.sh [--runtime <name>]
-//
-// The tests expect the runtime name to be provided in the RUNSC_RUNTIME
-// environment variable (default: runsc-test).
-//
-// Each test calls docker commands to start up a container, and tests that it is
-// behaving properly, with various runsc commands. The container is killed and deleted
-// at the end.
-
-package integration
-
-import (
-	"fmt"
-	"strconv"
-	"strings"
-	"syscall"
-	"testing"
-	"time"
-
-	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/runsc/test/testutil"
-)
-
-func TestExecCapabilities(t *testing.T) {
-	if err := testutil.Pull("alpine"); err != nil {
-		t.Fatalf("docker pull failed: %v", err)
-	}
-	d := testutil.MakeDocker("exec-test")
-
-	// Start the container.
-	if err := d.Run("alpine", "sh", "-c", "cat /proc/self/status; sleep 100"); err != nil {
-		t.Fatalf("docker run failed: %v", err)
-	}
-	defer d.CleanUp()
-
-	matches, err := d.WaitForOutputSubmatch("CapEff:\t([0-9a-f]+)\n", 5*time.Second)
-	if err != nil {
-		t.Fatalf("WaitForOutputSubmatch() timeout: %v", err)
-	}
-	if len(matches) != 2 {
-		t.Fatalf("There should be a match for the whole line and the capability bitmask")
-	}
-	capString := matches[1]
-	t.Log("Root capabilities:", capString)
-
-	// CAP_NET_RAW was in the capability set for the container, but was
-	// removed. However, `exec` does not remove it. Verify that it's not
-	// set in the container, then re-add it for comparison.
-	caps, err := strconv.ParseUint(capString, 16, 64)
-	if err != nil {
-		t.Fatalf("failed to convert capabilities %q: %v", capString, err)
-	}
-	if caps&(1<<uint64(linux.CAP_NET_RAW)) != 0 {
-		t.Fatalf("CAP_NET_RAW should be filtered, but is set in the container: %x", caps)
-	}
-	caps |= 1 << uint64(linux.CAP_NET_RAW)
-	want := fmt.Sprintf("CapEff:\t%016x\n", caps)
-
-	// Now check that exec'd process capabilities match the root.
-	got, err := d.Exec("grep", "CapEff:", "/proc/self/status")
-	if err != nil {
-		t.Fatalf("docker exec failed: %v", err)
-	}
-	if got != want {
-		t.Errorf("wrong capabilities, got: %q, want: %q", got, want)
-	}
-}
-
-func TestExecJobControl(t *testing.T) {
-	if err := testutil.Pull("alpine"); err != nil {
-		t.Fatalf("docker pull failed: %v", err)
-	}
-	d := testutil.MakeDocker("exec-job-control-test")
-
-	// Start the container.
-	if err := d.Run("alpine", "sleep", "1000"); err != nil {
-		t.Fatalf("docker run failed: %v", err)
-	}
-	defer d.CleanUp()
-
-	// Exec 'sh' with an attached pty.
-	cmd, ptmx, err := d.ExecWithTerminal("sh")
-	if err != nil {
-		t.Fatalf("docker exec failed: %v", err)
-	}
-	defer ptmx.Close()
-
-	// Call "sleep 100 | cat" in the shell.  We pipe to cat so that there
-	// will be two processes in the foreground process group.
-	if _, err := ptmx.Write([]byte("sleep 100 | cat\n")); err != nil {
-		t.Fatalf("error writing to pty: %v", err)
-	}
-
-	// Give shell a few seconds to start executing the sleep.
-	time.Sleep(2 * time.Second)
-
-	// Send a ^C to the pty, which should kill sleep and cat, but not the
-	// shell.  \x03 is ASCII "end of text", which is the same as ^C.
-	if _, err := ptmx.Write([]byte{'\x03'}); err != nil {
-		t.Fatalf("error writing to pty: %v", err)
-	}
-
-	// The shell should still be alive at this point. Sleep should have
-	// exited with code 2+128=130. We'll exit with 10 plus that number, so
-	// that we can be sure that the shell did not get signalled.
-	if _, err := ptmx.Write([]byte("exit $(expr $? + 10)\n")); err != nil {
-		t.Fatalf("error writing to pty: %v", err)
-	}
-
-	// Exec process should exit with code 10+130=140.
-	ps, err := cmd.Process.Wait()
-	if err != nil {
-		t.Fatalf("error waiting for exec process: %v", err)
-	}
-	ws := ps.Sys().(syscall.WaitStatus)
-	if !ws.Exited() {
-		t.Errorf("ws.Exited got false, want true")
-	}
-	if got, want := ws.ExitStatus(), 140; got != want {
-		t.Errorf("ws.ExitedStatus got %d, want %d", got, want)
-	}
-}
-
-// Test that failure to exec returns proper error message.
-func TestExecError(t *testing.T) {
-	if err := testutil.Pull("alpine"); err != nil {
-		t.Fatalf("docker pull failed: %v", err)
-	}
-	d := testutil.MakeDocker("exec-error-test")
-
-	// Start the container.
-	if err := d.Run("alpine", "sleep", "1000"); err != nil {
-		t.Fatalf("docker run failed: %v", err)
-	}
-	defer d.CleanUp()
-
-	_, err := d.Exec("no_can_find")
-	if err == nil {
-		t.Fatalf("docker exec didn't fail")
-	}
-	if want := `error finding executable "no_can_find" in PATH`; !strings.Contains(err.Error(), want) {
-		t.Fatalf("docker exec wrong error, got: %s, want: .*%s.*", err.Error(), want)
-	}
-}
diff --git a/runsc/test/integration/integration.go b/runsc/test/integration/integration.go
deleted file mode 100644
index 4cd5f6c24..000000000
--- a/runsc/test/integration/integration.go
+++ /dev/null
@@ -1,16 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Package integration is empty. See integration_test.go for description.
-package integration
diff --git a/runsc/test/integration/integration_test.go b/runsc/test/integration/integration_test.go
deleted file mode 100644
index 7cef4b9dd..000000000
--- a/runsc/test/integration/integration_test.go
+++ /dev/null
@@ -1,344 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Package integration provides end-to-end integration tests for runsc.
-//
-// Each test calls docker commands to start up a container, and tests that it is
-// behaving properly, with various runsc commands. The container is killed and
-// deleted at the end.
-//
-// Setup instruction in runsc/test/README.md.
-package integration
-
-import (
-	"fmt"
-	"net"
-	"net/http"
-	"os"
-	"strconv"
-	"strings"
-	"syscall"
-	"testing"
-	"time"
-
-	"gvisor.dev/gvisor/runsc/test/testutil"
-)
-
-// httpRequestSucceeds sends a request to a given url and checks that the status is OK.
-func httpRequestSucceeds(client http.Client, server string, port int) error {
-	url := fmt.Sprintf("http://%s:%d", server, port)
-	// Ensure that content is being served.
-	resp, err := client.Get(url)
-	if err != nil {
-		return fmt.Errorf("error reaching http server: %v", err)
-	}
-	if want := http.StatusOK; resp.StatusCode != want {
-		return fmt.Errorf("wrong response code, got: %d, want: %d", resp.StatusCode, want)
-	}
-	return nil
-}
-
-// TestLifeCycle tests a basic Create/Start/Stop docker container life cycle.
-func TestLifeCycle(t *testing.T) {
-	if err := testutil.Pull("nginx"); err != nil {
-		t.Fatal("docker pull failed:", err)
-	}
-	d := testutil.MakeDocker("lifecycle-test")
-	if err := d.Create("-p", "80", "nginx"); err != nil {
-		t.Fatal("docker create failed:", err)
-	}
-	if err := d.Start(); err != nil {
-		d.CleanUp()
-		t.Fatal("docker start failed:", err)
-	}
-
-	// Test that container is working
-	port, err := d.FindPort(80)
-	if err != nil {
-		t.Fatal("docker.FindPort(80) failed: ", err)
-	}
-	if err := testutil.WaitForHTTP(port, 30*time.Second); err != nil {
-		t.Fatal("WaitForHTTP() timeout:", err)
-	}
-	client := http.Client{Timeout: time.Duration(2 * time.Second)}
-	if err := httpRequestSucceeds(client, "localhost", port); err != nil {
-		t.Error("http request failed:", err)
-	}
-
-	if err := d.Stop(); err != nil {
-		d.CleanUp()
-		t.Fatal("docker stop failed:", err)
-	}
-	if err := d.Remove(); err != nil {
-		t.Fatal("docker rm failed:", err)
-	}
-}
-
-func TestPauseResume(t *testing.T) {
-	const img = "gcr.io/gvisor-presubmit/python-hello"
-	if !testutil.IsPauseResumeSupported() {
-		t.Log("Pause/resume is not supported, skipping test.")
-		return
-	}
-
-	if err := testutil.Pull(img); err != nil {
-		t.Fatal("docker pull failed:", err)
-	}
-	d := testutil.MakeDocker("pause-resume-test")
-	if err := d.Run("-p", "8080", img); err != nil {
-		t.Fatalf("docker run failed: %v", err)
-	}
-	defer d.CleanUp()
-
-	// Find where port 8080 is mapped to.
-	port, err := d.FindPort(8080)
-	if err != nil {
-		t.Fatal("docker.FindPort(8080) failed:", err)
-	}
-
-	// Wait until it's up and running.
-	if err := testutil.WaitForHTTP(port, 30*time.Second); err != nil {
-		t.Fatal("WaitForHTTP() timeout:", err)
-	}
-
-	// Check that container is working.
-	client := http.Client{Timeout: time.Duration(2 * time.Second)}
-	if err := httpRequestSucceeds(client, "localhost", port); err != nil {
-		t.Error("http request failed:", err)
-	}
-
-	if err := d.Pause(); err != nil {
-		t.Fatal("docker pause failed:", err)
-	}
-
-	// Check if container is paused.
-	switch _, err := client.Get(fmt.Sprintf("http://localhost:%d", port)); v := err.(type) {
-	case nil:
-		t.Errorf("http req expected to fail but it succeeded")
-	case net.Error:
-		if !v.Timeout() {
-			t.Errorf("http req got error %v, wanted timeout", v)
-		}
-	default:
-		t.Errorf("http req got unexpected error %v", v)
-	}
-
-	if err := d.Unpause(); err != nil {
-		t.Fatal("docker unpause failed:", err)
-	}
-
-	// Wait until it's up and running.
-	if err := testutil.WaitForHTTP(port, 30*time.Second); err != nil {
-		t.Fatal("WaitForHTTP() timeout:", err)
-	}
-
-	// Check if container is working again.
-	if err := httpRequestSucceeds(client, "localhost", port); err != nil {
-		t.Error("http request failed:", err)
-	}
-}
-
-func TestCheckpointRestore(t *testing.T) {
-	const img = "gcr.io/gvisor-presubmit/python-hello"
-	if !testutil.IsPauseResumeSupported() {
-		t.Log("Pause/resume is not supported, skipping test.")
-		return
-	}
-	if err := testutil.Pull(img); err != nil {
-		t.Fatal("docker pull failed:", err)
-	}
-	d := testutil.MakeDocker("save-restore-test")
-	if err := d.Run("-p", "8080", img); err != nil {
-		t.Fatalf("docker run failed: %v", err)
-	}
-	defer d.CleanUp()
-
-	if err := d.Checkpoint("test"); err != nil {
-		t.Fatal("docker checkpoint failed:", err)
-	}
-
-	if _, err := d.Wait(30 * time.Second); err != nil {
-		t.Fatal(err)
-	}
-
-	if err := d.Restore("test"); err != nil {
-		t.Fatal("docker restore failed:", err)
-	}
-
-	// Find where port 8080 is mapped to.
-	port, err := d.FindPort(8080)
-	if err != nil {
-		t.Fatal("docker.FindPort(8080) failed:", err)
-	}
-
-	// Wait until it's up and running.
-	if err := testutil.WaitForHTTP(port, 30*time.Second); err != nil {
-		t.Fatal("WaitForHTTP() timeout:", err)
-	}
-
-	// Check if container is working again.
-	client := http.Client{Timeout: time.Duration(2 * time.Second)}
-	if err := httpRequestSucceeds(client, "localhost", port); err != nil {
-		t.Error("http request failed:", err)
-	}
-}
-
-// Create client and server that talk to each other using the local IP.
-func TestConnectToSelf(t *testing.T) {
-	d := testutil.MakeDocker("connect-to-self-test")
-
-	// Creates server that replies "server" and exists. Sleeps at the end because
-	// 'docker exec' gets killed if the init process exists before it can finish.
-	if err := d.Run("ubuntu:trusty", "/bin/sh", "-c", "echo server | nc -l -p 8080 && sleep 1"); err != nil {
-		t.Fatal("docker run failed:", err)
-	}
-	defer d.CleanUp()
-
-	// Finds IP address for host.
-	ip, err := d.Exec("/bin/sh", "-c", "cat /etc/hosts | grep ${HOSTNAME} | awk '{print $1}'")
-	if err != nil {
-		t.Fatal("docker exec failed:", err)
-	}
-	ip = strings.TrimRight(ip, "\n")
-
-	// Runs client that sends "client" to the server and exits.
-	reply, err := d.Exec("/bin/sh", "-c", fmt.Sprintf("echo client | nc %s 8080", ip))
-	if err != nil {
-		t.Fatal("docker exec failed:", err)
-	}
-
-	// Ensure both client and server got the message from each other.
-	if want := "server\n"; reply != want {
-		t.Errorf("Error on server, want: %q, got: %q", want, reply)
-	}
-	if _, err := d.WaitForOutput("^client\n$", 1*time.Second); err != nil {
-		t.Fatal("docker.WaitForOutput(client) timeout:", err)
-	}
-}
-
-func TestMemLimit(t *testing.T) {
-	if err := testutil.Pull("alpine"); err != nil {
-		t.Fatal("docker pull failed:", err)
-	}
-	d := testutil.MakeDocker("cgroup-test")
-	cmd := "cat /proc/meminfo | grep MemTotal: | awk '{print $2}'"
-	out, err := d.RunFg("--memory=500MB", "alpine", "sh", "-c", cmd)
-	if err != nil {
-		t.Fatal("docker run failed:", err)
-	}
-	defer d.CleanUp()
-
-	// Remove warning message that swap isn't present.
-	if strings.HasPrefix(out, "WARNING") {
-		lines := strings.Split(out, "\n")
-		if len(lines) != 3 {
-			t.Fatalf("invalid output: %s", out)
-		}
-		out = lines[1]
-	}
-
-	got, err := strconv.ParseUint(strings.TrimSpace(out), 10, 64)
-	if err != nil {
-		t.Fatalf("failed to parse %q: %v", out, err)
-	}
-	if want := uint64(500 * 1024); got != want {
-		t.Errorf("MemTotal got: %d, want: %d", got, want)
-	}
-}
-
-func TestNumCPU(t *testing.T) {
-	if err := testutil.Pull("alpine"); err != nil {
-		t.Fatal("docker pull failed:", err)
-	}
-	d := testutil.MakeDocker("cgroup-test")
-	cmd := "cat /proc/cpuinfo | grep 'processor.*:' | wc -l"
-	out, err := d.RunFg("--cpuset-cpus=0", "alpine", "sh", "-c", cmd)
-	if err != nil {
-		t.Fatal("docker run failed:", err)
-	}
-	defer d.CleanUp()
-
-	got, err := strconv.Atoi(strings.TrimSpace(out))
-	if err != nil {
-		t.Fatalf("failed to parse %q: %v", out, err)
-	}
-	if want := 1; got != want {
-		t.Errorf("MemTotal got: %d, want: %d", got, want)
-	}
-}
-
-// TestJobControl tests that job control characters are handled properly.
-func TestJobControl(t *testing.T) {
-	if err := testutil.Pull("alpine"); err != nil {
-		t.Fatalf("docker pull failed: %v", err)
-	}
-	d := testutil.MakeDocker("job-control-test")
-
-	// Start the container with an attached PTY.
-	_, ptmx, err := d.RunWithPty("alpine", "sh")
-	if err != nil {
-		t.Fatalf("docker run failed: %v", err)
-	}
-	defer ptmx.Close()
-	defer d.CleanUp()
-
-	// Call "sleep 100" in the shell.
-	if _, err := ptmx.Write([]byte("sleep 100\n")); err != nil {
-		t.Fatalf("error writing to pty: %v", err)
-	}
-
-	// Give shell a few seconds to start executing the sleep.
-	time.Sleep(2 * time.Second)
-
-	// Send a ^C to the pty, which should kill sleep, but not the shell.
-	// \x03 is ASCII "end of text", which is the same as ^C.
-	if _, err := ptmx.Write([]byte{'\x03'}); err != nil {
-		t.Fatalf("error writing to pty: %v", err)
-	}
-
-	// The shell should still be alive at this point. Sleep should have
-	// exited with code 2+128=130. We'll exit with 10 plus that number, so
-	// that we can be sure that the shell did not get signalled.
-	if _, err := ptmx.Write([]byte("exit $(expr $? + 10)\n")); err != nil {
-		t.Fatalf("error writing to pty: %v", err)
-	}
-
-	// Wait for the container to exit.
-	got, err := d.Wait(5 * time.Second)
-	if err != nil {
-		t.Fatalf("error getting exit code: %v", err)
-	}
-	// Container should exit with code 10+130=140.
-	if want := syscall.WaitStatus(140); got != want {
-		t.Errorf("container exited with code %d want %d", got, want)
-	}
-}
-
-// TestTmpFile checks that files inside '/tmp' are not overridden. In addition,
-// it checks that working dir is created if it doesn't exit.
-func TestTmpFile(t *testing.T) {
-	if err := testutil.Pull("alpine"); err != nil {
-		t.Fatal("docker pull failed:", err)
-	}
-	d := testutil.MakeDocker("tmp-file-test")
-	if err := d.Run("-w=/tmp/foo/bar", "--read-only", "alpine", "touch", "/tmp/foo/bar/file"); err != nil {
-		t.Fatal("docker run failed:", err)
-	}
-	defer d.CleanUp()
-}
-
-func TestMain(m *testing.M) {
-	testutil.EnsureSupportedDockerVersion()
-	os.Exit(m.Run())
-}
diff --git a/runsc/test/integration/regression_test.go b/runsc/test/integration/regression_test.go
deleted file mode 100644
index fb68dda99..000000000
--- a/runsc/test/integration/regression_test.go
+++ /dev/null
@@ -1,45 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package integration
-
-import (
-	"strings"
-	"testing"
-
-	"gvisor.dev/gvisor/runsc/test/testutil"
-)
-
-// Test that UDS can be created using overlay when parent directory is in lower
-// layer only (b/134090485).
-//
-// Prerequisite: the directory where the socket file is created must not have
-// been open for write before bind(2) is called.
-func TestBindOverlay(t *testing.T) {
-	if err := testutil.Pull("ubuntu:trusty"); err != nil {
-		t.Fatal("docker pull failed:", err)
-	}
-	d := testutil.MakeDocker("bind-overlay-test")
-
-	cmd := "nc -l -U /var/run/sock & p=$! && sleep 1 && echo foobar-asdf | nc -U /var/run/sock && wait $p"
-	got, err := d.RunFg("ubuntu:trusty", "bash", "-c", cmd)
-	if err != nil {
-		t.Fatal("docker run failed:", err)
-	}
-
-	if want := "foobar-asdf"; !strings.Contains(got, want) {
-		t.Fatalf("docker run output is missing %q: %s", want, got)
-	}
-	defer d.CleanUp()
-}
diff --git a/runsc/test/root/BUILD b/runsc/test/root/BUILD
deleted file mode 100644
index 500ef7b8e..000000000
--- a/runsc/test/root/BUILD
+++ /dev/null
@@ -1,33 +0,0 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
-
-package(licenses = ["notice"])
-
-go_library(
-    name = "root",
-    srcs = ["root.go"],
-    importpath = "gvisor.dev/gvisor/runsc/test/root",
-)
-
-go_test(
-    name = "root_test",
-    size = "small",
-    srcs = [
-        "cgroup_test.go",
-        "chroot_test.go",
-        "crictl_test.go",
-    ],
-    embed = [":root"],
-    tags = [
-        # Requires docker and runsc to be configured before the test runs.
-        # Also test only runs as root.
-        "manual",
-        "local",
-    ],
-    deps = [
-        "//runsc/cgroup",
-        "//runsc/specutils",
-        "//runsc/test/root/testdata",
-        "//runsc/test/testutil",
-        "@com_github_syndtr_gocapability//capability:go_default_library",
-    ],
-)
diff --git a/runsc/test/root/cgroup_test.go b/runsc/test/root/cgroup_test.go
deleted file mode 100644
index 5392dc6e0..000000000
--- a/runsc/test/root/cgroup_test.go
+++ /dev/null
@@ -1,237 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package root
-
-import (
-	"bufio"
-	"fmt"
-	"io/ioutil"
-	"os"
-	"os/exec"
-	"path/filepath"
-	"strconv"
-	"strings"
-	"testing"
-
-	"gvisor.dev/gvisor/runsc/cgroup"
-	"gvisor.dev/gvisor/runsc/test/testutil"
-)
-
-func verifyPid(pid int, path string) error {
-	f, err := os.Open(path)
-	if err != nil {
-		return err
-	}
-	defer f.Close()
-
-	var gots []int
-	scanner := bufio.NewScanner(f)
-	for scanner.Scan() {
-		got, err := strconv.Atoi(scanner.Text())
-		if err != nil {
-			return err
-		}
-		if got == pid {
-			return nil
-		}
-		gots = append(gots, got)
-	}
-	if scanner.Err() != nil {
-		return scanner.Err()
-	}
-	return fmt.Errorf("got: %s, want: %d", gots, pid)
-}
-
-// TestCgroup sets cgroup options and checks that cgroup was properly configured.
-func TestCgroup(t *testing.T) {
-	if err := testutil.Pull("alpine"); err != nil {
-		t.Fatal("docker pull failed:", err)
-	}
-	d := testutil.MakeDocker("cgroup-test")
-
-	attrs := []struct {
-		arg            string
-		ctrl           string
-		file           string
-		want           string
-		skipIfNotFound bool
-	}{
-		{
-			arg:  "--cpu-shares=1000",
-			ctrl: "cpu",
-			file: "cpu.shares",
-			want: "1000",
-		},
-		{
-			arg:  "--cpu-period=2000",
-			ctrl: "cpu",
-			file: "cpu.cfs_period_us",
-			want: "2000",
-		},
-		{
-			arg:  "--cpu-quota=3000",
-			ctrl: "cpu",
-			file: "cpu.cfs_quota_us",
-			want: "3000",
-		},
-		{
-			arg:  "--cpuset-cpus=0",
-			ctrl: "cpuset",
-			file: "cpuset.cpus",
-			want: "0",
-		},
-		{
-			arg:  "--cpuset-mems=0",
-			ctrl: "cpuset",
-			file: "cpuset.mems",
-			want: "0",
-		},
-		{
-			arg:  "--kernel-memory=100MB",
-			ctrl: "memory",
-			file: "memory.kmem.limit_in_bytes",
-			want: "104857600",
-		},
-		{
-			arg:  "--memory=1GB",
-			ctrl: "memory",
-			file: "memory.limit_in_bytes",
-			want: "1073741824",
-		},
-		{
-			arg:  "--memory-reservation=500MB",
-			ctrl: "memory",
-			file: "memory.soft_limit_in_bytes",
-			want: "524288000",
-		},
-		{
-			arg:            "--memory-swap=2GB",
-			ctrl:           "memory",
-			file:           "memory.memsw.limit_in_bytes",
-			want:           "2147483648",
-			skipIfNotFound: true, // swap may be disabled on the machine.
-		},
-		{
-			arg:  "--memory-swappiness=5",
-			ctrl: "memory",
-			file: "memory.swappiness",
-			want: "5",
-		},
-		{
-			arg:  "--blkio-weight=750",
-			ctrl: "blkio",
-			file: "blkio.weight",
-			want: "750",
-		},
-	}
-
-	args := make([]string, 0, len(attrs))
-	for _, attr := range attrs {
-		args = append(args, attr.arg)
-	}
-
-	args = append(args, "alpine", "sleep", "10000")
-	if err := d.Run(args...); err != nil {
-		t.Fatal("docker create failed:", err)
-	}
-	defer d.CleanUp()
-
-	gid, err := d.ID()
-	if err != nil {
-		t.Fatalf("Docker.ID() failed: %v", err)
-	}
-	t.Logf("cgroup ID: %s", gid)
-
-	// Check list of attributes defined above.
-	for _, attr := range attrs {
-		path := filepath.Join("/sys/fs/cgroup", attr.ctrl, "docker", gid, attr.file)
-		out, err := ioutil.ReadFile(path)
-		if err != nil {
-			if os.IsNotExist(err) && attr.skipIfNotFound {
-				t.Logf("skipped %s/%s", attr.ctrl, attr.file)
-				continue
-			}
-			t.Fatalf("failed to read %q: %v", path, err)
-		}
-		if got := strings.TrimSpace(string(out)); got != attr.want {
-			t.Errorf("arg: %q, cgroup attribute %s/%s, got: %q, want: %q", attr.arg, attr.ctrl, attr.file, got, attr.want)
-		}
-	}
-
-	// Check that sandbox is inside cgroup.
-	controllers := []string{
-		"blkio",
-		"cpu",
-		"cpuset",
-		"memory",
-		"net_cls",
-		"net_prio",
-		"devices",
-		"freezer",
-		"perf_event",
-		"pids",
-		"systemd",
-	}
-	pid, err := d.SandboxPid()
-	if err != nil {
-		t.Fatalf("SandboxPid: %v", err)
-	}
-	for _, ctrl := range controllers {
-		path := filepath.Join("/sys/fs/cgroup", ctrl, "docker", gid, "cgroup.procs")
-		if err := verifyPid(pid, path); err != nil {
-			t.Errorf("cgroup control %q processes: %v", ctrl, err)
-		}
-	}
-}
-
-func TestCgroupParent(t *testing.T) {
-	if err := testutil.Pull("alpine"); err != nil {
-		t.Fatal("docker pull failed:", err)
-	}
-	d := testutil.MakeDocker("cgroup-test")
-
-	parent := testutil.RandomName("runsc")
-	if err := d.Run("--cgroup-parent", parent, "alpine", "sleep", "10000"); err != nil {
-		t.Fatal("docker create failed:", err)
-	}
-	defer d.CleanUp()
-	gid, err := d.ID()
-	if err != nil {
-		t.Fatalf("Docker.ID() failed: %v", err)
-	}
-	t.Logf("cgroup ID: %s", gid)
-
-	// Check that sandbox is inside cgroup.
-	pid, err := d.SandboxPid()
-	if err != nil {
-		t.Fatalf("SandboxPid: %v", err)
-	}
-
-	// Finds cgroup for the sandbox's parent process to check that cgroup is
-	// created in the right location relative to the parent.
-	cmd := fmt.Sprintf("grep PPid: /proc/%d/status | sed 's/PPid:\\s//'", pid)
-	ppid, err := exec.Command("bash", "-c", cmd).CombinedOutput()
-	if err != nil {
-		t.Fatalf("Executing %q: %v", cmd, err)
-	}
-	cgroups, err := cgroup.LoadPaths(strings.TrimSpace(string(ppid)))
-	if err != nil {
-		t.Fatalf("cgroup.LoadPath(%s): %v", ppid, err)
-	}
-	path := filepath.Join("/sys/fs/cgroup/memory", cgroups["memory"], parent, gid, "cgroup.procs")
-	if err := verifyPid(pid, path); err != nil {
-		t.Errorf("cgroup control %q processes: %v", "memory", err)
-	}
-}
diff --git a/runsc/test/root/chroot_test.go b/runsc/test/root/chroot_test.go
deleted file mode 100644
index d0f236580..000000000
--- a/runsc/test/root/chroot_test.go
+++ /dev/null
@@ -1,161 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Package root is used for tests that requires sysadmin privileges run. First,
-// follow the setup instruction in runsc/test/README.md. To run these tests:
-//
-//     bazel build //runsc/test/root:root_test
-//     root_test=$(find -L ./bazel-bin/ -executable -type f -name root_test | grep __main__)
-//     sudo RUNSC_RUNTIME=runsc-test ${root_test}
-package root
-
-import (
-	"fmt"
-	"io/ioutil"
-	"os"
-	"os/exec"
-	"path/filepath"
-	"strconv"
-	"strings"
-	"testing"
-
-	"github.com/syndtr/gocapability/capability"
-	"gvisor.dev/gvisor/runsc/specutils"
-	"gvisor.dev/gvisor/runsc/test/testutil"
-)
-
-// TestChroot verifies that the sandbox is chroot'd and that mounts are cleaned
-// up after the sandbox is destroyed.
-func TestChroot(t *testing.T) {
-	d := testutil.MakeDocker("chroot-test")
-	if err := d.Run("alpine", "sleep", "10000"); err != nil {
-		t.Fatalf("docker run failed: %v", err)
-	}
-	defer d.CleanUp()
-
-	pid, err := d.SandboxPid()
-	if err != nil {
-		t.Fatalf("Docker.SandboxPid(): %v", err)
-	}
-
-	// Check that sandbox is chroot'ed.
-	procRoot := filepath.Join("/proc", strconv.Itoa(pid), "root")
-	chroot, err := filepath.EvalSymlinks(procRoot)
-	if err != nil {
-		t.Fatalf("error resolving /proc/<pid>/root symlink: %v", err)
-	}
-	if chroot != "/" {
-		t.Errorf("sandbox is not chroot'd, it should be inside: /, got: %q", chroot)
-	}
-
-	path, err := filepath.EvalSymlinks(filepath.Join("/proc", strconv.Itoa(pid), "cwd"))
-	if err != nil {
-		t.Fatalf("error resolving /proc/<pid>/cwd symlink: %v", err)
-	}
-	if chroot != path {
-		t.Errorf("sandbox current dir is wrong, want: %q, got: %q", chroot, path)
-	}
-
-	fi, err := ioutil.ReadDir(procRoot)
-	if err != nil {
-		t.Fatalf("error listing %q: %v", chroot, err)
-	}
-	if want, got := 1, len(fi); want != got {
-		t.Fatalf("chroot dir got %d entries, want %d", got, want)
-	}
-
-	// chroot dir is prepared by runsc and should contains only /proc.
-	if fi[0].Name() != "proc" {
-		t.Errorf("chroot got children %v, want %v", fi[0].Name(), "proc")
-	}
-
-	d.CleanUp()
-}
-
-func TestChrootGofer(t *testing.T) {
-	d := testutil.MakeDocker("chroot-test")
-	if err := d.Run("alpine", "sleep", "10000"); err != nil {
-		t.Fatalf("docker run failed: %v", err)
-	}
-	defer d.CleanUp()
-
-	// It's tricky to find gofers. Get sandbox PID first, then find parent. From
-	// parent get all immediate children, remove the sandbox, and everything else
-	// are gofers.
-	sandPID, err := d.SandboxPid()
-	if err != nil {
-		t.Fatalf("Docker.SandboxPid(): %v", err)
-	}
-
-	// Find sandbox's parent PID.
-	cmd := fmt.Sprintf("grep PPid /proc/%d/status | awk '{print $2}'", sandPID)
-	parent, err := exec.Command("sh", "-c", cmd).CombinedOutput()
-	if err != nil {
-		t.Fatalf("failed to fetch runsc (%d) parent PID: %v, out:\n%s", sandPID, err, string(parent))
-	}
-	parentPID, err := strconv.Atoi(strings.TrimSpace(string(parent)))
-	if err != nil {
-		t.Fatalf("failed to parse PPID %q: %v", string(parent), err)
-	}
-
-	// Get all children from parent.
-	childrenOut, err := exec.Command("/usr/bin/pgrep", "-P", strconv.Itoa(parentPID)).CombinedOutput()
-	if err != nil {
-		t.Fatalf("failed to fetch containerd-shim children: %v", err)
-	}
-	children := strings.Split(strings.TrimSpace(string(childrenOut)), "\n")
-
-	// This where the root directory is mapped on the host and that's where the
-	// gofer must have chroot'd to.
-	root := "/root"
-
-	for _, child := range children {
-		childPID, err := strconv.Atoi(child)
-		if err != nil {
-			t.Fatalf("failed to parse child PID %q: %v", child, err)
-		}
-		if childPID == sandPID {
-			// Skip the sandbox, all other immediate children are gofers.
-			continue
-		}
-
-		// Check that gofer is chroot'ed.
-		chroot, err := filepath.EvalSymlinks(filepath.Join("/proc", child, "root"))
-		if err != nil {
-			t.Fatalf("error resolving /proc/<pid>/root symlink: %v", err)
-		}
-		if root != chroot {
-			t.Errorf("gofer chroot is wrong, want: %q, got: %q", root, chroot)
-		}
-
-		path, err := filepath.EvalSymlinks(filepath.Join("/proc", child, "cwd"))
-		if err != nil {
-			t.Fatalf("error resolving /proc/<pid>/cwd symlink: %v", err)
-		}
-		if root != path {
-			t.Errorf("gofer current dir is wrong, want: %q, got: %q", root, path)
-		}
-	}
-}
-
-func TestMain(m *testing.M) {
-	testutil.EnsureSupportedDockerVersion()
-
-	if !specutils.HasCapabilities(capability.CAP_SYS_ADMIN, capability.CAP_DAC_OVERRIDE) {
-		fmt.Println("Test requires sysadmin privileges to run. Try again with sudo.")
-		os.Exit(1)
-	}
-
-	os.Exit(m.Run())
-}
diff --git a/runsc/test/root/crictl_test.go b/runsc/test/root/crictl_test.go
deleted file mode 100644
index 515ae2df1..000000000
--- a/runsc/test/root/crictl_test.go
+++ /dev/null
@@ -1,242 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package root
-
-import (
-	"bytes"
-	"fmt"
-	"io"
-	"io/ioutil"
-	"log"
-	"net/http"
-	"os"
-	"os/exec"
-	"path"
-	"path/filepath"
-	"strings"
-	"testing"
-	"time"
-
-	"gvisor.dev/gvisor/runsc/specutils"
-	"gvisor.dev/gvisor/runsc/test/root/testdata"
-	"gvisor.dev/gvisor/runsc/test/testutil"
-)
-
-// Tests for crictl have to be run as root (rather than in a user namespace)
-// because crictl creates named network namespaces in /var/run/netns/.
-
-func TestCrictlSanity(t *testing.T) {
-	// Setup containerd and crictl.
-	crictl, cleanup, err := setup(t)
-	if err != nil {
-		t.Fatalf("failed to setup crictl: %v", err)
-	}
-	defer cleanup()
-	podID, contID, err := crictl.StartPodAndContainer("httpd", testdata.Sandbox, testdata.Httpd)
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	// Look for the httpd page.
-	if err = httpGet(crictl, podID, "index.html"); err != nil {
-		t.Fatalf("failed to get page: %v", err)
-	}
-
-	// Stop everything.
-	if err := crictl.StopPodAndContainer(podID, contID); err != nil {
-		t.Fatal(err)
-	}
-}
-
-func TestMountPaths(t *testing.T) {
-	// Setup containerd and crictl.
-	crictl, cleanup, err := setup(t)
-	if err != nil {
-		t.Fatalf("failed to setup crictl: %v", err)
-	}
-	defer cleanup()
-	podID, contID, err := crictl.StartPodAndContainer("httpd", testdata.Sandbox, testdata.HttpdMountPaths)
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	// Look for the directory available at /test.
-	if err = httpGet(crictl, podID, "test"); err != nil {
-		t.Fatalf("failed to get page: %v", err)
-	}
-
-	// Stop everything.
-	if err := crictl.StopPodAndContainer(podID, contID); err != nil {
-		t.Fatal(err)
-	}
-}
-
-func TestMountOverSymlinks(t *testing.T) {
-	// Setup containerd and crictl.
-	crictl, cleanup, err := setup(t)
-	if err != nil {
-		t.Fatalf("failed to setup crictl: %v", err)
-	}
-	defer cleanup()
-	podID, contID, err := crictl.StartPodAndContainer("k8s.gcr.io/busybox", testdata.Sandbox, testdata.MountOverSymlink)
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	out, err := crictl.Exec(contID, "readlink", "/etc/resolv.conf")
-	if err != nil {
-		t.Fatal(err)
-	}
-	if want := "/tmp/resolv.conf"; !strings.Contains(string(out), want) {
-		t.Fatalf("/etc/resolv.conf is not pointing to %q: %q", want, string(out))
-	}
-
-	etc, err := crictl.Exec(contID, "cat", "/etc/resolv.conf")
-	if err != nil {
-		t.Fatal(err)
-	}
-	tmp, err := crictl.Exec(contID, "cat", "/tmp/resolv.conf")
-	if err != nil {
-		t.Fatal(err)
-	}
-	if tmp != etc {
-		t.Fatalf("file content doesn't match:\n\t/etc/resolv.conf: %s\n\t/tmp/resolv.conf: %s", string(etc), string(tmp))
-	}
-
-	// Stop everything.
-	if err := crictl.StopPodAndContainer(podID, contID); err != nil {
-		t.Fatal(err)
-	}
-}
-
-// setup sets up before a test. Specifically it:
-// * Creates directories and a socket for containerd to utilize.
-// * Runs containerd and waits for it to reach a "ready" state for testing.
-// * Returns a cleanup function that should be called at the end of the test.
-func setup(t *testing.T) (*testutil.Crictl, func(), error) {
-	var cleanups []func()
-	cleanupFunc := func() {
-		for i := len(cleanups) - 1; i >= 0; i-- {
-			cleanups[i]()
-		}
-	}
-	cleanup := specutils.MakeCleanup(cleanupFunc)
-	defer cleanup.Clean()
-
-	// Create temporary containerd root and state directories, and a socket
-	// via which crictl and containerd communicate.
-	containerdRoot, err := ioutil.TempDir(testutil.TmpDir(), "containerd-root")
-	if err != nil {
-		t.Fatalf("failed to create containerd root: %v", err)
-	}
-	cleanups = append(cleanups, func() { os.RemoveAll(containerdRoot) })
-	containerdState, err := ioutil.TempDir(testutil.TmpDir(), "containerd-state")
-	if err != nil {
-		t.Fatalf("failed to create containerd state: %v", err)
-	}
-	cleanups = append(cleanups, func() { os.RemoveAll(containerdState) })
-	sockAddr := filepath.Join(testutil.TmpDir(), "containerd-test.sock")
-
-	// Start containerd.
-	config, err := testutil.WriteTmpFile("containerd-config", testdata.ContainerdConfig(getRunsc()))
-	if err != nil {
-		t.Fatalf("failed to write containerd config")
-	}
-	cleanups = append(cleanups, func() { os.RemoveAll(config) })
-	containerd := exec.Command(getContainerd(),
-		"--config", config,
-		"--log-level", "debug",
-		"--root", containerdRoot,
-		"--state", containerdState,
-		"--address", sockAddr)
-	cleanups = append(cleanups, func() {
-		if err := testutil.KillCommand(containerd); err != nil {
-			log.Printf("error killing containerd: %v", err)
-		}
-	})
-	containerdStderr, err := containerd.StderrPipe()
-	if err != nil {
-		t.Fatalf("failed to get containerd stderr: %v", err)
-	}
-	containerdStdout, err := containerd.StdoutPipe()
-	if err != nil {
-		t.Fatalf("failed to get containerd stdout: %v", err)
-	}
-	if err := containerd.Start(); err != nil {
-		t.Fatalf("failed running containerd: %v", err)
-	}
-
-	// Wait for containerd to boot. Then put all containerd output into a
-	// buffer to be logged at the end of the test.
-	testutil.WaitUntilRead(containerdStderr, "Start streaming server", nil, 10*time.Second)
-	stdoutBuf := &bytes.Buffer{}
-	stderrBuf := &bytes.Buffer{}
-	go func() { io.Copy(stdoutBuf, containerdStdout) }()
-	go func() { io.Copy(stderrBuf, containerdStderr) }()
-	cleanups = append(cleanups, func() {
-		t.Logf("containerd stdout: %s", string(stdoutBuf.Bytes()))
-		t.Logf("containerd stderr: %s", string(stderrBuf.Bytes()))
-	})
-
-	cleanup.Release()
-	return testutil.NewCrictl(20*time.Second, sockAddr), cleanupFunc, nil
-}
-
-// httpGet GETs the contents of a file served from a pod on port 80.
-func httpGet(crictl *testutil.Crictl, podID, filePath string) error {
-	// Get the IP of the httpd server.
-	ip, err := crictl.PodIP(podID)
-	if err != nil {
-		return fmt.Errorf("failed to get IP from pod %q: %v", podID, err)
-	}
-
-	// GET the page. We may be waiting for the server to start, so retry
-	// with a timeout.
-	var resp *http.Response
-	cb := func() error {
-		r, err := http.Get(fmt.Sprintf("http://%s", path.Join(ip, filePath)))
-		resp = r
-		return err
-	}
-	if err := testutil.Poll(cb, 20*time.Second); err != nil {
-		return err
-	}
-	defer resp.Body.Close()
-
-	if resp.StatusCode != 200 {
-		return fmt.Errorf("bad status returned: %d", resp.StatusCode)
-	}
-	return nil
-}
-
-func getContainerd() string {
-	// Bazel doesn't pass PATH through, assume the location of containerd
-	// unless specified by environment variable.
-	c := os.Getenv("CONTAINERD_PATH")
-	if c == "" {
-		return "/usr/local/bin/containerd"
-	}
-	return c
-}
-
-func getRunsc() string {
-	// Bazel doesn't pass PATH through, assume the location of runsc unless
-	// specified by environment variable.
-	c := os.Getenv("RUNSC_EXEC")
-	if c == "" {
-		return "/tmp/runsc-test/runsc"
-	}
-	return c
-}
diff --git a/runsc/test/root/root.go b/runsc/test/root/root.go
deleted file mode 100644
index 349c752cc..000000000
--- a/runsc/test/root/root.go
+++ /dev/null
@@ -1,16 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Package root is empty. See chroot_test.go for description.
-package root
diff --git a/runsc/test/root/testdata/BUILD b/runsc/test/root/testdata/BUILD
deleted file mode 100644
index 80dc5f214..000000000
--- a/runsc/test/root/testdata/BUILD
+++ /dev/null
@@ -1,18 +0,0 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
-
-package(licenses = ["notice"])
-
-go_library(
-    name = "testdata",
-    srcs = [
-        "busybox.go",
-        "containerd_config.go",
-        "httpd.go",
-        "httpd_mount_paths.go",
-        "sandbox.go",
-    ],
-    importpath = "gvisor.dev/gvisor/runsc/test/root/testdata",
-    visibility = [
-        "//visibility:public",
-    ],
-)
diff --git a/runsc/test/root/testdata/busybox.go b/runsc/test/root/testdata/busybox.go
deleted file mode 100644
index e4dbd2843..000000000
--- a/runsc/test/root/testdata/busybox.go
+++ /dev/null
@@ -1,32 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package testdata
-
-// MountOverSymlink is a JSON config for a container that /etc/resolv.conf is a
-// symlink to /tmp/resolv.conf.
-var MountOverSymlink = `
-{
-        "metadata": {
-                "name": "busybox"
-        },
-        "image": {
-                "image": "k8s.gcr.io/busybox"
-        },
-        "command": [
-                "sleep",
-                "1000"
-        ]
-}
-`
diff --git a/runsc/test/root/testdata/containerd_config.go b/runsc/test/root/testdata/containerd_config.go
deleted file mode 100644
index e12f1ec88..000000000
--- a/runsc/test/root/testdata/containerd_config.go
+++ /dev/null
@@ -1,39 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Package testdata contains data required for root tests.
-package testdata
-
-import "fmt"
-
-// containerdConfigTemplate is a .toml config for containerd. It contains a
-// formatting verb so the runtime field can be set via fmt.Sprintf.
-const containerdConfigTemplate = `
-disabled_plugins = ["restart"]
-[plugins.linux]
-  runtime = "%s"
-  runtime_root = "/tmp/test-containerd/runsc"
-  shim = "/usr/local/bin/gvisor-containerd-shim"
-  shim_debug = true
-
-[plugins.cri.containerd.runtimes.runsc]
-  runtime_type = "io.containerd.runtime.v1.linux"
-  runtime_engine = "%s"
-`
-
-// ContainerdConfig returns a containerd config file with the specified
-// runtime.
-func ContainerdConfig(runtime string) string {
-	return fmt.Sprintf(containerdConfigTemplate, runtime, runtime)
-}
diff --git a/runsc/test/root/testdata/httpd.go b/runsc/test/root/testdata/httpd.go
deleted file mode 100644
index 45d5e33d4..000000000
--- a/runsc/test/root/testdata/httpd.go
+++ /dev/null
@@ -1,32 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package testdata
-
-// Httpd is a JSON config for an httpd container.
-const Httpd = `
-{
-  "metadata": {
-    "name": "httpd"
-  },
-  "image":{
-    "image": "httpd"
-  },
-  "mounts": [
-  ],
-  "linux": {
-  },
-  "log_path": "httpd.log"
-}
-`
diff --git a/runsc/test/root/testdata/httpd_mount_paths.go b/runsc/test/root/testdata/httpd_mount_paths.go
deleted file mode 100644
index ac3f4446a..000000000
--- a/runsc/test/root/testdata/httpd_mount_paths.go
+++ /dev/null
@@ -1,53 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package testdata
-
-// HttpdMountPaths is a JSON config for an httpd container with additional
-// mounts.
-const HttpdMountPaths = `
-{
-  "metadata": {
-    "name": "httpd"
-  },
-  "image":{
-    "image": "httpd"
-  },
-  "mounts": [
-      {
-        "container_path": "/var/run/secrets/kubernetes.io/serviceaccount",
-        "host_path": "/var/lib/kubelet/pods/82bae206-cdf5-11e8-b245-8cdcd43ac064/volumes/kubernetes.io~secret/default-token-2rpfx",
-        "readonly": true
-      },
-      {
-        "container_path": "/etc/hosts",
-        "host_path": "/var/lib/kubelet/pods/82bae206-cdf5-11e8-b245-8cdcd43ac064/etc-hosts",
-        "readonly": false
-      },
-      {
-        "container_path": "/dev/termination-log",
-        "host_path": "/var/lib/kubelet/pods/82bae206-cdf5-11e8-b245-8cdcd43ac064/containers/httpd/d1709580",
-        "readonly": false
-      },
-      {
-        "container_path": "/usr/local/apache2/htdocs/test",
-        "host_path": "/var/lib/kubelet/pods/82bae206-cdf5-11e8-b245-8cdcd43ac064",
-        "readonly": true
-      }
-  ],
-  "linux": {
-  },
-  "log_path": "httpd.log"
-}
-`
diff --git a/runsc/test/root/testdata/sandbox.go b/runsc/test/root/testdata/sandbox.go
deleted file mode 100644
index 0db210370..000000000
--- a/runsc/test/root/testdata/sandbox.go
+++ /dev/null
@@ -1,30 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package testdata
-
-// Sandbox is a default JSON config for a sandbox.
-const Sandbox = `
-{
-    "metadata": {
-        "name": "default-sandbox",
-        "namespace": "default",
-        "attempt": 1,
-        "uid": "hdishd83djaidwnduwk28bcsb"
-    },
-    "linux": {
-    },
-    "log_directory": "/tmp"
-}
-`
diff --git a/runsc/test/testutil/BUILD b/runsc/test/testutil/BUILD
deleted file mode 100644
index 327e7ca4d..000000000
--- a/runsc/test/testutil/BUILD
+++ /dev/null
@@ -1,22 +0,0 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
-
-package(licenses = ["notice"])
-
-go_library(
-    name = "testutil",
-    srcs = [
-        "crictl.go",
-        "docker.go",
-        "testutil.go",
-        "testutil_race.go",
-    ],
-    importpath = "gvisor.dev/gvisor/runsc/test/testutil",
-    visibility = ["//:sandbox"],
-    deps = [
-        "//runsc/boot",
-        "//runsc/specutils",
-        "@com_github_cenkalti_backoff//:go_default_library",
-        "@com_github_kr_pty//:go_default_library",
-        "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
-    ],
-)
diff --git a/runsc/test/testutil/crictl.go b/runsc/test/testutil/crictl.go
deleted file mode 100644
index 4f9ee0c05..000000000
--- a/runsc/test/testutil/crictl.go
+++ /dev/null
@@ -1,241 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package testutil
-
-import (
-	"encoding/json"
-	"fmt"
-	"os"
-	"os/exec"
-	"strings"
-	"time"
-)
-
-const endpointPrefix = "unix://"
-
-// Crictl contains information required to run the crictl utility.
-type Crictl struct {
-	executable      string
-	timeout         time.Duration
-	imageEndpoint   string
-	runtimeEndpoint string
-}
-
-// NewCrictl returns a Crictl configured with a timeout and an endpoint over
-// which it will talk to containerd.
-func NewCrictl(timeout time.Duration, endpoint string) *Crictl {
-	// Bazel doesn't pass PATH through, assume the location of crictl
-	// unless specified by environment variable.
-	executable := os.Getenv("CRICTL_PATH")
-	if executable == "" {
-		executable = "/usr/local/bin/crictl"
-	}
-	return &Crictl{
-		executable:      executable,
-		timeout:         timeout,
-		imageEndpoint:   endpointPrefix + endpoint,
-		runtimeEndpoint: endpointPrefix + endpoint,
-	}
-}
-
-// Pull pulls an container image. It corresponds to `crictl pull`.
-func (cc *Crictl) Pull(imageName string) error {
-	_, err := cc.run("pull", imageName)
-	return err
-}
-
-// RunPod creates a sandbox. It corresponds to `crictl runp`.
-func (cc *Crictl) RunPod(sbSpecFile string) (string, error) {
-	podID, err := cc.run("runp", sbSpecFile)
-	if err != nil {
-		return "", fmt.Errorf("runp failed: %v", err)
-	}
-	// Strip the trailing newline from crictl output.
-	return strings.TrimSpace(podID), nil
-}
-
-// Create creates a container within a sandbox. It corresponds to `crictl
-// create`.
-func (cc *Crictl) Create(podID, contSpecFile, sbSpecFile string) (string, error) {
-	podID, err := cc.run("create", podID, contSpecFile, sbSpecFile)
-	if err != nil {
-		return "", fmt.Errorf("create failed: %v", err)
-	}
-	// Strip the trailing newline from crictl output.
-	return strings.TrimSpace(podID), nil
-}
-
-// Start starts a container. It corresponds to `crictl start`.
-func (cc *Crictl) Start(contID string) (string, error) {
-	output, err := cc.run("start", contID)
-	if err != nil {
-		return "", fmt.Errorf("start failed: %v", err)
-	}
-	return output, nil
-}
-
-// Stop stops a container. It corresponds to `crictl stop`.
-func (cc *Crictl) Stop(contID string) error {
-	_, err := cc.run("stop", contID)
-	return err
-}
-
-// Exec execs a program inside a container. It corresponds to `crictl exec`.
-func (cc *Crictl) Exec(contID string, args ...string) (string, error) {
-	a := []string{"exec", contID}
-	a = append(a, args...)
-	output, err := cc.run(a...)
-	if err != nil {
-		return "", fmt.Errorf("exec failed: %v", err)
-	}
-	return output, nil
-}
-
-// Rm removes a container. It corresponds to `crictl rm`.
-func (cc *Crictl) Rm(contID string) error {
-	_, err := cc.run("rm", contID)
-	return err
-}
-
-// StopPod stops a pod. It corresponds to `crictl stopp`.
-func (cc *Crictl) StopPod(podID string) error {
-	_, err := cc.run("stopp", podID)
-	return err
-}
-
-// containsConfig is a minimal copy of
-// https://github.com/kubernetes/kubernetes/blob/master/pkg/kubelet/apis/cri/runtime/v1alpha2/api.proto
-// It only contains fields needed for testing.
-type containerConfig struct {
-	Status containerStatus
-}
-
-type containerStatus struct {
-	Network containerNetwork
-}
-
-type containerNetwork struct {
-	IP string
-}
-
-// PodIP returns a pod's IP address.
-func (cc *Crictl) PodIP(podID string) (string, error) {
-	output, err := cc.run("inspectp", podID)
-	if err != nil {
-		return "", err
-	}
-	conf := &containerConfig{}
-	if err := json.Unmarshal([]byte(output), conf); err != nil {
-		return "", fmt.Errorf("failed to unmarshal JSON: %v, %s", err, output)
-	}
-	if conf.Status.Network.IP == "" {
-		return "", fmt.Errorf("no IP found in config: %s", output)
-	}
-	return conf.Status.Network.IP, nil
-}
-
-// RmPod removes a container. It corresponds to `crictl rmp`.
-func (cc *Crictl) RmPod(podID string) error {
-	_, err := cc.run("rmp", podID)
-	return err
-}
-
-// StartPodAndContainer pulls an image, then starts a sandbox and container in
-// that sandbox. It returns the pod ID and container ID.
-func (cc *Crictl) StartPodAndContainer(image, sbSpec, contSpec string) (string, string, error) {
-	if err := cc.Pull(image); err != nil {
-		return "", "", fmt.Errorf("failed to pull %s: %v", image, err)
-	}
-
-	// Write the specs to files that can be read by crictl.
-	sbSpecFile, err := WriteTmpFile("sbSpec", sbSpec)
-	if err != nil {
-		return "", "", fmt.Errorf("failed to write sandbox spec: %v", err)
-	}
-	contSpecFile, err := WriteTmpFile("contSpec", contSpec)
-	if err != nil {
-		return "", "", fmt.Errorf("failed to write container spec: %v", err)
-	}
-
-	podID, err := cc.RunPod(sbSpecFile)
-	if err != nil {
-		return "", "", err
-	}
-
-	contID, err := cc.Create(podID, contSpecFile, sbSpecFile)
-	if err != nil {
-		return "", "", fmt.Errorf("failed to create container in pod %q: %v", podID, err)
-	}
-
-	if _, err := cc.Start(contID); err != nil {
-		return "", "", fmt.Errorf("failed to start container %q in pod %q: %v", contID, podID, err)
-	}
-
-	return podID, contID, nil
-}
-
-// StopPodAndContainer stops a container and pod.
-func (cc *Crictl) StopPodAndContainer(podID, contID string) error {
-	if err := cc.Stop(contID); err != nil {
-		return fmt.Errorf("failed to stop container %q in pod %q: %v", contID, podID, err)
-	}
-
-	if err := cc.Rm(contID); err != nil {
-		return fmt.Errorf("failed to remove container %q in pod %q: %v", contID, podID, err)
-	}
-
-	if err := cc.StopPod(podID); err != nil {
-		return fmt.Errorf("failed to stop pod %q: %v", podID, err)
-	}
-
-	if err := cc.RmPod(podID); err != nil {
-		return fmt.Errorf("failed to remove pod %q: %v", podID, err)
-	}
-
-	return nil
-}
-
-// run runs crictl with the given args and returns an error if it takes longer
-// than cc.Timeout to run.
-func (cc *Crictl) run(args ...string) (string, error) {
-	defaultArgs := []string{
-		"--image-endpoint", cc.imageEndpoint,
-		"--runtime-endpoint", cc.runtimeEndpoint,
-	}
-	cmd := exec.Command(cc.executable, append(defaultArgs, args...)...)
-
-	// Run the command with a timeout.
-	done := make(chan string)
-	errCh := make(chan error)
-	go func() {
-		output, err := cmd.CombinedOutput()
-		if err != nil {
-			errCh <- fmt.Errorf("error: \"%v\", output: %s", err, string(output))
-			return
-		}
-		done <- string(output)
-	}()
-	select {
-	case output := <-done:
-		return output, nil
-	case err := <-errCh:
-		return "", err
-	case <-time.After(cc.timeout):
-		if err := KillCommand(cmd); err != nil {
-			return "", fmt.Errorf("timed out, then couldn't kill process %+v: %v", cmd, err)
-		}
-		return "", fmt.Errorf("timed out: %+v", cmd)
-	}
-}
diff --git a/runsc/test/testutil/docker.go b/runsc/test/testutil/docker.go
deleted file mode 100644
index 94e625259..000000000
--- a/runsc/test/testutil/docker.go
+++ /dev/null
@@ -1,410 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package testutil
-
-import (
-	"flag"
-	"fmt"
-	"io/ioutil"
-	"log"
-	"os"
-	"os/exec"
-	"path"
-	"regexp"
-	"strconv"
-	"strings"
-	"syscall"
-	"time"
-
-	"github.com/kr/pty"
-)
-
-var runtimeType = flag.String("runtime-type", "", "specify which runtime to use: kvm, hostnet, overlay")
-
-func getRuntime() string {
-	r, ok := os.LookupEnv("RUNSC_RUNTIME")
-	if !ok {
-		r = "runsc-test"
-	}
-	if *runtimeType != "" {
-		r += "-" + *runtimeType
-	}
-	return r
-}
-
-// IsPauseResumeSupported returns true if Pause/Resume is supported by runtime.
-func IsPauseResumeSupported() bool {
-	// Native host network stack can't be saved.
-	return !strings.Contains(getRuntime(), "hostnet")
-}
-
-// EnsureSupportedDockerVersion checks if correct docker is installed.
-func EnsureSupportedDockerVersion() {
-	cmd := exec.Command("docker", "version")
-	out, err := cmd.CombinedOutput()
-	if err != nil {
-		log.Fatalf("Error running %q: %v", "docker version", err)
-	}
-	re := regexp.MustCompile(`Version:\s+(\d+)\.(\d+)\.\d.*`)
-	matches := re.FindStringSubmatch(string(out))
-	if len(matches) != 3 {
-		log.Fatalf("Invalid docker output: %s", out)
-	}
-	major, _ := strconv.Atoi(matches[1])
-	minor, _ := strconv.Atoi(matches[2])
-	if major < 17 || (major == 17 && minor < 9) {
-		log.Fatalf("Docker version 17.09.0 or greater is required, found: %02d.%02d", major, minor)
-	}
-}
-
-// MountMode describes if the mount should be ro or rw.
-type MountMode int
-
-const (
-	// ReadOnly is what the name says.
-	ReadOnly MountMode = iota
-	// ReadWrite is what the name says.
-	ReadWrite
-)
-
-// String returns the mount mode argument for this MountMode.
-func (m MountMode) String() string {
-	switch m {
-	case ReadOnly:
-		return "ro"
-	case ReadWrite:
-		return "rw"
-	}
-	panic(fmt.Sprintf("invalid mode: %d", m))
-}
-
-// MountArg formats the volume argument to mount in the container.
-func MountArg(source, target string, mode MountMode) string {
-	return fmt.Sprintf("-v=%s:%s:%v", source, target, mode)
-}
-
-// LinkArg formats the link argument.
-func LinkArg(source *Docker, target string) string {
-	return fmt.Sprintf("--link=%s:%s", source.Name, target)
-}
-
-// PrepareFiles creates temp directory to copy files there. The sandbox doesn't
-// have access to files in the test dir.
-func PrepareFiles(names ...string) (string, error) {
-	dir, err := ioutil.TempDir("", "image-test")
-	if err != nil {
-		return "", fmt.Errorf("ioutil.TempDir failed: %v", err)
-	}
-	if err := os.Chmod(dir, 0777); err != nil {
-		return "", fmt.Errorf("os.Chmod(%q, 0777) failed: %v", dir, err)
-	}
-	for _, name := range names {
-		src := getLocalPath(name)
-		dst := path.Join(dir, name)
-		if err := Copy(src, dst); err != nil {
-			return "", fmt.Errorf("testutil.Copy(%q, %q) failed: %v", src, dst, err)
-		}
-	}
-	return dir, nil
-}
-
-func getLocalPath(file string) string {
-	return path.Join(".", file)
-}
-
-// do executes docker command.
-func do(args ...string) (string, error) {
-	log.Printf("Running: docker %s\n", args)
-	cmd := exec.Command("docker", args...)
-	out, err := cmd.CombinedOutput()
-	if err != nil {
-		return "", fmt.Errorf("error executing docker %s: %v\nout: %s", args, err, out)
-	}
-	return string(out), nil
-}
-
-// doWithPty executes docker command with stdio attached to a pty.
-func doWithPty(args ...string) (*exec.Cmd, *os.File, error) {
-	log.Printf("Running with pty: docker %s\n", args)
-	cmd := exec.Command("docker", args...)
-	ptmx, err := pty.Start(cmd)
-	if err != nil {
-		return nil, nil, fmt.Errorf("error executing docker %s with a pty: %v", args, err)
-	}
-	return cmd, ptmx, nil
-}
-
-// Pull pulls a docker image. This is used in tests to isolate the
-// time to pull the image off the network from the time to actually
-// start the container, to avoid timeouts over slow networks.
-func Pull(image string) error {
-	_, err := do("pull", image)
-	return err
-}
-
-// Docker contains the name and the runtime of a docker container.
-type Docker struct {
-	Runtime string
-	Name    string
-}
-
-// MakeDocker sets up the struct for a Docker container.
-// Names of containers will be unique.
-func MakeDocker(namePrefix string) Docker {
-	return Docker{Name: RandomName(namePrefix), Runtime: getRuntime()}
-}
-
-// logDockerID logs a container id, which is needed to find container runsc logs.
-func (d *Docker) logDockerID() {
-	id, err := d.ID()
-	if err != nil {
-		log.Printf("%v\n", err)
-	}
-	log.Printf("Name: %s ID: %v\n", d.Name, id)
-}
-
-// Create calls 'docker create' with the arguments provided.
-func (d *Docker) Create(args ...string) error {
-	a := []string{"create", "--runtime", d.Runtime, "--name", d.Name}
-	a = append(a, args...)
-	_, err := do(a...)
-	if err == nil {
-		d.logDockerID()
-	}
-	return err
-}
-
-// Start calls 'docker start'.
-func (d *Docker) Start() error {
-	if _, err := do("start", d.Name); err != nil {
-		return fmt.Errorf("error starting container %q: %v", d.Name, err)
-	}
-	return nil
-}
-
-// Stop calls 'docker stop'.
-func (d *Docker) Stop() error {
-	if _, err := do("stop", d.Name); err != nil {
-		return fmt.Errorf("error stopping container %q: %v", d.Name, err)
-	}
-	return nil
-}
-
-// Run calls 'docker run' with the arguments provided. The container starts
-// running in the background and the call returns immediately.
-func (d *Docker) Run(args ...string) error {
-	a := []string{"run", "--runtime", d.Runtime, "--name", d.Name, "-d"}
-	a = append(a, args...)
-	_, err := do(a...)
-	if err == nil {
-		d.logDockerID()
-	}
-	return err
-}
-
-// RunWithPty is like Run but with an attached pty.
-func (d *Docker) RunWithPty(args ...string) (*exec.Cmd, *os.File, error) {
-	a := []string{"run", "--runtime", d.Runtime, "--name", d.Name, "-it"}
-	a = append(a, args...)
-	return doWithPty(a...)
-}
-
-// RunFg calls 'docker run' with the arguments provided in the foreground. It
-// blocks until the container exits and returns the output.
-func (d *Docker) RunFg(args ...string) (string, error) {
-	a := []string{"run", "--runtime", d.Runtime, "--name", d.Name}
-	a = append(a, args...)
-	out, err := do(a...)
-	if err == nil {
-		d.logDockerID()
-	}
-	return string(out), err
-}
-
-// Logs calls 'docker logs'.
-func (d *Docker) Logs() (string, error) {
-	return do("logs", d.Name)
-}
-
-// Exec calls 'docker exec' with the arguments provided.
-func (d *Docker) Exec(args ...string) (string, error) {
-	a := []string{"exec", d.Name}
-	a = append(a, args...)
-	return do(a...)
-}
-
-// ExecWithTerminal calls 'docker exec -it' with the arguments provided and
-// attaches a pty to stdio.
-func (d *Docker) ExecWithTerminal(args ...string) (*exec.Cmd, *os.File, error) {
-	a := []string{"exec", "-it", d.Name}
-	a = append(a, args...)
-	return doWithPty(a...)
-}
-
-// Pause calls 'docker pause'.
-func (d *Docker) Pause() error {
-	if _, err := do("pause", d.Name); err != nil {
-		return fmt.Errorf("error pausing container %q: %v", d.Name, err)
-	}
-	return nil
-}
-
-// Unpause calls 'docker pause'.
-func (d *Docker) Unpause() error {
-	if _, err := do("unpause", d.Name); err != nil {
-		return fmt.Errorf("error unpausing container %q: %v", d.Name, err)
-	}
-	return nil
-}
-
-// Checkpoint calls 'docker checkpoint'.
-func (d *Docker) Checkpoint(name string) error {
-	if _, err := do("checkpoint", "create", d.Name, name); err != nil {
-		return fmt.Errorf("error pausing container %q: %v", d.Name, err)
-	}
-	return nil
-}
-
-// Restore calls 'docker start --checkname [name]'.
-func (d *Docker) Restore(name string) error {
-	if _, err := do("start", "--checkpoint", name, d.Name); err != nil {
-		return fmt.Errorf("error starting container %q: %v", d.Name, err)
-	}
-	return nil
-}
-
-// Remove calls 'docker rm'.
-func (d *Docker) Remove() error {
-	if _, err := do("rm", d.Name); err != nil {
-		return fmt.Errorf("error deleting container %q: %v", d.Name, err)
-	}
-	return nil
-}
-
-// CleanUp kills and deletes the container (best effort).
-func (d *Docker) CleanUp() {
-	d.logDockerID()
-	if _, err := do("kill", d.Name); err != nil {
-		if strings.Contains(err.Error(), "is not running") {
-			// Nothing to kill. Don't log the error in this case.
-		} else {
-			log.Printf("error killing container %q: %v", d.Name, err)
-		}
-	}
-	if err := d.Remove(); err != nil {
-		log.Print(err)
-	}
-}
-
-// FindPort returns the host port that is mapped to 'sandboxPort'. This calls
-// docker to allocate a free port in the host and prevent conflicts.
-func (d *Docker) FindPort(sandboxPort int) (int, error) {
-	format := fmt.Sprintf(`{{ (index (index .NetworkSettings.Ports "%d/tcp") 0).HostPort }}`, sandboxPort)
-	out, err := do("inspect", "-f", format, d.Name)
-	if err != nil {
-		return -1, fmt.Errorf("error retrieving port: %v", err)
-	}
-	port, err := strconv.Atoi(strings.TrimSuffix(string(out), "\n"))
-	if err != nil {
-		return -1, fmt.Errorf("error parsing port %q: %v", out, err)
-	}
-	return port, nil
-}
-
-// SandboxPid returns the PID to the sandbox process.
-func (d *Docker) SandboxPid() (int, error) {
-	out, err := do("inspect", "-f={{.State.Pid}}", d.Name)
-	if err != nil {
-		return -1, fmt.Errorf("error retrieving pid: %v", err)
-	}
-	pid, err := strconv.Atoi(strings.TrimSuffix(string(out), "\n"))
-	if err != nil {
-		return -1, fmt.Errorf("error parsing pid %q: %v", out, err)
-	}
-	return pid, nil
-}
-
-// ID returns the container ID.
-func (d *Docker) ID() (string, error) {
-	out, err := do("inspect", "-f={{.Id}}", d.Name)
-	if err != nil {
-		return "", fmt.Errorf("error retrieving ID: %v", err)
-	}
-	return strings.TrimSpace(string(out)), nil
-}
-
-// Wait waits for container to exit, up to the given timeout. Returns error if
-// wait fails or timeout is hit. Returns the application return code otherwise.
-// Note that the application may have failed even if err == nil, always check
-// the exit code.
-func (d *Docker) Wait(timeout time.Duration) (syscall.WaitStatus, error) {
-	timeoutChan := time.After(timeout)
-	waitChan := make(chan (syscall.WaitStatus))
-	errChan := make(chan (error))
-
-	go func() {
-		out, err := do("wait", d.Name)
-		if err != nil {
-			errChan <- fmt.Errorf("error waiting for container %q: %v", d.Name, err)
-		}
-		exit, err := strconv.Atoi(strings.TrimSuffix(string(out), "\n"))
-		if err != nil {
-			errChan <- fmt.Errorf("error parsing exit code %q: %v", out, err)
-		}
-		waitChan <- syscall.WaitStatus(uint32(exit))
-	}()
-
-	select {
-	case ws := <-waitChan:
-		return ws, nil
-	case err := <-errChan:
-		return syscall.WaitStatus(1), err
-	case <-timeoutChan:
-		return syscall.WaitStatus(1), fmt.Errorf("timeout waiting for container %q", d.Name)
-	}
-}
-
-// WaitForOutput calls 'docker logs' to retrieve containers output and searches
-// for the given pattern.
-func (d *Docker) WaitForOutput(pattern string, timeout time.Duration) (string, error) {
-	matches, err := d.WaitForOutputSubmatch(pattern, timeout)
-	if err != nil {
-		return "", err
-	}
-	if len(matches) == 0 {
-		return "", nil
-	}
-	return matches[0], nil
-}
-
-// WaitForOutputSubmatch calls 'docker logs' to retrieve containers output and
-// searches for the given pattern. It returns any regexp submatches as well.
-func (d *Docker) WaitForOutputSubmatch(pattern string, timeout time.Duration) ([]string, error) {
-	re := regexp.MustCompile(pattern)
-	var out string
-	for exp := time.Now().Add(timeout); time.Now().Before(exp); {
-		var err error
-		out, err = d.Logs()
-		if err != nil {
-			return nil, err
-		}
-		if matches := re.FindStringSubmatch(out); matches != nil {
-			// Success!
-			return matches, nil
-		}
-		time.Sleep(100 * time.Millisecond)
-	}
-	return nil, fmt.Errorf("timeout waiting for output %q: %s", re.String(), out)
-}
diff --git a/runsc/test/testutil/testutil.go b/runsc/test/testutil/testutil.go
deleted file mode 100644
index 4a3dfa0e3..000000000
--- a/runsc/test/testutil/testutil.go
+++ /dev/null
@@ -1,421 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Package testutil contains utility functions for runsc tests.
-package testutil
-
-import (
-	"bufio"
-	"context"
-	"encoding/base32"
-	"encoding/json"
-	"fmt"
-	"io"
-	"io/ioutil"
-	"log"
-	"math/rand"
-	"net/http"
-	"os"
-	"os/exec"
-	"os/signal"
-	"path/filepath"
-	"strings"
-	"sync"
-	"sync/atomic"
-	"syscall"
-	"time"
-
-	"github.com/cenkalti/backoff"
-	specs "github.com/opencontainers/runtime-spec/specs-go"
-	"gvisor.dev/gvisor/runsc/boot"
-	"gvisor.dev/gvisor/runsc/specutils"
-)
-
-func init() {
-	rand.Seed(time.Now().UnixNano())
-}
-
-// RaceEnabled is set to true if it was built with '--race' option.
-var RaceEnabled = false
-
-// TmpDir returns the absolute path to a writable directory that can be used as
-// scratch by the test.
-func TmpDir() string {
-	dir := os.Getenv("TEST_TMPDIR")
-	if dir == "" {
-		dir = "/tmp"
-	}
-	return dir
-}
-
-// ConfigureExePath configures the executable for runsc in the test environment.
-func ConfigureExePath() error {
-	path, err := FindFile("runsc/runsc")
-	if err != nil {
-		return err
-	}
-	specutils.ExePath = path
-	return nil
-}
-
-// FindFile searchs for a file inside the test run environment. It returns the
-// full path to the file. It fails if none or more than one file is found.
-func FindFile(path string) (string, error) {
-	wd, err := os.Getwd()
-	if err != nil {
-		return "", err
-	}
-
-	// The test root is demarcated by a path element called "__main__". Search for
-	// it backwards from the working directory.
-	root := wd
-	for {
-		dir, name := filepath.Split(root)
-		if name == "__main__" {
-			break
-		}
-		if len(dir) == 0 {
-			return "", fmt.Errorf("directory __main__ not found in %q", wd)
-		}
-		// Remove ending slash to loop around.
-		root = dir[:len(dir)-1]
-	}
-
-	// Annoyingly, bazel adds the build type to the directory path for go
-	// binaries, but not for c++ binaries. We use two different patterns to
-	// to find our file.
-	patterns := []string{
-		// Try the obvious path first.
-		filepath.Join(root, path),
-		// If it was a go binary, use a wildcard to match the build
-		// type. The pattern is: /test-path/__main__/directories/*/file.
-		filepath.Join(root, filepath.Dir(path), "*", filepath.Base(path)),
-	}
-
-	for _, p := range patterns {
-		matches, err := filepath.Glob(p)
-		if err != nil {
-			// "The only possible returned error is ErrBadPattern,
-			// when pattern is malformed." -godoc
-			return "", fmt.Errorf("error globbing %q: %v", p, err)
-		}
-		switch len(matches) {
-		case 0:
-			// Try the next pattern.
-		case 1:
-			// We found it.
-			return matches[0], nil
-		default:
-			return "", fmt.Errorf("more than one match found for %q: %s", path, matches)
-		}
-	}
-	return "", fmt.Errorf("file %q not found", path)
-}
-
-// TestConfig returns the default configuration to use in tests. Note that
-// 'RootDir' must be set by caller if required.
-func TestConfig() *boot.Config {
-	return &boot.Config{
-		Debug:           true,
-		LogFormat:       "text",
-		DebugLogFormat:  "text",
-		AlsoLogToStderr: true,
-		LogPackets:      true,
-		Network:         boot.NetworkNone,
-		Strace:          true,
-		Platform:        "ptrace",
-		FileAccess:      boot.FileAccessExclusive,
-		TestOnlyAllowRunAsCurrentUserWithoutChroot: true,
-		NumNetworkChannels:                         1,
-	}
-}
-
-// TestConfigWithRoot returns the default configuration to use in tests.
-func TestConfigWithRoot(rootDir string) *boot.Config {
-	conf := TestConfig()
-	conf.RootDir = rootDir
-	return conf
-}
-
-// NewSpecWithArgs creates a simple spec with the given args suitable for use
-// in tests.
-func NewSpecWithArgs(args ...string) *specs.Spec {
-	return &specs.Spec{
-		// The host filesystem root is the container root.
-		Root: &specs.Root{
-			Path:     "/",
-			Readonly: true,
-		},
-		Process: &specs.Process{
-			Args: args,
-			Env: []string{
-				"PATH=" + os.Getenv("PATH"),
-			},
-			Capabilities: specutils.AllCapabilities(),
-		},
-		Mounts: []specs.Mount{
-			// Root is readonly, but many tests want to write to tmpdir.
-			// This creates a writable mount inside the root. Also, when tmpdir points
-			// to "/tmp", it makes the the actual /tmp to be mounted and not a tmpfs
-			// inside the sentry.
-			{
-				Type:        "bind",
-				Destination: TmpDir(),
-				Source:      TmpDir(),
-			},
-		},
-		Hostname: "runsc-test-hostname",
-	}
-}
-
-// SetupRootDir creates a root directory for containers.
-func SetupRootDir() (string, error) {
-	rootDir, err := ioutil.TempDir(TmpDir(), "containers")
-	if err != nil {
-		return "", fmt.Errorf("error creating root dir: %v", err)
-	}
-	return rootDir, nil
-}
-
-// SetupContainer creates a bundle and root dir for the container, generates a
-// test config, and writes the spec to config.json in the bundle dir.
-func SetupContainer(spec *specs.Spec, conf *boot.Config) (rootDir, bundleDir string, err error) {
-	// Setup root dir if one hasn't been provided.
-	if len(conf.RootDir) == 0 {
-		rootDir, err = SetupRootDir()
-		if err != nil {
-			return "", "", err
-		}
-		conf.RootDir = rootDir
-	}
-	bundleDir, err = SetupBundleDir(spec)
-	return rootDir, bundleDir, err
-}
-
-// SetupBundleDir creates a bundle dir and writes the spec to config.json.
-func SetupBundleDir(spec *specs.Spec) (bundleDir string, err error) {
-	bundleDir, err = ioutil.TempDir(TmpDir(), "bundle")
-	if err != nil {
-		return "", fmt.Errorf("error creating bundle dir: %v", err)
-	}
-
-	if err = writeSpec(bundleDir, spec); err != nil {
-		return "", fmt.Errorf("error writing spec: %v", err)
-	}
-	return bundleDir, nil
-}
-
-// writeSpec writes the spec to disk in the given directory.
-func writeSpec(dir string, spec *specs.Spec) error {
-	b, err := json.Marshal(spec)
-	if err != nil {
-		return err
-	}
-	return ioutil.WriteFile(filepath.Join(dir, "config.json"), b, 0755)
-}
-
-// UniqueContainerID generates a unique container id for each test.
-//
-// The container id is used to create an abstract unix domain socket, which must
-// be unique.  While the container forbids creating two containers with the same
-// name, sometimes between test runs the socket does not get cleaned up quickly
-// enough, causing container creation to fail.
-func UniqueContainerID() string {
-	// Read 20 random bytes.
-	b := make([]byte, 20)
-	// "[Read] always returns len(p) and a nil error." --godoc
-	if _, err := rand.Read(b); err != nil {
-		panic("rand.Read failed: " + err.Error())
-	}
-	// base32 encode the random bytes, so that the name is a valid
-	// container id and can be used as a socket name in the filesystem.
-	return fmt.Sprintf("test-container-%s", base32.StdEncoding.EncodeToString(b))
-}
-
-// Copy copies file from src to dst.
-func Copy(src, dst string) error {
-	in, err := os.Open(src)
-	if err != nil {
-		return err
-	}
-	defer in.Close()
-
-	out, err := os.Create(dst)
-	if err != nil {
-		return err
-	}
-	defer out.Close()
-
-	_, err = io.Copy(out, in)
-	return err
-}
-
-// Poll is a shorthand function to poll for something with given timeout.
-func Poll(cb func() error, timeout time.Duration) error {
-	ctx, cancel := context.WithTimeout(context.Background(), timeout)
-	defer cancel()
-	b := backoff.WithContext(backoff.NewConstantBackOff(100*time.Millisecond), ctx)
-	return backoff.Retry(cb, b)
-}
-
-// WaitForHTTP tries GET requests on a port until the call succeeds or timeout.
-func WaitForHTTP(port int, timeout time.Duration) error {
-	cb := func() error {
-		c := &http.Client{
-			// Calculate timeout to be able to do minimum 5 attempts.
-			Timeout: timeout / 5,
-		}
-		url := fmt.Sprintf("http://localhost:%d/", port)
-		resp, err := c.Get(url)
-		if err != nil {
-			log.Printf("Waiting %s: %v", url, err)
-			return err
-		}
-		resp.Body.Close()
-		return nil
-	}
-	return Poll(cb, timeout)
-}
-
-// Reaper reaps child processes.
-type Reaper struct {
-	// mu protects ch, which will be nil if the reaper is not running.
-	mu sync.Mutex
-	ch chan os.Signal
-}
-
-// Start starts reaping child processes.
-func (r *Reaper) Start() {
-	r.mu.Lock()
-	defer r.mu.Unlock()
-
-	if r.ch != nil {
-		panic("reaper.Start called on a running reaper")
-	}
-
-	r.ch = make(chan os.Signal, 1)
-	signal.Notify(r.ch, syscall.SIGCHLD)
-
-	go func() {
-		for {
-			r.mu.Lock()
-			ch := r.ch
-			r.mu.Unlock()
-			if ch == nil {
-				return
-			}
-
-			_, ok := <-ch
-			if !ok {
-				// Channel closed.
-				return
-			}
-			for {
-				cpid, _ := syscall.Wait4(-1, nil, syscall.WNOHANG, nil)
-				if cpid < 1 {
-					break
-				}
-			}
-		}
-	}()
-}
-
-// Stop stops reaping child processes.
-func (r *Reaper) Stop() {
-	r.mu.Lock()
-	defer r.mu.Unlock()
-
-	if r.ch == nil {
-		panic("reaper.Stop called on a stopped reaper")
-	}
-
-	signal.Stop(r.ch)
-	close(r.ch)
-	r.ch = nil
-}
-
-// StartReaper is a helper that starts a new Reaper and returns a function to
-// stop it.
-func StartReaper() func() {
-	r := &Reaper{}
-	r.Start()
-	return r.Stop
-}
-
-// WaitUntilRead reads from the given reader until the wanted string is found
-// or until timeout.
-func WaitUntilRead(r io.Reader, want string, split bufio.SplitFunc, timeout time.Duration) error {
-	sc := bufio.NewScanner(r)
-	if split != nil {
-		sc.Split(split)
-	}
-	// done must be accessed atomically. A value greater than 0 indicates
-	// that the read loop can exit.
-	var done uint32
-	doneCh := make(chan struct{})
-	go func() {
-		for sc.Scan() {
-			t := sc.Text()
-			if strings.Contains(t, want) {
-				atomic.StoreUint32(&done, 1)
-				close(doneCh)
-				break
-			}
-			if atomic.LoadUint32(&done) > 0 {
-				break
-			}
-		}
-	}()
-	select {
-	case <-time.After(timeout):
-		atomic.StoreUint32(&done, 1)
-		return fmt.Errorf("timeout waiting to read %q", want)
-	case <-doneCh:
-		return nil
-	}
-}
-
-// KillCommand kills the process running cmd unless it hasn't been started. It
-// returns an error if it cannot kill the process unless the reason is that the
-// process has already exited.
-func KillCommand(cmd *exec.Cmd) error {
-	if cmd.Process == nil {
-		return nil
-	}
-	if err := cmd.Process.Kill(); err != nil {
-		if !strings.Contains(err.Error(), "process already finished") {
-			return fmt.Errorf("failed to kill process %v: %v", cmd, err)
-		}
-	}
-	return nil
-}
-
-// WriteTmpFile writes text to a temporary file, closes the file, and returns
-// the name of the file.
-func WriteTmpFile(pattern, text string) (string, error) {
-	file, err := ioutil.TempFile(TmpDir(), pattern)
-	if err != nil {
-		return "", err
-	}
-	defer file.Close()
-	if _, err := file.Write([]byte(text)); err != nil {
-		return "", err
-	}
-	return file.Name(), nil
-}
-
-// RandomName create a name with a 6 digit random number appended to it.
-func RandomName(prefix string) string {
-	return fmt.Sprintf("%s-%06d", prefix, rand.Int31n(1000000))
-}
diff --git a/runsc/test/testutil/testutil_race.go b/runsc/test/testutil/testutil_race.go
deleted file mode 100644
index 86db6ffa1..000000000
--- a/runsc/test/testutil/testutil_race.go
+++ /dev/null
@@ -1,21 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// +build race
-
-package testutil
-
-func init() {
-	RaceEnabled = true
-}
diff --git a/runsc/testutil/BUILD b/runsc/testutil/BUILD
new file mode 100644
index 000000000..d44ebc906
--- /dev/null
+++ b/runsc/testutil/BUILD
@@ -0,0 +1,17 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "testutil",
+    testonly = 1,
+    srcs = ["testutil.go"],
+    importpath = "gvisor.dev/gvisor/runsc/testutil",
+    visibility = ["//:sandbox"],
+    deps = [
+        "//runsc/boot",
+        "//runsc/specutils",
+        "@com_github_cenkalti_backoff//:go_default_library",
+        "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
+    ],
+)
diff --git a/runsc/testutil/testutil.go b/runsc/testutil/testutil.go
new file mode 100644
index 000000000..57ab73d97
--- /dev/null
+++ b/runsc/testutil/testutil.go
@@ -0,0 +1,440 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package testutil contains utility functions for runsc tests.
+package testutil
+
+import (
+	"bufio"
+	"context"
+	"debug/elf"
+	"encoding/base32"
+	"encoding/json"
+	"flag"
+	"fmt"
+	"io"
+	"io/ioutil"
+	"log"
+	"math/rand"
+	"net/http"
+	"os"
+	"os/exec"
+	"os/signal"
+	"path/filepath"
+	"strings"
+	"sync"
+	"sync/atomic"
+	"syscall"
+	"time"
+
+	"github.com/cenkalti/backoff"
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/specutils"
+)
+
+var (
+	checkpoint = flag.Bool("checkpoint", true, "control checkpoint/restore support")
+)
+
+func init() {
+	rand.Seed(time.Now().UnixNano())
+}
+
+// IsCheckpointSupported returns the relevant command line flag.
+func IsCheckpointSupported() bool {
+	return *checkpoint
+}
+
+// TmpDir returns the absolute path to a writable directory that can be used as
+// scratch by the test.
+func TmpDir() string {
+	dir := os.Getenv("TEST_TMPDIR")
+	if dir == "" {
+		dir = "/tmp"
+	}
+	return dir
+}
+
+// ConfigureExePath configures the executable for runsc in the test environment.
+func ConfigureExePath() error {
+	path, err := FindFile("runsc/runsc")
+	if err != nil {
+		return err
+	}
+	specutils.ExePath = path
+	return nil
+}
+
+// FindFile searchs for a file inside the test run environment. It returns the
+// full path to the file. It fails if none or more than one file is found.
+func FindFile(path string) (string, error) {
+	wd, err := os.Getwd()
+	if err != nil {
+		return "", err
+	}
+
+	// The test root is demarcated by a path element called "__main__". Search for
+	// it backwards from the working directory.
+	root := wd
+	for {
+		dir, name := filepath.Split(root)
+		if name == "__main__" {
+			break
+		}
+		if len(dir) == 0 {
+			return "", fmt.Errorf("directory __main__ not found in %q", wd)
+		}
+		// Remove ending slash to loop around.
+		root = dir[:len(dir)-1]
+	}
+
+	// Annoyingly, bazel adds the build type to the directory path for go
+	// binaries, but not for c++ binaries. We use two different patterns to
+	// to find our file.
+	patterns := []string{
+		// Try the obvious path first.
+		filepath.Join(root, path),
+		// If it was a go binary, use a wildcard to match the build
+		// type. The pattern is: /test-path/__main__/directories/*/file.
+		filepath.Join(root, filepath.Dir(path), "*", filepath.Base(path)),
+	}
+
+	for _, p := range patterns {
+		matches, err := filepath.Glob(p)
+		if err != nil {
+			// "The only possible returned error is ErrBadPattern,
+			// when pattern is malformed." -godoc
+			return "", fmt.Errorf("error globbing %q: %v", p, err)
+		}
+		switch len(matches) {
+		case 0:
+			// Try the next pattern.
+		case 1:
+			// We found it.
+			return matches[0], nil
+		default:
+			return "", fmt.Errorf("more than one match found for %q: %s", path, matches)
+		}
+	}
+	return "", fmt.Errorf("file %q not found", path)
+}
+
+// TestConfig returns the default configuration to use in tests. Note that
+// 'RootDir' must be set by caller if required.
+func TestConfig() *boot.Config {
+	return &boot.Config{
+		Debug:           true,
+		LogFormat:       "text",
+		DebugLogFormat:  "text",
+		AlsoLogToStderr: true,
+		LogPackets:      true,
+		Network:         boot.NetworkNone,
+		Strace:          true,
+		Platform:        "ptrace",
+		FileAccess:      boot.FileAccessExclusive,
+		TestOnlyAllowRunAsCurrentUserWithoutChroot: true,
+		NumNetworkChannels:                         1,
+	}
+}
+
+// TestConfigWithRoot returns the default configuration to use in tests.
+func TestConfigWithRoot(rootDir string) *boot.Config {
+	conf := TestConfig()
+	conf.RootDir = rootDir
+	return conf
+}
+
+// NewSpecWithArgs creates a simple spec with the given args suitable for use
+// in tests.
+func NewSpecWithArgs(args ...string) *specs.Spec {
+	return &specs.Spec{
+		// The host filesystem root is the container root.
+		Root: &specs.Root{
+			Path:     "/",
+			Readonly: true,
+		},
+		Process: &specs.Process{
+			Args: args,
+			Env: []string{
+				"PATH=" + os.Getenv("PATH"),
+			},
+			Capabilities: specutils.AllCapabilities(),
+		},
+		Mounts: []specs.Mount{
+			// Root is readonly, but many tests want to write to tmpdir.
+			// This creates a writable mount inside the root. Also, when tmpdir points
+			// to "/tmp", it makes the the actual /tmp to be mounted and not a tmpfs
+			// inside the sentry.
+			{
+				Type:        "bind",
+				Destination: TmpDir(),
+				Source:      TmpDir(),
+			},
+		},
+		Hostname: "runsc-test-hostname",
+	}
+}
+
+// SetupRootDir creates a root directory for containers.
+func SetupRootDir() (string, error) {
+	rootDir, err := ioutil.TempDir(TmpDir(), "containers")
+	if err != nil {
+		return "", fmt.Errorf("error creating root dir: %v", err)
+	}
+	return rootDir, nil
+}
+
+// SetupContainer creates a bundle and root dir for the container, generates a
+// test config, and writes the spec to config.json in the bundle dir.
+func SetupContainer(spec *specs.Spec, conf *boot.Config) (rootDir, bundleDir string, err error) {
+	rootDir, err = SetupRootDir()
+	if err != nil {
+		return "", "", err
+	}
+	conf.RootDir = rootDir
+	bundleDir, err = SetupBundleDir(spec)
+	return rootDir, bundleDir, err
+}
+
+// SetupBundleDir creates a bundle dir and writes the spec to config.json.
+func SetupBundleDir(spec *specs.Spec) (bundleDir string, err error) {
+	bundleDir, err = ioutil.TempDir(TmpDir(), "bundle")
+	if err != nil {
+		return "", fmt.Errorf("error creating bundle dir: %v", err)
+	}
+
+	if err = writeSpec(bundleDir, spec); err != nil {
+		return "", fmt.Errorf("error writing spec: %v", err)
+	}
+	return bundleDir, nil
+}
+
+// writeSpec writes the spec to disk in the given directory.
+func writeSpec(dir string, spec *specs.Spec) error {
+	b, err := json.Marshal(spec)
+	if err != nil {
+		return err
+	}
+	return ioutil.WriteFile(filepath.Join(dir, "config.json"), b, 0755)
+}
+
+// UniqueContainerID generates a unique container id for each test.
+//
+// The container id is used to create an abstract unix domain socket, which must
+// be unique.  While the container forbids creating two containers with the same
+// name, sometimes between test runs the socket does not get cleaned up quickly
+// enough, causing container creation to fail.
+func UniqueContainerID() string {
+	// Read 20 random bytes.
+	b := make([]byte, 20)
+	// "[Read] always returns len(p) and a nil error." --godoc
+	if _, err := rand.Read(b); err != nil {
+		panic("rand.Read failed: " + err.Error())
+	}
+	// base32 encode the random bytes, so that the name is a valid
+	// container id and can be used as a socket name in the filesystem.
+	return fmt.Sprintf("test-container-%s", base32.StdEncoding.EncodeToString(b))
+}
+
+// Copy copies file from src to dst.
+func Copy(src, dst string) error {
+	in, err := os.Open(src)
+	if err != nil {
+		return err
+	}
+	defer in.Close()
+
+	out, err := os.Create(dst)
+	if err != nil {
+		return err
+	}
+	defer out.Close()
+
+	_, err = io.Copy(out, in)
+	return err
+}
+
+// Poll is a shorthand function to poll for something with given timeout.
+func Poll(cb func() error, timeout time.Duration) error {
+	ctx, cancel := context.WithTimeout(context.Background(), timeout)
+	defer cancel()
+	b := backoff.WithContext(backoff.NewConstantBackOff(100*time.Millisecond), ctx)
+	return backoff.Retry(cb, b)
+}
+
+// WaitForHTTP tries GET requests on a port until the call succeeds or timeout.
+func WaitForHTTP(port int, timeout time.Duration) error {
+	cb := func() error {
+		c := &http.Client{
+			// Calculate timeout to be able to do minimum 5 attempts.
+			Timeout: timeout / 5,
+		}
+		url := fmt.Sprintf("http://localhost:%d/", port)
+		resp, err := c.Get(url)
+		if err != nil {
+			log.Printf("Waiting %s: %v", url, err)
+			return err
+		}
+		resp.Body.Close()
+		return nil
+	}
+	return Poll(cb, timeout)
+}
+
+// Reaper reaps child processes.
+type Reaper struct {
+	// mu protects ch, which will be nil if the reaper is not running.
+	mu sync.Mutex
+	ch chan os.Signal
+}
+
+// Start starts reaping child processes.
+func (r *Reaper) Start() {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+
+	if r.ch != nil {
+		panic("reaper.Start called on a running reaper")
+	}
+
+	r.ch = make(chan os.Signal, 1)
+	signal.Notify(r.ch, syscall.SIGCHLD)
+
+	go func() {
+		for {
+			r.mu.Lock()
+			ch := r.ch
+			r.mu.Unlock()
+			if ch == nil {
+				return
+			}
+
+			_, ok := <-ch
+			if !ok {
+				// Channel closed.
+				return
+			}
+			for {
+				cpid, _ := syscall.Wait4(-1, nil, syscall.WNOHANG, nil)
+				if cpid < 1 {
+					break
+				}
+			}
+		}
+	}()
+}
+
+// Stop stops reaping child processes.
+func (r *Reaper) Stop() {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+
+	if r.ch == nil {
+		panic("reaper.Stop called on a stopped reaper")
+	}
+
+	signal.Stop(r.ch)
+	close(r.ch)
+	r.ch = nil
+}
+
+// StartReaper is a helper that starts a new Reaper and returns a function to
+// stop it.
+func StartReaper() func() {
+	r := &Reaper{}
+	r.Start()
+	return r.Stop
+}
+
+// WaitUntilRead reads from the given reader until the wanted string is found
+// or until timeout.
+func WaitUntilRead(r io.Reader, want string, split bufio.SplitFunc, timeout time.Duration) error {
+	sc := bufio.NewScanner(r)
+	if split != nil {
+		sc.Split(split)
+	}
+	// done must be accessed atomically. A value greater than 0 indicates
+	// that the read loop can exit.
+	var done uint32
+	doneCh := make(chan struct{})
+	go func() {
+		for sc.Scan() {
+			t := sc.Text()
+			if strings.Contains(t, want) {
+				atomic.StoreUint32(&done, 1)
+				close(doneCh)
+				break
+			}
+			if atomic.LoadUint32(&done) > 0 {
+				break
+			}
+		}
+	}()
+	select {
+	case <-time.After(timeout):
+		atomic.StoreUint32(&done, 1)
+		return fmt.Errorf("timeout waiting to read %q", want)
+	case <-doneCh:
+		return nil
+	}
+}
+
+// KillCommand kills the process running cmd unless it hasn't been started. It
+// returns an error if it cannot kill the process unless the reason is that the
+// process has already exited.
+func KillCommand(cmd *exec.Cmd) error {
+	if cmd.Process == nil {
+		return nil
+	}
+	if err := cmd.Process.Kill(); err != nil {
+		if !strings.Contains(err.Error(), "process already finished") {
+			return fmt.Errorf("failed to kill process %v: %v", cmd, err)
+		}
+	}
+	return nil
+}
+
+// WriteTmpFile writes text to a temporary file, closes the file, and returns
+// the name of the file.
+func WriteTmpFile(pattern, text string) (string, error) {
+	file, err := ioutil.TempFile(TmpDir(), pattern)
+	if err != nil {
+		return "", err
+	}
+	defer file.Close()
+	if _, err := file.Write([]byte(text)); err != nil {
+		return "", err
+	}
+	return file.Name(), nil
+}
+
+// RandomName create a name with a 6 digit random number appended to it.
+func RandomName(prefix string) string {
+	return fmt.Sprintf("%s-%06d", prefix, rand.Int31n(1000000))
+}
+
+// IsStatic returns true iff the given file is a static binary.
+func IsStatic(filename string) (bool, error) {
+	f, err := elf.Open(filename)
+	if err != nil {
+		return false, err
+	}
+	for _, prog := range f.Progs {
+		if prog.Type == elf.PT_INTERP {
+			return false, nil // Has interpreter.
+		}
+	}
+	return true, nil
+}
diff --git a/runsc/tools/dockercfg/BUILD b/runsc/tools/dockercfg/BUILD
deleted file mode 100644
index 5cff917ed..000000000
--- a/runsc/tools/dockercfg/BUILD
+++ /dev/null
@@ -1,10 +0,0 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_binary")
-
-package(licenses = ["notice"])
-
-go_binary(
-    name = "dockercfg",
-    srcs = ["dockercfg.go"],
-    visibility = ["//visibility:public"],
-    deps = ["@com_github_google_subcommands//:go_default_library"],
-)
diff --git a/runsc/tools/dockercfg/dockercfg.go b/runsc/tools/dockercfg/dockercfg.go
deleted file mode 100644
index eb9dbd421..000000000
--- a/runsc/tools/dockercfg/dockercfg.go
+++ /dev/null
@@ -1,193 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Helper tool to configure Docker daemon.
-package main
-
-import (
-	"context"
-	"encoding/json"
-	"fmt"
-	"io/ioutil"
-	"log"
-	"os"
-
-	"flag"
-	"github.com/google/subcommands"
-)
-
-var (
-	configFile   = flag.String("config_file", "/etc/docker/daemon.json", "path to Docker daemon config file")
-	experimental = flag.Bool("experimental", false, "enable experimental features")
-)
-
-func main() {
-	subcommands.Register(subcommands.HelpCommand(), "")
-	subcommands.Register(subcommands.FlagsCommand(), "")
-	subcommands.Register(&runtimeAdd{}, "")
-	subcommands.Register(&runtimeRemove{}, "")
-
-	// All subcommands must be registered before flag parsing.
-	flag.Parse()
-
-	exitCode := subcommands.Execute(context.Background())
-	os.Exit(int(exitCode))
-}
-
-type runtime struct {
-	Path        string   `json:"path,omitempty"`
-	RuntimeArgs []string `json:"runtimeArgs,omitempty"`
-}
-
-// runtimeAdd implements subcommands.Command.
-type runtimeAdd struct {
-}
-
-// Name implements subcommands.Command.Name.
-func (*runtimeAdd) Name() string {
-	return "runtime-add"
-}
-
-// Synopsis implements subcommands.Command.Synopsis.
-func (*runtimeAdd) Synopsis() string {
-	return "adds a runtime to docker daemon configuration"
-}
-
-// Usage implements subcommands.Command.Usage.
-func (*runtimeAdd) Usage() string {
-	return `runtime-add [flags] <name> <path> [args...]  -- if provided, args are passed as arguments to the runtime
-`
-}
-
-// SetFlags implements subcommands.Command.SetFlags.
-func (*runtimeAdd) SetFlags(*flag.FlagSet) {
-}
-
-// Execute implements subcommands.Command.Execute.
-func (r *runtimeAdd) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
-	if f.NArg() < 2 {
-		f.Usage()
-		return subcommands.ExitUsageError
-	}
-	name := f.Arg(0)
-	path := f.Arg(1)
-	runtimeArgs := f.Args()[2:]
-
-	fmt.Printf("Adding runtime %q to file %q\n", name, *configFile)
-	c, err := readConfig(*configFile)
-	if err != nil {
-		log.Fatalf("Error reading config file %q: %v", *configFile, err)
-	}
-
-	var rts map[string]interface{}
-	if i, ok := c["runtimes"]; ok {
-		rts = i.(map[string]interface{})
-	} else {
-		rts = make(map[string]interface{})
-		c["runtimes"] = rts
-	}
-	if *experimental {
-		c["experimental"] = true
-	}
-	rts[name] = runtime{Path: path, RuntimeArgs: runtimeArgs}
-
-	if err := writeConfig(c, *configFile); err != nil {
-		log.Fatalf("Error writing config file %q: %v", *configFile, err)
-	}
-	return subcommands.ExitSuccess
-}
-
-// runtimeRemove implements subcommands.Command.
-type runtimeRemove struct {
-}
-
-// Name implements subcommands.Command.Name.
-func (*runtimeRemove) Name() string {
-	return "runtime-rm"
-}
-
-// Synopsis implements subcommands.Command.Synopsis.
-func (*runtimeRemove) Synopsis() string {
-	return "removes a runtime from docker daemon configuration"
-}
-
-// Usage implements subcommands.Command.Usage.
-func (*runtimeRemove) Usage() string {
-	return `runtime-rm [flags] <name>
-`
-}
-
-// SetFlags implements subcommands.Command.SetFlags.
-func (*runtimeRemove) SetFlags(*flag.FlagSet) {
-}
-
-// Execute implements subcommands.Command.Execute.
-func (r *runtimeRemove) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
-	if f.NArg() != 1 {
-		f.Usage()
-		return subcommands.ExitUsageError
-	}
-	name := f.Arg(0)
-
-	fmt.Printf("Removing runtime %q from file %q\n", name, *configFile)
-	c, err := readConfig(*configFile)
-	if err != nil {
-		log.Fatalf("Error reading config file %q: %v", *configFile, err)
-	}
-
-	var rts map[string]interface{}
-	if i, ok := c["runtimes"]; ok {
-		rts = i.(map[string]interface{})
-	} else {
-		log.Fatalf("runtime %q not found", name)
-	}
-	if _, ok := rts[name]; !ok {
-		log.Fatalf("runtime %q not found", name)
-	}
-	delete(rts, name)
-
-	if err := writeConfig(c, *configFile); err != nil {
-		log.Fatalf("Error writing config file %q: %v", *configFile, err)
-	}
-	return subcommands.ExitSuccess
-}
-
-func readConfig(path string) (map[string]interface{}, error) {
-	configBytes, err := ioutil.ReadFile(path)
-	if err != nil && !os.IsNotExist(err) {
-		return nil, err
-	}
-	c := make(map[string]interface{})
-	if len(configBytes) > 0 {
-		if err := json.Unmarshal(configBytes, &c); err != nil {
-			return nil, err
-		}
-	}
-	return c, nil
-}
-
-func writeConfig(c map[string]interface{}, path string) error {
-	b, err := json.MarshalIndent(c, "", "    ")
-	if err != nil {
-		return err
-	}
-
-	if err := os.Rename(path, path+"~"); err != nil && !os.IsNotExist(err) {
-		return fmt.Errorf("error renaming config file %q: %v", path, err)
-	}
-	if err := ioutil.WriteFile(path, b, 0644); err != nil {
-		return fmt.Errorf("error writing config file %q: %v", path, err)
-	}
-	return nil
-}
diff --git a/scripts/build.sh b/scripts/build.sh
new file mode 100755
index 000000000..dae3460af
--- /dev/null
+++ b/scripts/build.sh
@@ -0,0 +1,62 @@
+#!/bin/bash
+
+# Copyright 2018 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+source $(dirname $0)/common.sh
+
+# Build runsc.
+runsc=$(build -c opt //runsc)
+
+# Build packages.
+pkg=$(build -c opt --host_force_python=py2 //runsc:debian)
+
+# Build a repository, if the key is available.
+if [[ -v KOKORO_REPO_KEY ]]; then
+  repo=$(tools/make_repository.sh "${KOKORO_REPO_KEY}" gvisor-bot@google.com)
+fi
+
+# Install installs artifacts.
+install() {
+  mkdir -p $1
+  cp "${runsc}" "$1"/runsc
+  sha512sum "$1"/runsc | awk '{print $1 "  runsc"}' > "$1"/runsc.sha512
+  if [[ -v repo ]]; then
+    cp -a "${repo}" "${latest_dir}"/repo
+  fi
+}
+
+# Move the runsc binary into "latest" directory, and also a directory with the
+# current date. If the current commit happens to correpond to a tag, then we
+# will also move everything into a directory named after the given tag.
+if [[ -v KOKORO_ARTIFACTS_DIR ]]; then
+  if [[ "${KOKORO_BUILD_NIGHTLY}" == "true" ]]; then
+    # The "latest" directory and current date.
+    install "${KOKORO_ARTIFACTS_DIR}/nightly/latest"
+    install "${KOKORO_ARTIFACTS_DIR}/nightly/$(date -Idate)"
+  else
+    # Is it a tagged release? Build that instead. In that case, we also try to
+    # update the base release directory, in case this is an update. Finally, we
+    # update the "release" directory, which has the last released version.
+    tag="$(git describe --exact-match --tags HEAD)"
+    if ! [[ -z "${tag}" ]]; then
+      install "${KOKORO_ARTIFACTS_DIR}/${tag}"
+      base=$(echo "${tag}" | cut -d'.' -f1)
+      if [[ "${base}" != "${tag}" ]]; then
+        install "${KOKORO_ARTIFACTS_DIR}/${base}"
+      fi
+      install "${KOKORO_ARTIFACTS_DIR}/release"
+    fi
+  fi
+fi
diff --git a/scripts/common.sh b/scripts/common.sh
new file mode 100755
index 000000000..f2b9e24d8
--- /dev/null
+++ b/scripts/common.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+# Copyright 2019 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -xeo pipefail
+
+if [[ -f $(dirname $0)/common_google.sh ]]; then
+  source $(dirname $0)/common_google.sh
+else
+  source $(dirname $0)/common_bazel.sh
+fi
diff --git a/scripts/common_bazel.sh b/scripts/common_bazel.sh
new file mode 100755
index 000000000..42248cb25
--- /dev/null
+++ b/scripts/common_bazel.sh
@@ -0,0 +1,77 @@
+#!/bin/bash
+
+# Copyright 2019 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Install the latest version of Bazel and log the version.
+(which use_bazel.sh && use_bazel.sh latest) || which bazel
+bazel version
+
+# Switch into the workspace; only necessary if run with kokoro.
+if [[ -v KOKORO_GIT_COMMIT ]] && [[ -d git/repo ]]; then
+  cd git/repo
+elif [[ -v KOKORO_GIT_COMMIT ]] && [[ -d github/repo ]]; then
+  cd github/repo
+fi
+
+# Set the standard bazel flags.
+declare -r BAZEL_FLAGS=(
+  "--show_timestamps"
+  "--test_output=errors"
+  "--keep_going"
+  "--verbose_failures=true"
+)
+if [[ -v KOKORO_BAZEL_AUTH_CREDENTIAL ]] || [[ -v RBE_PROJECT_ID ]]; then
+  declare -r RBE_PROJECT_ID="${RBE_PROJECT_ID:-gvisor-rbe}"
+  declare -r BAZEL_RBE_FLAGS=(
+    "--config=remote"
+    "--project_id=${RBE_PROJECT_ID}"
+    "--remote_instance_name=projects/${RBE_PROJECT_ID}/instances/default_instance"
+  )
+fi
+if [[ -v KOKORO_BAZEL_AUTH_CREDENTIAL ]]; then
+  declare -r BAZEL_RBE_AUTH_FLAGS=(
+    "--auth_credentials=${KOKORO_BAZEL_AUTH_CREDENTIAL}"
+  )
+fi
+
+# Wrap bazel.
+function build() {
+  bazel build "${BAZEL_RBE_FLAGS[@]}" "${BAZEL_RBE_AUTH_FLAGS[@]}" "${BAZEL_FLAGS[@]}" "$@"
+}
+
+function test() {
+  (bazel test "${BAZEL_RBE_FLAGS[@]}" "${BAZEL_RBE_AUTH_FLAGS[@]}" "${BAZEL_FLAGS[@]}" "$@" && rc=0) || rc=$?
+
+  # Zip out everything into a convenient form.
+  if [[ -v KOKORO_ARTIFACTS_DIR ]]; then
+    find -L "bazel-testlogs" -name "test.xml" -o -name "test.log" -o -name "outputs.zip" |
+      tar --create --files-from - --transform 's/test\./sponge_log./' |
+      tar --extract --directory ${KOKORO_ARTIFACTS_DIR}
+  fi
+
+  return $rc
+}
+
+function run() {
+  local binary=$1
+  shift
+  bazel run "${binary}" -- "$@"
+}
+
+function run_as_root() {
+  local binary=$1
+  shift
+  bazel run --run_under="sudo" "${binary}" -- "$@"
+}
diff --git a/scripts/do_tests.sh b/scripts/do_tests.sh
new file mode 100755
index 000000000..a3a387c37
--- /dev/null
+++ b/scripts/do_tests.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+# Copyright 2019 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+source $(dirname $0)/common.sh
+
+# Build runsc.
+build //runsc
+
+# run runsc do without root privileges.
+run //runsc --rootless do true
+run //runsc --rootless --network=none do true
+
+# run runsc do with root privileges.
+run_as_root //runsc do true
diff --git a/scripts/docker_tests.sh b/scripts/docker_tests.sh
new file mode 100755
index 000000000..d6b18a35b
--- /dev/null
+++ b/scripts/docker_tests.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+
+# Copyright 2019 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+source $(dirname $0)/common.sh
+
+# Install the runtime and perform basic tests.
+run_as_root //runsc install --experimental=true -- --debug --strace --log-packets
+sudo systemctl restart docker
+test //test/image:image_test //test/e2e:integration_test
diff --git a/scripts/go.sh b/scripts/go.sh
new file mode 100755
index 000000000..e49d76c6d
--- /dev/null
+++ b/scripts/go.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+
+# Copyright 2019 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+source $(dirname $0)/common.sh
+
+# Build the go path.
+build :gopath
+
+# Build the synthetic branch.
+tools/go_branch.sh
+
+# Checkout the new branch.
+git checkout go && git clean -f
+
+# Build everything.
+go build ./...
+
+# Push, if required.
+if [[ "${KOKORO_GO_PUSH}" == "true" ]]; then
+  git push origin go:go
+fi
diff --git a/scripts/hostnet_tests.sh b/scripts/hostnet_tests.sh
new file mode 100755
index 000000000..0631c5510
--- /dev/null
+++ b/scripts/hostnet_tests.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+
+# Copyright 2019 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+source $(dirname $0)/common.sh
+
+# Install the runtime and perform basic tests.
+run_as_root //runsc install --experimental=true -- --debug --strace --log-packets --network=host
+sudo systemctl restart docker
+test --test_arg=-checkpoint=false //test/image:image_test //test/e2e:integration_test
diff --git a/scripts/kvm_tests.sh b/scripts/kvm_tests.sh
new file mode 100755
index 000000000..5cb7aa007
--- /dev/null
+++ b/scripts/kvm_tests.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+# Copyright 2019 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+source $(dirname $0)/common.sh
+
+# Ensure that KVM is loaded, and we can use it.
+(lsmod | grep -E '^(kvm_intel|kvm_amd)') || sudo modprobe kvm
+sudo chmod a+rw /dev/kvm
+
+# Run all KVM-tagged tests (locally).
+test --test_strategy=standalone --test_tag_filters=requires-kvm //...
+test --test_strategy=standalone //pkg/sentry/platform/kvm:kvm_test
+
+# Install the KVM runtime and run all integration tests.
+run_as_root //runsc install --experimental=true -- --debug --strace --log-packets --platform=kvm
+sudo systemctl restart docker
+test --test_strategy=standalone //test/image:image_test //test/e2e:integration_test
diff --git a/scripts/make_tests.sh b/scripts/make_tests.sh
new file mode 100755
index 000000000..0fa1248be
--- /dev/null
+++ b/scripts/make_tests.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+
+# Copyright 2019 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+source $(dirname $0)/common.sh
+
+top_level=$(git rev-parse --show-toplevel 2>/dev/null)
+[[ $? -eq 0 ]] && cd "${top_level}" || exit 1
+
+make
+make runsc
+make bazel-shutdown
diff --git a/scripts/overlay_tests.sh b/scripts/overlay_tests.sh
new file mode 100755
index 000000000..651a51f70
--- /dev/null
+++ b/scripts/overlay_tests.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+
+# Copyright 2019 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+source $(dirname $0)/common.sh
+
+# Install the runtime and perform basic tests.
+run_as_root //runsc install --experimental=true -- --debug --strace --log-packets --overlay
+sudo systemctl restart docker
+test //test/image:image_test //test/e2e:integration_test
diff --git a/scripts/release.sh b/scripts/release.sh
new file mode 100755
index 000000000..422319500
--- /dev/null
+++ b/scripts/release.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+
+# Copyright 2018 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+source $(dirname $0)/common.sh
+
+# Tag a release only if provided.
+if ! [[ -v KOKORO_RELEASE_COMMIT ]]; then
+  echo "No KOKORO_RELEASE_COMMIT provided." >&2
+  exit 1
+fi
+if ! [[ -v KOKORO_RELEASE_TAG ]]; then
+  echo "No KOKORO_RELEASE_TAG provided." >&2
+  exit 1
+fi
+
+# Ensure we have an appropriate configuration for the tag.
+git config --get user.name || git config user.name "gVisor-bot"
+git config --get user.email || git config user.email "gvisor-bot@google.com"
+
+# Run the release tool, which pushes to the origin repository.
+tools/tag_release.sh "${KOKORO_RELEASE_COMMIT}" "${KOKORO_RELEASE_TAG}"
diff --git a/scripts/root_tests.sh b/scripts/root_tests.sh
new file mode 100755
index 000000000..e42c0e3ec
--- /dev/null
+++ b/scripts/root_tests.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+
+# Copyright 2019 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+source $(dirname $0)/common.sh
+
+# Reinstall the latest containerd shim.
+declare -r base="https://storage.googleapis.com/cri-containerd-staging/gvisor-containerd-shim"
+declare -r latest=$(mktemp --tmpdir gvisor-containerd-shim-latest.XXXXXX)
+declare -r shim_path=$(mktemp --tmpdir gvisor-containerd-shim.XXXXXX)
+wget --no-verbose "${base}"/latest -O ${latest}
+wget --no-verbose "${base}"/gvisor-containerd-shim-$(cat ${latest}) -O ${shim_path}
+chmod +x ${shim_path}
+sudo mv ${shim_path} /usr/local/bin/gvisor-containerd-shim
+
+# Run the tests that require root.
+run_as_root //runsc install --experimental=true -- --debug --strace --log-packets
+sudo systemctl restart docker
+run_as_root //test/root:root_test
diff --git a/scripts/simple_tests.sh b/scripts/simple_tests.sh
new file mode 100755
index 000000000..585216aae
--- /dev/null
+++ b/scripts/simple_tests.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+# Copyright 2019 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+source $(dirname $0)/common.sh
+
+# Run all simple tests (locally).
+test //pkg/... //runsc/... //tools/...
diff --git a/scripts/syscall_tests.sh b/scripts/syscall_tests.sh
new file mode 100755
index 000000000..a131b2d50
--- /dev/null
+++ b/scripts/syscall_tests.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+# Copyright 2019 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+source $(dirname $0)/common.sh
+
+# Run all ptrace-variants of the system call tests.
+test --test_tag_filters=runsc_ptrace //test/syscalls/...
diff --git a/test/README.md b/test/README.md
new file mode 100644
index 000000000..09c36b461
--- /dev/null
+++ b/test/README.md
@@ -0,0 +1,18 @@
+# Tests
+
+The tests defined under this path are verifying functionality beyond what unit
+tests can cover, e.g. integration and end to end tests. Due to their nature,
+they may need extra setup in the test machine and extra configuration to run.
+
+-   **syscalls**: system call tests use a local runner, and do not require
+    additional configuration in the machine.
+-   **integration:** defines integration tests that uses `docker run` to test
+    functionality.
+-   **image:** basic end to end test for popular images. These require the same
+    setup as integration tests.
+-   **root:** tests that require to be run as root.
+-   **util:** utilities library to support the tests.
+
+For the above noted cases, the relevant runtime must be installed via `runsc
+install` before running. This is handled automatically by the test scripts in
+the `kokoro` directory.
diff --git a/test/e2e/BUILD b/test/e2e/BUILD
new file mode 100644
index 000000000..99442cffb
--- /dev/null
+++ b/test/e2e/BUILD
@@ -0,0 +1,31 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+
+package(licenses = ["notice"])
+
+go_test(
+    name = "integration_test",
+    size = "large",
+    srcs = [
+        "exec_test.go",
+        "integration_test.go",
+        "regression_test.go",
+    ],
+    embed = [":integration"],
+    tags = [
+        # Requires docker and runsc to be configured before the test runs.
+        "manual",
+        "local",
+    ],
+    visibility = ["//:sandbox"],
+    deps = [
+        "//pkg/abi/linux",
+        "//runsc/dockerutil",
+        "//runsc/testutil",
+    ],
+)
+
+go_library(
+    name = "integration",
+    srcs = ["integration.go"],
+    importpath = "gvisor.dev/gvisor/test/integration",
+)
diff --git a/test/e2e/exec_test.go b/test/e2e/exec_test.go
new file mode 100644
index 000000000..ce2c4f689
--- /dev/null
+++ b/test/e2e/exec_test.go
@@ -0,0 +1,156 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package image provides end-to-end integration tests for runsc. These tests
+// require docker and runsc to be installed on the machine.
+//
+// Each test calls docker commands to start up a container, and tests that it
+// is behaving properly, with various runsc commands. The container is killed
+// and deleted at the end.
+
+package integration
+
+import (
+	"fmt"
+	"strconv"
+	"strings"
+	"syscall"
+	"testing"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/runsc/dockerutil"
+)
+
+func TestExecCapabilities(t *testing.T) {
+	if err := dockerutil.Pull("alpine"); err != nil {
+		t.Fatalf("docker pull failed: %v", err)
+	}
+	d := dockerutil.MakeDocker("exec-test")
+
+	// Start the container.
+	if err := d.Run("alpine", "sh", "-c", "cat /proc/self/status; sleep 100"); err != nil {
+		t.Fatalf("docker run failed: %v", err)
+	}
+	defer d.CleanUp()
+
+	matches, err := d.WaitForOutputSubmatch("CapEff:\t([0-9a-f]+)\n", 5*time.Second)
+	if err != nil {
+		t.Fatalf("WaitForOutputSubmatch() timeout: %v", err)
+	}
+	if len(matches) != 2 {
+		t.Fatalf("There should be a match for the whole line and the capability bitmask")
+	}
+	capString := matches[1]
+	t.Log("Root capabilities:", capString)
+
+	// CAP_NET_RAW was in the capability set for the container, but was
+	// removed. However, `exec` does not remove it. Verify that it's not
+	// set in the container, then re-add it for comparison.
+	caps, err := strconv.ParseUint(capString, 16, 64)
+	if err != nil {
+		t.Fatalf("failed to convert capabilities %q: %v", capString, err)
+	}
+	if caps&(1<<uint64(linux.CAP_NET_RAW)) != 0 {
+		t.Fatalf("CAP_NET_RAW should be filtered, but is set in the container: %x", caps)
+	}
+	caps |= 1 << uint64(linux.CAP_NET_RAW)
+	want := fmt.Sprintf("CapEff:\t%016x\n", caps)
+
+	// Now check that exec'd process capabilities match the root.
+	got, err := d.Exec("grep", "CapEff:", "/proc/self/status")
+	if err != nil {
+		t.Fatalf("docker exec failed: %v", err)
+	}
+	if got != want {
+		t.Errorf("wrong capabilities, got: %q, want: %q", got, want)
+	}
+}
+
+func TestExecJobControl(t *testing.T) {
+	if err := dockerutil.Pull("alpine"); err != nil {
+		t.Fatalf("docker pull failed: %v", err)
+	}
+	d := dockerutil.MakeDocker("exec-job-control-test")
+
+	// Start the container.
+	if err := d.Run("alpine", "sleep", "1000"); err != nil {
+		t.Fatalf("docker run failed: %v", err)
+	}
+	defer d.CleanUp()
+
+	// Exec 'sh' with an attached pty.
+	cmd, ptmx, err := d.ExecWithTerminal("sh")
+	if err != nil {
+		t.Fatalf("docker exec failed: %v", err)
+	}
+	defer ptmx.Close()
+
+	// Call "sleep 100 | cat" in the shell.  We pipe to cat so that there
+	// will be two processes in the foreground process group.
+	if _, err := ptmx.Write([]byte("sleep 100 | cat\n")); err != nil {
+		t.Fatalf("error writing to pty: %v", err)
+	}
+
+	// Give shell a few seconds to start executing the sleep.
+	time.Sleep(2 * time.Second)
+
+	// Send a ^C to the pty, which should kill sleep and cat, but not the
+	// shell.  \x03 is ASCII "end of text", which is the same as ^C.
+	if _, err := ptmx.Write([]byte{'\x03'}); err != nil {
+		t.Fatalf("error writing to pty: %v", err)
+	}
+
+	// The shell should still be alive at this point. Sleep should have
+	// exited with code 2+128=130. We'll exit with 10 plus that number, so
+	// that we can be sure that the shell did not get signalled.
+	if _, err := ptmx.Write([]byte("exit $(expr $? + 10)\n")); err != nil {
+		t.Fatalf("error writing to pty: %v", err)
+	}
+
+	// Exec process should exit with code 10+130=140.
+	ps, err := cmd.Process.Wait()
+	if err != nil {
+		t.Fatalf("error waiting for exec process: %v", err)
+	}
+	ws := ps.Sys().(syscall.WaitStatus)
+	if !ws.Exited() {
+		t.Errorf("ws.Exited got false, want true")
+	}
+	if got, want := ws.ExitStatus(), 140; got != want {
+		t.Errorf("ws.ExitedStatus got %d, want %d", got, want)
+	}
+}
+
+// Test that failure to exec returns proper error message.
+func TestExecError(t *testing.T) {
+	if err := dockerutil.Pull("alpine"); err != nil {
+		t.Fatalf("docker pull failed: %v", err)
+	}
+	d := dockerutil.MakeDocker("exec-error-test")
+
+	// Start the container.
+	if err := d.Run("alpine", "sleep", "1000"); err != nil {
+		t.Fatalf("docker run failed: %v", err)
+	}
+	defer d.CleanUp()
+
+	_, err := d.Exec("no_can_find")
+	if err == nil {
+		t.Fatalf("docker exec didn't fail")
+	}
+	if want := `error finding executable "no_can_find" in PATH`; !strings.Contains(err.Error(), want) {
+		t.Fatalf("docker exec wrong error, got: %s, want: .*%s.*", err.Error(), want)
+	}
+}
diff --git a/test/e2e/integration.go b/test/e2e/integration.go
new file mode 100644
index 000000000..4cd5f6c24
--- /dev/null
+++ b/test/e2e/integration.go
@@ -0,0 +1,16 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package integration is empty. See integration_test.go for description.
+package integration
diff --git a/test/e2e/integration_test.go b/test/e2e/integration_test.go
new file mode 100644
index 000000000..7cc0de129
--- /dev/null
+++ b/test/e2e/integration_test.go
@@ -0,0 +1,348 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package integration provides end-to-end integration tests for runsc.
+//
+// Each test calls docker commands to start up a container, and tests that it is
+// behaving properly, with various runsc commands. The container is killed and
+// deleted at the end.
+//
+// Setup instruction in test/README.md.
+package integration
+
+import (
+	"flag"
+	"fmt"
+	"net"
+	"net/http"
+	"os"
+	"strconv"
+	"strings"
+	"syscall"
+	"testing"
+	"time"
+
+	"gvisor.dev/gvisor/runsc/dockerutil"
+	"gvisor.dev/gvisor/runsc/testutil"
+)
+
+// httpRequestSucceeds sends a request to a given url and checks that the status is OK.
+func httpRequestSucceeds(client http.Client, server string, port int) error {
+	url := fmt.Sprintf("http://%s:%d", server, port)
+	// Ensure that content is being served.
+	resp, err := client.Get(url)
+	if err != nil {
+		return fmt.Errorf("error reaching http server: %v", err)
+	}
+	if want := http.StatusOK; resp.StatusCode != want {
+		return fmt.Errorf("wrong response code, got: %d, want: %d", resp.StatusCode, want)
+	}
+	return nil
+}
+
+// TestLifeCycle tests a basic Create/Start/Stop docker container life cycle.
+func TestLifeCycle(t *testing.T) {
+	if err := dockerutil.Pull("nginx"); err != nil {
+		t.Fatal("docker pull failed:", err)
+	}
+	d := dockerutil.MakeDocker("lifecycle-test")
+	if err := d.Create("-p", "80", "nginx"); err != nil {
+		t.Fatal("docker create failed:", err)
+	}
+	if err := d.Start(); err != nil {
+		d.CleanUp()
+		t.Fatal("docker start failed:", err)
+	}
+
+	// Test that container is working
+	port, err := d.FindPort(80)
+	if err != nil {
+		t.Fatal("docker.FindPort(80) failed: ", err)
+	}
+	if err := testutil.WaitForHTTP(port, 30*time.Second); err != nil {
+		t.Fatal("WaitForHTTP() timeout:", err)
+	}
+	client := http.Client{Timeout: time.Duration(2 * time.Second)}
+	if err := httpRequestSucceeds(client, "localhost", port); err != nil {
+		t.Error("http request failed:", err)
+	}
+
+	if err := d.Stop(); err != nil {
+		d.CleanUp()
+		t.Fatal("docker stop failed:", err)
+	}
+	if err := d.Remove(); err != nil {
+		t.Fatal("docker rm failed:", err)
+	}
+}
+
+func TestPauseResume(t *testing.T) {
+	const img = "gcr.io/gvisor-presubmit/python-hello"
+	if !testutil.IsCheckpointSupported() {
+		t.Log("Checkpoint is not supported, skipping test.")
+		return
+	}
+
+	if err := dockerutil.Pull(img); err != nil {
+		t.Fatal("docker pull failed:", err)
+	}
+	d := dockerutil.MakeDocker("pause-resume-test")
+	if err := d.Run("-p", "8080", img); err != nil {
+		t.Fatalf("docker run failed: %v", err)
+	}
+	defer d.CleanUp()
+
+	// Find where port 8080 is mapped to.
+	port, err := d.FindPort(8080)
+	if err != nil {
+		t.Fatal("docker.FindPort(8080) failed:", err)
+	}
+
+	// Wait until it's up and running.
+	if err := testutil.WaitForHTTP(port, 30*time.Second); err != nil {
+		t.Fatal("WaitForHTTP() timeout:", err)
+	}
+
+	// Check that container is working.
+	client := http.Client{Timeout: time.Duration(2 * time.Second)}
+	if err := httpRequestSucceeds(client, "localhost", port); err != nil {
+		t.Error("http request failed:", err)
+	}
+
+	if err := d.Pause(); err != nil {
+		t.Fatal("docker pause failed:", err)
+	}
+
+	// Check if container is paused.
+	switch _, err := client.Get(fmt.Sprintf("http://localhost:%d", port)); v := err.(type) {
+	case nil:
+		t.Errorf("http req expected to fail but it succeeded")
+	case net.Error:
+		if !v.Timeout() {
+			t.Errorf("http req got error %v, wanted timeout", v)
+		}
+	default:
+		t.Errorf("http req got unexpected error %v", v)
+	}
+
+	if err := d.Unpause(); err != nil {
+		t.Fatal("docker unpause failed:", err)
+	}
+
+	// Wait until it's up and running.
+	if err := testutil.WaitForHTTP(port, 30*time.Second); err != nil {
+		t.Fatal("WaitForHTTP() timeout:", err)
+	}
+
+	// Check if container is working again.
+	if err := httpRequestSucceeds(client, "localhost", port); err != nil {
+		t.Error("http request failed:", err)
+	}
+}
+
+func TestCheckpointRestore(t *testing.T) {
+	const img = "gcr.io/gvisor-presubmit/python-hello"
+	if !testutil.IsCheckpointSupported() {
+		t.Log("Pause/resume is not supported, skipping test.")
+		return
+	}
+
+	if err := dockerutil.Pull(img); err != nil {
+		t.Fatal("docker pull failed:", err)
+	}
+	d := dockerutil.MakeDocker("save-restore-test")
+	if err := d.Run("-p", "8080", img); err != nil {
+		t.Fatalf("docker run failed: %v", err)
+	}
+	defer d.CleanUp()
+
+	if err := d.Checkpoint("test"); err != nil {
+		t.Fatal("docker checkpoint failed:", err)
+	}
+
+	if _, err := d.Wait(30 * time.Second); err != nil {
+		t.Fatal(err)
+	}
+
+	if err := d.Restore("test"); err != nil {
+		t.Fatal("docker restore failed:", err)
+	}
+
+	// Find where port 8080 is mapped to.
+	port, err := d.FindPort(8080)
+	if err != nil {
+		t.Fatal("docker.FindPort(8080) failed:", err)
+	}
+
+	// Wait until it's up and running.
+	if err := testutil.WaitForHTTP(port, 30*time.Second); err != nil {
+		t.Fatal("WaitForHTTP() timeout:", err)
+	}
+
+	// Check if container is working again.
+	client := http.Client{Timeout: time.Duration(2 * time.Second)}
+	if err := httpRequestSucceeds(client, "localhost", port); err != nil {
+		t.Error("http request failed:", err)
+	}
+}
+
+// Create client and server that talk to each other using the local IP.
+func TestConnectToSelf(t *testing.T) {
+	d := dockerutil.MakeDocker("connect-to-self-test")
+
+	// Creates server that replies "server" and exists. Sleeps at the end because
+	// 'docker exec' gets killed if the init process exists before it can finish.
+	if err := d.Run("ubuntu:trusty", "/bin/sh", "-c", "echo server | nc -l -p 8080 && sleep 1"); err != nil {
+		t.Fatal("docker run failed:", err)
+	}
+	defer d.CleanUp()
+
+	// Finds IP address for host.
+	ip, err := d.Exec("/bin/sh", "-c", "cat /etc/hosts | grep ${HOSTNAME} | awk '{print $1}'")
+	if err != nil {
+		t.Fatal("docker exec failed:", err)
+	}
+	ip = strings.TrimRight(ip, "\n")
+
+	// Runs client that sends "client" to the server and exits.
+	reply, err := d.Exec("/bin/sh", "-c", fmt.Sprintf("echo client | nc %s 8080", ip))
+	if err != nil {
+		t.Fatal("docker exec failed:", err)
+	}
+
+	// Ensure both client and server got the message from each other.
+	if want := "server\n"; reply != want {
+		t.Errorf("Error on server, want: %q, got: %q", want, reply)
+	}
+	if _, err := d.WaitForOutput("^client\n$", 1*time.Second); err != nil {
+		t.Fatal("docker.WaitForOutput(client) timeout:", err)
+	}
+}
+
+func TestMemLimit(t *testing.T) {
+	if err := dockerutil.Pull("alpine"); err != nil {
+		t.Fatal("docker pull failed:", err)
+	}
+	d := dockerutil.MakeDocker("cgroup-test")
+	cmd := "cat /proc/meminfo | grep MemTotal: | awk '{print $2}'"
+	out, err := d.RunFg("--memory=500MB", "alpine", "sh", "-c", cmd)
+	if err != nil {
+		t.Fatal("docker run failed:", err)
+	}
+	defer d.CleanUp()
+
+	// Remove warning message that swap isn't present.
+	if strings.HasPrefix(out, "WARNING") {
+		lines := strings.Split(out, "\n")
+		if len(lines) != 3 {
+			t.Fatalf("invalid output: %s", out)
+		}
+		out = lines[1]
+	}
+
+	got, err := strconv.ParseUint(strings.TrimSpace(out), 10, 64)
+	if err != nil {
+		t.Fatalf("failed to parse %q: %v", out, err)
+	}
+	if want := uint64(500 * 1024); got != want {
+		t.Errorf("MemTotal got: %d, want: %d", got, want)
+	}
+}
+
+func TestNumCPU(t *testing.T) {
+	if err := dockerutil.Pull("alpine"); err != nil {
+		t.Fatal("docker pull failed:", err)
+	}
+	d := dockerutil.MakeDocker("cgroup-test")
+	cmd := "cat /proc/cpuinfo | grep 'processor.*:' | wc -l"
+	out, err := d.RunFg("--cpuset-cpus=0", "alpine", "sh", "-c", cmd)
+	if err != nil {
+		t.Fatal("docker run failed:", err)
+	}
+	defer d.CleanUp()
+
+	got, err := strconv.Atoi(strings.TrimSpace(out))
+	if err != nil {
+		t.Fatalf("failed to parse %q: %v", out, err)
+	}
+	if want := 1; got != want {
+		t.Errorf("MemTotal got: %d, want: %d", got, want)
+	}
+}
+
+// TestJobControl tests that job control characters are handled properly.
+func TestJobControl(t *testing.T) {
+	if err := dockerutil.Pull("alpine"); err != nil {
+		t.Fatalf("docker pull failed: %v", err)
+	}
+	d := dockerutil.MakeDocker("job-control-test")
+
+	// Start the container with an attached PTY.
+	_, ptmx, err := d.RunWithPty("alpine", "sh")
+	if err != nil {
+		t.Fatalf("docker run failed: %v", err)
+	}
+	defer ptmx.Close()
+	defer d.CleanUp()
+
+	// Call "sleep 100" in the shell.
+	if _, err := ptmx.Write([]byte("sleep 100\n")); err != nil {
+		t.Fatalf("error writing to pty: %v", err)
+	}
+
+	// Give shell a few seconds to start executing the sleep.
+	time.Sleep(2 * time.Second)
+
+	// Send a ^C to the pty, which should kill sleep, but not the shell.
+	// \x03 is ASCII "end of text", which is the same as ^C.
+	if _, err := ptmx.Write([]byte{'\x03'}); err != nil {
+		t.Fatalf("error writing to pty: %v", err)
+	}
+
+	// The shell should still be alive at this point. Sleep should have
+	// exited with code 2+128=130. We'll exit with 10 plus that number, so
+	// that we can be sure that the shell did not get signalled.
+	if _, err := ptmx.Write([]byte("exit $(expr $? + 10)\n")); err != nil {
+		t.Fatalf("error writing to pty: %v", err)
+	}
+
+	// Wait for the container to exit.
+	got, err := d.Wait(5 * time.Second)
+	if err != nil {
+		t.Fatalf("error getting exit code: %v", err)
+	}
+	// Container should exit with code 10+130=140.
+	if want := syscall.WaitStatus(140); got != want {
+		t.Errorf("container exited with code %d want %d", got, want)
+	}
+}
+
+// TestTmpFile checks that files inside '/tmp' are not overridden. In addition,
+// it checks that working dir is created if it doesn't exit.
+func TestTmpFile(t *testing.T) {
+	if err := dockerutil.Pull("alpine"); err != nil {
+		t.Fatal("docker pull failed:", err)
+	}
+	d := dockerutil.MakeDocker("tmp-file-test")
+	if err := d.Run("-w=/tmp/foo/bar", "--read-only", "alpine", "touch", "/tmp/foo/bar/file"); err != nil {
+		t.Fatal("docker run failed:", err)
+	}
+	defer d.CleanUp()
+}
+
+func TestMain(m *testing.M) {
+	dockerutil.EnsureSupportedDockerVersion()
+	flag.Parse()
+	os.Exit(m.Run())
+}
diff --git a/test/e2e/regression_test.go b/test/e2e/regression_test.go
new file mode 100644
index 000000000..2488be383
--- /dev/null
+++ b/test/e2e/regression_test.go
@@ -0,0 +1,45 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package integration
+
+import (
+	"strings"
+	"testing"
+
+	"gvisor.dev/gvisor/runsc/dockerutil"
+)
+
+// Test that UDS can be created using overlay when parent directory is in lower
+// layer only (b/134090485).
+//
+// Prerequisite: the directory where the socket file is created must not have
+// been open for write before bind(2) is called.
+func TestBindOverlay(t *testing.T) {
+	if err := dockerutil.Pull("ubuntu:trusty"); err != nil {
+		t.Fatal("docker pull failed:", err)
+	}
+	d := dockerutil.MakeDocker("bind-overlay-test")
+
+	cmd := "nc -l -U /var/run/sock & p=$! && sleep 1 && echo foobar-asdf | nc -U /var/run/sock && wait $p"
+	got, err := d.RunFg("ubuntu:trusty", "bash", "-c", cmd)
+	if err != nil {
+		t.Fatal("docker run failed:", err)
+	}
+
+	if want := "foobar-asdf"; !strings.Contains(got, want) {
+		t.Fatalf("docker run output is missing %q: %s", want, got)
+	}
+	defer d.CleanUp()
+}
diff --git a/test/image/BUILD b/test/image/BUILD
new file mode 100644
index 000000000..09b0a0ad5
--- /dev/null
+++ b/test/image/BUILD
@@ -0,0 +1,34 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+
+package(licenses = ["notice"])
+
+go_test(
+    name = "image_test",
+    size = "large",
+    srcs = [
+        "image_test.go",
+    ],
+    data = [
+        "latin10k.txt",
+        "mysql.sql",
+        "ruby.rb",
+        "ruby.sh",
+    ],
+    embed = [":image"],
+    tags = [
+        # Requires docker and runsc to be configured before the test runs.
+        "manual",
+        "local",
+    ],
+    visibility = ["//:sandbox"],
+    deps = [
+        "//runsc/dockerutil",
+        "//runsc/testutil",
+    ],
+)
+
+go_library(
+    name = "image",
+    srcs = ["image.go"],
+    importpath = "gvisor.dev/gvisor/test/image",
+)
diff --git a/test/image/image.go b/test/image/image.go
new file mode 100644
index 000000000..297f1ab92
--- /dev/null
+++ b/test/image/image.go
@@ -0,0 +1,16 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package image is empty. See image_test.go for description.
+package image
diff --git a/test/image/image_test.go b/test/image/image_test.go
new file mode 100644
index 000000000..d0dcb1861
--- /dev/null
+++ b/test/image/image_test.go
@@ -0,0 +1,353 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package image provides end-to-end image tests for runsc.
+
+// Each test calls docker commands to start up a container, and tests that it
+// is behaving properly, like connecting to a port or looking at the output.
+// The container is killed and deleted at the end.
+//
+// Setup instruction in test/README.md.
+package image
+
+import (
+	"flag"
+	"fmt"
+	"io/ioutil"
+	"log"
+	"net/http"
+	"os"
+	"path/filepath"
+	"strings"
+	"testing"
+	"time"
+
+	"gvisor.dev/gvisor/runsc/dockerutil"
+	"gvisor.dev/gvisor/runsc/testutil"
+)
+
+func TestHelloWorld(t *testing.T) {
+	d := dockerutil.MakeDocker("hello-test")
+	if err := d.Run("hello-world"); err != nil {
+		t.Fatalf("docker run failed: %v", err)
+	}
+	defer d.CleanUp()
+
+	if _, err := d.WaitForOutput("Hello from Docker!", 5*time.Second); err != nil {
+		t.Fatalf("docker didn't say hello: %v", err)
+	}
+}
+
+func runHTTPRequest(port int) error {
+	url := fmt.Sprintf("http://localhost:%d/not-found", port)
+	resp, err := http.Get(url)
+	if err != nil {
+		return fmt.Errorf("error reaching http server: %v", err)
+	}
+	if want := http.StatusNotFound; resp.StatusCode != want {
+		return fmt.Errorf("Wrong response code, got: %d, want: %d", resp.StatusCode, want)
+	}
+
+	url = fmt.Sprintf("http://localhost:%d/latin10k.txt", port)
+	resp, err = http.Get(url)
+	if err != nil {
+		return fmt.Errorf("Error reaching http server: %v", err)
+	}
+	if want := http.StatusOK; resp.StatusCode != want {
+		return fmt.Errorf("Wrong response code, got: %d, want: %d", resp.StatusCode, want)
+	}
+
+	body, err := ioutil.ReadAll(resp.Body)
+	if err != nil {
+		return fmt.Errorf("Error reading http response: %v", err)
+	}
+	defer resp.Body.Close()
+
+	// READALL is the last word in the file. Ensures everything was read.
+	if want := "READALL"; strings.HasSuffix(string(body), want) {
+		return fmt.Errorf("response doesn't contain %q, resp: %q", want, body)
+	}
+	return nil
+}
+
+func testHTTPServer(t *testing.T, port int) {
+	const requests = 10
+	ch := make(chan error, requests)
+	for i := 0; i < requests; i++ {
+		go func() {
+			start := time.Now()
+			err := runHTTPRequest(port)
+			log.Printf("Response time %v: %v", time.Since(start).String(), err)
+			ch <- err
+		}()
+	}
+
+	for i := 0; i < requests; i++ {
+		err := <-ch
+		if err != nil {
+			t.Errorf("testHTTPServer(%d) failed: %v", port, err)
+		}
+	}
+}
+
+func TestHttpd(t *testing.T) {
+	if err := dockerutil.Pull("httpd"); err != nil {
+		t.Fatalf("docker pull failed: %v", err)
+	}
+	d := dockerutil.MakeDocker("http-test")
+
+	dir, err := dockerutil.PrepareFiles("latin10k.txt")
+	if err != nil {
+		t.Fatalf("PrepareFiles() failed: %v", err)
+	}
+
+	// Start the container.
+	mountArg := dockerutil.MountArg(dir, "/usr/local/apache2/htdocs", dockerutil.ReadOnly)
+	if err := d.Run("-p", "80", mountArg, "httpd"); err != nil {
+		t.Fatalf("docker run failed: %v", err)
+	}
+	defer d.CleanUp()
+
+	// Find where port 80 is mapped to.
+	port, err := d.FindPort(80)
+	if err != nil {
+		t.Fatalf("docker.FindPort(80) failed: %v", err)
+	}
+
+	// Wait until it's up and running.
+	if err := testutil.WaitForHTTP(port, 30*time.Second); err != nil {
+		t.Errorf("WaitForHTTP() timeout: %v", err)
+	}
+
+	testHTTPServer(t, port)
+}
+
+func TestNginx(t *testing.T) {
+	if err := dockerutil.Pull("nginx"); err != nil {
+		t.Fatalf("docker pull failed: %v", err)
+	}
+	d := dockerutil.MakeDocker("net-test")
+
+	dir, err := dockerutil.PrepareFiles("latin10k.txt")
+	if err != nil {
+		t.Fatalf("PrepareFiles() failed: %v", err)
+	}
+
+	// Start the container.
+	mountArg := dockerutil.MountArg(dir, "/usr/share/nginx/html", dockerutil.ReadOnly)
+	if err := d.Run("-p", "80", mountArg, "nginx"); err != nil {
+		t.Fatalf("docker run failed: %v", err)
+	}
+	defer d.CleanUp()
+
+	// Find where port 80 is mapped to.
+	port, err := d.FindPort(80)
+	if err != nil {
+		t.Fatalf("docker.FindPort(80) failed: %v", err)
+	}
+
+	// Wait until it's up and running.
+	if err := testutil.WaitForHTTP(port, 30*time.Second); err != nil {
+		t.Errorf("WaitForHTTP() timeout: %v", err)
+	}
+
+	testHTTPServer(t, port)
+}
+
+func TestMysql(t *testing.T) {
+	if err := dockerutil.Pull("mysql"); err != nil {
+		t.Fatalf("docker pull failed: %v", err)
+	}
+	d := dockerutil.MakeDocker("mysql-test")
+
+	// Start the container.
+	if err := d.Run("-e", "MYSQL_ROOT_PASSWORD=foobar123", "mysql"); err != nil {
+		t.Fatalf("docker run failed: %v", err)
+	}
+	defer d.CleanUp()
+
+	// Wait until it's up and running.
+	if _, err := d.WaitForOutput("port: 3306  MySQL Community Server", 3*time.Minute); err != nil {
+		t.Fatalf("docker.WaitForOutput() timeout: %v", err)
+	}
+
+	client := dockerutil.MakeDocker("mysql-client-test")
+	dir, err := dockerutil.PrepareFiles("mysql.sql")
+	if err != nil {
+		t.Fatalf("PrepareFiles() failed: %v", err)
+	}
+
+	// Tell mysql client to connect to the server and execute the file in verbose
+	// mode to verify the output.
+	args := []string{
+		dockerutil.LinkArg(&d, "mysql"),
+		dockerutil.MountArg(dir, "/sql", dockerutil.ReadWrite),
+		"mysql",
+		"mysql", "-hmysql", "-uroot", "-pfoobar123", "-v", "-e", "source /sql/mysql.sql",
+	}
+	if err := client.Run(args...); err != nil {
+		t.Fatalf("docker run failed: %v", err)
+	}
+	defer client.CleanUp()
+
+	// Ensure file executed to the end and shutdown mysql.
+	if _, err := client.WaitForOutput("--------------\nshutdown\n--------------", 15*time.Second); err != nil {
+		t.Fatalf("docker.WaitForOutput() timeout: %v", err)
+	}
+	if _, err := d.WaitForOutput("mysqld: Shutdown complete", 30*time.Second); err != nil {
+		t.Fatalf("docker.WaitForOutput() timeout: %v", err)
+	}
+}
+
+func TestPythonHello(t *testing.T) {
+	// TODO(b/136503277): Once we have more complete python runtime tests,
+	// we can drop this one.
+	const img = "gcr.io/gvisor-presubmit/python-hello"
+	if err := dockerutil.Pull(img); err != nil {
+		t.Fatalf("docker pull failed: %v", err)
+	}
+	d := dockerutil.MakeDocker("python-hello-test")
+	if err := d.Run("-p", "8080", img); err != nil {
+		t.Fatalf("docker run failed: %v", err)
+	}
+	defer d.CleanUp()
+
+	// Find where port 8080 is mapped to.
+	port, err := d.FindPort(8080)
+	if err != nil {
+		t.Fatalf("docker.FindPort(8080) failed: %v", err)
+	}
+
+	// Wait until it's up and running.
+	if err := testutil.WaitForHTTP(port, 30*time.Second); err != nil {
+		t.Fatalf("WaitForHTTP() timeout: %v", err)
+	}
+
+	// Ensure that content is being served.
+	url := fmt.Sprintf("http://localhost:%d", port)
+	resp, err := http.Get(url)
+	if err != nil {
+		t.Errorf("Error reaching http server: %v", err)
+	}
+	if want := http.StatusOK; resp.StatusCode != want {
+		t.Errorf("Wrong response code, got: %d, want: %d", resp.StatusCode, want)
+	}
+}
+
+func TestTomcat(t *testing.T) {
+	if err := dockerutil.Pull("tomcat:8.0"); err != nil {
+		t.Fatalf("docker pull failed: %v", err)
+	}
+	d := dockerutil.MakeDocker("tomcat-test")
+	if err := d.Run("-p", "8080", "tomcat:8.0"); err != nil {
+		t.Fatalf("docker run failed: %v", err)
+	}
+	defer d.CleanUp()
+
+	// Find where port 8080 is mapped to.
+	port, err := d.FindPort(8080)
+	if err != nil {
+		t.Fatalf("docker.FindPort(8080) failed: %v", err)
+	}
+
+	// Wait until it's up and running.
+	if err := testutil.WaitForHTTP(port, 30*time.Second); err != nil {
+		t.Fatalf("WaitForHTTP() timeout: %v", err)
+	}
+
+	// Ensure that content is being served.
+	url := fmt.Sprintf("http://localhost:%d", port)
+	resp, err := http.Get(url)
+	if err != nil {
+		t.Errorf("Error reaching http server: %v", err)
+	}
+	if want := http.StatusOK; resp.StatusCode != want {
+		t.Errorf("Wrong response code, got: %d, want: %d", resp.StatusCode, want)
+	}
+}
+
+func TestRuby(t *testing.T) {
+	if err := dockerutil.Pull("ruby"); err != nil {
+		t.Fatalf("docker pull failed: %v", err)
+	}
+	d := dockerutil.MakeDocker("ruby-test")
+
+	dir, err := dockerutil.PrepareFiles("ruby.rb", "ruby.sh")
+	if err != nil {
+		t.Fatalf("PrepareFiles() failed: %v", err)
+	}
+	if err := os.Chmod(filepath.Join(dir, "ruby.sh"), 0333); err != nil {
+		t.Fatalf("os.Chmod(%q, 0333) failed: %v", dir, err)
+	}
+
+	if err := d.Run("-p", "8080", dockerutil.MountArg(dir, "/src", dockerutil.ReadOnly), "ruby", "/src/ruby.sh"); err != nil {
+		t.Fatalf("docker run failed: %v", err)
+	}
+	defer d.CleanUp()
+
+	// Find where port 8080 is mapped to.
+	port, err := d.FindPort(8080)
+	if err != nil {
+		t.Fatalf("docker.FindPort(8080) failed: %v", err)
+	}
+
+	// Wait until it's up and running, 'gem install' can take some time.
+	if err := testutil.WaitForHTTP(port, 1*time.Minute); err != nil {
+		t.Fatalf("WaitForHTTP() timeout: %v", err)
+	}
+
+	// Ensure that content is being served.
+	url := fmt.Sprintf("http://localhost:%d", port)
+	resp, err := http.Get(url)
+	if err != nil {
+		t.Errorf("error reaching http server: %v", err)
+	}
+	if want := http.StatusOK; resp.StatusCode != want {
+		t.Errorf("wrong response code, got: %d, want: %d", resp.StatusCode, want)
+	}
+	body, err := ioutil.ReadAll(resp.Body)
+	if err != nil {
+		t.Fatalf("error reading body: %v", err)
+	}
+	if got, want := string(body), "Hello World"; !strings.Contains(got, want) {
+		t.Errorf("invalid body content, got: %q, want: %q", got, want)
+	}
+}
+
+func TestStdio(t *testing.T) {
+	if err := dockerutil.Pull("alpine"); err != nil {
+		t.Fatalf("docker pull failed: %v", err)
+	}
+	d := dockerutil.MakeDocker("stdio-test")
+
+	wantStdout := "hello stdout"
+	wantStderr := "bonjour stderr"
+	cmd := fmt.Sprintf("echo %q; echo %q 1>&2;", wantStdout, wantStderr)
+	if err := d.Run("alpine", "/bin/sh", "-c", cmd); err != nil {
+		t.Fatalf("docker run failed: %v", err)
+	}
+	defer d.CleanUp()
+
+	for _, want := range []string{wantStdout, wantStderr} {
+		if _, err := d.WaitForOutput(want, 5*time.Second); err != nil {
+			t.Fatalf("docker didn't get output %q : %v", want, err)
+		}
+	}
+}
+
+func TestMain(m *testing.M) {
+	dockerutil.EnsureSupportedDockerVersion()
+	flag.Parse()
+	os.Exit(m.Run())
+}
diff --git a/test/image/latin10k.txt b/test/image/latin10k.txt
new file mode 100644
index 000000000..61341e00b
--- /dev/null
+++ b/test/image/latin10k.txt
@@ -0,0 +1,33 @@
+Lorem ipsum dolor sit amet, consectetur adipiscing elit. Cras ut placerat felis. Maecenas urna est, auctor a efficitur sit amet, egestas et augue. Curabitur dignissim scelerisque nunc vel cursus. Ut vehicula est pretium, consectetur nunc non, pharetra ligula. Curabitur ut ultricies metus. Suspendisse pulvinar, orci sed fermentum vestibulum, eros turpis molestie lectus, nec elementum risus dolor mattis felis. Donec ultrices ipsum sem, at pretium lacus convallis at. Mauris nulla enim, tincidunt non bibendum at, vehicula pulvinar mauris.
+
+Duis in dapibus turpis. Pellentesque maximus magna odio, ac congue libero laoreet quis. Maecenas euismod risus in justo aliquam accumsan. Nunc quis ornare arcu, sit amet sodales elit. Phasellus nec scelerisque nisl, a tincidunt arcu. Proin ornare est nunc, sed suscipit orci interdum et. Suspendisse condimentum venenatis diam in tempor. Aliquam egestas lectus in rutrum tempus. Donec id egestas eros. Donec molestie consequat purus, sed posuere odio venenatis vitae. Nunc placerat augue id vehicula varius. In hac habitasse platea dictumst. Proin at est accumsan, venenatis quam a, fermentum risus. Phasellus posuere pellentesque enim, id suscipit magna consequat ut. Quisque ut tortor ante.
+
+Cras ut vulputate metus, a laoreet lectus. Vivamus ultrices molestie odio in tristique. Morbi faucibus mi eget sollicitudin fringilla. Fusce vitae lacinia ligula. Sed egestas sed diam eu posuere. Maecenas justo nisl, venenatis vel nibh vel, cursus aliquam velit. Praesent lacinia dui id erat venenatis rhoncus. Morbi gravida felis ante, sit amet vehicula orci rhoncus vitae.
+
+Sed finibus sagittis dictum. Proin auctor suscipit sem et mattis. Phasellus libero ligula, pellentesque ut felis porttitor, fermentum sollicitudin orci. Nulla eu nulla nibh. Fusce a eros risus. Proin vel magna risus. Donec nec elit eleifend, scelerisque sapien vitae, pharetra quam. Donec porttitor mauris scelerisque, tempus orci hendrerit, dapibus felis. Nullam libero elit, sollicitudin a aliquam at, ultrices in erat. Mauris eget ligula sodales, porta turpis et, scelerisque odio. Mauris mollis leo vitae purus gravida, in tempor nunc efficitur. Nulla facilisis posuere augue, nec pellentesque lectus eleifend ac. Vestibulum convallis est a feugiat tincidunt. Donec vitae enim volutpat, tincidunt eros eu, malesuada nibh.
+
+Quisque molestie, magna ornare elementum convallis, erat enim sagittis ipsum, eget porttitor sapien arcu id purus. Donec ut cursus diam. Nulla rutrum nulla et mi fermentum, vel tempus tellus posuere. Proin vitae pharetra nulla, nec ornare ex. Nulla consequat, augue a accumsan euismod, turpis leo ornare ligula, a pulvinar enim dolor ut augue. Quisque volutpat, lectus a varius mollis, nisl eros feugiat sem, at egestas lacus justo eu elit. Vestibulum scelerisque mauris est, sagittis interdum nunc accumsan sit amet. Maecenas aliquet ex ut lacus ornare, eu sagittis nibh imperdiet. Duis ultrices nisi velit, sed sodales risus sollicitudin et. Vestibulum ante ipsum primis in faucibus orci luctus et ultrices posuere cubilia Curae; Etiam a accumsan augue, vitae pulvinar nulla. Pellentesque euismod sodales magna, nec luctus eros mattis eget. Sed lacinia suscipit lectus, eget consectetur dui pellentesque sed. Nullam nec mattis tellus.
+
+Aliquam erat volutpat. Praesent lobortis massa porttitor eros tincidunt, nec consequat diam pharetra. Duis efficitur non lorem sed mattis. Suspendisse justo nunc, pulvinar eu porttitor at, facilisis id eros. Suspendisse potenti. Cras molestie aliquet orci ut fermentum. In tempus aliquet eros nec suscipit. Suspendisse in mauris ut lectus ultrices blandit sit amet vitae est. Nam magna massa, porttitor ut semper id, feugiat vel quam. Suspendisse dignissim posuere scelerisque. Donec scelerisque lorem efficitur suscipit suscipit. Nunc luctus ligula et scelerisque lacinia.
+
+Suspendisse potenti. Class aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos himenaeos. Sed ultrices, sem in venenatis scelerisque, tellus ipsum porttitor urna, et iaculis lectus odio ac nisi. Integer luctus dui urna, at sollicitudin elit dapibus eu. Praesent nibh ante, porttitor a ante in, ullamcorper pretium felis. Aliquam vel tortor imperdiet, imperdiet lorem et, cursus mi. Proin tempus velit est, ut hendrerit metus gravida sed. Sed nibh sapien, faucibus quis ipsum in, scelerisque lacinia elit. In nec magna eu magna laoreet rhoncus. Donec vitae rutrum mauris. Integer urna felis, consequat at rhoncus vitae, auctor quis elit. Duis a pulvinar sem, nec gravida nisl. Nam non dapibus purus. Praesent vestibulum turpis nec erat porttitor, a scelerisque purus tincidunt.
+
+Nam fringilla leo nisi, nec placerat nisl luctus eget. Aenean malesuada nunc porta sapien sodales convallis. Suspendisse ut massa tempor, ullamcorper mi ut, faucibus turpis. Vivamus at sagittis metus. Donec varius ac mi eget sodales. Nulla feugiat, nulla eu fringilla fringilla, nunc lorem sollicitudin quam, vitae lacinia velit lorem eu orci. Mauris leo urna, pellentesque ac posuere non, pellentesque sit amet quam.
+
+Vestibulum porta diam urna, a aliquet nibh vestibulum et. Proin interdum bibendum nisl sed rhoncus. Sed vel diam hendrerit, faucibus ante et, hendrerit diam. Nunc dolor augue, mattis non dolor vel, luctus sodales neque. Cras malesuada fermentum dolor eu lobortis. Integer dapibus volutpat consequat. Maecenas posuere feugiat nunc. Donec vel mollis elit, volutpat consequat enim. Nulla id nisi finibus orci imperdiet elementum. Phasellus ultrices, elit vitae consequat rutrum, nisl est congue massa, quis condimentum justo nisi vitae turpis. Maecenas aliquet risus sit amet accumsan elementum. Proin non finibus elit, sit amet lobortis augue.
+
+Morbi pretium pulvinar sem vel sollicitudin. Proin imperdiet fringilla leo, non pellentesque lacus gravida nec. Vivamus ullamcorper consectetur ligula eu consectetur. Curabitur sit amet tempus purus. Curabitur quam quam, tincidunt eu tempus vel, volutpat at ipsum. Maecenas lobortis elit ac justo interdum, sit amet mattis ligula mollis. Sed posuere ligula et felis convallis tempor. Aliquam nec mollis velit. Donec varius sit amet erat at imperdiet. Nulla ipsum justo, tempor non sollicitudin gravida, dignissim vel orci. In hac habitasse platea dictumst. Cras cursus tellus id arcu aliquet accumsan. Phasellus ac erat dui.
+
+Duis mollis metus at mi luctus aliquam. Duis varius eget erat ac porttitor. Phasellus lobortis sagittis lacinia. Etiam sagittis eget erat in pulvinar. Phasellus sodales risus nec vulputate accumsan. Cras sit amet pellentesque dui. Praesent consequat felis mi, at vulputate diam convallis a. Donec hendrerit nibh vel justo consequat dictum. In euismod, dui sit amet malesuada suscipit, mauris ex rhoncus eros, sed ornare arcu nunc eu urna. Pellentesque eget erat augue. Integer rutrum mauris sem, nec sodales nulla cursus vel. Vivamus porta, urna vel varius vulputate, nulla arcu malesuada dui, a ultrices magna ante sed nibh.
+
+Morbi ultricies aliquam lorem id bibendum. Donec sit amet nunc vitae massa gravida eleifend hendrerit vel libero. Pellentesque habitant morbi tristique senectus et netus et malesuada fames ac turpis egestas. Nulla vestibulum tempus condimentum. Aliquam dolor ipsum, condimentum in sapien et, tempor iaculis nulla. Aenean non pharetra augue. Maecenas mattis dignissim maximus. Fusce elementum tincidunt massa sit amet lobortis. Phasellus nec pharetra dui, et malesuada ante. Nullam commodo pretium tellus. Praesent sollicitudin, enim eget imperdiet scelerisque, odio felis vulputate dolor, eget auctor neque tellus ac lorem.
+
+In consectetur augue et sapien feugiat varius. Nam tortor mi, consectetur ac felis non, elementum venenatis augue. Suspendisse ut tellus in est sagittis cursus. Quisque faucibus, neque sit amet semper congue, nibh augue finibus odio, vitae interdum dolor arcu eget arcu. Curabitur dictum risus massa, non tincidunt urna molestie non. Maecenas eu quam purus. Donec vulputate, dui eu accumsan blandit, mauris tortor tristique mi, sed blandit leo quam id quam. Ut venenatis sagittis malesuada. Integer non auctor orci. Duis consectetur massa felis. Fusce euismod est sit amet bibendum finibus. Vestibulum dolor ex, tempor at elit in, iaculis cursus dui. Nunc sed neque ac risus rutrum tempus sit amet at ante. In hac habitasse platea dictumst.
+
+Donec rutrum, velit nec viverra tincidunt, est velit viverra neque, quis auctor leo ex at lectus. Morbi eget purus nisi. Aliquam lacus dui, interdum vitae elit at, venenatis dignissim est. Duis ac mollis lorem. Vivamus a vestibulum quam. Maecenas non metus dolor. Praesent tortor nunc, tristique at nisl molestie, vulputate eleifend diam. Integer ultrices lacus odio, vel imperdiet enim accumsan id. Sed ligula tortor, interdum eu velit eget, pharetra pulvinar magna. Sed non lacus in eros tincidunt sagittis ac vel justo. Donec vitae leo sagittis, accumsan ante sit amet, accumsan odio. Ut volutpat ultricies tortor. Vestibulum tempus purus et est tristique sagittis quis vitae turpis.
+
+Nam iaculis neque lacus, eget euismod turpis blandit eget. In hac habitasse platea dictumst. Phasellus justo neque, scelerisque sit amet risus ut, pretium commodo nisl. Phasellus auctor sapien sed ex bibendum fermentum. Proin maximus odio a ante ornare, a feugiat lorem egestas. Etiam efficitur tortor a ante tincidunt interdum. Nullam non est ac massa congue efficitur sit amet nec eros. Nullam at ipsum vel mauris tincidunt efficitur. Duis pulvinar nisl elit, id auctor risus laoreet ac. Sed nunc mauris, tristique id leo ut, condimentum congue nunc. Sed ultricies, mauris et convallis faucibus, justo ex faucibus est, at lobortis purus justo non arcu. Integer vel facilisis elit, dapibus imperdiet mauris.
+
+Pellentesque non mattis turpis, eget bibendum velit. Fusce sollicitudin ante ac tincidunt rhoncus. Praesent porta scelerisque consequat. Donec eleifend faucibus sollicitudin. Quisque vitae purus eget tortor tempor ultrices. Maecenas mauris diam, semper vitae est non, imperdiet tempor magna. Duis elit lacus, auctor vestibulum enim eget, rhoncus porttitor tortor.
+
+Donec non rhoncus nibh. Cras dapibus justo vitae nunc accumsan, id congue erat egestas. Aenean at ante ante. Duis eleifend imperdiet dREADALL
diff --git a/test/image/mysql.sql b/test/image/mysql.sql
new file mode 100644
index 000000000..51554b98d
--- /dev/null
+++ b/test/image/mysql.sql
@@ -0,0 +1,23 @@
+# Copyright 2018 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+SHOW databases;
+USE mysql;
+
+CREATE TABLE foo (id int);
+INSERT INTO foo VALUES(1);
+SELECT * FROM foo;
+DROP TABLE foo;
+
+shutdown;
diff --git a/test/image/ruby.rb b/test/image/ruby.rb
new file mode 100644
index 000000000..aced49c6d
--- /dev/null
+++ b/test/image/ruby.rb
@@ -0,0 +1,23 @@
+# Copyright 2018 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+require 'sinatra'
+
+set :bind, "0.0.0.0"
+set :port, 8080
+
+get '/' do
+  'Hello World'
+end
+
diff --git a/test/image/ruby.sh b/test/image/ruby.sh
new file mode 100644
index 000000000..ebe8d5b0e
--- /dev/null
+++ b/test/image/ruby.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+# Copyright 2018 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+
+gem install sinatra
+ruby /src/ruby.rb
diff --git a/test/root/BUILD b/test/root/BUILD
new file mode 100644
index 000000000..f130df2c7
--- /dev/null
+++ b/test/root/BUILD
@@ -0,0 +1,36 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "root",
+    srcs = ["root.go"],
+    importpath = "gvisor.dev/gvisor/test/root",
+)
+
+go_test(
+    name = "root_test",
+    size = "small",
+    srcs = [
+        "cgroup_test.go",
+        "chroot_test.go",
+        "crictl_test.go",
+    ],
+    embed = [":root"],
+    tags = [
+        # Requires docker and runsc to be configured before the test runs.
+        # Also test only runs as root.
+        "manual",
+        "local",
+    ],
+    visibility = ["//:sandbox"],
+    deps = [
+        "//runsc/cgroup",
+        "//runsc/criutil",
+        "//runsc/dockerutil",
+        "//runsc/specutils",
+        "//runsc/testutil",
+        "//test/root/testdata",
+        "@com_github_syndtr_gocapability//capability:go_default_library",
+    ],
+)
diff --git a/test/root/cgroup_test.go b/test/root/cgroup_test.go
new file mode 100644
index 000000000..cc7e8583e
--- /dev/null
+++ b/test/root/cgroup_test.go
@@ -0,0 +1,238 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package root
+
+import (
+	"bufio"
+	"fmt"
+	"io/ioutil"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"strconv"
+	"strings"
+	"testing"
+
+	"gvisor.dev/gvisor/runsc/cgroup"
+	"gvisor.dev/gvisor/runsc/dockerutil"
+	"gvisor.dev/gvisor/runsc/testutil"
+)
+
+func verifyPid(pid int, path string) error {
+	f, err := os.Open(path)
+	if err != nil {
+		return err
+	}
+	defer f.Close()
+
+	var gots []int
+	scanner := bufio.NewScanner(f)
+	for scanner.Scan() {
+		got, err := strconv.Atoi(scanner.Text())
+		if err != nil {
+			return err
+		}
+		if got == pid {
+			return nil
+		}
+		gots = append(gots, got)
+	}
+	if scanner.Err() != nil {
+		return scanner.Err()
+	}
+	return fmt.Errorf("got: %s, want: %d", gots, pid)
+}
+
+// TestCgroup sets cgroup options and checks that cgroup was properly configured.
+func TestCgroup(t *testing.T) {
+	if err := dockerutil.Pull("alpine"); err != nil {
+		t.Fatal("docker pull failed:", err)
+	}
+	d := dockerutil.MakeDocker("cgroup-test")
+
+	attrs := []struct {
+		arg            string
+		ctrl           string
+		file           string
+		want           string
+		skipIfNotFound bool
+	}{
+		{
+			arg:  "--cpu-shares=1000",
+			ctrl: "cpu",
+			file: "cpu.shares",
+			want: "1000",
+		},
+		{
+			arg:  "--cpu-period=2000",
+			ctrl: "cpu",
+			file: "cpu.cfs_period_us",
+			want: "2000",
+		},
+		{
+			arg:  "--cpu-quota=3000",
+			ctrl: "cpu",
+			file: "cpu.cfs_quota_us",
+			want: "3000",
+		},
+		{
+			arg:  "--cpuset-cpus=0",
+			ctrl: "cpuset",
+			file: "cpuset.cpus",
+			want: "0",
+		},
+		{
+			arg:  "--cpuset-mems=0",
+			ctrl: "cpuset",
+			file: "cpuset.mems",
+			want: "0",
+		},
+		{
+			arg:  "--kernel-memory=100MB",
+			ctrl: "memory",
+			file: "memory.kmem.limit_in_bytes",
+			want: "104857600",
+		},
+		{
+			arg:  "--memory=1GB",
+			ctrl: "memory",
+			file: "memory.limit_in_bytes",
+			want: "1073741824",
+		},
+		{
+			arg:  "--memory-reservation=500MB",
+			ctrl: "memory",
+			file: "memory.soft_limit_in_bytes",
+			want: "524288000",
+		},
+		{
+			arg:            "--memory-swap=2GB",
+			ctrl:           "memory",
+			file:           "memory.memsw.limit_in_bytes",
+			want:           "2147483648",
+			skipIfNotFound: true, // swap may be disabled on the machine.
+		},
+		{
+			arg:  "--memory-swappiness=5",
+			ctrl: "memory",
+			file: "memory.swappiness",
+			want: "5",
+		},
+		{
+			arg:  "--blkio-weight=750",
+			ctrl: "blkio",
+			file: "blkio.weight",
+			want: "750",
+		},
+	}
+
+	args := make([]string, 0, len(attrs))
+	for _, attr := range attrs {
+		args = append(args, attr.arg)
+	}
+
+	args = append(args, "alpine", "sleep", "10000")
+	if err := d.Run(args...); err != nil {
+		t.Fatal("docker create failed:", err)
+	}
+	defer d.CleanUp()
+
+	gid, err := d.ID()
+	if err != nil {
+		t.Fatalf("Docker.ID() failed: %v", err)
+	}
+	t.Logf("cgroup ID: %s", gid)
+
+	// Check list of attributes defined above.
+	for _, attr := range attrs {
+		path := filepath.Join("/sys/fs/cgroup", attr.ctrl, "docker", gid, attr.file)
+		out, err := ioutil.ReadFile(path)
+		if err != nil {
+			if os.IsNotExist(err) && attr.skipIfNotFound {
+				t.Logf("skipped %s/%s", attr.ctrl, attr.file)
+				continue
+			}
+			t.Fatalf("failed to read %q: %v", path, err)
+		}
+		if got := strings.TrimSpace(string(out)); got != attr.want {
+			t.Errorf("arg: %q, cgroup attribute %s/%s, got: %q, want: %q", attr.arg, attr.ctrl, attr.file, got, attr.want)
+		}
+	}
+
+	// Check that sandbox is inside cgroup.
+	controllers := []string{
+		"blkio",
+		"cpu",
+		"cpuset",
+		"memory",
+		"net_cls",
+		"net_prio",
+		"devices",
+		"freezer",
+		"perf_event",
+		"pids",
+		"systemd",
+	}
+	pid, err := d.SandboxPid()
+	if err != nil {
+		t.Fatalf("SandboxPid: %v", err)
+	}
+	for _, ctrl := range controllers {
+		path := filepath.Join("/sys/fs/cgroup", ctrl, "docker", gid, "cgroup.procs")
+		if err := verifyPid(pid, path); err != nil {
+			t.Errorf("cgroup control %q processes: %v", ctrl, err)
+		}
+	}
+}
+
+func TestCgroupParent(t *testing.T) {
+	if err := dockerutil.Pull("alpine"); err != nil {
+		t.Fatal("docker pull failed:", err)
+	}
+	d := dockerutil.MakeDocker("cgroup-test")
+
+	parent := testutil.RandomName("runsc")
+	if err := d.Run("--cgroup-parent", parent, "alpine", "sleep", "10000"); err != nil {
+		t.Fatal("docker create failed:", err)
+	}
+	defer d.CleanUp()
+	gid, err := d.ID()
+	if err != nil {
+		t.Fatalf("Docker.ID() failed: %v", err)
+	}
+	t.Logf("cgroup ID: %s", gid)
+
+	// Check that sandbox is inside cgroup.
+	pid, err := d.SandboxPid()
+	if err != nil {
+		t.Fatalf("SandboxPid: %v", err)
+	}
+
+	// Finds cgroup for the sandbox's parent process to check that cgroup is
+	// created in the right location relative to the parent.
+	cmd := fmt.Sprintf("grep PPid: /proc/%d/status | sed 's/PPid:\\s//'", pid)
+	ppid, err := exec.Command("bash", "-c", cmd).CombinedOutput()
+	if err != nil {
+		t.Fatalf("Executing %q: %v", cmd, err)
+	}
+	cgroups, err := cgroup.LoadPaths(strings.TrimSpace(string(ppid)))
+	if err != nil {
+		t.Fatalf("cgroup.LoadPath(%s): %v", ppid, err)
+	}
+	path := filepath.Join("/sys/fs/cgroup/memory", cgroups["memory"], parent, gid, "cgroup.procs")
+	if err := verifyPid(pid, path); err != nil {
+		t.Errorf("cgroup control %q processes: %v", "memory", err)
+	}
+}
diff --git a/test/root/chroot_test.go b/test/root/chroot_test.go
new file mode 100644
index 000000000..f47f8e2c2
--- /dev/null
+++ b/test/root/chroot_test.go
@@ -0,0 +1,158 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package root is used for tests that requires sysadmin privileges run.
+package root
+
+import (
+	"flag"
+	"fmt"
+	"io/ioutil"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"strconv"
+	"strings"
+	"testing"
+
+	"github.com/syndtr/gocapability/capability"
+	"gvisor.dev/gvisor/runsc/dockerutil"
+	"gvisor.dev/gvisor/runsc/specutils"
+)
+
+// TestChroot verifies that the sandbox is chroot'd and that mounts are cleaned
+// up after the sandbox is destroyed.
+func TestChroot(t *testing.T) {
+	d := dockerutil.MakeDocker("chroot-test")
+	if err := d.Run("alpine", "sleep", "10000"); err != nil {
+		t.Fatalf("docker run failed: %v", err)
+	}
+	defer d.CleanUp()
+
+	pid, err := d.SandboxPid()
+	if err != nil {
+		t.Fatalf("Docker.SandboxPid(): %v", err)
+	}
+
+	// Check that sandbox is chroot'ed.
+	procRoot := filepath.Join("/proc", strconv.Itoa(pid), "root")
+	chroot, err := filepath.EvalSymlinks(procRoot)
+	if err != nil {
+		t.Fatalf("error resolving /proc/<pid>/root symlink: %v", err)
+	}
+	if chroot != "/" {
+		t.Errorf("sandbox is not chroot'd, it should be inside: /, got: %q", chroot)
+	}
+
+	path, err := filepath.EvalSymlinks(filepath.Join("/proc", strconv.Itoa(pid), "cwd"))
+	if err != nil {
+		t.Fatalf("error resolving /proc/<pid>/cwd symlink: %v", err)
+	}
+	if chroot != path {
+		t.Errorf("sandbox current dir is wrong, want: %q, got: %q", chroot, path)
+	}
+
+	fi, err := ioutil.ReadDir(procRoot)
+	if err != nil {
+		t.Fatalf("error listing %q: %v", chroot, err)
+	}
+	if want, got := 1, len(fi); want != got {
+		t.Fatalf("chroot dir got %d entries, want %d", got, want)
+	}
+
+	// chroot dir is prepared by runsc and should contains only /proc.
+	if fi[0].Name() != "proc" {
+		t.Errorf("chroot got children %v, want %v", fi[0].Name(), "proc")
+	}
+
+	d.CleanUp()
+}
+
+func TestChrootGofer(t *testing.T) {
+	d := dockerutil.MakeDocker("chroot-test")
+	if err := d.Run("alpine", "sleep", "10000"); err != nil {
+		t.Fatalf("docker run failed: %v", err)
+	}
+	defer d.CleanUp()
+
+	// It's tricky to find gofers. Get sandbox PID first, then find parent. From
+	// parent get all immediate children, remove the sandbox, and everything else
+	// are gofers.
+	sandPID, err := d.SandboxPid()
+	if err != nil {
+		t.Fatalf("Docker.SandboxPid(): %v", err)
+	}
+
+	// Find sandbox's parent PID.
+	cmd := fmt.Sprintf("grep PPid /proc/%d/status | awk '{print $2}'", sandPID)
+	parent, err := exec.Command("sh", "-c", cmd).CombinedOutput()
+	if err != nil {
+		t.Fatalf("failed to fetch runsc (%d) parent PID: %v, out:\n%s", sandPID, err, string(parent))
+	}
+	parentPID, err := strconv.Atoi(strings.TrimSpace(string(parent)))
+	if err != nil {
+		t.Fatalf("failed to parse PPID %q: %v", string(parent), err)
+	}
+
+	// Get all children from parent.
+	childrenOut, err := exec.Command("/usr/bin/pgrep", "-P", strconv.Itoa(parentPID)).CombinedOutput()
+	if err != nil {
+		t.Fatalf("failed to fetch containerd-shim children: %v", err)
+	}
+	children := strings.Split(strings.TrimSpace(string(childrenOut)), "\n")
+
+	// This where the root directory is mapped on the host and that's where the
+	// gofer must have chroot'd to.
+	root := "/root"
+
+	for _, child := range children {
+		childPID, err := strconv.Atoi(child)
+		if err != nil {
+			t.Fatalf("failed to parse child PID %q: %v", child, err)
+		}
+		if childPID == sandPID {
+			// Skip the sandbox, all other immediate children are gofers.
+			continue
+		}
+
+		// Check that gofer is chroot'ed.
+		chroot, err := filepath.EvalSymlinks(filepath.Join("/proc", child, "root"))
+		if err != nil {
+			t.Fatalf("error resolving /proc/<pid>/root symlink: %v", err)
+		}
+		if root != chroot {
+			t.Errorf("gofer chroot is wrong, want: %q, got: %q", root, chroot)
+		}
+
+		path, err := filepath.EvalSymlinks(filepath.Join("/proc", child, "cwd"))
+		if err != nil {
+			t.Fatalf("error resolving /proc/<pid>/cwd symlink: %v", err)
+		}
+		if root != path {
+			t.Errorf("gofer current dir is wrong, want: %q, got: %q", root, path)
+		}
+	}
+}
+
+func TestMain(m *testing.M) {
+	dockerutil.EnsureSupportedDockerVersion()
+
+	if !specutils.HasCapabilities(capability.CAP_SYS_ADMIN, capability.CAP_DAC_OVERRIDE) {
+		fmt.Println("Test requires sysadmin privileges to run. Try again with sudo.")
+		os.Exit(1)
+	}
+
+	flag.Parse()
+	os.Exit(m.Run())
+}
diff --git a/test/root/crictl_test.go b/test/root/crictl_test.go
new file mode 100644
index 000000000..d597664f5
--- /dev/null
+++ b/test/root/crictl_test.go
@@ -0,0 +1,242 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package root
+
+import (
+	"bytes"
+	"fmt"
+	"io"
+	"io/ioutil"
+	"log"
+	"net/http"
+	"os"
+	"os/exec"
+	"path"
+	"path/filepath"
+	"strings"
+	"testing"
+	"time"
+
+	"gvisor.dev/gvisor/runsc/criutil"
+	"gvisor.dev/gvisor/runsc/dockerutil"
+	"gvisor.dev/gvisor/runsc/specutils"
+	"gvisor.dev/gvisor/runsc/testutil"
+	"gvisor.dev/gvisor/test/root/testdata"
+)
+
+// Tests for crictl have to be run as root (rather than in a user namespace)
+// because crictl creates named network namespaces in /var/run/netns/.
+
+// TestCrictlSanity refers to b/112433158.
+func TestCrictlSanity(t *testing.T) {
+	// Setup containerd and crictl.
+	crictl, cleanup, err := setup(t)
+	if err != nil {
+		t.Fatalf("failed to setup crictl: %v", err)
+	}
+	defer cleanup()
+	podID, contID, err := crictl.StartPodAndContainer("httpd", testdata.Sandbox, testdata.Httpd)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	// Look for the httpd page.
+	if err = httpGet(crictl, podID, "index.html"); err != nil {
+		t.Fatalf("failed to get page: %v", err)
+	}
+
+	// Stop everything.
+	if err := crictl.StopPodAndContainer(podID, contID); err != nil {
+		t.Fatal(err)
+	}
+}
+
+// TestMountPaths refers to b/117635704.
+func TestMountPaths(t *testing.T) {
+	// Setup containerd and crictl.
+	crictl, cleanup, err := setup(t)
+	if err != nil {
+		t.Fatalf("failed to setup crictl: %v", err)
+	}
+	defer cleanup()
+	podID, contID, err := crictl.StartPodAndContainer("httpd", testdata.Sandbox, testdata.HttpdMountPaths)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	// Look for the directory available at /test.
+	if err = httpGet(crictl, podID, "test"); err != nil {
+		t.Fatalf("failed to get page: %v", err)
+	}
+
+	// Stop everything.
+	if err := crictl.StopPodAndContainer(podID, contID); err != nil {
+		t.Fatal(err)
+	}
+}
+
+// TestMountPaths refers to b/118728671.
+func TestMountOverSymlinks(t *testing.T) {
+	// Setup containerd and crictl.
+	crictl, cleanup, err := setup(t)
+	if err != nil {
+		t.Fatalf("failed to setup crictl: %v", err)
+	}
+	defer cleanup()
+	podID, contID, err := crictl.StartPodAndContainer("k8s.gcr.io/busybox", testdata.Sandbox, testdata.MountOverSymlink)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	out, err := crictl.Exec(contID, "readlink", "/etc/resolv.conf")
+	if err != nil {
+		t.Fatal(err)
+	}
+	if want := "/tmp/resolv.conf"; !strings.Contains(string(out), want) {
+		t.Fatalf("/etc/resolv.conf is not pointing to %q: %q", want, string(out))
+	}
+
+	etc, err := crictl.Exec(contID, "cat", "/etc/resolv.conf")
+	if err != nil {
+		t.Fatal(err)
+	}
+	tmp, err := crictl.Exec(contID, "cat", "/tmp/resolv.conf")
+	if err != nil {
+		t.Fatal(err)
+	}
+	if tmp != etc {
+		t.Fatalf("file content doesn't match:\n\t/etc/resolv.conf: %s\n\t/tmp/resolv.conf: %s", string(etc), string(tmp))
+	}
+
+	// Stop everything.
+	if err := crictl.StopPodAndContainer(podID, contID); err != nil {
+		t.Fatal(err)
+	}
+}
+
+// setup sets up before a test. Specifically it:
+// * Creates directories and a socket for containerd to utilize.
+// * Runs containerd and waits for it to reach a "ready" state for testing.
+// * Returns a cleanup function that should be called at the end of the test.
+func setup(t *testing.T) (*criutil.Crictl, func(), error) {
+	var cleanups []func()
+	cleanupFunc := func() {
+		for i := len(cleanups) - 1; i >= 0; i-- {
+			cleanups[i]()
+		}
+	}
+	cleanup := specutils.MakeCleanup(cleanupFunc)
+	defer cleanup.Clean()
+
+	// Create temporary containerd root and state directories, and a socket
+	// via which crictl and containerd communicate.
+	containerdRoot, err := ioutil.TempDir(testutil.TmpDir(), "containerd-root")
+	if err != nil {
+		t.Fatalf("failed to create containerd root: %v", err)
+	}
+	cleanups = append(cleanups, func() { os.RemoveAll(containerdRoot) })
+	containerdState, err := ioutil.TempDir(testutil.TmpDir(), "containerd-state")
+	if err != nil {
+		t.Fatalf("failed to create containerd state: %v", err)
+	}
+	cleanups = append(cleanups, func() { os.RemoveAll(containerdState) })
+	sockAddr := filepath.Join(testutil.TmpDir(), "containerd-test.sock")
+
+	// We rewrite a configuration. This is based on the current docker
+	// configuration for the runtime under test.
+	runtime, err := dockerutil.RuntimePath()
+	if err != nil {
+		t.Fatalf("error discovering runtime path: %v", err)
+	}
+	config, err := testutil.WriteTmpFile("containerd-config", testdata.ContainerdConfig(runtime))
+	if err != nil {
+		t.Fatalf("failed to write containerd config")
+	}
+	cleanups = append(cleanups, func() { os.RemoveAll(config) })
+
+	// Start containerd.
+	containerd := exec.Command(getContainerd(),
+		"--config", config,
+		"--log-level", "debug",
+		"--root", containerdRoot,
+		"--state", containerdState,
+		"--address", sockAddr)
+	cleanups = append(cleanups, func() {
+		if err := testutil.KillCommand(containerd); err != nil {
+			log.Printf("error killing containerd: %v", err)
+		}
+	})
+	containerdStderr, err := containerd.StderrPipe()
+	if err != nil {
+		t.Fatalf("failed to get containerd stderr: %v", err)
+	}
+	containerdStdout, err := containerd.StdoutPipe()
+	if err != nil {
+		t.Fatalf("failed to get containerd stdout: %v", err)
+	}
+	if err := containerd.Start(); err != nil {
+		t.Fatalf("failed running containerd: %v", err)
+	}
+
+	// Wait for containerd to boot. Then put all containerd output into a
+	// buffer to be logged at the end of the test.
+	testutil.WaitUntilRead(containerdStderr, "Start streaming server", nil, 10*time.Second)
+	stdoutBuf := &bytes.Buffer{}
+	stderrBuf := &bytes.Buffer{}
+	go func() { io.Copy(stdoutBuf, containerdStdout) }()
+	go func() { io.Copy(stderrBuf, containerdStderr) }()
+	cleanups = append(cleanups, func() {
+		t.Logf("containerd stdout: %s", string(stdoutBuf.Bytes()))
+		t.Logf("containerd stderr: %s", string(stderrBuf.Bytes()))
+	})
+
+	cleanup.Release()
+	return criutil.NewCrictl(20*time.Second, sockAddr), cleanupFunc, nil
+}
+
+// httpGet GETs the contents of a file served from a pod on port 80.
+func httpGet(crictl *criutil.Crictl, podID, filePath string) error {
+	// Get the IP of the httpd server.
+	ip, err := crictl.PodIP(podID)
+	if err != nil {
+		return fmt.Errorf("failed to get IP from pod %q: %v", podID, err)
+	}
+
+	// GET the page. We may be waiting for the server to start, so retry
+	// with a timeout.
+	var resp *http.Response
+	cb := func() error {
+		r, err := http.Get(fmt.Sprintf("http://%s", path.Join(ip, filePath)))
+		resp = r
+		return err
+	}
+	if err := testutil.Poll(cb, 20*time.Second); err != nil {
+		return err
+	}
+	defer resp.Body.Close()
+
+	if resp.StatusCode != 200 {
+		return fmt.Errorf("bad status returned: %d", resp.StatusCode)
+	}
+	return nil
+}
+
+func getContainerd() string {
+	// Use the local path if it exists, otherwise, use the system one.
+	if _, err := os.Stat("/usr/local/bin/containerd"); err == nil {
+		return "/usr/local/bin/containerd"
+	}
+	return "/usr/bin/containerd"
+}
diff --git a/test/root/root.go b/test/root/root.go
new file mode 100644
index 000000000..349c752cc
--- /dev/null
+++ b/test/root/root.go
@@ -0,0 +1,16 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package root is empty. See chroot_test.go for description.
+package root
diff --git a/test/root/testdata/BUILD b/test/root/testdata/BUILD
new file mode 100644
index 000000000..14c19ef1e
--- /dev/null
+++ b/test/root/testdata/BUILD
@@ -0,0 +1,18 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "testdata",
+    srcs = [
+        "busybox.go",
+        "containerd_config.go",
+        "httpd.go",
+        "httpd_mount_paths.go",
+        "sandbox.go",
+    ],
+    importpath = "gvisor.dev/gvisor/test/root/testdata",
+    visibility = [
+        "//visibility:public",
+    ],
+)
diff --git a/test/root/testdata/busybox.go b/test/root/testdata/busybox.go
new file mode 100644
index 000000000..e4dbd2843
--- /dev/null
+++ b/test/root/testdata/busybox.go
@@ -0,0 +1,32 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package testdata
+
+// MountOverSymlink is a JSON config for a container that /etc/resolv.conf is a
+// symlink to /tmp/resolv.conf.
+var MountOverSymlink = `
+{
+        "metadata": {
+                "name": "busybox"
+        },
+        "image": {
+                "image": "k8s.gcr.io/busybox"
+        },
+        "command": [
+                "sleep",
+                "1000"
+        ]
+}
+`
diff --git a/test/root/testdata/containerd_config.go b/test/root/testdata/containerd_config.go
new file mode 100644
index 000000000..e12f1ec88
--- /dev/null
+++ b/test/root/testdata/containerd_config.go
@@ -0,0 +1,39 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package testdata contains data required for root tests.
+package testdata
+
+import "fmt"
+
+// containerdConfigTemplate is a .toml config for containerd. It contains a
+// formatting verb so the runtime field can be set via fmt.Sprintf.
+const containerdConfigTemplate = `
+disabled_plugins = ["restart"]
+[plugins.linux]
+  runtime = "%s"
+  runtime_root = "/tmp/test-containerd/runsc"
+  shim = "/usr/local/bin/gvisor-containerd-shim"
+  shim_debug = true
+
+[plugins.cri.containerd.runtimes.runsc]
+  runtime_type = "io.containerd.runtime.v1.linux"
+  runtime_engine = "%s"
+`
+
+// ContainerdConfig returns a containerd config file with the specified
+// runtime.
+func ContainerdConfig(runtime string) string {
+	return fmt.Sprintf(containerdConfigTemplate, runtime, runtime)
+}
diff --git a/test/root/testdata/httpd.go b/test/root/testdata/httpd.go
new file mode 100644
index 000000000..45d5e33d4
--- /dev/null
+++ b/test/root/testdata/httpd.go
@@ -0,0 +1,32 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package testdata
+
+// Httpd is a JSON config for an httpd container.
+const Httpd = `
+{
+  "metadata": {
+    "name": "httpd"
+  },
+  "image":{
+    "image": "httpd"
+  },
+  "mounts": [
+  ],
+  "linux": {
+  },
+  "log_path": "httpd.log"
+}
+`
diff --git a/test/root/testdata/httpd_mount_paths.go b/test/root/testdata/httpd_mount_paths.go
new file mode 100644
index 000000000..ac3f4446a
--- /dev/null
+++ b/test/root/testdata/httpd_mount_paths.go
@@ -0,0 +1,53 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package testdata
+
+// HttpdMountPaths is a JSON config for an httpd container with additional
+// mounts.
+const HttpdMountPaths = `
+{
+  "metadata": {
+    "name": "httpd"
+  },
+  "image":{
+    "image": "httpd"
+  },
+  "mounts": [
+      {
+        "container_path": "/var/run/secrets/kubernetes.io/serviceaccount",
+        "host_path": "/var/lib/kubelet/pods/82bae206-cdf5-11e8-b245-8cdcd43ac064/volumes/kubernetes.io~secret/default-token-2rpfx",
+        "readonly": true
+      },
+      {
+        "container_path": "/etc/hosts",
+        "host_path": "/var/lib/kubelet/pods/82bae206-cdf5-11e8-b245-8cdcd43ac064/etc-hosts",
+        "readonly": false
+      },
+      {
+        "container_path": "/dev/termination-log",
+        "host_path": "/var/lib/kubelet/pods/82bae206-cdf5-11e8-b245-8cdcd43ac064/containers/httpd/d1709580",
+        "readonly": false
+      },
+      {
+        "container_path": "/usr/local/apache2/htdocs/test",
+        "host_path": "/var/lib/kubelet/pods/82bae206-cdf5-11e8-b245-8cdcd43ac064",
+        "readonly": true
+      }
+  ],
+  "linux": {
+  },
+  "log_path": "httpd.log"
+}
+`
diff --git a/test/root/testdata/sandbox.go b/test/root/testdata/sandbox.go
new file mode 100644
index 000000000..0db210370
--- /dev/null
+++ b/test/root/testdata/sandbox.go
@@ -0,0 +1,30 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package testdata
+
+// Sandbox is a default JSON config for a sandbox.
+const Sandbox = `
+{
+    "metadata": {
+        "name": "default-sandbox",
+        "namespace": "default",
+        "attempt": 1,
+        "uid": "hdishd83djaidwnduwk28bcsb"
+    },
+    "linux": {
+    },
+    "log_directory": "/tmp"
+}
+`
diff --git a/test/runtimes/BUILD b/test/runtimes/BUILD
index e85804a83..5616a8b7b 100644
--- a/test/runtimes/BUILD
+++ b/test/runtimes/BUILD
@@ -1,7 +1,7 @@
 # These packages are used to run language runtime tests inside gVisor sandboxes.
 
 load("@io_bazel_rules_go//go:def.bzl", "go_library")
-load("//runsc/test:build_defs.bzl", "runtime_test")
+load("//test/runtimes:build_defs.bzl", "runtime_test")
 
 package(licenses = ["notice"])
 
@@ -21,5 +21,5 @@ runtime_test(
         "manual",
         "local",
     ],
-    deps = ["//runsc/test/testutil"],
+    deps = ["//runsc/testutil"],
 )
diff --git a/test/runtimes/build_defs.bzl b/test/runtimes/build_defs.bzl
new file mode 100644
index 000000000..ac28cc037
--- /dev/null
+++ b/test/runtimes/build_defs.bzl
@@ -0,0 +1,19 @@
+"""Defines a rule for runsc test targets."""
+
+load("@io_bazel_rules_go//go:def.bzl", _go_test = "go_test")
+
+# runtime_test is a macro that will create targets to run the given test target
+# with different runtime options.
+def runtime_test(**kwargs):
+    """Runs the given test target with different runtime options."""
+    name = kwargs["name"]
+    _go_test(**kwargs)
+    kwargs["name"] = name + "_hostnet"
+    kwargs["args"] = ["--runtime-type=hostnet"]
+    _go_test(**kwargs)
+    kwargs["name"] = name + "_kvm"
+    kwargs["args"] = ["--runtime-type=kvm"]
+    _go_test(**kwargs)
+    kwargs["name"] = name + "_overlay"
+    kwargs["args"] = ["--runtime-type=overlay"]
+    _go_test(**kwargs)
diff --git a/test/runtimes/common/BUILD b/test/runtimes/common/BUILD
index 1b39606b8..b4740bb97 100644
--- a/test/runtimes/common/BUILD
+++ b/test/runtimes/common/BUILD
@@ -15,6 +15,6 @@ go_test(
     srcs = ["common_test.go"],
     deps = [
         ":common",
-        "//runsc/test/testutil",
+        "//runsc/testutil",
     ],
 )
diff --git a/test/runtimes/common/common_test.go b/test/runtimes/common/common_test.go
index 4fb1e482a..65875b41b 100644
--- a/test/runtimes/common/common_test.go
+++ b/test/runtimes/common/common_test.go
@@ -23,7 +23,7 @@ import (
 	"strings"
 	"testing"
 
-	"gvisor.dev/gvisor/runsc/test/testutil"
+	"gvisor.dev/gvisor/runsc/testutil"
 	"gvisor.dev/gvisor/test/runtimes/common"
 )
 
diff --git a/test/runtimes/runtimes_test.go b/test/runtimes/runtimes_test.go
index 9421021a1..0ff5dda02 100644
--- a/test/runtimes/runtimes_test.go
+++ b/test/runtimes/runtimes_test.go
@@ -19,7 +19,7 @@ import (
 	"testing"
 	"time"
 
-	"gvisor.dev/gvisor/runsc/test/testutil"
+	"gvisor.dev/gvisor/runsc/testutil"
 )
 
 // Wait time for each test to run.
diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD
index a8a2e75d3..58eb1154a 100644
--- a/test/syscalls/BUILD
+++ b/test/syscalls/BUILD
@@ -693,6 +693,7 @@ syscall_test(test = "//test/syscalls/linux:proc_net_udp_test")
 
 go_binary(
     name = "syscall_test_runner",
+    testonly = 1,
     srcs = ["syscall_test_runner.go"],
     data = [
         "//runsc",
@@ -700,7 +701,7 @@ go_binary(
     deps = [
         "//pkg/log",
         "//runsc/specutils",
-        "//runsc/test/testutil",
+        "//runsc/testutil",
         "//test/syscalls/gtest",
         "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
         "@org_golang_x_sys//unix:go_default_library",
diff --git a/test/syscalls/build_defs.bzl b/test/syscalls/build_defs.bzl
index 60df47798..e94ef5602 100644
--- a/test/syscalls/build_defs.bzl
+++ b/test/syscalls/build_defs.bzl
@@ -94,6 +94,7 @@ def _syscall_test(
     # more stable.
     if platform == "kvm":
         tags += ["manual"]
+        tags += ["requires-kvm"]
 
     args = [
         # Arguments are passed directly to syscall_test_runner binary.
diff --git a/test/syscalls/syscall_test_runner.go b/test/syscalls/syscall_test_runner.go
index 32408f021..e900f8abc 100644
--- a/test/syscalls/syscall_test_runner.go
+++ b/test/syscalls/syscall_test_runner.go
@@ -35,7 +35,7 @@ import (
 	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/runsc/specutils"
-	"gvisor.dev/gvisor/runsc/test/testutil"
+	"gvisor.dev/gvisor/runsc/testutil"
 	"gvisor.dev/gvisor/test/syscalls/gtest"
 )
 
diff --git a/tools/make_repository.sh b/tools/make_repository.sh
new file mode 100755
index 000000000..bf9c50d74
--- /dev/null
+++ b/tools/make_repository.sh
@@ -0,0 +1,69 @@
+#!/bin/bash
+
+# Copyright 2018 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Parse arguments. We require more than two arguments, which are the private
+# keyring, the e-mail associated with the signer, and the list of packages.
+if [ "$#" -le 2 ]; then
+  echo "usage: $0 <private-key> <signer-email> <packages...>"
+  exit 1
+fi
+declare -r private_key=$(readlink -e "$1")
+declare -r signer="$2"
+shift; shift
+
+# Verbose from this point.
+set -xeo pipefail
+
+# Create a temporary working directory. We don't remove this, as we ultimately
+# print this result and allow the caller to copy wherever they would like.
+declare -r tmpdir=$(mktemp -d /tmp/repoXXXXXX)
+
+# Create a temporary keyring, and ensure it is cleaned up.
+declare -r keyring=$(mktemp /tmp/keyringXXXXXX.gpg)
+cleanup() {
+  rm -f "${keyring}"
+}
+trap cleanup EXIT
+gpg --no-default-keyring --keyring "${keyring}" --import "${private_key}"
+
+# Export the public key from the keyring.
+gpg --no-default-keyring --keyring "${keyring}" --armor --export "${signer}" > "${tmpdir}"/keyFile
+
+# Copy the packages, and ensure permissions are correct.
+cp -a "$@" "${tmpdir}" && chmod 0644 "${tmpdir}"/*
+
+# Ensure there are no symlinks hanging around; these may be remnants of the
+# build process. They may be useful for other things, but we are going to build
+# an index of the actual packages here.
+find "${tmpdir}" -type l -exec rm -f {} \;
+
+# Sign all packages.
+for file in "${tmpdir}"/*.deb; do
+  dpkg-sig -g "--no-default-keyring --keyring ${keyring}" --sign builder "${file}"
+done
+
+# Build the package list.
+(cd "${tmpdir}" && apt-ftparchive packages . | gzip > Packages.gz)
+
+# Build the release list.
+(cd "${tmpdir}" && apt-ftparchive release . > Release)
+
+# Sign the release.
+(cd "${tmpdir}" && gpg --no-default-keyring --keyring "${keyring}" --clearsign -o InRelease Release)
+(cd "${tmpdir}" && gpg --no-default-keyring --keyring "${keyring}" -abs -o Release.gpg Release)
+
+# Show the results.
+echo "${tmpdir}"
diff --git a/tools/run_build.sh b/tools/run_build.sh
deleted file mode 100755
index 7f6ada480..000000000
--- a/tools/run_build.sh
+++ /dev/null
@@ -1,49 +0,0 @@
-#!/bin/bash
-
-# Copyright 2018 The gVisor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Fail on any error.
-set -e
-# Display commands to stderr.
-set -x
-
-# Install the latest version of Bazel and log the version.
-(which use_bazel.sh && use_bazel.sh latest) || which bazel
-bazel version
-
-# Switch into the workspace.
-if [[ -v KOKORO_GIT_COMMIT ]] && [[ -d git/repo ]]; then
-  cd git/repo
-elif [[ -v KOKORO_GIT_COMMIT ]] && [[ -d github/repo ]]; then
-  cd github/repo
-fi
-
-# Build runsc.
-bazel build -c opt --strip=never //runsc
-
-# Move the runsc binary into "latest" directory, and also a directory with the
-# current date.
-if [[ -v KOKORO_ARTIFACTS_DIR ]]; then
-  latest_dir="${KOKORO_ARTIFACTS_DIR}"/latest
-  today_dir="${KOKORO_ARTIFACTS_DIR}"/"$(date -Idate)"
-  runsc="bazel-bin/runsc/linux_amd64_pure/runsc"
-
-  mkdir -p "${latest_dir}" "${today_dir}"
-  cp "${runsc}" "${latest_dir}"
-  cp "${runsc}" "${today_dir}"
-
-  sha512sum "${latest_dir}"/runsc | awk '{print $1 "  runsc"}' > "${latest_dir}"/runsc.sha512
-  cp "${latest_dir}"/runsc.sha512 "${today_dir}"/runsc.sha512
-fi
diff --git a/tools/run_tests.sh b/tools/run_tests.sh
deleted file mode 100755
index 6fe80a36b..000000000
--- a/tools/run_tests.sh
+++ /dev/null
@@ -1,304 +0,0 @@
-#!/bin/bash
-
-# Copyright 2018 The gVisor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Fail on any error. Treat unset variables as error. Print commands as executed.
-set -eux
-
-###################
-# GLOBAL ENV VARS #
-###################
-
-if [[ -v KOKORO_GIT_COMMIT ]] && [[ -d git/repo ]]; then
-  readonly WORKSPACE_DIR="${PWD}/git/repo"
-elif [[ -v KOKORO_GIT_COMMIT ]] && [[ -d github/repo ]]; then
-  readonly WORKSPACE_DIR="${PWD}/github/repo"
-else
-  readonly WORKSPACE_DIR="${PWD}"
-fi
-
-# Used to configure RBE.
-readonly CLOUD_PROJECT_ID="gvisor-rbe"
-readonly RBE_PROJECT_ID="projects/${CLOUD_PROJECT_ID}/instances/default_instance"
-
-# Random runtime name to avoid collisions.
-readonly RUNTIME="runsc_test_$((RANDOM))"
-
-# Packages that will be built and tested.
-readonly BUILD_PACKAGES=("//...")
-readonly TEST_PACKAGES=("//pkg/..." "//runsc/..." "//tools/...")
-
-#######################
-# BAZEL CONFIGURATION #
-#######################
-
-# Install the latest version of Bazel and log the version.
-(which use_bazel.sh && use_bazel.sh 0.28.0) || which bazel
-bazel version
-
-# Load the kvm module.
-sudo -n -E modprobe kvm
-
-# General Bazel build/test flags.
-BAZEL_BUILD_FLAGS=(
-  "--show_timestamps"
-  "--test_output=errors"
-  "--keep_going"
-  "--verbose_failures=true"
-)
-
-# Bazel build/test for RBE, a super-set of BAZEL_BUILD_FLAGS.
-BAZEL_BUILD_RBE_FLAGS=(
-  "${BAZEL_BUILD_FLAGS[@]}"
-  "--config=remote"
-  "--project_id=${CLOUD_PROJECT_ID}"
-  "--remote_instance_name=${RBE_PROJECT_ID}"
-)
-if [[ -v KOKORO_BAZEL_AUTH_CREDENTIAL ]]; then
-  BAZEL_BUILD_RBE_FLAGS=(
-    "${BAZEL_BUILD_RBE_FLAGS[@]}"
-    "--auth_credentials=${KOKORO_BAZEL_AUTH_CREDENTIAL}"
-  )
-fi
-
-####################
-# Helper Functions #
-####################
-
-sanity_checks() {
-  cd ${WORKSPACE_DIR}
-  bazel run //:gazelle -- update-repos -from_file=go.mod
-  git diff --exit-code WORKSPACE
-}
-
-build_everything() {
-  FLAVOR="${1}"
-
-  cd ${WORKSPACE_DIR}
-  bazel build \
-    -c "${FLAVOR}" "${BAZEL_BUILD_RBE_FLAGS[@]}" \
-    "${BUILD_PACKAGES[@]}"
-}
-
-build_runsc_debian() {
-  cd ${WORKSPACE_DIR}
-
-  # TODO(b/135475885): pkg_deb is incompatible with Python3.
-  # https://github.com/bazelbuild/bazel/issues/8443
-  bazel build --host_force_python=py2 runsc:runsc-debian
-}
-
-# Run simple tests runs the tests that require no special setup or
-# configuration.
-run_simple_tests() {
-  cd ${WORKSPACE_DIR}
-  bazel test \
-    "${BAZEL_BUILD_FLAGS[@]}" \
-    "${TEST_PACKAGES[@]}"
-}
-
-install_runtime() {
-  cd ${WORKSPACE_DIR}
-  sudo -n ${WORKSPACE_DIR}/runsc/test/install.sh --runtime ${RUNTIME}
-}
-
-install_helper() {
-  PACKAGE="${1}"
-  TAG="${2}"
-  GOPATH="${3}"
-
-  # Clone the repository.
-  mkdir -p "${GOPATH}"/src/$(dirname "${PACKAGE}") && \
-     git clone https://"${PACKAGE}" "${GOPATH}"/src/"${PACKAGE}"
-
-  # Checkout and build the repository.
-  (cd "${GOPATH}"/src/"${PACKAGE}" && \
-      git checkout "${TAG}" && \
-      GOPATH="${GOPATH}" make && \
-      sudo -n -E env GOPATH="${GOPATH}" make install)
-}
-
-# Install dependencies for the crictl tests.
-install_crictl_test_deps() {
-  sudo -n -E apt-get update
-  sudo -n -E apt-get install -y btrfs-tools libseccomp-dev
-
-  # Install containerd & cri-tools.
-  GOPATH=$(mktemp -d --tmpdir gopathXXXXX)
-  install_helper github.com/containerd/containerd v1.2.2 "${GOPATH}"
-  install_helper github.com/kubernetes-sigs/cri-tools v1.11.0 "${GOPATH}"
-
-  # Install gvisor-containerd-shim.
-  local latest=/tmp/gvisor-containerd-shim-latest
-  local shim_path=/tmp/gvisor-containerd-shim
-  wget --no-verbose https://storage.googleapis.com/cri-containerd-staging/gvisor-containerd-shim/latest -O ${latest}
-  wget --no-verbose https://storage.googleapis.com/cri-containerd-staging/gvisor-containerd-shim/gvisor-containerd-shim-$(cat ${latest}) -O ${shim_path}
-  chmod +x ${shim_path}
-  sudo -n -E mv ${shim_path} /usr/local/bin
-
-  # Configure containerd-shim.
-  local shim_config_path=/etc/containerd
-  local shim_config_tmp_path=/tmp/gvisor-containerd-shim.toml
-  sudo -n -E mkdir -p ${shim_config_path}
-  cat > ${shim_config_tmp_path} <<-EOF
-    runc_shim = "/usr/local/bin/containerd-shim"
-
-    [runsc_config]
-      debug = "true"
-      debug-log = "/tmp/runsc-logs/"
-      strace = "true"
-      file-access = "shared"
-EOF
-  sudo mv ${shim_config_tmp_path} ${shim_config_path}
-
-  # Configure CNI.
-  (cd "${GOPATH}" && sudo -n -E env PATH="${PATH}" GOPATH="${GOPATH}" \
-      src/github.com/containerd/containerd/script/setup/install-cni)
-}
-
-# Run the tests that require docker.
-run_docker_tests() {
-  cd ${WORKSPACE_DIR}
-
-  # Run tests with a default runtime (runc).
-  bazel test \
-    "${BAZEL_BUILD_FLAGS[@]}" \
-    --test_env=RUNSC_RUNTIME="" \
-    //runsc/test/image:image_test
-
-  # These names are used to exclude tests not supported in certain
-  # configuration, e.g. save/restore not supported with hostnet.
-  # Run runsc tests with docker that are tagged manual.
-  #
-  # The --nocache_test_results option is used here to eliminate cached results
-  # from the previous run for the runc runtime.
-  bazel test \
-    "${BAZEL_BUILD_FLAGS[@]}" \
-    --test_env=RUNSC_RUNTIME="${RUNTIME}" \
-    --nocache_test_results \
-    //runsc/test/integration:integration_test \
-    //runsc/test/integration:integration_test_hostnet \
-    //runsc/test/integration:integration_test_overlay \
-    //runsc/test/integration:integration_test_kvm \
-    //runsc/test/image:image_test \
-    //runsc/test/image:image_test_overlay \
-    //runsc/test/image:image_test_hostnet \
-    //runsc/test/image:image_test_kvm
-}
-
-# Run the tests that require root.
-run_root_tests() {
-  cd ${WORKSPACE_DIR}
-  bazel build //runsc/test/root:root_test
-  local root_test=$(find -L ./bazel-bin/ -executable -type f -name root_test | grep __main__)
-  if [[ ! -f "${root_test}" ]]; then
-    echo "root_test executable not found"
-    exit 1
-  fi
-  sudo -n -E RUNSC_RUNTIME="${RUNTIME}" RUNSC_EXEC=/tmp/"${RUNTIME}"/runsc ${root_test}
-}
-
-# Run syscall unit tests.
-run_syscall_tests() {
-  cd ${WORKSPACE_DIR}
-  bazel test "${BAZEL_BUILD_RBE_FLAGS[@]}" \
-    --test_tag_filters=runsc_ptrace //test/syscalls/...
-}
-
-run_runsc_do_tests() {
-  local runsc=$(find bazel-bin/runsc -type f -executable -name "runsc" | head -n1)
-
-  # run runsc do without root privileges.
-  ${runsc} --rootless do true
-  ${runsc} --rootless --network=none do true
-
-  # run runsc do with root privileges.
-  sudo -n -E ${runsc} do true
-}
-
-# Find and rename all test xml and log files so that Sponge can pick them up.
-# XML files must be named sponge_log.xml, and log files must be named
-# sponge_log.log. We move all such files into KOKORO_ARTIFACTS_DIR, in a
-# subdirectory named with the test name.
-upload_test_artifacts() {
-  # Skip if no kokoro directory.
-  [[ -v KOKORO_ARTIFACTS_DIR ]] || return
-
-  cd ${WORKSPACE_DIR}
-  find -L "bazel-testlogs" -name "test.xml" -o -name "test.log" -o -name "outputs.zip" |
-    tar --create --files-from - --transform 's/test\./sponge_log./' |
-    tar --extract --directory ${KOKORO_ARTIFACTS_DIR}
-  if [[ -d "/tmp/${RUNTIME}/logs" ]]; then
-    tar --create --gzip "--file=${KOKORO_ARTIFACTS_DIR}/runsc-logs.tar.gz" -C /tmp/ ${RUNTIME}/logs
-  fi
-}
-
-# Finish runs in the event of an error, uploading all artifacts.
-finish() {
-  # Grab the last exit code, we will return it.
-  local exit_code=${?}
-  upload_test_artifacts
-  exit ${exit_code}
-}
-
-# Run bazel in a docker container
-build_in_docker() {
-  cd ${WORKSPACE_DIR}
-  bazel clean
-  bazel shutdown
-  make
-  make runsc
-  make bazel-shutdown
-}
-
-########
-# MAIN #
-########
-
-main() {
-  # Register finish to run at exit.
-  trap finish EXIT
-
-  # Build and run the simple tests.
-  sanity_checks
-  build_everything opt
-  run_simple_tests
-
-  # So far so good. Install more deps and run the integration tests.
-  install_runtime
-  install_crictl_test_deps
-  run_docker_tests
-  run_root_tests
-
-  run_syscall_tests
-  run_runsc_do_tests
-
-  build_runsc_debian
-
-  # Build other flavors too.
-  build_everything dbg
-
-  # We need to upload all the existing test logs and artifacts before shutting
-  # down and cleaning bazel, otherwise all test information is lost. After this
-  # point, we don't expect any logs or artifacts.
-  upload_test_artifacts
-  trap - EXIT
-
-  # Run docker build tests.
-  build_in_docker
-}
-
-# Kick it off.
-main
-- 
cgit v1.2.3


From 7e94f171f4141e91478ef8b5693db36519c2322f Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Thu, 5 Sep 2019 18:02:45 -0700
Subject: Better strace logs for statx.

PiperOrigin-RevId: 267498537
---
 pkg/sentry/strace/linux64.go | 1 +
 1 file changed, 1 insertion(+)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/strace/linux64.go b/pkg/sentry/strace/linux64.go
index 3650fd6e1..5d57b75af 100644
--- a/pkg/sentry/strace/linux64.go
+++ b/pkg/sentry/strace/linux64.go
@@ -335,4 +335,5 @@ var linuxAMD64 = SyscallMap{
 	315: makeSyscallInfo("sched_getattr", Hex, Hex, Hex),
 	316: makeSyscallInfo("renameat2", FD, Path, Hex, Path, Hex),
 	317: makeSyscallInfo("seccomp", Hex, Hex, Hex),
+	332: makeSyscallInfo("statx", FD, Path, Hex, Hex, Hex),
 }
-- 
cgit v1.2.3


From 3733b9b893ec33877b1b46c56fe07c3856942d3f Mon Sep 17 00:00:00 2001
From: Rahat Mahmood <rahat@google.com>
Date: Mon, 9 Sep 2019 13:35:30 -0700
Subject: go_marshal: Implement automatic generation of ABI marshalling code.

This CL implements go_marshal, a code generation utility for
automatically serializing and deserializing ABI structs.

The go_marshal tool automatically generates implementations of the new
marshal interface. Unlike binary.Marshal/Unmarshal, the generated
interface implementations use no runtime reflection, and translates to
a single memcpy for most structs. See go_marshal/README.md for
details.

PiperOrigin-RevId: 268065475
---
 pkg/sentry/BUILD                                   |   2 +
 tools/go_marshal/BUILD                             |  14 +
 tools/go_marshal/README.md                         | 164 +++++++
 tools/go_marshal/analysis/BUILD                    |  13 +
 tools/go_marshal/analysis/analysis_unsafe.go       | 175 +++++++
 tools/go_marshal/defs.bzl                          | 158 +++++++
 tools/go_marshal/gomarshal/BUILD                   |  17 +
 tools/go_marshal/gomarshal/generator.go            | 382 ++++++++++++++++
 tools/go_marshal/gomarshal/generator_interfaces.go | 507 +++++++++++++++++++++
 tools/go_marshal/gomarshal/generator_tests.go      | 154 +++++++
 tools/go_marshal/gomarshal/util.go                 | 387 ++++++++++++++++
 tools/go_marshal/main.go                           |  73 +++
 tools/go_marshal/marshal/BUILD                     |  14 +
 tools/go_marshal/marshal/marshal.go                |  60 +++
 tools/go_marshal/test/BUILD                        |  29 ++
 tools/go_marshal/test/benchmark_test.go            | 178 ++++++++
 tools/go_marshal/test/external/BUILD               |  11 +
 tools/go_marshal/test/external/external.go         |  23 +
 tools/go_marshal/test/test.go                      | 105 +++++
 19 files changed, 2466 insertions(+)
 create mode 100644 tools/go_marshal/BUILD
 create mode 100644 tools/go_marshal/README.md
 create mode 100644 tools/go_marshal/analysis/BUILD
 create mode 100644 tools/go_marshal/analysis/analysis_unsafe.go
 create mode 100644 tools/go_marshal/defs.bzl
 create mode 100644 tools/go_marshal/gomarshal/BUILD
 create mode 100644 tools/go_marshal/gomarshal/generator.go
 create mode 100644 tools/go_marshal/gomarshal/generator_interfaces.go
 create mode 100644 tools/go_marshal/gomarshal/generator_tests.go
 create mode 100644 tools/go_marshal/gomarshal/util.go
 create mode 100644 tools/go_marshal/main.go
 create mode 100644 tools/go_marshal/marshal/BUILD
 create mode 100644 tools/go_marshal/marshal/marshal.go
 create mode 100644 tools/go_marshal/test/BUILD
 create mode 100644 tools/go_marshal/test/benchmark_test.go
 create mode 100644 tools/go_marshal/test/external/BUILD
 create mode 100644 tools/go_marshal/test/external/external.go
 create mode 100644 tools/go_marshal/test/test.go

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/BUILD b/pkg/sentry/BUILD
index 53989301f..2d6379c86 100644
--- a/pkg/sentry/BUILD
+++ b/pkg/sentry/BUILD
@@ -8,5 +8,7 @@ package_group(
     packages = [
         "//pkg/sentry/...",
         "//runsc/...",
+        # Code generated by go_marshal relies on go_marshal libraries.
+        "//tools/go_marshal/...",
     ],
 )
diff --git a/tools/go_marshal/BUILD b/tools/go_marshal/BUILD
new file mode 100644
index 000000000..c862b277c
--- /dev/null
+++ b/tools/go_marshal/BUILD
@@ -0,0 +1,14 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_binary")
+
+package(licenses = ["notice"])
+
+go_binary(
+    name = "go_marshal",
+    srcs = ["main.go"],
+    visibility = [
+        "//:sandbox",
+    ],
+    deps = [
+        "//tools/go_marshal/gomarshal",
+    ],
+)
diff --git a/tools/go_marshal/README.md b/tools/go_marshal/README.md
new file mode 100644
index 000000000..481575bd3
--- /dev/null
+++ b/tools/go_marshal/README.md
@@ -0,0 +1,164 @@
+This package implements the go_marshal utility.
+
+# Overview
+
+`go_marshal` is a code generation utility similar to `go_stateify` for
+automatically generating code to marshal go data structures to memory.
+
+`go_marshal` attempts to improve on `binary.Write` and the sentry's
+`binary.Marshal` by moving the go runtime reflection necessary to marshal a
+struct to compile-time.
+
+`go_marshal` automatically generates implementations for `abi.Marshallable` and
+`safemem.{Reader,Writer}`. Call-sites for serialization (typically syscall
+implementations) can directly invoke `safemem.Reader.ReadToBlocks` and
+`safemem.Writer.WriteFromBlocks`. Data structures that require custom
+serialization will have manual implementations for these interfaces.
+
+Data structures can be flagged for code generation by adding a struct-level
+comment `// +marshal`.
+
+# Usage
+
+See `defs.bzl`: two new rules are provided, `go_marshal` and `go_library`.
+
+The recommended way to generate a go library with marshalling is to use the
+`go_library` with mostly identical configuration as the native go_library rule.
+
+```
+load("<PKGPATH>/gvisor/tools/go_marshal:defs.bzl", "go_library")
+
+go_library(
+    name = "foo",
+    srcs = ["foo.go"],
+)
+```
+
+Under the hood, the `go_marshal` rule is used to generate a file that will
+appear in a Go target; the output file should appear explicitly in a srcs list.
+For example (note that the above is the preferred method):
+
+```
+load("<PKGPATH>/gvisor/tools/go_marshal:defs.bzl", "go_marshal")
+
+go_marshal(
+    name = "foo_abi",
+    srcs = ["foo.go"],
+    out = "foo_abi.go",
+    package = "foo",
+)
+
+go_library(
+    name = "foo",
+    srcs = [
+        "foo.go",
+        "foo_abi.go",
+    ],
+    deps = [
+        "<PKGPATH>/gvisor/pkg/abi",
+        "<PKGPATH>/gvisor/pkg/sentry/safemem/safemem",
+        "<PKGPATH>/gvisor/pkg/sentry/usermem/usermem",
+    ],
+)
+```
+
+As part of the interface generation, `go_marshal` also generates some tests for
+sanity checking the struct definitions for potential alignment issues, and a
+simple round-trip test through Marshal/Unmarshal to verify the implementation.
+These tests use reflection to verify properties of the ABI struct, and should be
+considered part of the generated interfaces (but are too expensive to execute at
+runtime). Ensure these tests run at some point.
+
+```
+$ cat BUILD
+load("<PKGPATH>/gvisor/tools/go_marshal:defs.bzl", "go_library")
+
+go_library(
+    name = "foo",
+    srcs = ["foo.go"],
+)
+$ blaze build :foo
+$ blaze query ...
+<path-to-dir>:foo_abi_autogen
+<path-to-dir>:foo_abi_autogen_test
+$ blaze test :foo_abi_autogen_test
+<test-output>
+```
+
+# Restrictions
+
+Not all valid go type definitions can be used with `go_marshal`. `go_marshal` is
+intended for ABI structs, which have these additional restrictions:
+
+-   At the moment, `go_marshal` only supports struct declarations.
+
+-   Structs are marshalled as packed types. This means no implicit padding is
+    inserted between fields shorter than the platform register size. For
+    alignment, manually insert padding fields.
+
+-   Structs used with `go_marshal` must have a compile-time static size. This
+    means no dynamically sizes fields like slices or strings. Use statically
+    sized array (byte arrays for strings) instead.
+
+-   No pointers, channel, map or function pointer fields, and no fields that are
+    arrays of these types. These don't make sense in an ABI data structure.
+
+-   We could support opaque pointers as `uintptr`, but this is currently not
+    implemented. Implementing this would require handling the architecture
+    dependent native pointer size.
+
+-   Fields must either be a primitive integer type (`byte`,
+    `[u]int{8,16,32,64}`), or of a type that implements abi.Marshallable.
+
+-   `int` and `uint` fields are not allowed. Use an explicitly-sized numeric
+    type.
+
+-   `float*` fields are currently not supported, but could be if necessary.
+
+# Appendix
+
+## Working with Non-Packed Structs
+
+ABI structs must generally be packed types, meaning they should have no implicit
+padding between short fields. However, if a field is tagged
+`marshal:"unaligned"`, `go_marshal` will fall back to a safer but slower
+mechanism to deal with potentially unaligned fields.
+
+Note that the non-packed property is inheritted by any other struct that embeds
+this struct, since the `go_marshal` tool currently can't reason about alignments
+for embedded structs that are not aligned.
+
+Because of this, it's generally best to avoid using `marshal:"unaligned"` and
+insert explicit padding fields instead.
+
+## Debugging go_marshal
+
+To enable debugging output from the go marshal tool, pass the `-debug` flag to
+the tool. When using the build rules from above, add a `debug = True` field to
+the build rule like this:
+
+```
+load("<PKGPATH>/gvisor/tools/go_marshal:defs.bzl", "go_library")
+
+go_library(
+    name = "foo",
+    srcs = ["foo.go"],
+    debug = True,
+)
+```
+
+## Modifying the `go_marshal` Tool
+
+The following are some guidelines for modifying the `go_marshal` tool:
+
+-   The `go_marshal` tool currently does a single pass over all types requesting
+    code generation, in arbitrary order. This means the generated code can't
+    directly obtain information about embedded marshallable types at
+    compile-time. One way to work around this restriction is to add a new
+    Marshallable interface method providing this piece of information, and
+    calling it from the generated code. Use this sparingly, as we want to rely
+    on compile-time information as much as possible for performance.
+
+-   No runtime reflection in the code generated for the marshallable interface.
+    The entire point of the tool is to avoid runtime reflection. The generated
+    tests may use reflection.
diff --git a/tools/go_marshal/analysis/BUILD b/tools/go_marshal/analysis/BUILD
new file mode 100644
index 000000000..c859ced77
--- /dev/null
+++ b/tools/go_marshal/analysis/BUILD
@@ -0,0 +1,13 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "analysis",
+    testonly = 1,
+    srcs = ["analysis_unsafe.go"],
+    importpath = "gvisor.dev/gvisor/tools/go_marshal/analysis",
+    visibility = [
+        "//:sandbox",
+    ],
+)
diff --git a/tools/go_marshal/analysis/analysis_unsafe.go b/tools/go_marshal/analysis/analysis_unsafe.go
new file mode 100644
index 000000000..9a9a4f298
--- /dev/null
+++ b/tools/go_marshal/analysis/analysis_unsafe.go
@@ -0,0 +1,175 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package analysis implements common functionality used by generated
+// go_marshal tests.
+package analysis
+
+// All functions in this package are unsafe and are not intended for general
+// consumption. They contain sharp edge cases and the caller is responsible for
+// ensuring none of them are hit. Callers must be carefully to pass in only sane
+// arguments. Failure to do so may cause panics at best and arbitrary memory
+// corruption at worst.
+//
+// Never use outside of tests.
+
+import (
+	"fmt"
+	"math/rand"
+	"reflect"
+	"testing"
+	"unsafe"
+)
+
+// RandomizeValue assigns random value(s) to an abitrary type. This is intended
+// for used with ABI structs from go_marshal, meaning the typical restrictions
+// apply (fixed-size types, no pointers, maps, channels, etc), and should only
+// be used on zeroed values to avoid overwriting pointers to active go objects.
+//
+// Internally, we populate the type with random data by doing an unsafe cast to
+// access the underlying memory of the type and filling it as if it were a byte
+// slice. This almost gets us what we want, but padding fields named "_" are
+// normally not accessible, so we walk the type and recursively zero all "_"
+// fields.
+//
+// Precondition: x must be a pointer. x must not contain any valid
+// pointers to active go objects (pointer fields aren't allowed in ABI
+// structs anyways), or we'd be violating the go runtime contract and
+// the GC may malfunction.
+func RandomizeValue(x interface{}) {
+	v := reflect.Indirect(reflect.ValueOf(x))
+	if !v.CanSet() {
+		panic("RandomizeType() called with an unaddressable value. You probably need to pass a pointer to the argument")
+	}
+
+	// Cast the underlying memory for the type into a byte slice.
+	var b []byte
+	hdr := (*reflect.SliceHeader)(unsafe.Pointer(&b))
+	// Note: v.UnsafeAddr panics if x is passed by value. x should be a pointer.
+	hdr.Data = v.UnsafeAddr()
+	hdr.Len = int(v.Type().Size())
+	hdr.Cap = hdr.Len
+
+	// Fill the byte slice with random data, which in effect fills the type with
+	// random values.
+	n, err := rand.Read(b)
+	if err != nil || n != len(b) {
+		panic("unreachable")
+	}
+
+	// Normally, padding fields are not accessible, so zero them out.
+	reflectZeroPaddingFields(v.Type(), b, false)
+}
+
+// reflectZeroPaddingFields assigns zero values to padding fields for the value
+// of type r, represented by the memory in data. Padding fields are defined as
+// fields with the name "_". If zero is true, the immediate value itself is
+// zeroed. In addition, the type is recursively scanned for padding fields in
+// inner types.
+//
+// This is used for zeroing padding fields after calling RandomizeValue.
+func reflectZeroPaddingFields(r reflect.Type, data []byte, zero bool) {
+	if zero {
+		for i, _ := range data {
+			data[i] = 0
+		}
+	}
+	switch r.Kind() {
+	case reflect.Int8, reflect.Uint8, reflect.Int16, reflect.Uint16, reflect.Int32, reflect.Uint32, reflect.Int64, reflect.Uint64:
+		// These types are explicitly allowed in an ABI type, but we don't need
+		// to recurse further as they're scalar types.
+	case reflect.Struct:
+		for i, numFields := 0, r.NumField(); i < numFields; i++ {
+			f := r.Field(i)
+			off := f.Offset
+			len := f.Type.Size()
+			window := data[off : off+len]
+			reflectZeroPaddingFields(f.Type, window, f.Name == "_")
+		}
+	case reflect.Array:
+		eLen := int(r.Elem().Size())
+		if int(r.Size()) != eLen*r.Len() {
+			panic("Array has unexpected size?")
+		}
+		for i, n := 0, r.Len(); i < n; i++ {
+			reflectZeroPaddingFields(r.Elem(), data[i*eLen:(i+1)*eLen], false)
+		}
+	default:
+		panic(fmt.Sprintf("Type %v not allowed in ABI struct", r.Kind()))
+
+	}
+}
+
+// AlignmentCheck ensures the definition of the type represented by typ doesn't
+// cause the go compiler to emit implicit padding between elements of the type
+// (i.e. fields in a struct).
+//
+// AlignmentCheck doesn't explicitly recurse for embedded structs because any
+// struct present in an ABI struct must also be Marshallable, and therefore
+// they're aligned by definition (or their alignment check would have failed).
+func AlignmentCheck(t *testing.T, typ reflect.Type) (ok bool, delta uint64) {
+	switch typ.Kind() {
+	case reflect.Int8, reflect.Uint8, reflect.Int16, reflect.Uint16, reflect.Int32, reflect.Uint32, reflect.Int64, reflect.Uint64:
+		// Primitive types are always considered well aligned. Primitive types
+		// that are fields in structs are checked independently, this branch
+		// exists to handle recursive calls to alignmentCheck.
+	case reflect.Struct:
+		xOff := 0
+		nextXOff := 0
+		skipNext := false
+		for i, numFields := 0, typ.NumField(); i < numFields; i++ {
+			xOff = nextXOff
+			f := typ.Field(i)
+			fmt.Printf("Checking alignment of %s.%s @ %d [+%d]...\n", typ.Name(), f.Name, f.Offset, f.Type.Size())
+			nextXOff = int(f.Offset + f.Type.Size())
+
+			if f.Name == "_" {
+				// Padding fields need not be aligned.
+				fmt.Printf("Padding field of type %v\n", f.Type)
+				continue
+			}
+
+			if tag, ok := f.Tag.Lookup("marshal"); ok && tag == "unaligned" {
+				skipNext = true
+				continue
+			}
+
+			if skipNext {
+				skipNext = false
+				fmt.Printf("Skipping alignment check for field %s.%s explicitly marked as unaligned.\n", typ.Name(), f.Name)
+				continue
+			}
+
+			if xOff != int(f.Offset) {
+				implicitPad := int(f.Offset) - xOff
+				t.Fatalf("Suspect offset for field %s.%s, detected an implicit %d byte padding from offset %d to %d; either add %d bytes of explicit padding before this field or tag it as `marshal:\"unaligned\"`.", typ.Name(), f.Name, implicitPad, xOff, f.Offset, implicitPad)
+			}
+		}
+
+		// Ensure structs end on a byte explicitly defined by the type.
+		if typ.NumField() > 0 && nextXOff != int(typ.Size()) {
+			implicitPad := int(typ.Size()) - nextXOff
+			f := typ.Field(typ.NumField() - 1) // Final field
+			t.Fatalf("Suspect offset for field %s.%s at the end of %s, detected an implicit %d byte padding from offset %d to %d at the end of the struct; either add %d bytes of explict padding at end of the struct or tag the final field %s as `marshal:\"unaligned\"`.",
+				typ.Name(), f.Name, typ.Name(), implicitPad, nextXOff, typ.Size(), implicitPad, f.Name)
+		}
+	case reflect.Array:
+		// Independent arrays are also always considered well aligned. We only
+		// need to worry about their alignment when they're embedded in structs,
+		// which we handle above.
+	default:
+		t.Fatalf("Unsupported type in ABI struct while checking for field alignment for type: %v", typ.Kind())
+	}
+	return true, uint64(typ.Size())
+}
diff --git a/tools/go_marshal/defs.bzl b/tools/go_marshal/defs.bzl
new file mode 100644
index 000000000..60a992b7f
--- /dev/null
+++ b/tools/go_marshal/defs.bzl
@@ -0,0 +1,158 @@
+"""Marshal is a tool for generating marshalling interfaces for Go types.
+
+The recommended way is to use the go_library rule defined below with mostly
+identical configuration as the native go_library rule.
+
+load("//tools/go_marshal:defs.bzl", "go_library")
+
+go_library(
+    name = "foo",
+    srcs = ["foo.go"],
+)
+
+Under the hood, the go_marshal rule is used to generate a file that will
+appear in a Go target; the output file should appear explicitly in a srcs list.
+For example (the above is still the preferred way):
+
+load("//tools/go_marshal:defs.bzl", "go_marshal")
+
+go_marshal(
+    name = "foo_abi",
+    srcs = ["foo.go"],
+    out = "foo_abi.go",
+    package = "foo",
+)
+
+go_library(
+    name = "foo",
+    srcs = [
+        "foo.go",
+        "foo_abi.go",
+    ],
+    deps = [
+       "//tools/go_marshal:marshal",
+       "//pkg/sentry/platform/safecopy",
+       "//pkg/sentry/usermem",
+    ],
+)
+"""
+
+load("@io_bazel_rules_go//go:def.bzl", _go_library = "go_library", _go_test = "go_test")
+
+def _go_marshal_impl(ctx):
+    """Execute the go_marshal tool."""
+    output = ctx.outputs.lib
+    output_test = ctx.outputs.test
+    (build_dir, _, _) = ctx.build_file_path.rpartition("/BUILD")
+
+    decl = "/".join(["gvisor.dev/gvisor", build_dir])
+
+    # Run the marshal command.
+    args = ["-output=%s" % output.path]
+    args += ["-pkg=%s" % ctx.attr.package]
+    args += ["-output_test=%s" % output_test.path]
+    args += ["-declarationPkg=%s" % decl]
+
+    if ctx.attr.debug:
+        args += ["-debug"]
+
+    args += ["--"]
+    for src in ctx.attr.srcs:
+        args += [f.path for f in src.files.to_list()]
+    ctx.actions.run(
+        inputs = ctx.files.srcs,
+        outputs = [output, output_test],
+        mnemonic = "GoMarshal",
+        progress_message = "go_marshal: %s" % ctx.label,
+        arguments = args,
+        executable = ctx.executable._tool,
+    )
+
+# Generates save and restore logic from a set of Go files.
+#
+# Args:
+#   name: the name of the rule.
+#   srcs: the input source files. These files should include all structs in the
+#         package that need to be saved.
+#   imports: an optional list of extra, non-aliased, Go-style absolute import
+#            paths.
+#   out: the name of the generated file output. This must not conflict with any
+#        other files and must be added to the srcs of the relevant go_library.
+#   package: the package name for the input sources.
+go_marshal = rule(
+    implementation = _go_marshal_impl,
+    attrs = {
+        "srcs": attr.label_list(mandatory = True, allow_files = True),
+        "libname": attr.string(mandatory = True),
+        "imports": attr.string_list(mandatory = False),
+        "package": attr.string(mandatory = True),
+        "debug": attr.bool(doc = "enable debugging output from the go_marshal tool"),
+        "_tool": attr.label(executable = True, cfg = "host", default = Label("//tools/go_marshal:go_marshal")),
+    },
+    outputs = {
+        "lib": "%{name}_unsafe.go",
+        "test": "%{name}_test.go",
+    },
+)
+
+def go_library(name, srcs, deps = [], imports = [], debug = False, **kwargs):
+    """wraps the standard go_library and does mashalling interface generation.
+
+    Args:
+      name: Same as native go_library.
+      srcs: Same as native go_library.
+      deps: Same as native go_library.
+      imports: Extra import paths to pass to the go_marshal tool.
+      debug: Enables debugging output from the go_marshal tool.
+      **kwargs: Remaining args to pass to the native go_library rule unmodified.
+    """
+    go_marshal(
+        name = name + "_abi_autogen",
+        libname = name,
+        srcs = [src for src in srcs if src.endswith(".go")],
+        debug = debug,
+        imports = imports,
+        package = name,
+    )
+
+    extra_deps = [
+        "//tools/go_marshal/marshal",
+        "//pkg/sentry/platform/safecopy",
+        "//pkg/sentry/usermem",
+    ]
+
+    all_srcs = srcs + [name + "_abi_autogen_unsafe.go"]
+    all_deps = deps + []  #  + extra_deps
+
+    for extra in extra_deps:
+        if extra not in deps:
+            all_deps.append(extra)
+
+    _go_library(
+        name = name,
+        srcs = all_srcs,
+        deps = all_deps,
+        **kwargs
+    )
+
+    # Don't pass importpath arg to go_test.
+    kwargs.pop("importpath", "")
+
+    _go_test(
+        name = name + "_abi_autogen_test",
+        srcs = [name + "_abi_autogen_test.go"],
+        # Generated test has a fixed set of dependencies since we generate these
+        # tests. They should only depend on the library generated above, and the
+        # Marshallable interface.
+        deps = [
+            ":" + name,
+            "//tools/go_marshal/analysis",
+        ],
+        **kwargs
+    )
+
+def go_test(**kwargs):
+    """Wraps the standard go_test."""
+    _go_test(
+        **kwargs
+    )
diff --git a/tools/go_marshal/gomarshal/BUILD b/tools/go_marshal/gomarshal/BUILD
new file mode 100644
index 000000000..a0eae6492
--- /dev/null
+++ b/tools/go_marshal/gomarshal/BUILD
@@ -0,0 +1,17 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "gomarshal",
+    srcs = [
+        "generator.go",
+        "generator_interfaces.go",
+        "generator_tests.go",
+        "util.go",
+    ],
+    importpath = "gvisor.dev/gvisor/tools/go_marshal/gomarshal",
+    visibility = [
+        "//:sandbox",
+    ],
+)
diff --git a/tools/go_marshal/gomarshal/generator.go b/tools/go_marshal/gomarshal/generator.go
new file mode 100644
index 000000000..641ccd938
--- /dev/null
+++ b/tools/go_marshal/gomarshal/generator.go
@@ -0,0 +1,382 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package gomarshal implements the go_marshal code generator. See README.md.
+package gomarshal
+
+import (
+	"bytes"
+	"fmt"
+	"go/ast"
+	"go/parser"
+	"go/token"
+	"os"
+	"sort"
+)
+
+const (
+	marshalImport  = "gvisor.dev/gvisor/tools/go_marshal/marshal"
+	usermemImport  = "gvisor.dev/gvisor/pkg/sentry/usermem"
+	safecopyImport = "gvisor.dev/gvisor/pkg/sentry/platform/safecopy"
+)
+
+// List of identifiers we use in generated code, that may conflict a
+// similarly-named source identifier. Avoid problems by refusing the generate
+// code when we see these.
+//
+// This only applies to import aliases at the moment. All other identifiers
+// are qualified by a receiver argument, since they're struct fields.
+//
+// All recievers are single letters, so we don't allow import aliases to be a
+// single letter.
+var badIdents = []string{
+	"src", "srcs", "dst", "dsts", "blk", "buf", "err",
+	// All single-letter identifiers.
+}
+
+// Generator drives code generation for a single invocation of the go_marshal
+// utility.
+//
+// The Generator holds arguments passed to the tool, and drives parsing,
+// processing and code Generator for all types marked with +marshal declared in
+// the input files.
+//
+// See Generator.run() as the entry point.
+type Generator struct {
+	// Paths to input go source files.
+	inputs []string
+	// Output file to write generated go source.
+	output *os.File
+	// Output file to write generated tests.
+	outputTest *os.File
+	// Package name for the generated file.
+	pkg string
+	// Go import path for package we're processing. This package should directly
+	// declare the type we're generating code for.
+	declaration string
+	// Set of extra packages to import in the generated file.
+	imports *importTable
+}
+
+// NewGenerator creates a new code Generator.
+func NewGenerator(srcs []string, out, outTest, pkg, declaration string, imports []string) (*Generator, error) {
+	f, err := os.OpenFile(out, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0644)
+	if err != nil {
+		return nil, fmt.Errorf("Couldn't open output file %q: %v", out, err)
+	}
+	fTest, err := os.OpenFile(outTest, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0644)
+	if err != nil {
+		return nil, fmt.Errorf("Couldn't open test output file %q: %v", out, err)
+	}
+	g := Generator{
+		inputs:      srcs,
+		output:      f,
+		outputTest:  fTest,
+		pkg:         pkg,
+		declaration: declaration,
+		imports:     newImportTable(),
+	}
+	for _, i := range imports {
+		// All imports on the extra imports list are unconditionally marked as
+		// used, so they're always added to the generated code.
+		g.imports.add(i).markUsed()
+	}
+	g.imports.add(marshalImport).markUsed()
+	// The follow imports may or may not be used by the generated
+	// code, depending what's required for the target types. Don't
+	// mark these imports as used by default.
+	g.imports.add(usermemImport)
+	g.imports.add(safecopyImport)
+	g.imports.add("unsafe")
+
+	return &g, nil
+}
+
+// writeHeader writes the header for the generated source file. The header
+// includes the package name, package level comments and import statements.
+func (g *Generator) writeHeader() error {
+	var b sourceBuffer
+	b.emit("// Automatically generated marshal implementation. See tools/go_marshal.\n\n")
+	b.emit("package %s\n\n", g.pkg)
+	if err := b.write(g.output); err != nil {
+		return err
+	}
+
+	return g.imports.write(g.output)
+}
+
+// writeTypeChecks writes a statement to force the compiler to perform a type
+// check for all Marshallable types referenced by the generated code.
+func (g *Generator) writeTypeChecks(ms map[string]struct{}) error {
+	if len(ms) == 0 {
+		return nil
+	}
+
+	msl := make([]string, 0, len(ms))
+	for m, _ := range ms {
+		msl = append(msl, m)
+	}
+	sort.Strings(msl)
+
+	var buf bytes.Buffer
+	fmt.Fprint(&buf, "// Marshallable types used by this file.\n")
+
+	for _, m := range msl {
+		fmt.Fprintf(&buf, "var _ marshal.Marshallable = (*%s)(nil)\n", m)
+	}
+	fmt.Fprint(&buf, "\n")
+
+	_, err := fmt.Fprint(g.output, buf.String())
+	return err
+}
+
+// parse processes all input files passed this generator and produces a set of
+// parsed go ASTs.
+func (g *Generator) parse() ([]*ast.File, []*token.FileSet, error) {
+	debugf("go_marshal invoked with %d input files:\n", len(g.inputs))
+	for _, path := range g.inputs {
+		debugf("  %s\n", path)
+	}
+
+	files := make([]*ast.File, 0, len(g.inputs))
+	fsets := make([]*token.FileSet, 0, len(g.inputs))
+
+	for _, path := range g.inputs {
+		fset := token.NewFileSet()
+		f, err := parser.ParseFile(fset, path, nil, parser.ParseComments)
+		if err != nil {
+			// Not a valid input file?
+			return nil, nil, fmt.Errorf("Input %q can't be parsed: %v", path, err)
+		}
+
+		if debugEnabled() {
+			debugf("AST for %q:\n", path)
+			ast.Print(fset, f)
+		}
+
+		files = append(files, f)
+		fsets = append(fsets, fset)
+	}
+
+	return files, fsets, nil
+}
+
+// collectMarshallabeTypes walks the parsed AST and collects a list of type
+// declarations for which we need to generate the Marshallable interface.
+func (g *Generator) collectMarshallabeTypes(a *ast.File, f *token.FileSet) []*ast.TypeSpec {
+	var types []*ast.TypeSpec
+	for _, decl := range a.Decls {
+		gdecl, ok := decl.(*ast.GenDecl)
+		// Type declaration?
+		if !ok || gdecl.Tok != token.TYPE {
+			debugfAt(f.Position(decl.Pos()), "Skipping declaration since it's not a type declaration.\n")
+			continue
+		}
+		// Does it have a comment?
+		if gdecl.Doc == nil {
+			debugfAt(f.Position(gdecl.Pos()), "Skipping declaration since it doesn't have a comment.\n")
+			continue
+		}
+		// Does the comment contain a "+marshal" line?
+		marked := false
+		for _, c := range gdecl.Doc.List {
+			if c.Text == "// +marshal" {
+				marked = true
+				break
+			}
+		}
+		if !marked {
+			debugfAt(f.Position(gdecl.Pos()), "Skipping declaration since it doesn't have a comment containing +marshal line.\n")
+			continue
+		}
+		for _, spec := range gdecl.Specs {
+			// We already confirmed we're in a type declaration earlier.
+			t := spec.(*ast.TypeSpec)
+			if _, ok := t.Type.(*ast.StructType); ok {
+				debugfAt(f.Position(t.Pos()), "Collected marshallable type %s.\n", t.Name.Name)
+				types = append(types, t)
+				continue
+			}
+			debugf("Skipping declaration %v since it's not a struct declaration.\n", gdecl)
+		}
+	}
+	return types
+}
+
+// collectImports collects all imports from all input source files. Some of
+// these imports are copied to the generated output, if they're referenced by
+// the generated code.
+//
+// collectImports de-duplicates imports while building the list, and ensures
+// identifiers in the generated code don't conflict with any imported package
+// names.
+func (g *Generator) collectImports(a *ast.File, f *token.FileSet) map[string]importStmt {
+	badImportNames := make(map[string]bool)
+	for _, i := range badIdents {
+		badImportNames[i] = true
+	}
+
+	is := make(map[string]importStmt)
+	for _, decl := range a.Decls {
+		gdecl, ok := decl.(*ast.GenDecl)
+		// Import statement?
+		if !ok || gdecl.Tok != token.IMPORT {
+			continue
+		}
+		for _, spec := range gdecl.Specs {
+			i := g.imports.addFromSpec(spec.(*ast.ImportSpec), f)
+			debugf("Collected import '%s' as '%s'\n", i.path, i.name)
+
+			// Make sure we have an import that doesn't use any local names that
+			// would conflict with identifiers in the generated code.
+			if len(i.name) == 1 {
+				abortAt(f.Position(spec.Pos()), fmt.Sprintf("Import has a single character local name '%s'; this may conflict with code generated by go_marshal, use a multi-character import alias", i.name))
+			}
+			if badImportNames[i.name] {
+				abortAt(f.Position(spec.Pos()), fmt.Sprintf("Import name '%s' is likely to conflict with code generated by go_marshal, use a different import alias", i.name))
+			}
+		}
+	}
+	return is
+
+}
+
+func (g *Generator) generateOne(t *ast.TypeSpec, fset *token.FileSet) *interfaceGenerator {
+	// We're guaranteed to have only struct type specs by now. See
+	// Generator.collectMarshallabeTypes.
+	i := newInterfaceGenerator(t, fset)
+	i.validate()
+	i.emitMarshallable()
+	return i
+}
+
+// generateOneTestSuite generates a test suite for the automatically generated
+// implementations type t.
+func (g *Generator) generateOneTestSuite(t *ast.TypeSpec) *testGenerator {
+	i := newTestGenerator(t, g.declaration)
+	i.emitTests()
+	return i
+}
+
+// Run is the entry point to code generation using g.
+//
+// Run parses all input source files specified in g and emits generated code.
+func (g *Generator) Run() error {
+	// Parse our input source files into ASTs and token sets.
+	asts, fsets, err := g.parse()
+	if err != nil {
+		return err
+	}
+
+	if len(asts) != len(fsets) {
+		panic("ASTs and FileSets don't match")
+	}
+
+	// Map of imports in source files; key = local package name, value = import
+	// path.
+	is := make(map[string]importStmt)
+	for i, a := range asts {
+		// Collect all imports from the source files. We may need to copy some
+		// of these to the generated code if they're referenced. This has to be
+		// done before the loop below because we need to process all ASTs before
+		// we start requesting imports to be copied one by one as we encounter
+		// them in each generated source.
+		for name, i := range g.collectImports(a, fsets[i]) {
+			is[name] = i
+		}
+	}
+
+	var impls []*interfaceGenerator
+	var ts []*testGenerator
+	// Set of Marshallable types referenced by generated code.
+	ms := make(map[string]struct{})
+	for i, a := range asts {
+		// Collect type declarations marked for code generation and generate
+		// Marshallable interfaces.
+		for _, t := range g.collectMarshallabeTypes(a, fsets[i]) {
+			impl := g.generateOne(t, fsets[i])
+			// Collect Marshallable types referenced by the generated code.
+			for ref, _ := range impl.ms {
+				ms[ref] = struct{}{}
+			}
+			impls = append(impls, impl)
+			// Collect imports referenced by the generated code and add them to
+			// the list of imports we need to copy to the generated code.
+			for name, _ := range impl.is {
+				if !g.imports.markUsed(name) {
+					panic(fmt.Sprintf("Generated code for '%s' referenced a non-existent import with local name '%s'", impl.typeName(), name))
+				}
+			}
+			ts = append(ts, g.generateOneTestSuite(t))
+		}
+	}
+
+	// Tool was invoked with input files with no data structures marked for code
+	// generation. This is probably not what the user intended.
+	if len(impls) == 0 {
+		var buf bytes.Buffer
+		fmt.Fprintf(&buf, "go_marshal invoked on these files, but they don't contain any types requiring code generation. Perhaps mark some with \"// +marshal\"?:\n")
+		for _, i := range g.inputs {
+			fmt.Fprintf(&buf, "  %s\n", i)
+		}
+		abort(buf.String())
+	}
+
+	// Write output file header. These include things like package name and
+	// import statements.
+	if err := g.writeHeader(); err != nil {
+		return err
+	}
+
+	// Write type checks for referenced marshallable types to output file.
+	if err := g.writeTypeChecks(ms); err != nil {
+		return err
+	}
+
+	// Write generated interfaces to output file.
+	for _, i := range impls {
+		if err := i.write(g.output); err != nil {
+			return err
+		}
+	}
+
+	// Write generated tests to test file.
+	return g.writeTests(ts)
+}
+
+// writeTests outputs tests for the generated interface implementations to a go
+// source file.
+func (g *Generator) writeTests(ts []*testGenerator) error {
+	var b sourceBuffer
+	b.emit("package %s_test\n\n", g.pkg)
+	if err := b.write(g.outputTest); err != nil {
+		return err
+	}
+
+	imports := newImportTable()
+	for _, t := range ts {
+		imports.merge(t.imports)
+	}
+
+	if err := imports.write(g.outputTest); err != nil {
+		return err
+	}
+
+	for _, t := range ts {
+		if err := t.write(g.outputTest); err != nil {
+			return err
+		}
+	}
+	return nil
+}
diff --git a/tools/go_marshal/gomarshal/generator_interfaces.go b/tools/go_marshal/gomarshal/generator_interfaces.go
new file mode 100644
index 000000000..a712c14dc
--- /dev/null
+++ b/tools/go_marshal/gomarshal/generator_interfaces.go
@@ -0,0 +1,507 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package gomarshal
+
+import (
+	"fmt"
+	"go/ast"
+	"go/token"
+	"strings"
+)
+
+// interfaceGenerator generates marshalling interfaces for a single type.
+//
+// getState is not thread-safe.
+type interfaceGenerator struct {
+	sourceBuffer
+
+	// The type we're serializing.
+	t *ast.TypeSpec
+
+	// Receiver argument for generated methods.
+	r string
+
+	// FileSet containing the tokens for the type we're processing.
+	f *token.FileSet
+
+	// is records external packages referenced by the generated implementation.
+	is map[string]struct{}
+
+	// ms records Marshallable types referenced by the generated implementation
+	// of t's interfaces.
+	ms map[string]struct{}
+
+	// as records embedded fields in t that are potentially not packed. The key
+	// is the accessor for the field.
+	as map[string]struct{}
+}
+
+// typeName returns the name of the type this g represents.
+func (g *interfaceGenerator) typeName() string {
+	return g.t.Name.Name
+}
+
+// newinterfaceGenerator creates a new interface generator.
+func newInterfaceGenerator(t *ast.TypeSpec, fset *token.FileSet) *interfaceGenerator {
+	if _, ok := t.Type.(*ast.StructType); !ok {
+		panic(fmt.Sprintf("Attempting to generate code for a not struct type %v", t))
+	}
+	g := &interfaceGenerator{
+		t:  t,
+		r:  receiverName(t),
+		f:  fset,
+		is: make(map[string]struct{}),
+		ms: make(map[string]struct{}),
+		as: make(map[string]struct{}),
+	}
+	g.recordUsedMarshallable(g.typeName())
+	return g
+}
+
+func (g *interfaceGenerator) recordUsedMarshallable(m string) {
+	g.ms[m] = struct{}{}
+
+}
+
+func (g *interfaceGenerator) recordUsedImport(i string) {
+	g.is[i] = struct{}{}
+
+}
+
+func (g *interfaceGenerator) recordPotentiallyNonPackedField(fieldName string) {
+	g.as[fieldName] = struct{}{}
+}
+
+func (g *interfaceGenerator) forEachField(fn func(f *ast.Field)) {
+	// This is guaranteed to succeed because g.t is always a struct.
+	st := g.t.Type.(*ast.StructType)
+	for _, field := range st.Fields.List {
+		fn(field)
+	}
+}
+
+func (g *interfaceGenerator) fieldAccessor(n *ast.Ident) string {
+	return fmt.Sprintf("%s.%s", g.r, n.Name)
+}
+
+// abortAt aborts the go_marshal tool with the given error message, with a
+// reference position to the input source. Same as abortAt, but uses g to
+// resolve p to position.
+func (g *interfaceGenerator) abortAt(p token.Pos, msg string) {
+	abortAt(g.f.Position(p), msg)
+}
+
+// validate ensures the type we're working with can be marshalled. These checks
+// are done ahead of time and in one place so we can make assumptions later.
+func (g *interfaceGenerator) validate() {
+	g.forEachField(func(f *ast.Field) {
+		if len(f.Names) == 0 {
+			g.abortAt(f.Pos(), "Cannot marshal structs with embedded fields, give the field a name; use '_' for anonymous fields such as padding fields")
+		}
+	})
+
+	g.forEachField(func(f *ast.Field) {
+		fieldDispatcher{
+			primitive: func(_, t *ast.Ident) {
+				switch t.Name {
+				case "int8", "uint8", "byte", "int16", "uint16", "int32", "uint32", "int64", "uint64":
+					// These are the only primitive types we're allow. Below, we
+					// provide suggestions for some disallowed types and reject
+					// them, then attempt to marshal any remaining types by
+					// invoking the marshal.Marshallable interface on them. If
+					// these types don't actually implement
+					// marshal.Marshallable, compilation of the generated code
+					// will fail with an appropriate error message.
+					return
+				case "int":
+					g.abortAt(f.Pos(), "Type 'int' has ambiguous width, use int32 or int64")
+				case "uint":
+					g.abortAt(f.Pos(), "Type 'uint' has ambiguous width, use uint32 or uint64")
+				case "string":
+					g.abortAt(f.Pos(), "Type 'string' is dynamically-sized and cannot be marshalled, use a fixed size byte array '[...]byte' instead")
+				default:
+					debugfAt(g.f.Position(f.Pos()), fmt.Sprintf("Found derived type '%s', will attempt dispatch via marshal.Marshallable.\n", t.Name))
+				}
+			},
+			selector: func(_, _, _ *ast.Ident) {
+				// No validation to perform on selector fields. However this
+				// callback must still be provided.
+			},
+			array: func(n, _ *ast.Ident, len int) {
+				a := f.Type.(*ast.ArrayType)
+				if a.Len == nil {
+					g.abortAt(f.Pos(), fmt.Sprintf("Dynamically sized slice '%s' cannot be marshalled, arrays must be statically sized", n.Name))
+				}
+
+				if _, ok := a.Len.(*ast.BasicLit); !ok {
+					g.abortAt(a.Len.Pos(), fmt.Sprintf("Array size must be a literal, don's use consts or expressions"))
+				}
+
+				if _, ok := a.Elt.(*ast.Ident); !ok {
+					g.abortAt(a.Elt.Pos(), fmt.Sprintf("Marshalling not supported for arrays with %s elements, array elements must be primitive types", kindString(a.Elt)))
+				}
+
+				if len <= 0 {
+					g.abortAt(a.Len.Pos(), fmt.Sprintf("Marshalling not supported for zero length arrays, why does an ABI struct have one?"))
+				}
+			},
+			unhandled: func(_ *ast.Ident) {
+				g.abortAt(f.Pos(), fmt.Sprintf("Marshalling not supported for %s fields", kindString(f.Type)))
+			},
+		}.dispatch(f)
+	})
+}
+
+// scalarSize returns the size of type identified by t. If t isn't a primitive
+// type, the size isn't known at code generation time, and must be resolved via
+// the marshal.Marshallable interface.
+func (g *interfaceGenerator) scalarSize(t *ast.Ident) (size int, unknownSize bool) {
+	switch t.Name {
+	case "int8", "uint8", "byte":
+		return 1, false
+	case "int16", "uint16":
+		return 2, false
+	case "int32", "uint32":
+		return 4, false
+	case "int64", "uint64":
+		return 8, false
+	default:
+		return 0, true
+	}
+}
+
+func (g *interfaceGenerator) shift(bufVar string, n int) {
+	g.emit("%s = %s[%d:]\n", bufVar, bufVar, n)
+}
+
+func (g *interfaceGenerator) shiftDynamic(bufVar, name string) {
+	g.emit("%s = %s[%s.SizeBytes():]\n", bufVar, bufVar, name)
+}
+
+func (g *interfaceGenerator) marshalScalar(accessor, typ string, bufVar string) {
+	switch typ {
+	case "int8", "uint8", "byte":
+		g.emit("%s[0] = byte(%s)\n", bufVar, accessor)
+		g.shift(bufVar, 1)
+	case "int16", "uint16":
+		g.recordUsedImport("usermem")
+		g.emit("usermem.ByteOrder.PutUint16(%s[:2], uint16(%s))\n", bufVar, accessor)
+		g.shift(bufVar, 2)
+	case "int32", "uint32":
+		g.recordUsedImport("usermem")
+		g.emit("usermem.ByteOrder.PutUint32(%s[:4], uint32(%s))\n", bufVar, accessor)
+		g.shift(bufVar, 4)
+	case "int64", "uint64":
+		g.recordUsedImport("usermem")
+		g.emit("usermem.ByteOrder.PutUint64(%s[:8], uint64(%s))\n", bufVar, accessor)
+		g.shift(bufVar, 8)
+	default:
+		g.emit("%s.MarshalBytes(%s[:%s.SizeBytes()])\n", accessor, bufVar, accessor)
+		g.shiftDynamic(bufVar, accessor)
+	}
+}
+
+func (g *interfaceGenerator) unmarshalScalar(accessor, typ string, bufVar string) {
+	switch typ {
+	case "int8":
+		g.emit("%s = int8(%s[0])\n", accessor, bufVar)
+		g.shift(bufVar, 1)
+	case "uint8":
+		g.emit("%s = uint8(%s[0])\n", accessor, bufVar)
+		g.shift(bufVar, 1)
+	case "byte":
+		g.emit("%s = %s[0]\n", accessor, bufVar)
+		g.shift(bufVar, 1)
+
+	case "int16":
+		g.recordUsedImport("usermem")
+		g.emit("%s = int16(usermem.ByteOrder.Uint16(%s[:2]))\n", accessor, bufVar)
+		g.shift(bufVar, 2)
+	case "uint16":
+		g.recordUsedImport("usermem")
+		g.emit("%s = usermem.ByteOrder.Uint16(%s[:2])\n", accessor, bufVar)
+		g.shift(bufVar, 2)
+
+	case "int32":
+		g.recordUsedImport("usermem")
+		g.emit("%s = int32(usermem.ByteOrder.Uint32(%s[:4]))\n", accessor, bufVar)
+		g.shift(bufVar, 4)
+	case "uint32":
+		g.recordUsedImport("usermem")
+		g.emit("%s = usermem.ByteOrder.Uint32(%s[:4])\n", accessor, bufVar)
+		g.shift(bufVar, 4)
+
+	case "int64":
+		g.recordUsedImport("usermem")
+		g.emit("%s = int64(usermem.ByteOrder.Uint64(%s[:8]))\n", accessor, bufVar)
+		g.shift(bufVar, 8)
+	case "uint64":
+		g.recordUsedImport("usermem")
+		g.emit("%s = usermem.ByteOrder.Uint64(%s[:8])\n", accessor, bufVar)
+		g.shift(bufVar, 8)
+	default:
+		g.emit("%s.UnmarshalBytes(%s[:%s.SizeBytes()])\n", accessor, bufVar, accessor)
+		g.shiftDynamic(bufVar, accessor)
+		g.recordPotentiallyNonPackedField(accessor)
+	}
+}
+
+// areFieldsPackedExpression returns a go expression checking whether g.t's fields are
+// packed. Returns "", false if g.t has no fields that may be potentially
+// packed, otherwise returns <clause>, true, where <clause> is an expression
+// like "t.a.Packed() && t.b.Packed() && t.c.Packed()".
+func (g *interfaceGenerator) areFieldsPackedExpression() (string, bool) {
+	if len(g.as) == 0 {
+		return "", false
+	}
+
+	cs := make([]string, 0, len(g.as))
+	for accessor, _ := range g.as {
+		cs = append(cs, fmt.Sprintf("%s.Packed()", accessor))
+	}
+	return strings.Join(cs, " && "), true
+}
+
+func (g *interfaceGenerator) emitMarshallable() {
+	// Is g.t a packed struct without consideing field types?
+	thisPacked := true
+	g.forEachField(func(f *ast.Field) {
+		if f.Tag != nil {
+			if f.Tag.Value == "`marshal:\"unaligned\"`" {
+				if thisPacked {
+					debugfAt(g.f.Position(g.t.Pos()),
+						fmt.Sprintf("Marking type '%s' as not packed due to tag `marshal:\"unaligned\"`.\n", g.t.Name))
+					thisPacked = false
+				}
+			}
+		}
+	})
+
+	g.emit("// SizeBytes implements marshal.Marshallable.SizeBytes.\n")
+	g.emit("func (%s *%s) SizeBytes() int {\n", g.r, g.typeName())
+	g.inIndent(func() {
+		primitiveSize := 0
+		var dynamicSizeTerms []string
+
+		g.forEachField(fieldDispatcher{
+			primitive: func(n, t *ast.Ident) {
+				if size, dynamic := g.scalarSize(t); !dynamic {
+					primitiveSize += size
+				} else {
+					g.recordUsedMarshallable(t.Name)
+					dynamicSizeTerms = append(dynamicSizeTerms, fmt.Sprintf("%s.SizeBytes()", g.fieldAccessor(n)))
+				}
+			},
+			selector: func(n, tX, tSel *ast.Ident) {
+				tName := fmt.Sprintf("%s.%s", tX.Name, tSel.Name)
+				g.recordUsedImport(tX.Name)
+				g.recordUsedMarshallable(tName)
+				dynamicSizeTerms = append(dynamicSizeTerms, fmt.Sprintf("(*%s)(nil).SizeBytes()", tName))
+			},
+			array: func(n, t *ast.Ident, len int) {
+				if len < 1 {
+					// Zero-length arrays should've been rejected by validate().
+					panic("unreachable")
+				}
+				if size, dynamic := g.scalarSize(t); !dynamic {
+					primitiveSize += size * len
+				} else {
+					g.recordUsedMarshallable(t.Name)
+					dynamicSizeTerms = append(dynamicSizeTerms, fmt.Sprintf("(*%s)(nil).SizeBytes()*%d", t.Name, len))
+				}
+			},
+		}.dispatch)
+		g.emit("return %d", primitiveSize)
+		if len(dynamicSizeTerms) > 0 {
+			g.incIndent()
+		}
+		{
+			for _, d := range dynamicSizeTerms {
+				g.emitNoIndent(" +\n")
+				g.emit(d)
+			}
+		}
+		if len(dynamicSizeTerms) > 0 {
+			g.decIndent()
+		}
+	})
+	g.emit("\n}\n\n")
+
+	g.emit("// MarshalBytes implements marshal.Marshallable.MarshalBytes.\n")
+	g.emit("func (%s *%s) MarshalBytes(dst []byte) {\n", g.r, g.typeName())
+	g.inIndent(func() {
+		g.forEachField(fieldDispatcher{
+			primitive: func(n, t *ast.Ident) {
+				if n.Name == "_" {
+					g.emit("// Padding: dst[:sizeof(%s)] ~= %s(0)\n", t.Name, t.Name)
+					if len, dynamic := g.scalarSize(t); !dynamic {
+						g.shift("dst", len)
+					} else {
+						// We can't use shiftDynamic here because we don't have
+						// an instance of the dynamic type we can referece here
+						// (since the version in this struct is anonymous). Use
+						// a typed nil pointer to call SizeBytes() instead.
+						g.emit("dst = dst[(*%s)(nil).SizeBytes():]\n", t.Name)
+					}
+					return
+				}
+				g.marshalScalar(g.fieldAccessor(n), t.Name, "dst")
+			},
+			selector: func(n, tX, tSel *ast.Ident) {
+				g.marshalScalar(g.fieldAccessor(n), fmt.Sprintf("%s.%s", tX.Name, tSel.Name), "dst")
+			},
+			array: func(n, t *ast.Ident, size int) {
+				if n.Name == "_" {
+					g.emit("// Padding: dst[:sizeof(%s)*%d] ~= [%d]%s{0}\n", t.Name, size, size, t.Name)
+					if len, dynamic := g.scalarSize(t); !dynamic {
+						g.shift("dst", len*size)
+					} else {
+						// We can't use shiftDynamic here because we don't have
+						// an instance of the dynamic type we can reference here
+						// (since the version in this struct is anonymous). Use
+						// a typed nil pointer to call SizeBytes() instead.
+						g.emit("dst = dst[(*%s)(nil).SizeBytes()*%d:]\n", t.Name, size)
+					}
+					return
+				}
+
+				g.emit("for i := 0; i < %d; i++ {\n", size)
+				g.inIndent(func() {
+					g.marshalScalar(fmt.Sprintf("%s[i]", g.fieldAccessor(n)), t.Name, "dst")
+				})
+				g.emit("}\n")
+			},
+		}.dispatch)
+	})
+	g.emit("}\n\n")
+
+	g.emit("// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes.\n")
+	g.emit("func (%s *%s) UnmarshalBytes(src []byte) {\n", g.r, g.typeName())
+	g.inIndent(func() {
+		g.forEachField(fieldDispatcher{
+			primitive: func(n, t *ast.Ident) {
+				if n.Name == "_" {
+					g.emit("// Padding: var _ %s ~= src[:sizeof(%s)]\n", t.Name, t.Name)
+					if len, dynamic := g.scalarSize(t); !dynamic {
+						g.shift("src", len)
+					} else {
+						// We can't use shiftDynamic here because we don't have
+						// an instance of the dynamic type we can reference here
+						// (since the version in this struct is anonymous). Use
+						// a typed nil pointer to call SizeBytes() instead.
+						g.emit("src = src[(*%s)(nil).SizeBytes():]\n", t.Name)
+						g.recordPotentiallyNonPackedField(fmt.Sprintf("(*%s)(nil)", t.Name))
+					}
+					return
+				}
+				g.unmarshalScalar(g.fieldAccessor(n), t.Name, "src")
+			},
+			selector: func(n, tX, tSel *ast.Ident) {
+				g.unmarshalScalar(g.fieldAccessor(n), fmt.Sprintf("%s.%s", tX.Name, tSel.Name), "src")
+			},
+			array: func(n, t *ast.Ident, size int) {
+				if n.Name == "_" {
+					g.emit("// Padding: ~ copy([%d]%s(%s), src[:sizeof(%s)*%d])\n", size, t.Name, g.fieldAccessor(n), t.Name, size)
+					if len, dynamic := g.scalarSize(t); !dynamic {
+						g.shift("src", len*size)
+					} else {
+						// We can't use shiftDynamic here because we don't have
+						// an instance of the dynamic type we can referece here
+						// (since the version in this struct is anonymous). Use
+						// a typed nil pointer to call SizeBytes() instead.
+						g.emit("src = src[(*%s)(nil).SizeBytes()*%d:]\n", t.Name, size)
+					}
+					return
+				}
+
+				g.emit("for i := 0; i < %d; i++ {\n", size)
+				g.inIndent(func() {
+					g.unmarshalScalar(fmt.Sprintf("%s[i]", g.fieldAccessor(n)), t.Name, "src")
+				})
+				g.emit("}\n")
+			},
+		}.dispatch)
+	})
+	g.emit("}\n\n")
+
+	g.emit("// Packed implements marshal.Marshallable.Packed.\n")
+	g.emit("func (%s *%s) Packed() bool {\n", g.r, g.typeName())
+	g.inIndent(func() {
+		expr, fieldsMaybePacked := g.areFieldsPackedExpression()
+		switch {
+		case !thisPacked:
+			g.emit("return false\n")
+		case fieldsMaybePacked:
+			g.emit("return %s\n", expr)
+		default:
+			g.emit("return true\n")
+
+		}
+	})
+	g.emit("}\n\n")
+
+	g.emit("// MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe.\n")
+	g.emit("func (%s *%s) MarshalUnsafe(dst []byte) {\n", g.r, g.typeName())
+	g.inIndent(func() {
+		if thisPacked {
+			g.recordUsedImport("safecopy")
+			g.recordUsedImport("unsafe")
+			if cond, ok := g.areFieldsPackedExpression(); ok {
+				g.emit("if %s {\n", cond)
+				g.inIndent(func() {
+					g.emit("safecopy.CopyIn(dst, unsafe.Pointer(%s))\n", g.r)
+				})
+				g.emit("} else {\n")
+				g.inIndent(func() {
+					g.emit("%s.MarshalBytes(dst)\n", g.r)
+				})
+				g.emit("}\n")
+			} else {
+				g.emit("safecopy.CopyIn(dst, unsafe.Pointer(%s))\n", g.r)
+			}
+		} else {
+			g.emit("// Type %s doesn't have a packed layout in memory, fallback to MarshalBytes.\n", g.typeName())
+			g.emit("%s.MarshalBytes(dst)\n", g.r)
+		}
+	})
+	g.emit("}\n\n")
+
+	g.emit("// UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe.\n")
+	g.emit("func (%s *%s) UnmarshalUnsafe(src []byte) {\n", g.r, g.typeName())
+	g.inIndent(func() {
+		if thisPacked {
+			g.recordUsedImport("safecopy")
+			g.recordUsedImport("unsafe")
+			if cond, ok := g.areFieldsPackedExpression(); ok {
+				g.emit("if %s {\n", cond)
+				g.inIndent(func() {
+					g.emit("safecopy.CopyOut(unsafe.Pointer(%s), src)\n", g.r)
+				})
+				g.emit("} else {\n")
+				g.inIndent(func() {
+					g.emit("%s.UnmarshalBytes(src)\n", g.r)
+				})
+				g.emit("}\n")
+			} else {
+				g.emit("safecopy.CopyOut(unsafe.Pointer(%s), src)\n", g.r)
+			}
+		} else {
+			g.emit("// Type %s doesn't have a packed layout in memory, fall back to UnmarshalBytes.\n", g.typeName())
+			g.emit("%s.UnmarshalBytes(src)\n", g.r)
+		}
+	})
+	g.emit("}\n\n")
+
+}
diff --git a/tools/go_marshal/gomarshal/generator_tests.go b/tools/go_marshal/gomarshal/generator_tests.go
new file mode 100644
index 000000000..df25cb5b2
--- /dev/null
+++ b/tools/go_marshal/gomarshal/generator_tests.go
@@ -0,0 +1,154 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package gomarshal
+
+import (
+	"fmt"
+	"go/ast"
+	"io"
+	"strings"
+)
+
+var standardImports = []string{
+	"fmt",
+	"reflect",
+	"testing",
+	"gvisor.dev/gvisor/tools/go_marshal/analysis",
+}
+
+type testGenerator struct {
+	sourceBuffer
+
+	// The type we're serializing.
+	t *ast.TypeSpec
+
+	// Receiver argument for generated methods.
+	r string
+
+	// Imports used by generated code.
+	imports *importTable
+
+	// Import statement for the package declaring the type we generated code
+	// for. We need this to construct test instances for the type, since the
+	// tests aren't written in the same package.
+	decl *importStmt
+}
+
+func newTestGenerator(t *ast.TypeSpec, declaration string) *testGenerator {
+	if _, ok := t.Type.(*ast.StructType); !ok {
+		panic(fmt.Sprintf("Attempting to generate code for a not struct type %v", t))
+	}
+	g := &testGenerator{
+		t:       t,
+		r:       receiverName(t),
+		imports: newImportTable(),
+	}
+
+	for _, i := range standardImports {
+		g.imports.add(i).markUsed()
+	}
+	g.decl = g.imports.add(declaration)
+	g.decl.markUsed()
+
+	return g
+}
+
+func (g *testGenerator) typeName() string {
+	return fmt.Sprintf("%s.%s", g.decl.name, g.t.Name.Name)
+}
+
+func (g *testGenerator) forEachField(fn func(f *ast.Field)) {
+	// This is guaranteed to succeed because g.t is always a struct.
+	st := g.t.Type.(*ast.StructType)
+	for _, field := range st.Fields.List {
+		fn(field)
+	}
+}
+
+func (g *testGenerator) testFuncName(base string) string {
+	return fmt.Sprintf("%s%s", base, strings.Title(g.t.Name.Name))
+}
+
+func (g *testGenerator) inTestFunction(name string, body func()) {
+	g.emit("func %s(t *testing.T) {\n", g.testFuncName(name))
+	g.inIndent(body)
+	g.emit("}\n\n")
+}
+
+func (g *testGenerator) emitTestNonZeroSize() {
+	g.inTestFunction("TestSizeNonZero", func() {
+		g.emit("x := &%s{}\n", g.typeName())
+		g.emit("if x.SizeBytes() == 0 {\n")
+		g.inIndent(func() {
+			g.emit("t.Fatal(\"Marshallable.Size() should not return zero\")\n")
+		})
+		g.emit("}\n")
+	})
+}
+
+func (g *testGenerator) emitTestSuspectAlignment() {
+	g.inTestFunction("TestSuspectAlignment", func() {
+		g.emit("x := %s{}\n", g.typeName())
+		g.emit("analysis.AlignmentCheck(t, reflect.TypeOf(x))\n")
+	})
+}
+
+func (g *testGenerator) emitTestMarshalUnmarshalPreservesData() {
+	g.inTestFunction("TestSafeMarshalUnmarshalPreservesData", func() {
+		g.emit("var x, y, z, yUnsafe, zUnsafe %s\n", g.typeName())
+		g.emit("analysis.RandomizeValue(&x)\n\n")
+
+		g.emit("buf := make([]byte, x.SizeBytes())\n")
+		g.emit("x.MarshalBytes(buf)\n")
+		g.emit("bufUnsafe := make([]byte, x.SizeBytes())\n")
+		g.emit("x.MarshalUnsafe(bufUnsafe)\n\n")
+
+		g.emit("y.UnmarshalBytes(buf)\n")
+		g.emit("if !reflect.DeepEqual(x, y) {\n")
+		g.inIndent(func() {
+			g.emit("t.Fatal(fmt.Sprintf(\"Data corrupted across Marshal/Unmarshal cycle:\\nBefore: %%+v\\nAfter: %%+v\\n\", x, y))\n")
+		})
+		g.emit("}\n")
+		g.emit("yUnsafe.UnmarshalBytes(bufUnsafe)\n")
+		g.emit("if !reflect.DeepEqual(x, yUnsafe) {\n")
+		g.inIndent(func() {
+			g.emit("t.Fatal(fmt.Sprintf(\"Data corrupted across MarshalUnsafe/Unmarshal cycle:\\nBefore: %%+v\\nAfter: %%+v\\n\", x, yUnsafe))\n")
+		})
+		g.emit("}\n\n")
+
+		g.emit("z.UnmarshalUnsafe(buf)\n")
+		g.emit("if !reflect.DeepEqual(x, z) {\n")
+		g.inIndent(func() {
+			g.emit("t.Fatal(fmt.Sprintf(\"Data corrupted across Marshal/UnmarshalUnsafe cycle:\\nBefore: %%+v\\nAfter: %%+v\\n\", x, z))\n")
+		})
+		g.emit("}\n")
+		g.emit("zUnsafe.UnmarshalUnsafe(bufUnsafe)\n")
+		g.emit("if !reflect.DeepEqual(x, zUnsafe) {\n")
+		g.inIndent(func() {
+			g.emit("t.Fatal(fmt.Sprintf(\"Data corrupted across MarshalUnsafe/UnmarshalUnsafe cycle:\\nBefore: %%+v\\nAfter: %%+v\\n\", x, zUnsafe))\n")
+		})
+		g.emit("}\n")
+	})
+}
+
+func (g *testGenerator) emitTests() {
+	g.emitTestNonZeroSize()
+	g.emitTestSuspectAlignment()
+	g.emitTestMarshalUnmarshalPreservesData()
+}
+
+func (g *testGenerator) write(out io.Writer) error {
+	return g.sourceBuffer.write(out)
+}
diff --git a/tools/go_marshal/gomarshal/util.go b/tools/go_marshal/gomarshal/util.go
new file mode 100644
index 000000000..967537abf
--- /dev/null
+++ b/tools/go_marshal/gomarshal/util.go
@@ -0,0 +1,387 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package gomarshal
+
+import (
+	"bytes"
+	"flag"
+	"fmt"
+	"go/ast"
+	"go/token"
+	"io"
+	"os"
+	"path"
+	"reflect"
+	"sort"
+	"strconv"
+	"strings"
+)
+
+var debug = flag.Bool("debug", false, "enables debugging output")
+
+// receiverName returns an appropriate receiver name given a type spec.
+func receiverName(t *ast.TypeSpec) string {
+	if len(t.Name.Name) < 1 {
+		// Zero length type name?
+		panic("unreachable")
+	}
+	return strings.ToLower(t.Name.Name[:1])
+}
+
+// kindString returns a user-friendly representation of an AST expr type.
+func kindString(e ast.Expr) string {
+	switch e.(type) {
+	case *ast.Ident:
+		return "scalar"
+	case *ast.ArrayType:
+		return "array"
+	case *ast.StructType:
+		return "struct"
+	case *ast.StarExpr:
+		return "pointer"
+	case *ast.FuncType:
+		return "function"
+	case *ast.InterfaceType:
+		return "interface"
+	case *ast.MapType:
+		return "map"
+	case *ast.ChanType:
+		return "channel"
+	default:
+		return reflect.TypeOf(e).String()
+	}
+}
+
+// fieldDispatcher is a collection of callbacks for handling different types of
+// fields in a struct declaration.
+type fieldDispatcher struct {
+	primitive func(n, t *ast.Ident)
+	selector  func(n, tX, tSel *ast.Ident)
+	array     func(n, t *ast.Ident, size int)
+	unhandled func(n *ast.Ident)
+}
+
+// Precondition: All dispatch callbacks that will be invoked must be
+// provided. Embedded fields are not allowed, len(f.Names) >= 1.
+func (fd fieldDispatcher) dispatch(f *ast.Field) {
+	// Each field declaration may actually be multiple declarations of the same
+	// type. For example, consider:
+	//
+	// type Point struct {
+	//     x, y, z int
+	// }
+	//
+	// We invoke the call-backs once per such instance. Embedded fields are not
+	// allowed, and results in a panic.
+	if len(f.Names) < 1 {
+		panic("Precondition not met: attempted to dispatch on embedded field")
+	}
+
+	for _, name := range f.Names {
+		switch v := f.Type.(type) {
+		case *ast.Ident:
+			fd.primitive(name, v)
+		case *ast.SelectorExpr:
+			fd.selector(name, v.X.(*ast.Ident), v.Sel)
+		case *ast.ArrayType:
+			len := 0
+			if v.Len != nil {
+				// Non-literal array length is handled by generatorInterfaces.validate().
+				if lenLit, ok := v.Len.(*ast.BasicLit); ok {
+					var err error
+					len, err = strconv.Atoi(lenLit.Value)
+					if err != nil {
+						panic(err)
+					}
+				}
+			}
+			switch t := v.Elt.(type) {
+			case *ast.Ident:
+				fd.array(name, t, len)
+			default:
+				fd.array(name, nil, len)
+			}
+		default:
+			fd.unhandled(name)
+		}
+	}
+}
+
+// debugEnabled indicates whether debugging is enabled for gomarshal.
+func debugEnabled() bool {
+	return *debug
+}
+
+// abort aborts the go_marshal tool with the given error message.
+func abort(msg string) {
+	if !strings.HasSuffix(msg, "\n") {
+		msg += "\n"
+	}
+	fmt.Print(msg)
+	os.Exit(1)
+}
+
+// abortAt aborts the go_marshal tool with the given error message, with
+// a reference position to the input source.
+func abortAt(p token.Position, msg string) {
+	abort(fmt.Sprintf("%v:\n  %s\n", p, msg))
+}
+
+// debugf conditionally prints a debug message.
+func debugf(f string, a ...interface{}) {
+	if debugEnabled() {
+		fmt.Printf(f, a...)
+	}
+}
+
+// debugfAt conditionally prints a debug message with a reference to a position
+// in the input source.
+func debugfAt(p token.Position, f string, a ...interface{}) {
+	if debugEnabled() {
+		fmt.Printf("%s:\n  %s", p, fmt.Sprintf(f, a...))
+	}
+}
+
+// emit generates a line of code in the output file.
+//
+// emit is a wrapper around writing a formatted string to the output
+// buffer. emit can be invoked in one of two ways:
+//
+// (1) emit("some string")
+//     When emit is called with a single string argument, it is simply copied to
+//     the output buffer without any further formatting.
+// (2) emit(fmtString, args...)
+//     emit can also be invoked in a similar fashion to *Printf() functions,
+//     where the first argument is a format string.
+//
+// Calling emit with a single argument that is not a string will result in a
+// panic, as the caller's intent is ambiguous.
+func emit(out io.Writer, indent int, a ...interface{}) {
+	const spacesPerIndentLevel = 4
+
+	if len(a) < 1 {
+		panic("emit() called with no arguments")
+	}
+
+	if indent > 0 {
+		if _, err := fmt.Fprint(out, strings.Repeat(" ", indent*spacesPerIndentLevel)); err != nil {
+			// Writing to the emit output should not fail. Typically the output
+			// is a byte.Buffer; writes to these never fail.
+			panic(err)
+		}
+	}
+
+	first, ok := a[0].(string)
+	if !ok {
+		// First argument must be either the string to emit (case 1 from
+		// function-level comment), or a format string (case 2).
+		panic(fmt.Sprintf("First argument to emit() is not a string: %+v", a[0]))
+	}
+
+	if len(a) == 1 {
+		// Single string argument. Assume no formatting requested.
+		if _, err := fmt.Fprint(out, first); err != nil {
+			// Writing to out should not fail.
+			panic(err)
+		}
+		return
+
+	}
+
+	// Formatting requested.
+	if _, err := fmt.Fprintf(out, first, a[1:]...); err != nil {
+		// Writing to out should not fail.
+		panic(err)
+	}
+}
+
+// sourceBuffer represents fragments of generated go source code.
+//
+// sourceBuffer provides a convenient way to build up go souce fragments in
+// memory. May be safely zero-value initialized. Not thread-safe.
+type sourceBuffer struct {
+	// Current indentation level.
+	indent int
+
+	// Memory buffer containing contents while they're being generated.
+	b bytes.Buffer
+}
+
+func (b *sourceBuffer) incIndent() {
+	b.indent++
+}
+
+func (b *sourceBuffer) decIndent() {
+	if b.indent <= 0 {
+		panic("decIndent() without matching incIndent()")
+	}
+	b.indent--
+}
+
+func (b *sourceBuffer) emit(a ...interface{}) {
+	emit(&b.b, b.indent, a...)
+}
+
+func (b *sourceBuffer) emitNoIndent(a ...interface{}) {
+	emit(&b.b, 0 /*indent*/, a...)
+}
+
+func (b *sourceBuffer) inIndent(body func()) {
+	b.incIndent()
+	body()
+	b.decIndent()
+}
+
+func (b *sourceBuffer) write(out io.Writer) error {
+	_, err := fmt.Fprint(out, b.b.String())
+	return err
+}
+
+// Write implements io.Writer.Write.
+func (b *sourceBuffer) Write(buf []byte) (int, error) {
+	return (b.b.Write(buf))
+}
+
+// importStmt represents a single import statement.
+type importStmt struct {
+	// Local name of the imported package.
+	name string
+	// Import path.
+	path string
+	// Indicates whether the local name is an alias, or simply the final
+	// component of the path.
+	aliased bool
+	// Indicates whether this import was referenced by generated code.
+	used bool
+}
+
+func newImport(p string) *importStmt {
+	name := path.Base(p)
+	return &importStmt{
+		name:    name,
+		path:    p,
+		aliased: false,
+	}
+}
+
+func newImportFromSpec(spec *ast.ImportSpec, f *token.FileSet) *importStmt {
+	p := spec.Path.Value[1 : len(spec.Path.Value)-1] // Strip the " quotes around path.
+	name := path.Base(p)
+	if name == "" || name == "/" || name == "." {
+		panic(fmt.Sprintf("Couldn't process local package name for import at %s, (processed as %s)",
+			f.Position(spec.Path.Pos()), name))
+	}
+	if spec.Name != nil {
+		name = spec.Name.Name
+	}
+	return &importStmt{
+		name:    name,
+		path:    p,
+		aliased: spec.Name != nil,
+	}
+}
+
+func (i *importStmt) String() string {
+	if i.aliased {
+		return fmt.Sprintf("%s \"%s\"", i.name, i.path)
+	}
+	return fmt.Sprintf("\"%s\"", i.path)
+}
+
+func (i *importStmt) markUsed() {
+	i.used = true
+}
+
+func (i *importStmt) equivalent(other *importStmt) bool {
+	return i == other
+}
+
+// importTable represents a collection of importStmts.
+type importTable struct {
+	// Map of imports and whether they should be copied to the output.
+	is map[string]*importStmt
+}
+
+func newImportTable() *importTable {
+	return &importTable{
+		is: make(map[string]*importStmt),
+	}
+}
+
+// Merges import statements from other into i. Collisions in import statements
+// result in a panic.
+func (i *importTable) merge(other *importTable) {
+	for name, im := range other.is {
+		if dup, ok := i.is[name]; ok && dup.equivalent(im) {
+			panic(fmt.Sprintf("Found colliding import statements: ours: %+v, other's: %+v", dup, im))
+		}
+
+		i.is[name] = im
+	}
+}
+
+func (i *importTable) add(s string) *importStmt {
+	n := newImport(s)
+	i.is[n.name] = n
+	return n
+}
+
+func (i *importTable) addFromSpec(spec *ast.ImportSpec, f *token.FileSet) *importStmt {
+	n := newImportFromSpec(spec, f)
+	i.is[n.name] = n
+	return n
+}
+
+// Marks the import named n as used. If no such import is in the table, returns
+// false.
+func (i *importTable) markUsed(n string) bool {
+	if n, ok := i.is[n]; ok {
+		n.markUsed()
+		return true
+	}
+	return false
+}
+
+func (i *importTable) clear() {
+	for _, i := range i.is {
+		i.used = false
+	}
+}
+
+func (i *importTable) write(out io.Writer) error {
+	if len(i.is) == 0 {
+		// Nothing to import, we're done.
+		return nil
+	}
+
+	imports := make([]string, 0, len(i.is))
+	for _, i := range i.is {
+		if i.used {
+			imports = append(imports, i.String())
+		}
+	}
+	sort.Strings(imports)
+
+	var b sourceBuffer
+	b.emit("import (\n")
+	b.incIndent()
+	for _, i := range imports {
+		b.emit("%s\n", i)
+	}
+	b.decIndent()
+	b.emit(")\n\n")
+
+	return b.write(out)
+}
diff --git a/tools/go_marshal/main.go b/tools/go_marshal/main.go
new file mode 100644
index 000000000..3d12eb93c
--- /dev/null
+++ b/tools/go_marshal/main.go
@@ -0,0 +1,73 @@
+// Copyright 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// go_marshal is a code generation utility for automatically generating code to
+// marshal go data structures to memory.
+//
+// This binary is typically run as part of the build process, and is invoked by
+// the go_marshal bazel rule defined in defs.bzl.
+//
+// See README.md.
+package main
+
+import (
+	"flag"
+	"fmt"
+	"os"
+	"strings"
+
+	"gvisor.dev/gvisor/tools/go_marshal/gomarshal"
+)
+
+var (
+	pkg            = flag.String("pkg", "", "output package")
+	output         = flag.String("output", "", "output file")
+	outputTest     = flag.String("output_test", "", "output file for tests")
+	imports        = flag.String("imports", "", "comma-separated list of extra packages to import in generated code")
+	declarationPkg = flag.String("declarationPkg", "", "import path of target declaring the types we're generating on")
+)
+
+func main() {
+	flag.Usage = func() {
+		fmt.Fprintf(os.Stderr, "Usage: %s <input go src files>\n", os.Args[0])
+		flag.PrintDefaults()
+	}
+	flag.Parse()
+	if len(flag.Args()) == 0 {
+		flag.Usage()
+		os.Exit(1)
+	}
+
+	if *pkg == "" {
+		flag.Usage()
+		fmt.Fprint(os.Stderr, "Flag -pkg must be provided.\n")
+		os.Exit(1)
+	}
+
+	var extraImports []string
+	if len(*imports) > 0 {
+		// Note: strings.Split(s, sep) returns s if sep doesn't exist in s. Thus
+		// we check for an empty imports list to avoid emitting an empty string
+		// as an import.
+		extraImports = strings.Split(*imports, ",")
+	}
+	g, err := gomarshal.NewGenerator(flag.Args(), *output, *outputTest, *pkg, *declarationPkg, extraImports)
+	if err != nil {
+		panic(err)
+	}
+
+	if err := g.Run(); err != nil {
+		panic(err)
+	}
+}
diff --git a/tools/go_marshal/marshal/BUILD b/tools/go_marshal/marshal/BUILD
new file mode 100644
index 000000000..47dda97a1
--- /dev/null
+++ b/tools/go_marshal/marshal/BUILD
@@ -0,0 +1,14 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "marshal",
+    srcs = [
+        "marshal.go",
+    ],
+    importpath = "gvisor.dev/gvisor/tools/go_marshal/marshal",
+    visibility = [
+        "//:sandbox",
+    ],
+)
diff --git a/tools/go_marshal/marshal/marshal.go b/tools/go_marshal/marshal/marshal.go
new file mode 100644
index 000000000..a313a27ed
--- /dev/null
+++ b/tools/go_marshal/marshal/marshal.go
@@ -0,0 +1,60 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package marshal defines the Marshallable interface for
+// serialize/deserializing go data structures to/from memory, according to the
+// Linux ABI.
+//
+// Implementations of this interface are typically automatically generated by
+// tools/go_marshal. See the go_marshal README for details.
+package marshal
+
+// Marshallable represents a type that can be marshalled to and from memory.
+type Marshallable interface {
+	// SizeBytes is the size of the memory representation of a type in
+	// marshalled form.
+	SizeBytes() int
+
+	// MarshalBytes serializes a copy of a type to dst. dst must be at least
+	// SizeBytes() long.
+	MarshalBytes(dst []byte)
+
+	// UnmarshalBytes deserializes a type from src. src must be at least
+	// SizeBytes() long.
+	UnmarshalBytes(src []byte)
+
+	// Packed returns true if the marshalled size of the type is the same as the
+	// size it occupies in memory. This happens when the type has no fields
+	// starting at unaligned addresses (should always be true by default for ABI
+	// structs, verified by automatically generated tests when using
+	// go_marshal), and has no fields marked `marshal:"unaligned"`.
+	Packed() bool
+
+	// MarshalUnsafe serializes a type by bulk copying its in-memory
+	// representation to the dst buffer. This is only safe to do when the type
+	// has no implicit padding, see Marshallable.Packed. When Packed would
+	// return false, MarshalUnsafe should fall back to the safer but slower
+	// MarshalBytes.
+	MarshalUnsafe(dst []byte)
+
+	// UnmarshalUnsafe deserializes a type directly to the underlying memory
+	// allocated for the object by the runtime.
+	//
+	// This allows much faster unmarshalling of types which have no implicit
+	// padding, see Marshallable.Packed. When Packed would return false,
+	// UnmarshalUnsafe should fall back to the safer but slower unmarshal
+	// mechanism implemented in UnmarshalBytes (usually by calling
+	// UnmarshalBytes directly).
+	UnmarshalUnsafe(src []byte)
+}
diff --git a/tools/go_marshal/test/BUILD b/tools/go_marshal/test/BUILD
new file mode 100644
index 000000000..947011414
--- /dev/null
+++ b/tools/go_marshal/test/BUILD
@@ -0,0 +1,29 @@
+package(licenses = ["notice"])
+
+load("//tools/go_marshal:defs.bzl", "go_library", "go_test")
+
+package_group(
+    name = "gomarshal_test",
+    packages = [
+        "//tools/go_marshal/test/...",
+    ],
+)
+
+go_test(
+    name = "benchmark_test",
+    srcs = ["benchmark_test.go"],
+    deps = [
+        ":test",
+        "//pkg/binary",
+        "//pkg/sentry/usermem",
+        "//tools/go_marshal/analysis",
+    ],
+)
+
+go_library(
+    name = "test",
+    testonly = 1,
+    srcs = ["test.go"],
+    importpath = "gvisor.dev/gvisor/tools/go_marshal/test",
+    deps = ["//tools/go_marshal/test/external"],
+)
diff --git a/tools/go_marshal/test/benchmark_test.go b/tools/go_marshal/test/benchmark_test.go
new file mode 100644
index 000000000..e70db06d8
--- /dev/null
+++ b/tools/go_marshal/test/benchmark_test.go
@@ -0,0 +1,178 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package benchmark_test
+
+import (
+	"bytes"
+	encbin "encoding/binary"
+	"fmt"
+	"reflect"
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/tools/go_marshal/analysis"
+	test "gvisor.dev/gvisor/tools/go_marshal/test"
+)
+
+// Marshalling using the standard encoding/binary package.
+func BenchmarkEncodingBinary(b *testing.B) {
+	var s1, s2 test.Stat
+	analysis.RandomizeValue(&s1)
+
+	size := encbin.Size(&s1)
+
+	b.ResetTimer()
+
+	for n := 0; n < b.N; n++ {
+		buf := bytes.NewBuffer(make([]byte, size))
+		buf.Reset()
+		if err := encbin.Write(buf, usermem.ByteOrder, &s1); err != nil {
+			b.Error("Write:", err)
+		}
+		if err := encbin.Read(buf, usermem.ByteOrder, &s2); err != nil {
+			b.Error("Read:", err)
+		}
+	}
+
+	b.StopTimer()
+
+	// Sanity check, make sure the values were preserved.
+	if !reflect.DeepEqual(s1, s2) {
+		panic(fmt.Sprintf("Data corruption across marshal/unmarshal cycle:\nBefore: %+v\nAfter: %+v\n", s1, s2))
+	}
+}
+
+// Marshalling using the sentry's binary.Marshal.
+func BenchmarkBinary(b *testing.B) {
+	var s1, s2 test.Stat
+	analysis.RandomizeValue(&s1)
+
+	size := binary.Size(s1)
+
+	b.ResetTimer()
+
+	for n := 0; n < b.N; n++ {
+		buf := make([]byte, 0, size)
+		buf = binary.Marshal(buf, usermem.ByteOrder, &s1)
+		binary.Unmarshal(buf, usermem.ByteOrder, &s2)
+	}
+
+	b.StopTimer()
+
+	// Sanity check, make sure the values were preserved.
+	if !reflect.DeepEqual(s1, s2) {
+		panic(fmt.Sprintf("Data corruption across marshal/unmarshal cycle:\nBefore: %+v\nAfter: %+v\n", s1, s2))
+	}
+}
+
+// Marshalling field-by-field with manually-written code.
+func BenchmarkMarshalManual(b *testing.B) {
+	var s1, s2 test.Stat
+	analysis.RandomizeValue(&s1)
+
+	b.ResetTimer()
+
+	for n := 0; n < b.N; n++ {
+		buf := make([]byte, 0, s1.SizeBytes())
+
+		// Marshal
+		buf = binary.AppendUint64(buf, usermem.ByteOrder, s1.Dev)
+		buf = binary.AppendUint64(buf, usermem.ByteOrder, s1.Ino)
+		buf = binary.AppendUint64(buf, usermem.ByteOrder, s1.Nlink)
+		buf = binary.AppendUint32(buf, usermem.ByteOrder, s1.Mode)
+		buf = binary.AppendUint32(buf, usermem.ByteOrder, s1.UID)
+		buf = binary.AppendUint32(buf, usermem.ByteOrder, s1.GID)
+		buf = binary.AppendUint32(buf, usermem.ByteOrder, 0)
+		buf = binary.AppendUint64(buf, usermem.ByteOrder, s1.Rdev)
+		buf = binary.AppendUint64(buf, usermem.ByteOrder, uint64(s1.Size))
+		buf = binary.AppendUint64(buf, usermem.ByteOrder, uint64(s1.Blksize))
+		buf = binary.AppendUint64(buf, usermem.ByteOrder, uint64(s1.Blocks))
+		buf = binary.AppendUint64(buf, usermem.ByteOrder, uint64(s1.ATime.Sec))
+		buf = binary.AppendUint64(buf, usermem.ByteOrder, uint64(s1.ATime.Nsec))
+		buf = binary.AppendUint64(buf, usermem.ByteOrder, uint64(s1.MTime.Sec))
+		buf = binary.AppendUint64(buf, usermem.ByteOrder, uint64(s1.MTime.Nsec))
+		buf = binary.AppendUint64(buf, usermem.ByteOrder, uint64(s1.CTime.Sec))
+		buf = binary.AppendUint64(buf, usermem.ByteOrder, uint64(s1.CTime.Nsec))
+
+		// Unmarshal
+		s2.Dev = usermem.ByteOrder.Uint64(buf[0:8])
+		s2.Ino = usermem.ByteOrder.Uint64(buf[8:16])
+		s2.Nlink = usermem.ByteOrder.Uint64(buf[16:24])
+		s2.Mode = usermem.ByteOrder.Uint32(buf[24:28])
+		s2.UID = usermem.ByteOrder.Uint32(buf[28:32])
+		s2.GID = usermem.ByteOrder.Uint32(buf[32:36])
+		// Padding: buf[36:40]
+		s2.Rdev = usermem.ByteOrder.Uint64(buf[40:48])
+		s2.Size = int64(usermem.ByteOrder.Uint64(buf[48:56]))
+		s2.Blksize = int64(usermem.ByteOrder.Uint64(buf[56:64]))
+		s2.Blocks = int64(usermem.ByteOrder.Uint64(buf[64:72]))
+		s2.ATime.Sec = int64(usermem.ByteOrder.Uint64(buf[72:80]))
+		s2.ATime.Nsec = int64(usermem.ByteOrder.Uint64(buf[80:88]))
+		s2.MTime.Sec = int64(usermem.ByteOrder.Uint64(buf[88:96]))
+		s2.MTime.Nsec = int64(usermem.ByteOrder.Uint64(buf[96:104]))
+		s2.CTime.Sec = int64(usermem.ByteOrder.Uint64(buf[104:112]))
+		s2.CTime.Nsec = int64(usermem.ByteOrder.Uint64(buf[112:120]))
+	}
+
+	b.StopTimer()
+
+	// Sanity check, make sure the values were preserved.
+	if !reflect.DeepEqual(s1, s2) {
+		panic(fmt.Sprintf("Data corruption across marshal/unmarshal cycle:\nBefore: %+v\nAfter: %+v\n", s1, s2))
+	}
+}
+
+// Marshalling with the go_marshal safe API.
+func BenchmarkGoMarshalSafe(b *testing.B) {
+	var s1, s2 test.Stat
+	analysis.RandomizeValue(&s1)
+
+	b.ResetTimer()
+
+	for n := 0; n < b.N; n++ {
+		buf := make([]byte, s1.SizeBytes())
+		s1.MarshalBytes(buf)
+		s2.UnmarshalBytes(buf)
+	}
+
+	b.StopTimer()
+
+	// Sanity check, make sure the values were preserved.
+	if !reflect.DeepEqual(s1, s2) {
+		panic(fmt.Sprintf("Data corruption across marshal/unmarshal cycle:\nBefore: %+v\nAfter: %+v\n", s1, s2))
+	}
+}
+
+// Marshalling with the go_marshal unsafe API.
+func BenchmarkGoMarshalUnsafe(b *testing.B) {
+	var s1, s2 test.Stat
+	analysis.RandomizeValue(&s1)
+
+	b.ResetTimer()
+
+	for n := 0; n < b.N; n++ {
+		buf := make([]byte, s1.SizeBytes())
+		s1.MarshalUnsafe(buf)
+		s2.UnmarshalUnsafe(buf)
+	}
+
+	b.StopTimer()
+
+	// Sanity check, make sure the values were preserved.
+	if !reflect.DeepEqual(s1, s2) {
+		panic(fmt.Sprintf("Data corruption across marshal/unmarshal cycle:\nBefore: %+v\nAfter: %+v\n", s1, s2))
+	}
+}
diff --git a/tools/go_marshal/test/external/BUILD b/tools/go_marshal/test/external/BUILD
new file mode 100644
index 000000000..8fb43179b
--- /dev/null
+++ b/tools/go_marshal/test/external/BUILD
@@ -0,0 +1,11 @@
+package(licenses = ["notice"])
+
+load("//tools/go_marshal:defs.bzl", "go_library")
+
+go_library(
+    name = "external",
+    testonly = 1,
+    srcs = ["external.go"],
+    importpath = "gvisor.dev/gvisor/tools/go_marshal/test/external",
+    visibility = ["//tools/go_marshal/test:gomarshal_test"],
+)
diff --git a/tools/go_marshal/test/external/external.go b/tools/go_marshal/test/external/external.go
new file mode 100644
index 000000000..4be3722f3
--- /dev/null
+++ b/tools/go_marshal/test/external/external.go
@@ -0,0 +1,23 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package external defines types we can import for testing.
+package external
+
+// External is a public Marshallable type for use in testing.
+//
+// +marshal
+type External struct {
+	j int64
+}
diff --git a/tools/go_marshal/test/test.go b/tools/go_marshal/test/test.go
new file mode 100644
index 000000000..8de02d707
--- /dev/null
+++ b/tools/go_marshal/test/test.go
@@ -0,0 +1,105 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package test contains data structures for testing the go_marshal tool.
+package test
+
+import (
+	// We're intentionally using a package name alias here even though it's not
+	// necessary to test the code generator's ability to handle package aliases.
+	ex "gvisor.dev/gvisor/tools/go_marshal/test/external"
+)
+
+// Type1 is a test data type.
+//
+// +marshal
+type Type1 struct {
+	a    Type2
+	x, y int64 // Multiple field names.
+	b    byte  `marshal:"unaligned"` // Short field.
+	c    uint64
+	_    uint32  // Unnamed scalar field.
+	_    [6]byte // Unnamed vector field, typical padding.
+	_    [2]byte
+	xs   [8]int32
+	as   [10]Type2 `marshal:"unaligned"` // Array of Marshallable objects.
+	ss   Type3
+}
+
+// Type2 is a test data type.
+//
+// +marshal
+type Type2 struct {
+	n int64
+	c byte
+	_ [7]byte
+	m int64
+	a int64
+}
+
+// Type3 is a test data type.
+//
+// +marshal
+type Type3 struct {
+	s int64
+	x ex.External // Type defined in another package.
+}
+
+// Type4 is a test data type.
+//
+// +marshal
+type Type4 struct {
+	c byte
+	x int64 `marshal:"unaligned"`
+	d byte
+	_ [7]byte
+}
+
+// Type5 is a test data type.
+//
+// +marshal
+type Type5 struct {
+	n int64
+	t Type4
+	m int64
+}
+
+// Timespec represents struct timespec in <time.h>.
+//
+// +marshal
+type Timespec struct {
+	Sec  int64
+	Nsec int64
+}
+
+// Stat represents struct stat.
+//
+// +marshal
+type Stat struct {
+	Dev     uint64
+	Ino     uint64
+	Nlink   uint64
+	Mode    uint32
+	UID     uint32
+	GID     uint32
+	_       int32
+	Rdev    uint64
+	Size    int64
+	Blksize int64
+	Blocks  int64
+	ATime   Timespec
+	MTime   Timespec
+	CTime   Timespec
+	_       [3]int64
+}
-- 
cgit v1.2.3


From df5d377521e625aeb8f4fe18bd1d9974dbf9998c Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Thu, 12 Sep 2019 15:09:01 -0700
Subject: Remove go_test from go_stateify and go_marshal

They are no-ops, so the standard rule works fine.

PiperOrigin-RevId: 268776264
---
 pkg/abi/linux/BUILD                        |  4 +-
 pkg/amutex/BUILD                           |  3 +-
 pkg/atomicbitops/BUILD                     |  3 +-
 pkg/binary/BUILD                           |  3 +-
 pkg/bits/BUILD                             |  3 +-
 pkg/bpf/BUILD                              |  4 +-
 pkg/compressio/BUILD                       |  3 +-
 pkg/cpuid/BUILD                            |  4 +-
 pkg/eventchannel/BUILD                     |  3 +-
 pkg/fd/BUILD                               |  3 +-
 pkg/fdchannel/BUILD                        |  3 +-
 pkg/flipcall/BUILD                         |  3 +-
 pkg/fspath/BUILD                           |  3 +-
 pkg/gate/BUILD                             |  3 +-
 pkg/ilist/BUILD                            |  3 +-
 pkg/linewriter/BUILD                       |  3 +-
 pkg/log/BUILD                              |  3 +-
 pkg/metric/BUILD                           |  3 +-
 pkg/p9/BUILD                               |  3 +-
 pkg/p9/p9test/BUILD                        |  4 +-
 pkg/procid/BUILD                           |  3 +-
 pkg/refs/BUILD                             |  4 +-
 pkg/seccomp/BUILD                          |  4 +-
 pkg/secio/BUILD                            |  3 +-
 pkg/segment/test/BUILD                     |  3 +-
 pkg/sentry/control/BUILD                   |  3 +-
 pkg/sentry/device/BUILD                    |  4 +-
 pkg/sentry/fs/BUILD                        |  4 +-
 pkg/sentry/fs/fdpipe/BUILD                 |  4 +-
 pkg/sentry/fs/fsutil/BUILD                 |  4 +-
 pkg/sentry/fs/gofer/BUILD                  |  4 +-
 pkg/sentry/fs/host/BUILD                   |  4 +-
 pkg/sentry/fs/lock/BUILD                   |  4 +-
 pkg/sentry/fs/proc/BUILD                   |  4 +-
 pkg/sentry/fs/proc/seqfile/BUILD           |  4 +-
 pkg/sentry/fs/ramfs/BUILD                  |  4 +-
 pkg/sentry/fs/tmpfs/BUILD                  |  4 +-
 pkg/sentry/fs/tty/BUILD                    |  4 +-
 pkg/sentry/fsimpl/ext/BUILD                |  4 +-
 pkg/sentry/fsimpl/ext/benchmark/BUILD      |  2 +-
 pkg/sentry/fsimpl/ext/disklayout/BUILD     |  4 +-
 pkg/sentry/fsimpl/memfs/BUILD              |  3 +-
 pkg/sentry/fsimpl/proc/BUILD               |  3 +-
 pkg/sentry/hostcpu/BUILD                   |  3 +-
 pkg/sentry/kernel/BUILD                    |  3 +-
 pkg/sentry/kernel/epoll/BUILD              |  4 +-
 pkg/sentry/kernel/eventfd/BUILD            |  4 +-
 pkg/sentry/kernel/futex/BUILD              |  4 +-
 pkg/sentry/kernel/pipe/BUILD               |  4 +-
 pkg/sentry/kernel/sched/BUILD              |  3 +-
 pkg/sentry/kernel/semaphore/BUILD          |  4 +-
 pkg/sentry/limits/BUILD                    |  4 +-
 pkg/sentry/memmap/BUILD                    |  4 +-
 pkg/sentry/mm/BUILD                        |  4 +-
 pkg/sentry/pgalloc/BUILD                   |  4 +-
 pkg/sentry/platform/interrupt/BUILD        |  3 +-
 pkg/sentry/platform/kvm/BUILD              |  3 +-
 pkg/sentry/platform/ring0/pagetables/BUILD |  3 +-
 pkg/sentry/platform/safecopy/BUILD         |  3 +-
 pkg/sentry/safemem/BUILD                   |  3 +-
 pkg/sentry/socket/netlink/port/BUILD       |  4 +-
 pkg/sentry/time/BUILD                      |  3 +-
 pkg/sentry/usermem/BUILD                   |  4 +-
 pkg/sentry/vfs/BUILD                       |  3 +-
 pkg/sleep/BUILD                            |  3 +-
 pkg/state/BUILD                            |  3 +-
 pkg/state/statefile/BUILD                  |  3 +-
 pkg/syserror/BUILD                         |  3 +-
 pkg/tcpip/BUILD                            |  4 +-
 pkg/tcpip/adapters/gonet/BUILD             |  3 +-
 pkg/tcpip/buffer/BUILD                     |  4 +-
 pkg/tcpip/hash/jenkins/BUILD               |  3 +-
 pkg/tcpip/header/BUILD                     |  4 +-
 pkg/tcpip/link/fdbased/BUILD               |  3 +-
 pkg/tcpip/link/muxed/BUILD                 |  3 +-
 pkg/tcpip/link/sharedmem/BUILD             |  3 +-
 pkg/tcpip/link/sharedmem/pipe/BUILD        |  3 +-
 pkg/tcpip/link/sharedmem/queue/BUILD       |  3 +-
 pkg/tcpip/link/waitable/BUILD              |  3 +-
 pkg/tcpip/network/BUILD                    |  2 +-
 pkg/tcpip/network/arp/BUILD                |  3 +-
 pkg/tcpip/network/fragmentation/BUILD      |  4 +-
 pkg/tcpip/network/ipv4/BUILD               |  3 +-
 pkg/tcpip/network/ipv6/BUILD               |  3 +-
 pkg/tcpip/ports/BUILD                      |  3 +-
 pkg/tcpip/stack/BUILD                      |  4 +-
 pkg/tcpip/transport/tcp/BUILD              |  4 +-
 pkg/tcpip/transport/tcpconntrack/BUILD     |  3 +-
 pkg/tcpip/transport/udp/BUILD              |  4 +-
 pkg/tmutex/BUILD                           |  3 +-
 pkg/unet/BUILD                             |  3 +-
 pkg/urpc/BUILD                             |  3 +-
 pkg/waiter/BUILD                           |  4 +-
 tools/go_marshal/defs.bzl                  |  6 ---
 tools/go_marshal/test/BUILD                |  4 +-
 tools/go_stateify/defs.bzl                 | 65 ++++++++++++++++++++----------
 96 files changed, 268 insertions(+), 123 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/abi/linux/BUILD b/pkg/abi/linux/BUILD
index ba233b93f..39c92bb33 100644
--- a/pkg/abi/linux/BUILD
+++ b/pkg/abi/linux/BUILD
@@ -2,9 +2,11 @@
 # Linux kernel. It should be used instead of syscall or golang.org/x/sys/unix
 # when the host OS may not be Linux.
 
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
+
 package(licenses = ["notice"])
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "linux",
diff --git a/pkg/amutex/BUILD b/pkg/amutex/BUILD
index 39d253b98..6bc486b62 100644
--- a/pkg/amutex/BUILD
+++ b/pkg/amutex/BUILD
@@ -1,4 +1,5 @@
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
 
 package(licenses = ["notice"])
 
diff --git a/pkg/atomicbitops/BUILD b/pkg/atomicbitops/BUILD
index 47ab65346..5f59866fa 100644
--- a/pkg/atomicbitops/BUILD
+++ b/pkg/atomicbitops/BUILD
@@ -1,4 +1,5 @@
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
 
 package(licenses = ["notice"])
 
diff --git a/pkg/binary/BUILD b/pkg/binary/BUILD
index 09d6c2c1f..543fb54bf 100644
--- a/pkg/binary/BUILD
+++ b/pkg/binary/BUILD
@@ -1,4 +1,5 @@
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
 
 package(licenses = ["notice"])
 
diff --git a/pkg/bits/BUILD b/pkg/bits/BUILD
index 0c2dde4f8..51967b811 100644
--- a/pkg/bits/BUILD
+++ b/pkg/bits/BUILD
@@ -1,4 +1,5 @@
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
 
 package(licenses = ["notice"])
 
diff --git a/pkg/bpf/BUILD b/pkg/bpf/BUILD
index b692aa3b1..8d31e068c 100644
--- a/pkg/bpf/BUILD
+++ b/pkg/bpf/BUILD
@@ -1,6 +1,8 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
+
 package(licenses = ["notice"])
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "bpf",
diff --git a/pkg/compressio/BUILD b/pkg/compressio/BUILD
index cdec96df1..a0b21d4bd 100644
--- a/pkg/compressio/BUILD
+++ b/pkg/compressio/BUILD
@@ -1,4 +1,5 @@
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
 
 package(licenses = ["notice"])
 
diff --git a/pkg/cpuid/BUILD b/pkg/cpuid/BUILD
index 830e19e07..32422f9e2 100644
--- a/pkg/cpuid/BUILD
+++ b/pkg/cpuid/BUILD
@@ -1,6 +1,8 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
+
 package(licenses = ["notice"])
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "cpuid",
diff --git a/pkg/eventchannel/BUILD b/pkg/eventchannel/BUILD
index 9961baaa9..71f2abc83 100644
--- a/pkg/eventchannel/BUILD
+++ b/pkg/eventchannel/BUILD
@@ -1,5 +1,6 @@
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
 load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
 
 package(licenses = ["notice"])
 
diff --git a/pkg/fd/BUILD b/pkg/fd/BUILD
index 785c685a0..afa8f7659 100644
--- a/pkg/fd/BUILD
+++ b/pkg/fd/BUILD
@@ -1,4 +1,5 @@
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
 
 package(licenses = ["notice"])
 
diff --git a/pkg/fdchannel/BUILD b/pkg/fdchannel/BUILD
index e54e7371c..56495cbd9 100644
--- a/pkg/fdchannel/BUILD
+++ b/pkg/fdchannel/BUILD
@@ -1,4 +1,5 @@
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
 
 package(licenses = ["notice"])
 
diff --git a/pkg/flipcall/BUILD b/pkg/flipcall/BUILD
index c1e078c7c..5643d5f26 100644
--- a/pkg/flipcall/BUILD
+++ b/pkg/flipcall/BUILD
@@ -1,4 +1,5 @@
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
 
 package(licenses = ["notice"])
 
diff --git a/pkg/fspath/BUILD b/pkg/fspath/BUILD
index 11716af81..0c5f50397 100644
--- a/pkg/fspath/BUILD
+++ b/pkg/fspath/BUILD
@@ -1,4 +1,5 @@
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
 
 package(
     default_visibility = ["//visibility:public"],
diff --git a/pkg/gate/BUILD b/pkg/gate/BUILD
index e6a8dbd02..4b9321711 100644
--- a/pkg/gate/BUILD
+++ b/pkg/gate/BUILD
@@ -1,4 +1,5 @@
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
 
 package(licenses = ["notice"])
 
diff --git a/pkg/ilist/BUILD b/pkg/ilist/BUILD
index 8f3defa25..34d2673ef 100644
--- a/pkg/ilist/BUILD
+++ b/pkg/ilist/BUILD
@@ -1,5 +1,6 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
diff --git a/pkg/linewriter/BUILD b/pkg/linewriter/BUILD
index c8e923a74..a5d980d14 100644
--- a/pkg/linewriter/BUILD
+++ b/pkg/linewriter/BUILD
@@ -1,4 +1,5 @@
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
 
 package(licenses = ["notice"])
 
diff --git a/pkg/log/BUILD b/pkg/log/BUILD
index 12615240c..fc5f5779b 100644
--- a/pkg/log/BUILD
+++ b/pkg/log/BUILD
@@ -1,4 +1,5 @@
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
 
 package(licenses = ["notice"])
 
diff --git a/pkg/metric/BUILD b/pkg/metric/BUILD
index 3b8a691f4..842788179 100644
--- a/pkg/metric/BUILD
+++ b/pkg/metric/BUILD
@@ -1,5 +1,6 @@
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
 load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
 
 package(licenses = ["notice"])
 
diff --git a/pkg/p9/BUILD b/pkg/p9/BUILD
index c6737bf97..6bc4d3bc7 100644
--- a/pkg/p9/BUILD
+++ b/pkg/p9/BUILD
@@ -1,4 +1,5 @@
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
 
 package(
     default_visibility = ["//visibility:public"],
diff --git a/pkg/p9/p9test/BUILD b/pkg/p9/p9test/BUILD
index 6e939a49a..1d34181e0 100644
--- a/pkg/p9/p9test/BUILD
+++ b/pkg/p9/p9test/BUILD
@@ -1,5 +1,5 @@
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
-load("@io_bazel_rules_go//go:def.bzl", "go_binary")
+load("//tools/go_stateify:defs.bzl", "go_library")
+load("@io_bazel_rules_go//go:def.bzl", "go_binary", "go_test")
 
 package(licenses = ["notice"])
 
diff --git a/pkg/procid/BUILD b/pkg/procid/BUILD
index 697e7a2f4..078f084b2 100644
--- a/pkg/procid/BUILD
+++ b/pkg/procid/BUILD
@@ -1,4 +1,5 @@
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
 
 package(licenses = ["notice"])
 
diff --git a/pkg/refs/BUILD b/pkg/refs/BUILD
index 9c08452fc..827385139 100644
--- a/pkg/refs/BUILD
+++ b/pkg/refs/BUILD
@@ -1,7 +1,9 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
+
 package(licenses = ["notice"])
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_template_instance(
     name = "weak_ref_list",
diff --git a/pkg/seccomp/BUILD b/pkg/seccomp/BUILD
index d1024e49d..af94e944d 100644
--- a/pkg/seccomp/BUILD
+++ b/pkg/seccomp/BUILD
@@ -1,5 +1,5 @@
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
-load("@io_bazel_rules_go//go:def.bzl", "go_binary", "go_embed_data")
+load("//tools/go_stateify:defs.bzl", "go_library")
+load("@io_bazel_rules_go//go:def.bzl", "go_binary", "go_embed_data", "go_test")
 
 package(licenses = ["notice"])
 
diff --git a/pkg/secio/BUILD b/pkg/secio/BUILD
index f38fb39f3..22abdc69f 100644
--- a/pkg/secio/BUILD
+++ b/pkg/secio/BUILD
@@ -1,4 +1,5 @@
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
 
 package(licenses = ["notice"])
 
diff --git a/pkg/segment/test/BUILD b/pkg/segment/test/BUILD
index 694486296..12d7c77d2 100644
--- a/pkg/segment/test/BUILD
+++ b/pkg/segment/test/BUILD
@@ -1,4 +1,5 @@
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
 
 package(
     default_visibility = ["//visibility:private"],
diff --git a/pkg/sentry/control/BUILD b/pkg/sentry/control/BUILD
index bf802d1b6..5522cecd0 100644
--- a/pkg/sentry/control/BUILD
+++ b/pkg/sentry/control/BUILD
@@ -1,4 +1,5 @@
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
 
 package(licenses = ["notice"])
 
diff --git a/pkg/sentry/device/BUILD b/pkg/sentry/device/BUILD
index 7e8918722..0c86197f7 100644
--- a/pkg/sentry/device/BUILD
+++ b/pkg/sentry/device/BUILD
@@ -1,6 +1,8 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
+
 package(licenses = ["notice"])
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "device",
diff --git a/pkg/sentry/fs/BUILD b/pkg/sentry/fs/BUILD
index d7259b47b..3119a61b6 100644
--- a/pkg/sentry/fs/BUILD
+++ b/pkg/sentry/fs/BUILD
@@ -1,7 +1,9 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
+
 package(licenses = ["notice"])
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "fs",
diff --git a/pkg/sentry/fs/fdpipe/BUILD b/pkg/sentry/fs/fdpipe/BUILD
index bf00b9c09..b9bd9ed17 100644
--- a/pkg/sentry/fs/fdpipe/BUILD
+++ b/pkg/sentry/fs/fdpipe/BUILD
@@ -1,6 +1,8 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
+
 package(licenses = ["notice"])
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "fdpipe",
diff --git a/pkg/sentry/fs/fsutil/BUILD b/pkg/sentry/fs/fsutil/BUILD
index 6499f87ac..b4ac83dc4 100644
--- a/pkg/sentry/fs/fsutil/BUILD
+++ b/pkg/sentry/fs/fsutil/BUILD
@@ -1,7 +1,9 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
+
 package(licenses = ["notice"])
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_template_instance(
     name = "dirty_set_impl",
diff --git a/pkg/sentry/fs/gofer/BUILD b/pkg/sentry/fs/gofer/BUILD
index 6b993928c..2b71ca0e1 100644
--- a/pkg/sentry/fs/gofer/BUILD
+++ b/pkg/sentry/fs/gofer/BUILD
@@ -1,6 +1,8 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
+
 package(licenses = ["notice"])
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "gofer",
diff --git a/pkg/sentry/fs/host/BUILD b/pkg/sentry/fs/host/BUILD
index b1080fb1a..3e532332e 100644
--- a/pkg/sentry/fs/host/BUILD
+++ b/pkg/sentry/fs/host/BUILD
@@ -1,6 +1,8 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
+
 package(licenses = ["notice"])
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "host",
diff --git a/pkg/sentry/fs/lock/BUILD b/pkg/sentry/fs/lock/BUILD
index 08d7c0c57..5a7a5b8cd 100644
--- a/pkg/sentry/fs/lock/BUILD
+++ b/pkg/sentry/fs/lock/BUILD
@@ -1,7 +1,9 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
+
 package(licenses = ["notice"])
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_template_instance(
     name = "lock_range",
diff --git a/pkg/sentry/fs/proc/BUILD b/pkg/sentry/fs/proc/BUILD
index c7599d1f6..1c93e8886 100644
--- a/pkg/sentry/fs/proc/BUILD
+++ b/pkg/sentry/fs/proc/BUILD
@@ -1,6 +1,8 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
+
 package(licenses = ["notice"])
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "proc",
diff --git a/pkg/sentry/fs/proc/seqfile/BUILD b/pkg/sentry/fs/proc/seqfile/BUILD
index 20c3eefc8..76433c7d0 100644
--- a/pkg/sentry/fs/proc/seqfile/BUILD
+++ b/pkg/sentry/fs/proc/seqfile/BUILD
@@ -1,6 +1,8 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
+
 package(licenses = ["notice"])
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "seqfile",
diff --git a/pkg/sentry/fs/ramfs/BUILD b/pkg/sentry/fs/ramfs/BUILD
index 516efcc4c..d0f351e5a 100644
--- a/pkg/sentry/fs/ramfs/BUILD
+++ b/pkg/sentry/fs/ramfs/BUILD
@@ -1,6 +1,8 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
+
 package(licenses = ["notice"])
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "ramfs",
diff --git a/pkg/sentry/fs/tmpfs/BUILD b/pkg/sentry/fs/tmpfs/BUILD
index 8f7eb5757..11b680929 100644
--- a/pkg/sentry/fs/tmpfs/BUILD
+++ b/pkg/sentry/fs/tmpfs/BUILD
@@ -1,6 +1,8 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
+
 package(licenses = ["notice"])
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "tmpfs",
diff --git a/pkg/sentry/fs/tty/BUILD b/pkg/sentry/fs/tty/BUILD
index 5e9327aec..d799de748 100644
--- a/pkg/sentry/fs/tty/BUILD
+++ b/pkg/sentry/fs/tty/BUILD
@@ -1,6 +1,8 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
+
 package(licenses = ["notice"])
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "tty",
diff --git a/pkg/sentry/fsimpl/ext/BUILD b/pkg/sentry/fsimpl/ext/BUILD
index 9e8ebb907..b0c286b7a 100644
--- a/pkg/sentry/fsimpl/ext/BUILD
+++ b/pkg/sentry/fsimpl/ext/BUILD
@@ -1,6 +1,8 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
+
 package(licenses = ["notice"])
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 
 go_template_instance(
diff --git a/pkg/sentry/fsimpl/ext/benchmark/BUILD b/pkg/sentry/fsimpl/ext/benchmark/BUILD
index 9fddb4c4c..bfc46dfa6 100644
--- a/pkg/sentry/fsimpl/ext/benchmark/BUILD
+++ b/pkg/sentry/fsimpl/ext/benchmark/BUILD
@@ -1,4 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_test")
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
 
 package(licenses = ["notice"])
 
diff --git a/pkg/sentry/fsimpl/ext/disklayout/BUILD b/pkg/sentry/fsimpl/ext/disklayout/BUILD
index 907d35b7e..2d50e30aa 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/BUILD
+++ b/pkg/sentry/fsimpl/ext/disklayout/BUILD
@@ -1,6 +1,8 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
+
 package(licenses = ["notice"])
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "disklayout",
diff --git a/pkg/sentry/fsimpl/memfs/BUILD b/pkg/sentry/fsimpl/memfs/BUILD
index d2450e810..7e364c5fd 100644
--- a/pkg/sentry/fsimpl/memfs/BUILD
+++ b/pkg/sentry/fsimpl/memfs/BUILD
@@ -1,4 +1,5 @@
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
 
 package(licenses = ["notice"])
 
diff --git a/pkg/sentry/fsimpl/proc/BUILD b/pkg/sentry/fsimpl/proc/BUILD
index 3d8a4deaf..ade6ac946 100644
--- a/pkg/sentry/fsimpl/proc/BUILD
+++ b/pkg/sentry/fsimpl/proc/BUILD
@@ -1,4 +1,5 @@
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
 
 package(licenses = ["notice"])
 
diff --git a/pkg/sentry/hostcpu/BUILD b/pkg/sentry/hostcpu/BUILD
index f989f2f8b..d4a420e60 100644
--- a/pkg/sentry/hostcpu/BUILD
+++ b/pkg/sentry/hostcpu/BUILD
@@ -1,4 +1,5 @@
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
 
 package(licenses = ["notice"])
 
diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
index e61d39c82..e964a991b 100644
--- a/pkg/sentry/kernel/BUILD
+++ b/pkg/sentry/kernel/BUILD
@@ -1,9 +1,10 @@
 load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
 
 package(licenses = ["notice"])
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_template_instance(
     name = "pending_signals_list",
diff --git a/pkg/sentry/kernel/epoll/BUILD b/pkg/sentry/kernel/epoll/BUILD
index f46c43128..65427b112 100644
--- a/pkg/sentry/kernel/epoll/BUILD
+++ b/pkg/sentry/kernel/epoll/BUILD
@@ -1,7 +1,9 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
+
 package(licenses = ["notice"])
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_template_instance(
     name = "epoll_list",
diff --git a/pkg/sentry/kernel/eventfd/BUILD b/pkg/sentry/kernel/eventfd/BUILD
index 1c5f979d4..983ca67ed 100644
--- a/pkg/sentry/kernel/eventfd/BUILD
+++ b/pkg/sentry/kernel/eventfd/BUILD
@@ -1,6 +1,8 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
+
 package(licenses = ["notice"])
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "eventfd",
diff --git a/pkg/sentry/kernel/futex/BUILD b/pkg/sentry/kernel/futex/BUILD
index 6a31dc044..41f44999c 100644
--- a/pkg/sentry/kernel/futex/BUILD
+++ b/pkg/sentry/kernel/futex/BUILD
@@ -1,7 +1,9 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
+
 package(licenses = ["notice"])
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_template_instance(
     name = "atomicptr_bucket",
diff --git a/pkg/sentry/kernel/pipe/BUILD b/pkg/sentry/kernel/pipe/BUILD
index 4d15cca85..2ce8952e2 100644
--- a/pkg/sentry/kernel/pipe/BUILD
+++ b/pkg/sentry/kernel/pipe/BUILD
@@ -1,7 +1,9 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
+
 package(licenses = ["notice"])
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_template_instance(
     name = "buffer_list",
diff --git a/pkg/sentry/kernel/sched/BUILD b/pkg/sentry/kernel/sched/BUILD
index 1725b8562..98ea7a0d8 100644
--- a/pkg/sentry/kernel/sched/BUILD
+++ b/pkg/sentry/kernel/sched/BUILD
@@ -1,4 +1,5 @@
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
 
 package(licenses = ["notice"])
 
diff --git a/pkg/sentry/kernel/semaphore/BUILD b/pkg/sentry/kernel/semaphore/BUILD
index 36edf10f3..80e5e5da3 100644
--- a/pkg/sentry/kernel/semaphore/BUILD
+++ b/pkg/sentry/kernel/semaphore/BUILD
@@ -1,7 +1,9 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
+
 package(licenses = ["notice"])
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_template_instance(
     name = "waiter_list",
diff --git a/pkg/sentry/limits/BUILD b/pkg/sentry/limits/BUILD
index 40025d62d..59649c770 100644
--- a/pkg/sentry/limits/BUILD
+++ b/pkg/sentry/limits/BUILD
@@ -1,6 +1,8 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
+
 package(licenses = ["notice"])
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "limits",
diff --git a/pkg/sentry/memmap/BUILD b/pkg/sentry/memmap/BUILD
index 29c14ec56..9687e7e76 100644
--- a/pkg/sentry/memmap/BUILD
+++ b/pkg/sentry/memmap/BUILD
@@ -1,7 +1,9 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
+
 package(licenses = ["notice"])
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_template_instance(
     name = "mappable_range",
diff --git a/pkg/sentry/mm/BUILD b/pkg/sentry/mm/BUILD
index 072745a08..b35c8c673 100644
--- a/pkg/sentry/mm/BUILD
+++ b/pkg/sentry/mm/BUILD
@@ -1,7 +1,9 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
+
 package(licenses = ["notice"])
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_template_instance(
     name = "file_refcount_set",
diff --git a/pkg/sentry/pgalloc/BUILD b/pkg/sentry/pgalloc/BUILD
index 858f895f2..3fd904c67 100644
--- a/pkg/sentry/pgalloc/BUILD
+++ b/pkg/sentry/pgalloc/BUILD
@@ -1,7 +1,9 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
+
 package(licenses = ["notice"])
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_template_instance(
     name = "evictable_range",
diff --git a/pkg/sentry/platform/interrupt/BUILD b/pkg/sentry/platform/interrupt/BUILD
index eeb634644..b6d008dbe 100644
--- a/pkg/sentry/platform/interrupt/BUILD
+++ b/pkg/sentry/platform/interrupt/BUILD
@@ -1,4 +1,5 @@
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
 
 package(licenses = ["notice"])
 
diff --git a/pkg/sentry/platform/kvm/BUILD b/pkg/sentry/platform/kvm/BUILD
index fe979dccf..31fa48ec5 100644
--- a/pkg/sentry/platform/kvm/BUILD
+++ b/pkg/sentry/platform/kvm/BUILD
@@ -1,4 +1,5 @@
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
 
 package(licenses = ["notice"])
 
diff --git a/pkg/sentry/platform/ring0/pagetables/BUILD b/pkg/sentry/platform/ring0/pagetables/BUILD
index 3b95af617..ea090b686 100644
--- a/pkg/sentry/platform/ring0/pagetables/BUILD
+++ b/pkg/sentry/platform/ring0/pagetables/BUILD
@@ -1,4 +1,5 @@
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
 
 package(licenses = ["notice"])
 
diff --git a/pkg/sentry/platform/safecopy/BUILD b/pkg/sentry/platform/safecopy/BUILD
index 924d8a6d6..6769cd0a5 100644
--- a/pkg/sentry/platform/safecopy/BUILD
+++ b/pkg/sentry/platform/safecopy/BUILD
@@ -1,4 +1,5 @@
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
 
 package(licenses = ["notice"])
 
diff --git a/pkg/sentry/safemem/BUILD b/pkg/sentry/safemem/BUILD
index fd6dc8e6e..884020f7b 100644
--- a/pkg/sentry/safemem/BUILD
+++ b/pkg/sentry/safemem/BUILD
@@ -1,4 +1,5 @@
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
 
 package(licenses = ["notice"])
 
diff --git a/pkg/sentry/socket/netlink/port/BUILD b/pkg/sentry/socket/netlink/port/BUILD
index 9e2e12799..445080aa4 100644
--- a/pkg/sentry/socket/netlink/port/BUILD
+++ b/pkg/sentry/socket/netlink/port/BUILD
@@ -1,6 +1,8 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
+
 package(licenses = ["notice"])
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "port",
diff --git a/pkg/sentry/time/BUILD b/pkg/sentry/time/BUILD
index 8aa6a3017..beb43ba13 100644
--- a/pkg/sentry/time/BUILD
+++ b/pkg/sentry/time/BUILD
@@ -1,4 +1,5 @@
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
 
 package(licenses = ["notice"])
 
diff --git a/pkg/sentry/usermem/BUILD b/pkg/sentry/usermem/BUILD
index a5b4206bb..cc5d25762 100644
--- a/pkg/sentry/usermem/BUILD
+++ b/pkg/sentry/usermem/BUILD
@@ -1,7 +1,9 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
+
 package(licenses = ["notice"])
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_template_instance(
     name = "addr_range",
diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD
index 0f247bf77..eff4b44f6 100644
--- a/pkg/sentry/vfs/BUILD
+++ b/pkg/sentry/vfs/BUILD
@@ -1,4 +1,5 @@
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
 
 package(licenses = ["notice"])
 
diff --git a/pkg/sleep/BUILD b/pkg/sleep/BUILD
index 00665c939..bdca80d37 100644
--- a/pkg/sleep/BUILD
+++ b/pkg/sleep/BUILD
@@ -1,4 +1,5 @@
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
 
 package(licenses = ["notice"])
 
diff --git a/pkg/state/BUILD b/pkg/state/BUILD
index c0f3c658d..329904457 100644
--- a/pkg/state/BUILD
+++ b/pkg/state/BUILD
@@ -1,5 +1,6 @@
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
 load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
 
 package(licenses = ["notice"])
 
diff --git a/pkg/state/statefile/BUILD b/pkg/state/statefile/BUILD
index e70f4a79f..8a865d229 100644
--- a/pkg/state/statefile/BUILD
+++ b/pkg/state/statefile/BUILD
@@ -1,4 +1,5 @@
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
 
 package(licenses = ["notice"])
 
diff --git a/pkg/syserror/BUILD b/pkg/syserror/BUILD
index b149f9e02..bd3f9fd28 100644
--- a/pkg/syserror/BUILD
+++ b/pkg/syserror/BUILD
@@ -1,4 +1,5 @@
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
 
 package(licenses = ["notice"])
 
diff --git a/pkg/tcpip/BUILD b/pkg/tcpip/BUILD
index df37c7d5a..3fd9e3134 100644
--- a/pkg/tcpip/BUILD
+++ b/pkg/tcpip/BUILD
@@ -1,6 +1,8 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
+
 package(licenses = ["notice"])
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "tcpip",
diff --git a/pkg/tcpip/adapters/gonet/BUILD b/pkg/tcpip/adapters/gonet/BUILD
index 0d2637ee4..78df5a0b1 100644
--- a/pkg/tcpip/adapters/gonet/BUILD
+++ b/pkg/tcpip/adapters/gonet/BUILD
@@ -1,4 +1,5 @@
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
 
 package(licenses = ["notice"])
 
diff --git a/pkg/tcpip/buffer/BUILD b/pkg/tcpip/buffer/BUILD
index 3301967fb..b4e8d6810 100644
--- a/pkg/tcpip/buffer/BUILD
+++ b/pkg/tcpip/buffer/BUILD
@@ -1,6 +1,8 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
+
 package(licenses = ["notice"])
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "buffer",
diff --git a/pkg/tcpip/hash/jenkins/BUILD b/pkg/tcpip/hash/jenkins/BUILD
index 29b30be9c..0c5c20cea 100644
--- a/pkg/tcpip/hash/jenkins/BUILD
+++ b/pkg/tcpip/hash/jenkins/BUILD
@@ -1,4 +1,5 @@
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
 
 package(licenses = ["notice"])
 
diff --git a/pkg/tcpip/header/BUILD b/pkg/tcpip/header/BUILD
index 76ef02f13..b558350c3 100644
--- a/pkg/tcpip/header/BUILD
+++ b/pkg/tcpip/header/BUILD
@@ -1,6 +1,8 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
+
 package(licenses = ["notice"])
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "header",
diff --git a/pkg/tcpip/link/fdbased/BUILD b/pkg/tcpip/link/fdbased/BUILD
index 74fbbb896..8fa9e3984 100644
--- a/pkg/tcpip/link/fdbased/BUILD
+++ b/pkg/tcpip/link/fdbased/BUILD
@@ -1,4 +1,5 @@
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
 
 package(licenses = ["notice"])
 
diff --git a/pkg/tcpip/link/muxed/BUILD b/pkg/tcpip/link/muxed/BUILD
index ea12ef1ac..1bab380b0 100644
--- a/pkg/tcpip/link/muxed/BUILD
+++ b/pkg/tcpip/link/muxed/BUILD
@@ -1,4 +1,5 @@
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
 
 package(licenses = ["notice"])
 
diff --git a/pkg/tcpip/link/sharedmem/BUILD b/pkg/tcpip/link/sharedmem/BUILD
index f2998aa98..0a5ea3dc4 100644
--- a/pkg/tcpip/link/sharedmem/BUILD
+++ b/pkg/tcpip/link/sharedmem/BUILD
@@ -1,4 +1,5 @@
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
 
 package(licenses = ["notice"])
 
diff --git a/pkg/tcpip/link/sharedmem/pipe/BUILD b/pkg/tcpip/link/sharedmem/pipe/BUILD
index 94725cb11..330ed5e94 100644
--- a/pkg/tcpip/link/sharedmem/pipe/BUILD
+++ b/pkg/tcpip/link/sharedmem/pipe/BUILD
@@ -1,4 +1,5 @@
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
 
 package(licenses = ["notice"])
 
diff --git a/pkg/tcpip/link/sharedmem/queue/BUILD b/pkg/tcpip/link/sharedmem/queue/BUILD
index 160a8f864..de1ce043d 100644
--- a/pkg/tcpip/link/sharedmem/queue/BUILD
+++ b/pkg/tcpip/link/sharedmem/queue/BUILD
@@ -1,4 +1,5 @@
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
 
 package(licenses = ["notice"])
 
diff --git a/pkg/tcpip/link/waitable/BUILD b/pkg/tcpip/link/waitable/BUILD
index 2597d4b3e..0746dc8ec 100644
--- a/pkg/tcpip/link/waitable/BUILD
+++ b/pkg/tcpip/link/waitable/BUILD
@@ -1,4 +1,5 @@
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
 
 package(licenses = ["notice"])
 
diff --git a/pkg/tcpip/network/BUILD b/pkg/tcpip/network/BUILD
index f36f49453..9d16ff8c9 100644
--- a/pkg/tcpip/network/BUILD
+++ b/pkg/tcpip/network/BUILD
@@ -1,4 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_test")
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
 
 package(licenses = ["notice"])
 
diff --git a/pkg/tcpip/network/arp/BUILD b/pkg/tcpip/network/arp/BUILD
index d95d44f56..df0d3a8c0 100644
--- a/pkg/tcpip/network/arp/BUILD
+++ b/pkg/tcpip/network/arp/BUILD
@@ -1,4 +1,5 @@
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
 
 package(licenses = ["notice"])
 
diff --git a/pkg/tcpip/network/fragmentation/BUILD b/pkg/tcpip/network/fragmentation/BUILD
index 118bfc763..c5c7aad86 100644
--- a/pkg/tcpip/network/fragmentation/BUILD
+++ b/pkg/tcpip/network/fragmentation/BUILD
@@ -1,7 +1,9 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
+
 package(licenses = ["notice"])
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_template_instance(
     name = "reassembler_list",
diff --git a/pkg/tcpip/network/ipv4/BUILD b/pkg/tcpip/network/ipv4/BUILD
index be84fa63d..58e537aad 100644
--- a/pkg/tcpip/network/ipv4/BUILD
+++ b/pkg/tcpip/network/ipv4/BUILD
@@ -1,4 +1,5 @@
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
 
 package(licenses = ["notice"])
 
diff --git a/pkg/tcpip/network/ipv6/BUILD b/pkg/tcpip/network/ipv6/BUILD
index c71b69123..a471abbfb 100644
--- a/pkg/tcpip/network/ipv6/BUILD
+++ b/pkg/tcpip/network/ipv6/BUILD
@@ -1,4 +1,5 @@
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
 
 package(licenses = ["notice"])
 
diff --git a/pkg/tcpip/ports/BUILD b/pkg/tcpip/ports/BUILD
index 989058413..11efb4e44 100644
--- a/pkg/tcpip/ports/BUILD
+++ b/pkg/tcpip/ports/BUILD
@@ -1,4 +1,5 @@
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
 
 package(licenses = ["notice"])
 
diff --git a/pkg/tcpip/stack/BUILD b/pkg/tcpip/stack/BUILD
index 788de3dfe..28c49e8ff 100644
--- a/pkg/tcpip/stack/BUILD
+++ b/pkg/tcpip/stack/BUILD
@@ -1,7 +1,9 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
+
 package(licenses = ["notice"])
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_template_instance(
     name = "linkaddrentry_list",
diff --git a/pkg/tcpip/transport/tcp/BUILD b/pkg/tcpip/transport/tcp/BUILD
index 1ee1a53f8..39a839ab7 100644
--- a/pkg/tcpip/transport/tcp/BUILD
+++ b/pkg/tcpip/transport/tcp/BUILD
@@ -1,7 +1,9 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
+
 package(licenses = ["notice"])
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_template_instance(
     name = "tcp_segment_list",
diff --git a/pkg/tcpip/transport/tcpconntrack/BUILD b/pkg/tcpip/transport/tcpconntrack/BUILD
index 4bec48c0f..43fcc27f0 100644
--- a/pkg/tcpip/transport/tcpconntrack/BUILD
+++ b/pkg/tcpip/transport/tcpconntrack/BUILD
@@ -1,4 +1,5 @@
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
 
 package(licenses = ["notice"])
 
diff --git a/pkg/tcpip/transport/udp/BUILD b/pkg/tcpip/transport/udp/BUILD
index ac2666f69..c1ca22b35 100644
--- a/pkg/tcpip/transport/udp/BUILD
+++ b/pkg/tcpip/transport/udp/BUILD
@@ -1,7 +1,9 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
+
 package(licenses = ["notice"])
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_template_instance(
     name = "udp_packet_list",
diff --git a/pkg/tmutex/BUILD b/pkg/tmutex/BUILD
index 98d51cc69..6afdb29b7 100644
--- a/pkg/tmutex/BUILD
+++ b/pkg/tmutex/BUILD
@@ -1,4 +1,5 @@
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
 
 package(licenses = ["notice"])
 
diff --git a/pkg/unet/BUILD b/pkg/unet/BUILD
index cbd92fc05..8f6f180e5 100644
--- a/pkg/unet/BUILD
+++ b/pkg/unet/BUILD
@@ -1,4 +1,5 @@
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
 
 package(licenses = ["notice"])
 
diff --git a/pkg/urpc/BUILD b/pkg/urpc/BUILD
index b7f505a84..b6bbb0ea2 100644
--- a/pkg/urpc/BUILD
+++ b/pkg/urpc/BUILD
@@ -1,4 +1,5 @@
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
 
 package(licenses = ["notice"])
 
diff --git a/pkg/waiter/BUILD b/pkg/waiter/BUILD
index 9173dfd0f..8dc88becb 100644
--- a/pkg/waiter/BUILD
+++ b/pkg/waiter/BUILD
@@ -1,7 +1,9 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
+
 package(licenses = ["notice"])
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_template_instance(
     name = "waiter_list",
diff --git a/tools/go_marshal/defs.bzl b/tools/go_marshal/defs.bzl
index 60a992b7f..c32eb559f 100644
--- a/tools/go_marshal/defs.bzl
+++ b/tools/go_marshal/defs.bzl
@@ -150,9 +150,3 @@ def go_library(name, srcs, deps = [], imports = [], debug = False, **kwargs):
         ],
         **kwargs
     )
-
-def go_test(**kwargs):
-    """Wraps the standard go_test."""
-    _go_test(
-        **kwargs
-    )
diff --git a/tools/go_marshal/test/BUILD b/tools/go_marshal/test/BUILD
index 947011414..fa82f8e9b 100644
--- a/tools/go_marshal/test/BUILD
+++ b/tools/go_marshal/test/BUILD
@@ -1,6 +1,8 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
+
 package(licenses = ["notice"])
 
-load("//tools/go_marshal:defs.bzl", "go_library", "go_test")
+load("//tools/go_marshal:defs.bzl", "go_library")
 
 package_group(
     name = "gomarshal_test",
diff --git a/tools/go_stateify/defs.bzl b/tools/go_stateify/defs.bzl
index aeba197e2..3ce36c1c8 100644
--- a/tools/go_stateify/defs.bzl
+++ b/tools/go_stateify/defs.bzl
@@ -35,7 +35,7 @@ go_library(
 )
 """
 
-load("@io_bazel_rules_go//go:def.bzl", _go_library = "go_library", _go_test = "go_test")
+load("@io_bazel_rules_go//go:def.bzl", _go_library = "go_library")
 
 def _go_stateify_impl(ctx):
     """Implementation for the stateify tool."""
@@ -60,28 +60,57 @@ def _go_stateify_impl(ctx):
         executable = ctx.executable._tool,
     )
 
-# Generates save and restore logic from a set of Go files.
-#
-# Args:
-#   name: the name of the rule.
-#   srcs: the input source files. These files should include all structs in the package that need to be saved.
-#   imports: an optional list of extra non-aliased, Go-style absolute import paths.
-#   out: the name of the generated file output. This must not conflict with any other files and must be added to the srcs of the relevant go_library.
-#   package: the package name for the input sources.
 go_stateify = rule(
     implementation = _go_stateify_impl,
+    doc = "Generates save and restore logic from a set of Go files.",
     attrs = {
-        "srcs": attr.label_list(mandatory = True, allow_files = True),
-        "imports": attr.string_list(mandatory = False),
-        "package": attr.string(mandatory = True),
-        "out": attr.output(mandatory = True),
-        "_tool": attr.label(executable = True, cfg = "host", default = Label("//tools/go_stateify:stateify")),
+        "srcs": attr.label_list(
+            doc = """
+The input source files. These files should include all structs in the package
+that need to be saved.
+""",
+            mandatory = True,
+            allow_files = True,
+        ),
+        "imports": attr.string_list(
+            doc = """
+An optional list of extra non-aliased, Go-style absolute import paths required
+for statified types.
+""",
+            mandatory = False,
+        ),
+        "package": attr.string(
+            doc = "The package name for the input sources.",
+            mandatory = True,
+        ),
+        "out": attr.output(
+            doc = """
+The name of the generated file output. This must not conflict with any other
+files and must be added to the srcs of the relevant go_library.
+""",
+            mandatory = True,
+        ),
+        "_tool": attr.label(
+            executable = True,
+            cfg = "host",
+            default = Label("//tools/go_stateify:stateify"),
+        ),
         "_statepkg": attr.string(default = "gvisor.dev/gvisor/pkg/state"),
     },
 )
 
 def go_library(name, srcs, deps = [], imports = [], **kwargs):
-    """wraps the standard go_library and does stateification."""
+    """Standard go_library wrapped which generates state source files.
+
+    Args:
+      name: the name of the go_library rule.
+      srcs: sources of the go_library. Each will be processed for stateify
+            annotations.
+      deps: dependencies for the go_library.
+      imports: an optional list of extra non-aliased, Go-style absolute import
+               paths required for stateified types.
+      **kwargs: passed to go_library.
+    """
     if "encode_unsafe.go" not in srcs and (name + "_state_autogen.go") not in srcs:
         # Only do stateification for non-state packages without manual autogen.
         go_stateify(
@@ -105,9 +134,3 @@ def go_library(name, srcs, deps = [], imports = [], **kwargs):
         deps = all_deps,
         **kwargs
     )
-
-def go_test(**kwargs):
-    """Wraps the standard go_test."""
-    _go_test(
-        **kwargs
-    )
-- 
cgit v1.2.3


From 7c6ab6a219f37a1d4c18ced4a602458fcf363f85 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Thu, 12 Sep 2019 17:42:14 -0700
Subject: Implement splice methods for pipes and sockets.

This also allows the tee(2) implementation to be enabled, since dup can now be
properly supported via WriteTo.

Note that this change necessitated some minor restructoring with the
fs.FileOperations splice methods. If the *fs.File is passed through directly,
then only public API methods are accessible, which will deadlock immediately
since the locking is already done by fs.Splice. Instead, we pass through an
abstract io.Reader or io.Writer, which elide locks and use the underlying
fs.FileOperations directly.

PiperOrigin-RevId: 268805207
---
 pkg/sentry/fs/file.go                   |  23 +++-
 pkg/sentry/fs/file_operations.go        |   9 +-
 pkg/sentry/fs/file_overlay.go           |   9 +-
 pkg/sentry/fs/fsutil/file.go            |   6 +-
 pkg/sentry/fs/inotify.go                |   5 +-
 pkg/sentry/fs/splice.go                 | 162 +++++++++++++-------------
 pkg/sentry/kernel/pipe/buffer.go        |  25 ++++
 pkg/sentry/kernel/pipe/pipe.go          |  82 +++++++++++---
 pkg/sentry/kernel/pipe/reader_writer.go |  76 ++++++++++++-
 pkg/sentry/socket/epsocket/epsocket.go  | 134 +++++++++++++++++++---
 pkg/sentry/syscalls/linux/linux64.go    |   4 +-
 pkg/sentry/syscalls/linux/sys_splice.go |  86 +++++++-------
 pkg/tcpip/header/udp.go                 |   5 +
 pkg/tcpip/stack/transport_test.go       |   4 +-
 pkg/tcpip/tcpip.go                      |  48 ++++----
 pkg/tcpip/transport/icmp/endpoint.go    |   4 +-
 pkg/tcpip/transport/raw/endpoint.go     |   7 +-
 pkg/tcpip/transport/tcp/endpoint.go     |  68 ++++++-----
 pkg/tcpip/transport/udp/endpoint.go     |  14 +--
 test/syscalls/linux/BUILD               |   3 +
 test/syscalls/linux/pipe.cc             |  14 +++
 test/syscalls/linux/sendfile.cc         |  69 ++++++++++++
 test/syscalls/linux/splice.cc           | 194 +++++++++++++++++++++++++-------
 23 files changed, 770 insertions(+), 281 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/file.go b/pkg/sentry/fs/file.go
index bb8117f89..c0a6e884b 100644
--- a/pkg/sentry/fs/file.go
+++ b/pkg/sentry/fs/file.go
@@ -515,6 +515,11 @@ type lockedReader struct {
 
 	// File is the file to read from.
 	File *File
+
+	// Offset is the offset to start at.
+	//
+	// This applies only to Read, not ReadAt.
+	Offset int64
 }
 
 // Read implements io.Reader.Read.
@@ -522,7 +527,8 @@ func (r *lockedReader) Read(buf []byte) (int, error) {
 	if r.Ctx.Interrupted() {
 		return 0, syserror.ErrInterrupted
 	}
-	n, err := r.File.FileOperations.Read(r.Ctx, r.File, usermem.BytesIOSequence(buf), r.File.offset)
+	n, err := r.File.FileOperations.Read(r.Ctx, r.File, usermem.BytesIOSequence(buf), r.Offset)
+	r.Offset += n
 	return int(n), err
 }
 
@@ -544,11 +550,21 @@ type lockedWriter struct {
 
 	// File is the file to write to.
 	File *File
+
+	// Offset is the offset to start at.
+	//
+	// This applies only to Write, not WriteAt.
+	Offset int64
 }
 
 // Write implements io.Writer.Write.
 func (w *lockedWriter) Write(buf []byte) (int, error) {
-	return w.WriteAt(buf, w.File.offset)
+	if w.Ctx.Interrupted() {
+		return 0, syserror.ErrInterrupted
+	}
+	n, err := w.WriteAt(buf, w.Offset)
+	w.Offset += int64(n)
+	return int(n), err
 }
 
 // WriteAt implements io.Writer.WriteAt.
@@ -562,6 +578,9 @@ func (w *lockedWriter) WriteAt(buf []byte, offset int64) (int, error) {
 	// io.Copy, since our own Write interface does not have this same
 	// contract. Enforce that here.
 	for written < len(buf) {
+		if w.Ctx.Interrupted() {
+			return written, syserror.ErrInterrupted
+		}
 		var n int64
 		n, err = w.File.FileOperations.Write(w.Ctx, w.File, usermem.BytesIOSequence(buf[written:]), offset+int64(written))
 		if n > 0 {
diff --git a/pkg/sentry/fs/file_operations.go b/pkg/sentry/fs/file_operations.go
index d86f5bf45..b88303f17 100644
--- a/pkg/sentry/fs/file_operations.go
+++ b/pkg/sentry/fs/file_operations.go
@@ -15,6 +15,8 @@
 package fs
 
 import (
+	"io"
+
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
@@ -105,8 +107,11 @@ type FileOperations interface {
 	// on the destination, following by a buffered copy with standard Read
 	// and Write operations.
 	//
+	// If dup is set, the data should be duplicated into the destination
+	// and retained.
+	//
 	// The same preconditions as Read apply.
-	WriteTo(ctx context.Context, file *File, dst *File, opts SpliceOpts) (int64, error)
+	WriteTo(ctx context.Context, file *File, dst io.Writer, count int64, dup bool) (int64, error)
 
 	// Write writes src to file at offset and returns the number of bytes
 	// written which must be greater than or equal to 0. Like Read, file
@@ -126,7 +131,7 @@ type FileOperations interface {
 	// source. See WriteTo for details regarding how this is called.
 	//
 	// The same preconditions as Write apply; FileFlags.Write must be set.
-	ReadFrom(ctx context.Context, file *File, src *File, opts SpliceOpts) (int64, error)
+	ReadFrom(ctx context.Context, file *File, src io.Reader, count int64) (int64, error)
 
 	// Fsync writes buffered modifications of file and/or flushes in-flight
 	// operations to backing storage based on syncType. The range to sync is
diff --git a/pkg/sentry/fs/file_overlay.go b/pkg/sentry/fs/file_overlay.go
index 9820f0b13..225e40186 100644
--- a/pkg/sentry/fs/file_overlay.go
+++ b/pkg/sentry/fs/file_overlay.go
@@ -15,6 +15,7 @@
 package fs
 
 import (
+	"io"
 	"sync"
 
 	"gvisor.dev/gvisor/pkg/refs"
@@ -268,9 +269,9 @@ func (f *overlayFileOperations) Read(ctx context.Context, file *File, dst userme
 }
 
 // WriteTo implements FileOperations.WriteTo.
-func (f *overlayFileOperations) WriteTo(ctx context.Context, file *File, dst *File, opts SpliceOpts) (n int64, err error) {
+func (f *overlayFileOperations) WriteTo(ctx context.Context, file *File, dst io.Writer, count int64, dup bool) (n int64, err error) {
 	err = f.onTop(ctx, file, func(file *File, ops FileOperations) error {
-		n, err = ops.WriteTo(ctx, file, dst, opts)
+		n, err = ops.WriteTo(ctx, file, dst, count, dup)
 		return err // Will overwrite itself.
 	})
 	return
@@ -285,9 +286,9 @@ func (f *overlayFileOperations) Write(ctx context.Context, file *File, src userm
 }
 
 // ReadFrom implements FileOperations.ReadFrom.
-func (f *overlayFileOperations) ReadFrom(ctx context.Context, file *File, src *File, opts SpliceOpts) (n int64, err error) {
+func (f *overlayFileOperations) ReadFrom(ctx context.Context, file *File, src io.Reader, count int64) (n int64, err error) {
 	// See above; f.upper must be non-nil.
-	return f.upper.FileOperations.ReadFrom(ctx, f.upper, src, opts)
+	return f.upper.FileOperations.ReadFrom(ctx, f.upper, src, count)
 }
 
 // Fsync implements FileOperations.Fsync.
diff --git a/pkg/sentry/fs/fsutil/file.go b/pkg/sentry/fs/fsutil/file.go
index 626b9126a..fc5b3b1a1 100644
--- a/pkg/sentry/fs/fsutil/file.go
+++ b/pkg/sentry/fs/fsutil/file.go
@@ -15,6 +15,8 @@
 package fsutil
 
 import (
+	"io"
+
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
@@ -228,12 +230,12 @@ func (FileNoIoctl) Ioctl(context.Context, *fs.File, usermem.IO, arch.SyscallArgu
 type FileNoSplice struct{}
 
 // WriteTo implements fs.FileOperations.WriteTo.
-func (FileNoSplice) WriteTo(context.Context, *fs.File, *fs.File, fs.SpliceOpts) (int64, error) {
+func (FileNoSplice) WriteTo(context.Context, *fs.File, io.Writer, int64, bool) (int64, error) {
 	return 0, syserror.ENOSYS
 }
 
 // ReadFrom implements fs.FileOperations.ReadFrom.
-func (FileNoSplice) ReadFrom(context.Context, *fs.File, *fs.File, fs.SpliceOpts) (int64, error) {
+func (FileNoSplice) ReadFrom(context.Context, *fs.File, io.Reader, int64) (int64, error) {
 	return 0, syserror.ENOSYS
 }
 
diff --git a/pkg/sentry/fs/inotify.go b/pkg/sentry/fs/inotify.go
index c7f4e2d13..ba3e0233d 100644
--- a/pkg/sentry/fs/inotify.go
+++ b/pkg/sentry/fs/inotify.go
@@ -15,6 +15,7 @@
 package fs
 
 import (
+	"io"
 	"sync"
 	"sync/atomic"
 
@@ -172,7 +173,7 @@ func (i *Inotify) Read(ctx context.Context, _ *File, dst usermem.IOSequence, _ i
 }
 
 // WriteTo implements FileOperations.WriteTo.
-func (*Inotify) WriteTo(context.Context, *File, *File, SpliceOpts) (int64, error) {
+func (*Inotify) WriteTo(context.Context, *File, io.Writer, int64, bool) (int64, error) {
 	return 0, syserror.ENOSYS
 }
 
@@ -182,7 +183,7 @@ func (*Inotify) Fsync(context.Context, *File, int64, int64, SyncType) error {
 }
 
 // ReadFrom implements FileOperations.ReadFrom.
-func (*Inotify) ReadFrom(context.Context, *File, *File, SpliceOpts) (int64, error) {
+func (*Inotify) ReadFrom(context.Context, *File, io.Reader, int64) (int64, error) {
 	return 0, syserror.ENOSYS
 }
 
diff --git a/pkg/sentry/fs/splice.go b/pkg/sentry/fs/splice.go
index eed1c2854..b03b7f836 100644
--- a/pkg/sentry/fs/splice.go
+++ b/pkg/sentry/fs/splice.go
@@ -18,7 +18,6 @@ import (
 	"io"
 	"sync/atomic"
 
-	"gvisor.dev/gvisor/pkg/secio"
 	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
@@ -33,146 +32,131 @@ func Splice(ctx context.Context, dst *File, src *File, opts SpliceOpts) (int64,
 	}
 
 	// Check whether or not the objects being sliced are stream-oriented
-	// (i.e. pipes or sockets). If yes, we elide checks and offset locks.
-	srcPipe := IsPipe(src.Dirent.Inode.StableAttr) || IsSocket(src.Dirent.Inode.StableAttr)
-	dstPipe := IsPipe(dst.Dirent.Inode.StableAttr) || IsSocket(dst.Dirent.Inode.StableAttr)
+	// (i.e. pipes or sockets). For all stream-oriented files and files
+	// where a specific offiset is not request, we acquire the file mutex.
+	// This has two important side effects. First, it provides the standard
+	// protection against concurrent writes that would mutate the offset.
+	// Second, it prevents Splice deadlocks. Only internal anonymous files
+	// implement the ReadFrom and WriteTo methods directly, and since such
+	// anonymous files are referred to by a unique fs.File object, we know
+	// that the file mutex takes strict precedence over internal locks.
+	// Since we enforce lock ordering here, we can't deadlock by using
+	// using a file in two different splice operations simultaneously.
+	srcPipe := !IsRegular(src.Dirent.Inode.StableAttr)
+	dstPipe := !IsRegular(dst.Dirent.Inode.StableAttr)
+	dstAppend := !dstPipe && dst.Flags().Append
+	srcLock := srcPipe || !opts.SrcOffset
+	dstLock := dstPipe || !opts.DstOffset || dstAppend
 
-	if !dstPipe && !opts.DstOffset && !srcPipe && !opts.SrcOffset {
+	switch {
+	case srcLock && dstLock:
 		switch {
 		case dst.UniqueID < src.UniqueID:
 			// Acquire dst first.
 			if !dst.mu.Lock(ctx) {
 				return 0, syserror.ErrInterrupted
 			}
-			defer dst.mu.Unlock()
 			if !src.mu.Lock(ctx) {
+				dst.mu.Unlock()
 				return 0, syserror.ErrInterrupted
 			}
-			defer src.mu.Unlock()
 		case dst.UniqueID > src.UniqueID:
 			// Acquire src first.
 			if !src.mu.Lock(ctx) {
 				return 0, syserror.ErrInterrupted
 			}
-			defer src.mu.Unlock()
 			if !dst.mu.Lock(ctx) {
+				src.mu.Unlock()
 				return 0, syserror.ErrInterrupted
 			}
-			defer dst.mu.Unlock()
 		case dst.UniqueID == src.UniqueID:
 			// Acquire only one lock; it's the same file. This is a
 			// bit of a edge case, but presumably it's possible.
 			if !dst.mu.Lock(ctx) {
 				return 0, syserror.ErrInterrupted
 			}
-			defer dst.mu.Unlock()
+			srcLock = false // Only need one unlock.
 		}
 		// Use both offsets (locked).
 		opts.DstStart = dst.offset
 		opts.SrcStart = src.offset
-	} else if !dstPipe && !opts.DstOffset {
+	case dstLock:
 		// Acquire only dst.
 		if !dst.mu.Lock(ctx) {
 			return 0, syserror.ErrInterrupted
 		}
-		defer dst.mu.Unlock()
 		opts.DstStart = dst.offset // Safe: locked.
-	} else if !srcPipe && !opts.SrcOffset {
+	case srcLock:
 		// Acquire only src.
 		if !src.mu.Lock(ctx) {
 			return 0, syserror.ErrInterrupted
 		}
-		defer src.mu.Unlock()
 		opts.SrcStart = src.offset // Safe: locked.
 	}
 
-	// Check append-only mode and the limit.
-	if !dstPipe {
+	var err error
+	if dstAppend {
 		unlock := dst.Dirent.Inode.lockAppendMu(dst.Flags().Append)
 		defer unlock()
-		if dst.Flags().Append {
-			if opts.DstOffset {
-				// We need to acquire the lock.
-				if !dst.mu.Lock(ctx) {
-					return 0, syserror.ErrInterrupted
-				}
-				defer dst.mu.Unlock()
-			}
-			// Figure out the appropriate offset to use.
-			if err := dst.offsetForAppend(ctx, &opts.DstStart); err != nil {
-				return 0, err
-			}
-		}
 
+		// Figure out the appropriate offset to use.
+		err = dst.offsetForAppend(ctx, &opts.DstStart)
+	}
+	if err == nil && !dstPipe {
 		// Enforce file limits.
 		limit, ok := dst.checkLimit(ctx, opts.DstStart)
 		switch {
 		case ok && limit == 0:
-			return 0, syserror.ErrExceedsFileSizeLimit
+			err = syserror.ErrExceedsFileSizeLimit
 		case ok && limit < opts.Length:
 			opts.Length = limit // Cap the write.
 		}
 	}
+	if err != nil {
+		if dstLock {
+			dst.mu.Unlock()
+		}
+		if srcLock {
+			src.mu.Unlock()
+		}
+		return 0, err
+	}
 
-	// Attempt to do a WriteTo; this is likely the most efficient.
-	//
-	// The underlying implementation may be able to donate buffers.
-	newOpts := SpliceOpts{
-		Length:    opts.Length,
-		SrcStart:  opts.SrcStart,
-		SrcOffset: !srcPipe,
-		Dup:       opts.Dup,
-		DstStart:  opts.DstStart,
-		DstOffset: !dstPipe,
+	// Construct readers and writers for the splice. This is used to
+	// provide a safer locking path for the WriteTo/ReadFrom operations
+	// (since they will otherwise go through public interface methods which
+	// conflict with locking done above), and simplifies the fallback path.
+	w := &lockedWriter{
+		Ctx:    ctx,
+		File:   dst,
+		Offset: opts.DstStart,
 	}
-	n, err := src.FileOperations.WriteTo(ctx, src, dst, newOpts)
-	if n == 0 && err != nil {
-		// Attempt as a ReadFrom. If a WriteTo, a ReadFrom may also
-		// be more efficient than a copy if buffers are cached or readily
-		// available. (It's unlikely that they can actually be donate
-		n, err = dst.FileOperations.ReadFrom(ctx, dst, src, newOpts)
+	r := &lockedReader{
+		Ctx:    ctx,
+		File:   src,
+		Offset: opts.SrcStart,
 	}
-	if n == 0 && err != nil {
-		// If we've failed up to here, and at least one of the sources
-		// is a pipe or socket, then we can't properly support dup.
-		// Return an error indicating that this operation is not
-		// supported.
-		if (srcPipe || dstPipe) && newOpts.Dup {
-			return 0, syserror.EINVAL
-		}
 
-		// We failed to splice the files. But that's fine; we just fall
-		// back to a slow path in this case. This copies without doing
-		// any mode changes, so should still be more efficient.
-		var (
-			r io.Reader
-			w io.Writer
-		)
-		fw := &lockedWriter{
-			Ctx:  ctx,
-			File: dst,
-		}
-		if newOpts.DstOffset {
-			// Use the provided offset.
-			w = secio.NewOffsetWriter(fw, newOpts.DstStart)
-		} else {
-			// Writes will proceed with no offset.
-			w = fw
-		}
-		fr := &lockedReader{
-			Ctx:  ctx,
-			File: src,
-		}
-		if newOpts.SrcOffset {
-			// Limit to the given offset and length.
-			r = io.NewSectionReader(fr, opts.SrcStart, opts.Length)
-		} else {
-			// Limit just to the given length.
-			r = &io.LimitedReader{fr, opts.Length}
-		}
+	// Attempt to do a WriteTo; this is likely the most efficient.
+	n, err := src.FileOperations.WriteTo(ctx, src, w, opts.Length, opts.Dup)
+	if n == 0 && err != nil && err != syserror.ErrWouldBlock && !opts.Dup {
+		// Attempt as a ReadFrom. If a WriteTo, a ReadFrom may also be
+		// more efficient than a copy if buffers are cached or readily
+		// available. (It's unlikely that they can actually be donated).
+		n, err = dst.FileOperations.ReadFrom(ctx, dst, r, opts.Length)
+	}
 
-		// Copy between the two.
-		n, err = io.Copy(w, r)
+	// Support one last fallback option, but only if at least one of
+	// the source and destination are regular files. This is because
+	// if we block at some point, we could lose data. If the source is
+	// not a pipe then reading is not destructive; if the destination
+	// is a regular file, then it is guaranteed not to block writing.
+	if n == 0 && err != nil && err != syserror.ErrWouldBlock && !opts.Dup && (!dstPipe || !srcPipe) {
+		// Fallback to an in-kernel copy.
+		n, err = io.Copy(w, &io.LimitedReader{
+			R: r,
+			N: opts.Length,
+		})
 	}
 
 	// Update offsets, if required.
@@ -185,5 +169,13 @@ func Splice(ctx context.Context, dst *File, src *File, opts SpliceOpts) (int64,
 		}
 	}
 
+	// Drop locks.
+	if dstLock {
+		dst.mu.Unlock()
+	}
+	if srcLock {
+		src.mu.Unlock()
+	}
+
 	return n, err
 }
diff --git a/pkg/sentry/kernel/pipe/buffer.go b/pkg/sentry/kernel/pipe/buffer.go
index 69ef2a720..95bee2d37 100644
--- a/pkg/sentry/kernel/pipe/buffer.go
+++ b/pkg/sentry/kernel/pipe/buffer.go
@@ -15,6 +15,7 @@
 package pipe
 
 import (
+	"io"
 	"sync"
 
 	"gvisor.dev/gvisor/pkg/sentry/safemem"
@@ -67,6 +68,17 @@ func (b *buffer) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) {
 	return n, err
 }
 
+// WriteFromReader writes to the buffer from an io.Reader.
+func (b *buffer) WriteFromReader(r io.Reader, count int64) (int64, error) {
+	dst := b.data[b.write:]
+	if count < int64(len(dst)) {
+		dst = b.data[b.write:][:count]
+	}
+	n, err := r.Read(dst)
+	b.write += n
+	return int64(n), err
+}
+
 // ReadToBlocks implements safemem.Reader.ReadToBlocks.
 func (b *buffer) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
 	src := safemem.BlockSeqOf(safemem.BlockFromSafeSlice(b.data[b.read:b.write]))
@@ -75,6 +87,19 @@ func (b *buffer) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
 	return n, err
 }
 
+// ReadToWriter reads from the buffer into an io.Writer.
+func (b *buffer) ReadToWriter(w io.Writer, count int64, dup bool) (int64, error) {
+	src := b.data[b.read:b.write]
+	if count < int64(len(src)) {
+		src = b.data[b.read:][:count]
+	}
+	n, err := w.Write(src)
+	if !dup {
+		b.read += n
+	}
+	return int64(n), err
+}
+
 // bufferPool is a pool for buffers.
 var bufferPool = sync.Pool{
 	New: func() interface{} {
diff --git a/pkg/sentry/kernel/pipe/pipe.go b/pkg/sentry/kernel/pipe/pipe.go
index 247e2928e..93b50669f 100644
--- a/pkg/sentry/kernel/pipe/pipe.go
+++ b/pkg/sentry/kernel/pipe/pipe.go
@@ -23,7 +23,6 @@ import (
 
 	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
@@ -173,13 +172,24 @@ func (p *Pipe) Open(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) *fs.F
 	}
 }
 
+type readOps struct {
+	// left returns the bytes remaining.
+	left func() int64
+
+	// limit limits subsequence reads.
+	limit func(int64)
+
+	// read performs the actual read operation.
+	read func(*buffer) (int64, error)
+}
+
 // read reads data from the pipe into dst and returns the number of bytes
 // read, or returns ErrWouldBlock if the pipe is empty.
 //
 // Precondition: this pipe must have readers.
-func (p *Pipe) read(ctx context.Context, dst usermem.IOSequence) (int64, error) {
+func (p *Pipe) read(ctx context.Context, ops readOps) (int64, error) {
 	// Don't block for a zero-length read even if the pipe is empty.
-	if dst.NumBytes() == 0 {
+	if ops.left() == 0 {
 		return 0, nil
 	}
 
@@ -196,12 +206,12 @@ func (p *Pipe) read(ctx context.Context, dst usermem.IOSequence) (int64, error)
 	}
 
 	// Limit how much we consume.
-	if dst.NumBytes() > p.size {
-		dst = dst.TakeFirst64(p.size)
+	if ops.left() > p.size {
+		ops.limit(p.size)
 	}
 
 	done := int64(0)
-	for dst.NumBytes() > 0 {
+	for ops.left() > 0 {
 		// Pop the first buffer.
 		first := p.data.Front()
 		if first == nil {
@@ -209,10 +219,9 @@ func (p *Pipe) read(ctx context.Context, dst usermem.IOSequence) (int64, error)
 		}
 
 		// Copy user data.
-		n, err := dst.CopyOutFrom(ctx, first)
+		n, err := ops.read(first)
 		done += int64(n)
 		p.size -= n
-		dst = dst.DropFirst64(n)
 
 		// Empty buffer?
 		if first.Empty() {
@@ -230,12 +239,57 @@ func (p *Pipe) read(ctx context.Context, dst usermem.IOSequence) (int64, error)
 	return done, nil
 }
 
+// dup duplicates all data from this pipe into the given writer.
+//
+// There is no blocking behavior implemented here. The writer may propagate
+// some blocking error. All the writes must be complete writes.
+func (p *Pipe) dup(ctx context.Context, ops readOps) (int64, error) {
+	p.mu.Lock()
+	defer p.mu.Unlock()
+
+	// Is the pipe empty?
+	if p.size == 0 {
+		if !p.HasWriters() {
+			// See above.
+			return 0, nil
+		}
+		return 0, syserror.ErrWouldBlock
+	}
+
+	// Limit how much we consume.
+	if ops.left() > p.size {
+		ops.limit(p.size)
+	}
+
+	done := int64(0)
+	for buf := p.data.Front(); buf != nil; buf = buf.Next() {
+		n, err := ops.read(buf)
+		done += n
+		if err != nil {
+			return done, err
+		}
+	}
+
+	return done, nil
+}
+
+type writeOps struct {
+	// left returns the bytes remaining.
+	left func() int64
+
+	// limit should limit subsequent writes.
+	limit func(int64)
+
+	// write should write to the provided buffer.
+	write func(*buffer) (int64, error)
+}
+
 // write writes data from sv into the pipe and returns the number of bytes
 // written. If no bytes are written because the pipe is full (or has less than
 // atomicIOBytes free capacity), write returns ErrWouldBlock.
 //
 // Precondition: this pipe must have writers.
-func (p *Pipe) write(ctx context.Context, src usermem.IOSequence) (int64, error) {
+func (p *Pipe) write(ctx context.Context, ops writeOps) (int64, error) {
 	p.mu.Lock()
 	defer p.mu.Unlock()
 
@@ -246,17 +300,16 @@ func (p *Pipe) write(ctx context.Context, src usermem.IOSequence) (int64, error)
 
 	// POSIX requires that a write smaller than atomicIOBytes (PIPE_BUF) be
 	// atomic, but requires no atomicity for writes larger than this.
-	wanted := src.NumBytes()
+	wanted := ops.left()
 	if avail := p.max - p.size; wanted > avail {
 		if wanted <= p.atomicIOBytes {
 			return 0, syserror.ErrWouldBlock
 		}
-		// Limit to the available capacity.
-		src = src.TakeFirst64(avail)
+		ops.limit(avail)
 	}
 
 	done := int64(0)
-	for src.NumBytes() > 0 {
+	for ops.left() > 0 {
 		// Need a new buffer?
 		last := p.data.Back()
 		if last == nil || last.Full() {
@@ -266,10 +319,9 @@ func (p *Pipe) write(ctx context.Context, src usermem.IOSequence) (int64, error)
 		}
 
 		// Copy user data.
-		n, err := src.CopyInTo(ctx, last)
+		n, err := ops.write(last)
 		done += int64(n)
 		p.size += n
-		src = src.DropFirst64(n)
 
 		// Handle errors.
 		if err != nil {
diff --git a/pkg/sentry/kernel/pipe/reader_writer.go b/pkg/sentry/kernel/pipe/reader_writer.go
index f69dbf27b..7c307f013 100644
--- a/pkg/sentry/kernel/pipe/reader_writer.go
+++ b/pkg/sentry/kernel/pipe/reader_writer.go
@@ -15,6 +15,7 @@
 package pipe
 
 import (
+	"io"
 	"math"
 	"syscall"
 
@@ -55,7 +56,45 @@ func (rw *ReaderWriter) Release() {
 
 // Read implements fs.FileOperations.Read.
 func (rw *ReaderWriter) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, _ int64) (int64, error) {
-	n, err := rw.Pipe.read(ctx, dst)
+	n, err := rw.Pipe.read(ctx, readOps{
+		left: func() int64 {
+			return dst.NumBytes()
+		},
+		limit: func(l int64) {
+			dst = dst.TakeFirst64(l)
+		},
+		read: func(buf *buffer) (int64, error) {
+			n, err := dst.CopyOutFrom(ctx, buf)
+			dst = dst.DropFirst64(n)
+			return n, err
+		},
+	})
+	if n > 0 {
+		rw.Pipe.Notify(waiter.EventOut)
+	}
+	return n, err
+}
+
+// WriteTo implements fs.FileOperations.WriteTo.
+func (rw *ReaderWriter) WriteTo(ctx context.Context, _ *fs.File, w io.Writer, count int64, dup bool) (int64, error) {
+	ops := readOps{
+		left: func() int64 {
+			return count
+		},
+		limit: func(l int64) {
+			count = l
+		},
+		read: func(buf *buffer) (int64, error) {
+			n, err := buf.ReadToWriter(w, count, dup)
+			count -= n
+			return n, err
+		},
+	}
+	if dup {
+		// There is no notification for dup operations.
+		return rw.Pipe.dup(ctx, ops)
+	}
+	n, err := rw.Pipe.read(ctx, ops)
 	if n > 0 {
 		rw.Pipe.Notify(waiter.EventOut)
 	}
@@ -64,7 +103,40 @@ func (rw *ReaderWriter) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequ
 
 // Write implements fs.FileOperations.Write.
 func (rw *ReaderWriter) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, _ int64) (int64, error) {
-	n, err := rw.Pipe.write(ctx, src)
+	n, err := rw.Pipe.write(ctx, writeOps{
+		left: func() int64 {
+			return src.NumBytes()
+		},
+		limit: func(l int64) {
+			src = src.TakeFirst64(l)
+		},
+		write: func(buf *buffer) (int64, error) {
+			n, err := src.CopyInTo(ctx, buf)
+			src = src.DropFirst64(n)
+			return n, err
+		},
+	})
+	if n > 0 {
+		rw.Pipe.Notify(waiter.EventIn)
+	}
+	return n, err
+}
+
+// ReadFrom implements fs.FileOperations.WriteTo.
+func (rw *ReaderWriter) ReadFrom(ctx context.Context, _ *fs.File, r io.Reader, count int64) (int64, error) {
+	n, err := rw.Pipe.write(ctx, writeOps{
+		left: func() int64 {
+			return count
+		},
+		limit: func(l int64) {
+			count = l
+		},
+		write: func(buf *buffer) (int64, error) {
+			n, err := buf.WriteFromReader(r, count)
+			count -= n
+			return n, err
+		},
+	})
 	if n > 0 {
 		rw.Pipe.Notify(waiter.EventIn)
 	}
diff --git a/pkg/sentry/socket/epsocket/epsocket.go b/pkg/sentry/socket/epsocket/epsocket.go
index 0e37ce61b..3e05e40fe 100644
--- a/pkg/sentry/socket/epsocket/epsocket.go
+++ b/pkg/sentry/socket/epsocket/epsocket.go
@@ -26,6 +26,7 @@ package epsocket
 
 import (
 	"bytes"
+	"io"
 	"math"
 	"reflect"
 	"sync"
@@ -227,7 +228,6 @@ type SocketOperations struct {
 	fsutil.FileNoopFlush            `state:"nosave"`
 	fsutil.FileNoFsync              `state:"nosave"`
 	fsutil.FileNoMMap               `state:"nosave"`
-	fsutil.FileNoSplice             `state:"nosave"`
 	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
 	socket.SendReceiveTimeout
 	*waiter.Queue
@@ -412,17 +412,58 @@ func (s *SocketOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOS
 	return int64(n), nil
 }
 
-// ioSequencePayload implements tcpip.Payload. It copies user memory bytes on demand
-// based on the requested size.
+// WriteTo implements fs.FileOperations.WriteTo.
+func (s *SocketOperations) WriteTo(ctx context.Context, _ *fs.File, dst io.Writer, count int64, dup bool) (int64, error) {
+	s.readMu.Lock()
+	defer s.readMu.Unlock()
+
+	// Copy as much data as possible.
+	done := int64(0)
+	for count > 0 {
+		// This may return a blocking error.
+		if err := s.fetchReadView(); err != nil {
+			return done, err.ToError()
+		}
+
+		// Write to the underlying file.
+		n, err := dst.Write(s.readView)
+		done += int64(n)
+		count -= int64(n)
+		if dup {
+			// That's all we support for dup. This is generally
+			// supported by any Linux system calls, but the
+			// expectation is that now a caller will call read to
+			// actually remove these bytes from the socket.
+			return done, nil
+		}
+
+		// Drop that part of the view.
+		s.readView.TrimFront(n)
+		if err != nil {
+			return done, err
+		}
+	}
+
+	return done, nil
+}
+
+// ioSequencePayload implements tcpip.Payload.
+//
+// t copies user memory bytes on demand based on the requested size.
 type ioSequencePayload struct {
 	ctx context.Context
 	src usermem.IOSequence
 }
 
-// Get implements tcpip.Payload.
-func (i *ioSequencePayload) Get(size int) ([]byte, *tcpip.Error) {
-	if size > i.Size() {
-		size = i.Size()
+// FullPayload implements tcpip.Payloader.FullPayload
+func (i *ioSequencePayload) FullPayload() ([]byte, *tcpip.Error) {
+	return i.Payload(int(i.src.NumBytes()))
+}
+
+// Payload implements tcpip.Payloader.Payload.
+func (i *ioSequencePayload) Payload(size int) ([]byte, *tcpip.Error) {
+	if max := int(i.src.NumBytes()); size > max {
+		size = max
 	}
 	v := buffer.NewView(size)
 	if _, err := i.src.CopyIn(i.ctx, v); err != nil {
@@ -431,11 +472,6 @@ func (i *ioSequencePayload) Get(size int) ([]byte, *tcpip.Error) {
 	return v, nil
 }
 
-// Size implements tcpip.Payload.
-func (i *ioSequencePayload) Size() int {
-	return int(i.src.NumBytes())
-}
-
 // DropFirst drops the first n bytes from underlying src.
 func (i *ioSequencePayload) DropFirst(n int) {
 	i.src = i.src.DropFirst(int(n))
@@ -469,6 +505,76 @@ func (s *SocketOperations) Write(ctx context.Context, _ *fs.File, src usermem.IO
 	return int64(n), nil
 }
 
+// readerPayload implements tcpip.Payloader.
+//
+// It allocates a view and reads from a reader on-demand, based on available
+// capacity in the endpoint.
+type readerPayload struct {
+	ctx   context.Context
+	r     io.Reader
+	count int64
+	err   error
+}
+
+// FullPayload implements tcpip.Payloader.FullPayload.
+func (r *readerPayload) FullPayload() ([]byte, *tcpip.Error) {
+	return r.Payload(int(r.count))
+}
+
+// Payload implements tcpip.Payloader.Payload.
+func (r *readerPayload) Payload(size int) ([]byte, *tcpip.Error) {
+	if size > int(r.count) {
+		size = int(r.count)
+	}
+	v := buffer.NewView(size)
+	n, err := r.r.Read(v)
+	if n > 0 {
+		// We ignore the error here. It may re-occur on subsequent
+		// reads, but for now we can enqueue some amount of data.
+		r.count -= int64(n)
+		return v[:n], nil
+	}
+	if err == syserror.ErrWouldBlock {
+		return nil, tcpip.ErrWouldBlock
+	} else if err != nil {
+		r.err = err // Save for propation.
+		return nil, tcpip.ErrBadAddress
+	}
+
+	// There is no data and no error. Return an error, which will propagate
+	// r.err, which will be nil. This is the desired result: (0, nil).
+	return nil, tcpip.ErrBadAddress
+}
+
+// ReadFrom implements fs.FileOperations.ReadFrom.
+func (s *SocketOperations) ReadFrom(ctx context.Context, _ *fs.File, r io.Reader, count int64) (int64, error) {
+	f := &readerPayload{ctx: ctx, r: r, count: count}
+	n, resCh, err := s.Endpoint.Write(f, tcpip.WriteOptions{})
+	if err == tcpip.ErrWouldBlock {
+		return 0, syserror.ErrWouldBlock
+	}
+
+	if resCh != nil {
+		t := ctx.(*kernel.Task)
+		if err := t.Block(resCh); err != nil {
+			return 0, syserr.FromError(err).ToError()
+		}
+
+		n, _, err = s.Endpoint.Write(f, tcpip.WriteOptions{
+			// Reads may be destructive but should be very fast,
+			// so we can't release the lock while copying data.
+			Atomic: true,
+		})
+	}
+	if err == tcpip.ErrWouldBlock {
+		return n, syserror.ErrWouldBlock
+	} else if err != nil {
+		return int64(n), f.err // Propagate error.
+	}
+
+	return int64(n), nil
+}
+
 // Readiness returns a mask of ready events for socket s.
 func (s *SocketOperations) Readiness(mask waiter.EventMask) waiter.EventMask {
 	r := s.Endpoint.Readiness(mask)
@@ -2060,7 +2166,7 @@ func (s *SocketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []
 		n, _, err = s.Endpoint.Write(v, opts)
 	}
 	dontWait := flags&linux.MSG_DONTWAIT != 0
-	if err == nil && (n >= int64(v.Size()) || dontWait) {
+	if err == nil && (n >= v.src.NumBytes() || dontWait) {
 		// Complete write.
 		return int(n), nil
 	}
@@ -2085,7 +2191,7 @@ func (s *SocketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []
 			return 0, syserr.TranslateNetstackError(err)
 		}
 
-		if err == nil && v.Size() == 0 || err != nil && err != tcpip.ErrWouldBlock {
+		if err == nil && v.src.NumBytes() == 0 || err != nil && err != tcpip.ErrWouldBlock {
 			return int(total), nil
 		}
 
diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go
index ed996ba51..150999fb8 100644
--- a/pkg/sentry/syscalls/linux/linux64.go
+++ b/pkg/sentry/syscalls/linux/linux64.go
@@ -320,8 +320,8 @@ var AMD64 = &kernel.SyscallTable{
 		272: syscalls.PartiallySupported("unshare", Unshare, "Mount, cgroup namespaces not supported. Network namespaces supported but must be empty.", nil),
 		273: syscalls.Error("set_robust_list", syserror.ENOSYS, "Obsolete.", nil),
 		274: syscalls.Error("get_robust_list", syserror.ENOSYS, "Obsolete.", nil),
-		275: syscalls.PartiallySupported("splice", Splice, "Stub implementation.", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098)
-		276: syscalls.ErrorWithEvent("tee", syserror.ENOSYS, "", []string{"gvisor.dev/issue/138"}),                   // TODO(b/29354098)
+		275: syscalls.Supported("splice", Splice),
+		276: syscalls.Supported("tee", Tee),
 		277: syscalls.PartiallySupported("sync_file_range", SyncFileRange, "Full data flush is not guaranteed at this time.", nil),
 		278: syscalls.ErrorWithEvent("vmsplice", syserror.ENOSYS, "", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098)
 		279: syscalls.CapError("move_pages", linux.CAP_SYS_NICE, "", nil),                               // requires cap_sys_nice (mostly)
diff --git a/pkg/sentry/syscalls/linux/sys_splice.go b/pkg/sentry/syscalls/linux/sys_splice.go
index 8a98fedcb..f0a292f2f 100644
--- a/pkg/sentry/syscalls/linux/sys_splice.go
+++ b/pkg/sentry/syscalls/linux/sys_splice.go
@@ -29,9 +29,8 @@ func doSplice(t *kernel.Task, outFile, inFile *fs.File, opts fs.SpliceOpts, nonB
 		total int64
 		n     int64
 		err   error
-		ch    chan struct{}
-		inW   bool
-		outW  bool
+		inCh  chan struct{}
+		outCh chan struct{}
 	)
 	for opts.Length > 0 {
 		n, err = fs.Splice(t, outFile, inFile, opts)
@@ -43,35 +42,33 @@ func doSplice(t *kernel.Task, outFile, inFile *fs.File, opts fs.SpliceOpts, nonB
 			break
 		}
 
-		// Are we a registered waiter?
-		if ch == nil {
-			ch = make(chan struct{}, 1)
-		}
-		if !inW && !inFile.Flags().NonBlocking {
-			w, _ := waiter.NewChannelEntry(ch)
-			inFile.EventRegister(&w, EventMaskRead)
-			defer inFile.EventUnregister(&w)
-			inW = true // Registered.
-		} else if !outW && !outFile.Flags().NonBlocking {
-			w, _ := waiter.NewChannelEntry(ch)
-			outFile.EventRegister(&w, EventMaskWrite)
-			defer outFile.EventUnregister(&w)
-			outW = true // Registered.
-		}
-
-		// Was anything registered? If no, everything is non-blocking.
-		if !inW && !outW {
-			break
-		}
-
-		if (!inW || inFile.Readiness(EventMaskRead) != 0) && (!outW || outFile.Readiness(EventMaskWrite) != 0) {
-			// Something became ready, try again without blocking.
-			continue
+		// Note that the blocking behavior here is a bit different than the
+		// normal pattern. Because we need to have both data to read and data
+		// to write simultaneously, we actually explicitly block on both of
+		// these cases in turn before returning to the splice operation.
+		if inFile.Readiness(EventMaskRead) == 0 {
+			if inCh == nil {
+				inCh = make(chan struct{}, 1)
+				inW, _ := waiter.NewChannelEntry(inCh)
+				inFile.EventRegister(&inW, EventMaskRead)
+				defer inFile.EventUnregister(&inW)
+				continue // Need to refresh readiness.
+			}
+			if err = t.Block(inCh); err != nil {
+				break
+			}
 		}
-
-		// Block until there's data.
-		if err = t.Block(ch); err != nil {
-			break
+		if outFile.Readiness(EventMaskWrite) == 0 {
+			if outCh == nil {
+				outCh = make(chan struct{}, 1)
+				outW, _ := waiter.NewChannelEntry(outCh)
+				outFile.EventRegister(&outW, EventMaskWrite)
+				defer outFile.EventUnregister(&outW)
+				continue // Need to refresh readiness.
+			}
+			if err = t.Block(outCh); err != nil {
+				break
+			}
 		}
 	}
 
@@ -149,7 +146,7 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 			Length:    count,
 			SrcOffset: true,
 			SrcStart:  offset,
-		}, false)
+		}, outFile.Flags().NonBlocking)
 
 		// Copy out the new offset.
 		if _, err := t.CopyOut(offsetAddr, n+offset); err != nil {
@@ -159,7 +156,7 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 		// Send data using splice.
 		n, err = doSplice(t, outFile, inFile, fs.SpliceOpts{
 			Length: count,
-		}, false)
+		}, outFile.Flags().NonBlocking)
 	}
 
 	// We can only pass a single file to handleIOError, so pick inFile
@@ -181,12 +178,6 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 		return 0, nil, syserror.EINVAL
 	}
 
-	// Only non-blocking is meaningful. Note that unlike in Linux, this
-	// flag is applied consistently. We will have either fully blocking or
-	// non-blocking behavior below, regardless of the underlying files
-	// being spliced to. It's unclear if this is a bug or not yet.
-	nonBlocking := (flags & linux.SPLICE_F_NONBLOCK) != 0
-
 	// Get files.
 	outFile := t.GetFile(outFD)
 	if outFile == nil {
@@ -200,6 +191,13 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 	}
 	defer inFile.DecRef()
 
+	// The operation is non-blocking if anything is non-blocking.
+	//
+	// N.B. This is a rather simplistic heuristic that avoids some
+	// poor edge case behavior since the exact semantics here are
+	// underspecified and vary between versions of Linux itself.
+	nonBlock := inFile.Flags().NonBlocking || outFile.Flags().NonBlocking || (flags&linux.SPLICE_F_NONBLOCK != 0)
+
 	// Construct our options.
 	//
 	// Note that exactly one of the underlying buffers must be a pipe. We
@@ -257,7 +255,7 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 	}
 
 	// Splice data.
-	n, err := doSplice(t, outFile, inFile, opts, nonBlocking)
+	n, err := doSplice(t, outFile, inFile, opts, nonBlock)
 
 	// See above; inFile is chosen arbitrarily here.
 	return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "splice", inFile)
@@ -275,9 +273,6 @@ func Tee(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallCo
 		return 0, nil, syserror.EINVAL
 	}
 
-	// Only non-blocking is meaningful.
-	nonBlocking := (flags & linux.SPLICE_F_NONBLOCK) != 0
-
 	// Get files.
 	outFile := t.GetFile(outFD)
 	if outFile == nil {
@@ -301,11 +296,14 @@ func Tee(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallCo
 		return 0, nil, syserror.EINVAL
 	}
 
+	// The operation is non-blocking if anything is non-blocking.
+	nonBlock := inFile.Flags().NonBlocking || outFile.Flags().NonBlocking || (flags&linux.SPLICE_F_NONBLOCK != 0)
+
 	// Splice data.
 	n, err := doSplice(t, outFile, inFile, fs.SpliceOpts{
 		Length: count,
 		Dup:    true,
-	}, nonBlocking)
+	}, nonBlock)
 
 	// See above; inFile is chosen arbitrarily here.
 	return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "tee", inFile)
diff --git a/pkg/tcpip/header/udp.go b/pkg/tcpip/header/udp.go
index c1f454805..74412c894 100644
--- a/pkg/tcpip/header/udp.go
+++ b/pkg/tcpip/header/udp.go
@@ -27,6 +27,11 @@ const (
 	udpChecksum = 6
 )
 
+const (
+	// UDPMaximumPacketSize is the largest possible UDP packet.
+	UDPMaximumPacketSize = 0xffff
+)
+
 // UDPFields contains the fields of a UDP packet. It is used to describe the
 // fields of a packet that needs to be encoded.
 type UDPFields struct {
diff --git a/pkg/tcpip/stack/transport_test.go b/pkg/tcpip/stack/transport_test.go
index 87d1e0d0d..847d02982 100644
--- a/pkg/tcpip/stack/transport_test.go
+++ b/pkg/tcpip/stack/transport_test.go
@@ -65,13 +65,13 @@ func (*fakeTransportEndpoint) Read(*tcpip.FullAddress) (buffer.View, tcpip.Contr
 	return buffer.View{}, tcpip.ControlMessages{}, nil
 }
 
-func (f *fakeTransportEndpoint) Write(p tcpip.Payload, opts tcpip.WriteOptions) (int64, <-chan struct{}, *tcpip.Error) {
+func (f *fakeTransportEndpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-chan struct{}, *tcpip.Error) {
 	if len(f.route.RemoteAddress) == 0 {
 		return 0, nil, tcpip.ErrNoRoute
 	}
 
 	hdr := buffer.NewPrependable(int(f.route.MaxHeaderLength()))
-	v, err := p.Get(p.Size())
+	v, err := p.FullPayload()
 	if err != nil {
 		return 0, nil, err
 	}
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index ebf8a2d04..2534069ab 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -261,31 +261,34 @@ type FullAddress struct {
 	Port uint16
 }
 
-// Payload provides an interface around data that is being sent to an endpoint.
-// This allows the endpoint to request the amount of data it needs based on
-// internal buffers without exposing them. 'p.Get(p.Size())' reads all the data.
-type Payload interface {
-	// Get returns a slice containing exactly 'min(size, p.Size())' bytes.
-	Get(size int) ([]byte, *Error)
-
-	// Size returns the payload size.
-	Size() int
+// Payloader is an interface that provides data.
+//
+// This interface allows the endpoint to request the amount of data it needs
+// based on internal buffers without exposing them.
+type Payloader interface {
+	// FullPayload returns all available bytes.
+	FullPayload() ([]byte, *Error)
+
+	// Payload returns a slice containing at most size bytes.
+	Payload(size int) ([]byte, *Error)
 }
 
-// SlicePayload implements Payload on top of slices for convenience.
+// SlicePayload implements Payloader for slices.
+//
+// This is typically used for tests.
 type SlicePayload []byte
 
-// Get implements Payload.
-func (s SlicePayload) Get(size int) ([]byte, *Error) {
-	if size > s.Size() {
-		size = s.Size()
-	}
-	return s[:size], nil
+// FullPayload implements Payloader.FullPayload.
+func (s SlicePayload) FullPayload() ([]byte, *Error) {
+	return s, nil
 }
 
-// Size implements Payload.
-func (s SlicePayload) Size() int {
-	return len(s)
+// Payload implements Payloader.Payload.
+func (s SlicePayload) Payload(size int) ([]byte, *Error) {
+	if size > len(s) {
+		size = len(s)
+	}
+	return s[:size], nil
 }
 
 // A ControlMessages contains socket control messages for IP sockets.
@@ -338,7 +341,7 @@ type Endpoint interface {
 	// ErrNoLinkAddress and a notification channel is returned for the caller to
 	// block. Channel is closed once address resolution is complete (success or
 	// not). The channel is only non-nil in this case.
-	Write(Payload, WriteOptions) (int64, <-chan struct{}, *Error)
+	Write(Payloader, WriteOptions) (int64, <-chan struct{}, *Error)
 
 	// Peek reads data without consuming it from the endpoint.
 	//
@@ -432,6 +435,11 @@ type WriteOptions struct {
 
 	// EndOfRecord has the same semantics as Linux's MSG_EOR.
 	EndOfRecord bool
+
+	// Atomic means that all data fetched from Payloader must be written to the
+	// endpoint. If Atomic is false, then data fetched from the Payloader may be
+	// discarded if available endpoint buffer space is unsufficient.
+	Atomic bool
 }
 
 // SockOpt represents socket options which values have the int type.
diff --git a/pkg/tcpip/transport/icmp/endpoint.go b/pkg/tcpip/transport/icmp/endpoint.go
index e1f622af6..3db060384 100644
--- a/pkg/tcpip/transport/icmp/endpoint.go
+++ b/pkg/tcpip/transport/icmp/endpoint.go
@@ -204,7 +204,7 @@ func (e *endpoint) prepareForWrite(to *tcpip.FullAddress) (retry bool, err *tcpi
 
 // Write writes data to the endpoint's peer. This method does not block
 // if the data cannot be written.
-func (e *endpoint) Write(p tcpip.Payload, opts tcpip.WriteOptions) (int64, <-chan struct{}, *tcpip.Error) {
+func (e *endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-chan struct{}, *tcpip.Error) {
 	// MSG_MORE is unimplemented. (This also means that MSG_EOR is a no-op.)
 	if opts.More {
 		return 0, nil, tcpip.ErrInvalidOptionValue
@@ -289,7 +289,7 @@ func (e *endpoint) Write(p tcpip.Payload, opts tcpip.WriteOptions) (int64, <-cha
 		}
 	}
 
-	v, err := p.Get(p.Size())
+	v, err := p.FullPayload()
 	if err != nil {
 		return 0, nil, err
 	}
diff --git a/pkg/tcpip/transport/raw/endpoint.go b/pkg/tcpip/transport/raw/endpoint.go
index 13e17e2a6..cf1c5c433 100644
--- a/pkg/tcpip/transport/raw/endpoint.go
+++ b/pkg/tcpip/transport/raw/endpoint.go
@@ -207,7 +207,7 @@ func (ep *endpoint) Read(addr *tcpip.FullAddress) (buffer.View, tcpip.ControlMes
 }
 
 // Write implements tcpip.Endpoint.Write.
-func (ep *endpoint) Write(payload tcpip.Payload, opts tcpip.WriteOptions) (int64, <-chan struct{}, *tcpip.Error) {
+func (ep *endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-chan struct{}, *tcpip.Error) {
 	// MSG_MORE is unimplemented. This also means that MSG_EOR is a no-op.
 	if opts.More {
 		return 0, nil, tcpip.ErrInvalidOptionValue
@@ -220,9 +220,8 @@ func (ep *endpoint) Write(payload tcpip.Payload, opts tcpip.WriteOptions) (int64
 		return 0, nil, tcpip.ErrInvalidEndpointState
 	}
 
-	payloadBytes, err := payload.Get(payload.Size())
+	payloadBytes, err := p.FullPayload()
 	if err != nil {
-		ep.mu.RUnlock()
 		return 0, nil, err
 	}
 
@@ -230,7 +229,7 @@ func (ep *endpoint) Write(payload tcpip.Payload, opts tcpip.WriteOptions) (int64
 	// destination address, route using that address.
 	if !ep.associated {
 		ip := header.IPv4(payloadBytes)
-		if !ip.IsValid(payload.Size()) {
+		if !ip.IsValid(len(payloadBytes)) {
 			ep.mu.RUnlock()
 			return 0, nil, tcpip.ErrInvalidOptionValue
 		}
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index ac927569a..dd931f88c 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -806,7 +806,7 @@ func (e *endpoint) isEndpointWritableLocked() (int, *tcpip.Error) {
 }
 
 // Write writes data to the endpoint's peer.
-func (e *endpoint) Write(p tcpip.Payload, opts tcpip.WriteOptions) (int64, <-chan struct{}, *tcpip.Error) {
+func (e *endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-chan struct{}, *tcpip.Error) {
 	// Linux completely ignores any address passed to sendto(2) for TCP sockets
 	// (without the MSG_FASTOPEN flag). Corking is unimplemented, so opts.More
 	// and opts.EndOfRecord are also ignored.
@@ -821,47 +821,52 @@ func (e *endpoint) Write(p tcpip.Payload, opts tcpip.WriteOptions) (int64, <-cha
 		return 0, nil, err
 	}
 
-	e.sndBufMu.Unlock()
-	e.mu.RUnlock()
-
-	// Nothing to do if the buffer is empty.
-	if p.Size() == 0 {
-		return 0, nil, nil
+	// We can release locks while copying data.
+	//
+	// This is not possible if atomic is set, because we can't allow the
+	// available buffer space to be consumed by some other caller while we
+	// are copying data in.
+	if !opts.Atomic {
+		e.sndBufMu.Unlock()
+		e.mu.RUnlock()
 	}
 
-	// Copy in memory without holding sndBufMu so that worker goroutine can
-	// make progress independent of this operation.
-	v, perr := p.Get(avail)
-	if perr != nil {
+	// Fetch data.
+	v, perr := p.Payload(avail)
+	if perr != nil || len(v) == 0 {
+		if opts.Atomic { // See above.
+			e.sndBufMu.Unlock()
+			e.mu.RUnlock()
+		}
+		// Note that perr may be nil if len(v) == 0.
 		return 0, nil, perr
 	}
 
-	e.mu.RLock()
-	e.sndBufMu.Lock()
+	if !opts.Atomic { // See above.
+		e.mu.RLock()
+		e.sndBufMu.Lock()
 
-	// Because we released the lock before copying, check state again
-	// to make sure the endpoint is still in a valid state for a
-	// write.
-	avail, err = e.isEndpointWritableLocked()
-	if err != nil {
-		e.sndBufMu.Unlock()
-		e.mu.RUnlock()
-		return 0, nil, err
-	}
+		// Because we released the lock before copying, check state again
+		// to make sure the endpoint is still in a valid state for a write.
+		avail, err = e.isEndpointWritableLocked()
+		if err != nil {
+			e.sndBufMu.Unlock()
+			e.mu.RUnlock()
+			return 0, nil, err
+		}
 
-	// Discard any excess data copied in due to avail being reduced due to a
-	// simultaneous write call to the socket.
-	if avail < len(v) {
-		v = v[:avail]
+		// Discard any excess data copied in due to avail being reduced due
+		// to a simultaneous write call to the socket.
+		if avail < len(v) {
+			v = v[:avail]
+		}
 	}
 
 	// Add data to the send queue.
-	l := len(v)
 	s := newSegmentFromView(&e.route, e.id, v)
-	e.sndBufUsed += l
-	e.sndBufInQueue += seqnum.Size(l)
+	e.sndBufUsed += len(v)
+	e.sndBufInQueue += seqnum.Size(len(v))
 	e.sndQueue.PushBack(s)
-
 	e.sndBufMu.Unlock()
 	// Release the endpoint lock to prevent deadlocks due to lock
 	// order inversion when acquiring workMu.
@@ -875,7 +880,8 @@ func (e *endpoint) Write(p tcpip.Payload, opts tcpip.WriteOptions) (int64, <-cha
 		// Let the protocol goroutine do the work.
 		e.sndWaker.Assert()
 	}
-	return int64(l), nil, nil
+
+	return int64(len(v)), nil, nil
 }
 
 // Peek reads data without consuming it from the endpoint.
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index dccb9a7eb..6ac7c067a 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -15,7 +15,6 @@
 package udp
 
 import (
-	"math"
 	"sync"
 
 	"gvisor.dev/gvisor/pkg/tcpip"
@@ -277,17 +276,12 @@ func (e *endpoint) connectRoute(nicid tcpip.NICID, addr tcpip.FullAddress, netPr
 
 // Write writes data to the endpoint's peer. This method does not block
 // if the data cannot be written.
-func (e *endpoint) Write(p tcpip.Payload, opts tcpip.WriteOptions) (int64, <-chan struct{}, *tcpip.Error) {
+func (e *endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-chan struct{}, *tcpip.Error) {
 	// MSG_MORE is unimplemented. (This also means that MSG_EOR is a no-op.)
 	if opts.More {
 		return 0, nil, tcpip.ErrInvalidOptionValue
 	}
 
-	if p.Size() > math.MaxUint16 {
-		// Payload can't possibly fit in a packet.
-		return 0, nil, tcpip.ErrMessageTooLong
-	}
-
 	to := opts.To
 
 	e.mu.RLock()
@@ -370,10 +364,14 @@ func (e *endpoint) Write(p tcpip.Payload, opts tcpip.WriteOptions) (int64, <-cha
 		}
 	}
 
-	v, err := p.Get(p.Size())
+	v, err := p.FullPayload()
 	if err != nil {
 		return 0, nil, err
 	}
+	if len(v) > header.UDPMaximumPacketSize {
+		// Payload can't possibly fit in a packet.
+		return 0, nil, tcpip.ErrMessageTooLong
+	}
 
 	ttl := route.DefaultTTL()
 	if header.IsV4MulticastAddress(route.RemoteAddress) || header.IsV6MulticastAddress(route.RemoteAddress) {
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 34057e3d0..df00d2c14 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -1867,7 +1867,9 @@ cc_binary(
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
+        "//test/util:thread_util",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/time",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -1901,6 +1903,7 @@ cc_binary(
         "//test/util:test_util",
         "//test/util:thread_util",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/time",
         "@com_google_googletest//:gtest",
     ],
 )
diff --git a/test/syscalls/linux/pipe.cc b/test/syscalls/linux/pipe.cc
index 65afb90f3..10e2a6dfc 100644
--- a/test/syscalls/linux/pipe.cc
+++ b/test/syscalls/linux/pipe.cc
@@ -168,6 +168,20 @@ TEST_P(PipeTest, Write) {
   EXPECT_EQ(wbuf, rbuf);
 }
 
+TEST_P(PipeTest, WritePage) {
+  SKIP_IF(!CreateBlocking());
+
+  std::vector<char> wbuf(kPageSize);
+  RandomizeBuffer(wbuf.data(), wbuf.size());
+  std::vector<char> rbuf(wbuf.size());
+
+  ASSERT_THAT(write(wfd_.get(), wbuf.data(), wbuf.size()),
+              SyscallSucceedsWithValue(wbuf.size()));
+  ASSERT_THAT(read(rfd_.get(), rbuf.data(), rbuf.size()),
+              SyscallSucceedsWithValue(rbuf.size()));
+  EXPECT_EQ(memcmp(rbuf.data(), wbuf.data(), wbuf.size()), 0);
+}
+
 TEST_P(PipeTest, NonBlocking) {
   SKIP_IF(!CreateNonBlocking());
 
diff --git a/test/syscalls/linux/sendfile.cc b/test/syscalls/linux/sendfile.cc
index 9167ab066..4502e7fb4 100644
--- a/test/syscalls/linux/sendfile.cc
+++ b/test/syscalls/linux/sendfile.cc
@@ -19,9 +19,12 @@
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
 #include "absl/strings/string_view.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
 #include "test/util/file_descriptor.h"
 #include "test/util/temp_path.h"
 #include "test/util/test_util.h"
+#include "test/util/thread_util.h"
 
 namespace gvisor {
 namespace testing {
@@ -442,6 +445,72 @@ TEST(SendFileTest, SendToNotARegularFile) {
   EXPECT_THAT(sendfile(outf.get(), inf.get(), nullptr, 0),
               SyscallFailsWithErrno(EINVAL));
 }
+
+TEST(SendFileTest, SendPipeWouldBlock) {
+  // Create temp file.
+  constexpr char kData[] =
+      "The fool doth think he is wise, but the wise man knows himself to be a "
+      "fool.";
+  constexpr int kDataSize = sizeof(kData) - 1;
+  const TempPath in_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      GetAbsoluteTestTmpdir(), kData, TempPath::kDefaultFileMode));
+
+  // Open the input file as read only.
+  const FileDescriptor inf =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(in_file.path(), O_RDONLY));
+
+  // Setup the output named pipe.
+  int fds[2];
+  ASSERT_THAT(pipe2(fds, O_NONBLOCK), SyscallSucceeds());
+  const FileDescriptor rfd(fds[0]);
+  const FileDescriptor wfd(fds[1]);
+
+  // Fill up the pipe's buffer.
+  int pipe_size = -1;
+  ASSERT_THAT(pipe_size = fcntl(wfd.get(), F_GETPIPE_SZ), SyscallSucceeds());
+  std::vector<char> buf(2 * pipe_size);
+  ASSERT_THAT(write(wfd.get(), buf.data(), buf.size()),
+              SyscallSucceedsWithValue(pipe_size));
+
+  EXPECT_THAT(sendfile(wfd.get(), inf.get(), nullptr, kDataSize),
+              SyscallFailsWithErrno(EWOULDBLOCK));
+}
+
+TEST(SendFileTest, SendPipeBlocks) {
+  // Create temp file.
+  constexpr char kData[] =
+      "The fault, dear Brutus, is not in our stars, but in ourselves.";
+  constexpr int kDataSize = sizeof(kData) - 1;
+  const TempPath in_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      GetAbsoluteTestTmpdir(), kData, TempPath::kDefaultFileMode));
+
+  // Open the input file as read only.
+  const FileDescriptor inf =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(in_file.path(), O_RDONLY));
+
+  // Setup the output named pipe.
+  int fds[2];
+  ASSERT_THAT(pipe(fds), SyscallSucceeds());
+  const FileDescriptor rfd(fds[0]);
+  const FileDescriptor wfd(fds[1]);
+
+  // Fill up the pipe's buffer.
+  int pipe_size = -1;
+  ASSERT_THAT(pipe_size = fcntl(wfd.get(), F_GETPIPE_SZ), SyscallSucceeds());
+  std::vector<char> buf(pipe_size);
+  ASSERT_THAT(write(wfd.get(), buf.data(), buf.size()),
+              SyscallSucceedsWithValue(pipe_size));
+
+  ScopedThread t([&]() {
+    absl::SleepFor(absl::Milliseconds(100));
+    ASSERT_THAT(read(rfd.get(), buf.data(), buf.size()),
+                SyscallSucceedsWithValue(pipe_size));
+  });
+
+  EXPECT_THAT(sendfile(wfd.get(), inf.get(), nullptr, kDataSize),
+              SyscallSucceedsWithValue(kDataSize));
+}
+
 }  // namespace
 
 }  // namespace testing
diff --git a/test/syscalls/linux/splice.cc b/test/syscalls/linux/splice.cc
index e25f264f6..85232cb1f 100644
--- a/test/syscalls/linux/splice.cc
+++ b/test/syscalls/linux/splice.cc
@@ -14,12 +14,16 @@
 
 #include <fcntl.h>
 #include <sys/eventfd.h>
+#include <sys/resource.h>
 #include <sys/sendfile.h>
+#include <sys/time.h>
 #include <unistd.h>
 
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
 #include "absl/strings/string_view.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
 #include "test/util/file_descriptor.h"
 #include "test/util/temp_path.h"
 #include "test/util/test_util.h"
@@ -36,23 +40,23 @@ TEST(SpliceTest, TwoRegularFiles) {
   const TempPath out_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
 
   // Open the input file as read only.
-  const FileDescriptor inf =
+  const FileDescriptor in_fd =
       ASSERT_NO_ERRNO_AND_VALUE(Open(in_file.path(), O_RDONLY));
 
   // Open the output file as write only.
-  const FileDescriptor outf =
+  const FileDescriptor out_fd =
       ASSERT_NO_ERRNO_AND_VALUE(Open(out_file.path(), O_WRONLY));
 
   // Verify that it is rejected as expected; regardless of offsets.
   loff_t in_offset = 0;
   loff_t out_offset = 0;
-  EXPECT_THAT(splice(inf.get(), &in_offset, outf.get(), &out_offset, 1, 0),
+  EXPECT_THAT(splice(in_fd.get(), &in_offset, out_fd.get(), &out_offset, 1, 0),
               SyscallFailsWithErrno(EINVAL));
-  EXPECT_THAT(splice(inf.get(), nullptr, outf.get(), &out_offset, 1, 0),
+  EXPECT_THAT(splice(in_fd.get(), nullptr, out_fd.get(), &out_offset, 1, 0),
               SyscallFailsWithErrno(EINVAL));
-  EXPECT_THAT(splice(inf.get(), &in_offset, outf.get(), nullptr, 1, 0),
+  EXPECT_THAT(splice(in_fd.get(), &in_offset, out_fd.get(), nullptr, 1, 0),
               SyscallFailsWithErrno(EINVAL));
-  EXPECT_THAT(splice(inf.get(), nullptr, outf.get(), nullptr, 1, 0),
+  EXPECT_THAT(splice(in_fd.get(), nullptr, out_fd.get(), nullptr, 1, 0),
               SyscallFailsWithErrno(EINVAL));
 }
 
@@ -75,8 +79,6 @@ TEST(SpliceTest, SamePipe) {
 }
 
 TEST(TeeTest, SamePipe) {
-  SKIP_IF(IsRunningOnGvisor());
-
   // Create a new pipe.
   int fds[2];
   ASSERT_THAT(pipe(fds), SyscallSucceeds());
@@ -95,11 +97,9 @@ TEST(TeeTest, SamePipe) {
 }
 
 TEST(TeeTest, RegularFile) {
-  SKIP_IF(IsRunningOnGvisor());
-
   // Open some file.
   const TempPath in_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
-  const FileDescriptor inf =
+  const FileDescriptor in_fd =
       ASSERT_NO_ERRNO_AND_VALUE(Open(in_file.path(), O_RDWR));
 
   // Create a new pipe.
@@ -109,9 +109,9 @@ TEST(TeeTest, RegularFile) {
   const FileDescriptor wfd(fds[1]);
 
   // Attempt to tee from the file.
-  EXPECT_THAT(tee(inf.get(), wfd.get(), kPageSize, 0),
+  EXPECT_THAT(tee(in_fd.get(), wfd.get(), kPageSize, 0),
               SyscallFailsWithErrno(EINVAL));
-  EXPECT_THAT(tee(rfd.get(), inf.get(), kPageSize, 0),
+  EXPECT_THAT(tee(rfd.get(), in_fd.get(), kPageSize, 0),
               SyscallFailsWithErrno(EINVAL));
 }
 
@@ -142,7 +142,7 @@ TEST(SpliceTest, FromEventFD) {
   constexpr uint64_t kEventFDValue = 1;
   int efd;
   ASSERT_THAT(efd = eventfd(kEventFDValue, 0), SyscallSucceeds());
-  const FileDescriptor inf(efd);
+  const FileDescriptor in_fd(efd);
 
   // Create a new pipe.
   int fds[2];
@@ -152,7 +152,7 @@ TEST(SpliceTest, FromEventFD) {
 
   // Splice 8-byte eventfd value to pipe.
   constexpr int kEventFDSize = 8;
-  EXPECT_THAT(splice(inf.get(), nullptr, wfd.get(), nullptr, kEventFDSize, 0),
+  EXPECT_THAT(splice(in_fd.get(), nullptr, wfd.get(), nullptr, kEventFDSize, 0),
               SyscallSucceedsWithValue(kEventFDSize));
 
   // Contents should be equal.
@@ -166,7 +166,7 @@ TEST(SpliceTest, FromEventFD) {
 TEST(SpliceTest, FromEventFDOffset) {
   int efd;
   ASSERT_THAT(efd = eventfd(0, 0), SyscallSucceeds());
-  const FileDescriptor inf(efd);
+  const FileDescriptor in_fd(efd);
 
   // Create a new pipe.
   int fds[2];
@@ -179,7 +179,7 @@ TEST(SpliceTest, FromEventFDOffset) {
   // This is not allowed because eventfd doesn't support pread.
   constexpr int kEventFDSize = 8;
   loff_t in_off = 0;
-  EXPECT_THAT(splice(inf.get(), &in_off, wfd.get(), nullptr, kEventFDSize, 0),
+  EXPECT_THAT(splice(in_fd.get(), &in_off, wfd.get(), nullptr, kEventFDSize, 0),
               SyscallFailsWithErrno(EINVAL));
 }
 
@@ -200,28 +200,29 @@ TEST(SpliceTest, ToEventFDOffset) {
 
   int efd;
   ASSERT_THAT(efd = eventfd(0, 0), SyscallSucceeds());
-  const FileDescriptor outf(efd);
+  const FileDescriptor out_fd(efd);
 
   // Attempt to splice 8-byte eventfd value to pipe with offset.
   //
   // This is not allowed because eventfd doesn't support pwrite.
   loff_t out_off = 0;
-  EXPECT_THAT(splice(rfd.get(), nullptr, outf.get(), &out_off, kEventFDSize, 0),
-              SyscallFailsWithErrno(EINVAL));
+  EXPECT_THAT(
+      splice(rfd.get(), nullptr, out_fd.get(), &out_off, kEventFDSize, 0),
+      SyscallFailsWithErrno(EINVAL));
 }
 
 TEST(SpliceTest, ToPipe) {
   // Open the input file.
   const TempPath in_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
-  const FileDescriptor inf =
+  const FileDescriptor in_fd =
       ASSERT_NO_ERRNO_AND_VALUE(Open(in_file.path(), O_RDWR));
 
   // Fill with some random data.
   std::vector<char> buf(kPageSize);
   RandomizeBuffer(buf.data(), buf.size());
-  ASSERT_THAT(write(inf.get(), buf.data(), buf.size()),
+  ASSERT_THAT(write(in_fd.get(), buf.data(), buf.size()),
               SyscallSucceedsWithValue(kPageSize));
-  ASSERT_THAT(lseek(inf.get(), 0, SEEK_SET), SyscallSucceedsWithValue(0));
+  ASSERT_THAT(lseek(in_fd.get(), 0, SEEK_SET), SyscallSucceedsWithValue(0));
 
   // Create a new pipe.
   int fds[2];
@@ -230,7 +231,7 @@ TEST(SpliceTest, ToPipe) {
   const FileDescriptor wfd(fds[1]);
 
   // Splice to the pipe.
-  EXPECT_THAT(splice(inf.get(), nullptr, wfd.get(), nullptr, kPageSize, 0),
+  EXPECT_THAT(splice(in_fd.get(), nullptr, wfd.get(), nullptr, kPageSize, 0),
               SyscallSucceedsWithValue(kPageSize));
 
   // Contents should be equal.
@@ -243,13 +244,13 @@ TEST(SpliceTest, ToPipe) {
 TEST(SpliceTest, ToPipeOffset) {
   // Open the input file.
   const TempPath in_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
-  const FileDescriptor inf =
+  const FileDescriptor in_fd =
       ASSERT_NO_ERRNO_AND_VALUE(Open(in_file.path(), O_RDWR));
 
   // Fill with some random data.
   std::vector<char> buf(kPageSize);
   RandomizeBuffer(buf.data(), buf.size());
-  ASSERT_THAT(write(inf.get(), buf.data(), buf.size()),
+  ASSERT_THAT(write(in_fd.get(), buf.data(), buf.size()),
               SyscallSucceedsWithValue(kPageSize));
 
   // Create a new pipe.
@@ -261,7 +262,7 @@ TEST(SpliceTest, ToPipeOffset) {
   // Splice to the pipe.
   loff_t in_offset = kPageSize / 2;
   EXPECT_THAT(
-      splice(inf.get(), &in_offset, wfd.get(), nullptr, kPageSize / 2, 0),
+      splice(in_fd.get(), &in_offset, wfd.get(), nullptr, kPageSize / 2, 0),
       SyscallSucceedsWithValue(kPageSize / 2));
 
   // Contents should be equal to only the second part.
@@ -286,22 +287,22 @@ TEST(SpliceTest, FromPipe) {
 
   // Open the input file.
   const TempPath out_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
-  const FileDescriptor outf =
+  const FileDescriptor out_fd =
       ASSERT_NO_ERRNO_AND_VALUE(Open(out_file.path(), O_RDWR));
 
   // Splice to the output file.
-  EXPECT_THAT(splice(rfd.get(), nullptr, outf.get(), nullptr, kPageSize, 0),
+  EXPECT_THAT(splice(rfd.get(), nullptr, out_fd.get(), nullptr, kPageSize, 0),
               SyscallSucceedsWithValue(kPageSize));
 
   // The offset of the output should be equal to kPageSize. We assert that and
   // reset to zero so that we can read the contents and ensure they match.
-  EXPECT_THAT(lseek(outf.get(), 0, SEEK_CUR),
+  EXPECT_THAT(lseek(out_fd.get(), 0, SEEK_CUR),
               SyscallSucceedsWithValue(kPageSize));
-  ASSERT_THAT(lseek(outf.get(), 0, SEEK_SET), SyscallSucceedsWithValue(0));
+  ASSERT_THAT(lseek(out_fd.get(), 0, SEEK_SET), SyscallSucceedsWithValue(0));
 
   // Contents should be equal.
   std::vector<char> rbuf(kPageSize);
-  ASSERT_THAT(read(outf.get(), rbuf.data(), rbuf.size()),
+  ASSERT_THAT(read(out_fd.get(), rbuf.data(), rbuf.size()),
               SyscallSucceedsWithValue(kPageSize));
   EXPECT_EQ(memcmp(rbuf.data(), buf.data(), buf.size()), 0);
 }
@@ -321,18 +322,19 @@ TEST(SpliceTest, FromPipeOffset) {
 
   // Open the input file.
   const TempPath out_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
-  const FileDescriptor outf =
+  const FileDescriptor out_fd =
       ASSERT_NO_ERRNO_AND_VALUE(Open(out_file.path(), O_RDWR));
 
   // Splice to the output file.
   loff_t out_offset = kPageSize / 2;
-  EXPECT_THAT(splice(rfd.get(), nullptr, outf.get(), &out_offset, kPageSize, 0),
-              SyscallSucceedsWithValue(kPageSize));
+  EXPECT_THAT(
+      splice(rfd.get(), nullptr, out_fd.get(), &out_offset, kPageSize, 0),
+      SyscallSucceedsWithValue(kPageSize));
 
   // Content should reflect the splice. We write to a specific offset in the
   // file, so the internals should now be allocated sparsely.
   std::vector<char> rbuf(kPageSize);
-  ASSERT_THAT(read(outf.get(), rbuf.data(), rbuf.size()),
+  ASSERT_THAT(read(out_fd.get(), rbuf.data(), rbuf.size()),
               SyscallSucceedsWithValue(kPageSize));
   std::vector<char> zbuf(kPageSize / 2);
   memset(zbuf.data(), 0, zbuf.size());
@@ -404,8 +406,6 @@ TEST(SpliceTest, Blocking) {
 }
 
 TEST(TeeTest, Blocking) {
-  SKIP_IF(IsRunningOnGvisor());
-
   // Create two new pipes.
   int first[2], second[2];
   ASSERT_THAT(pipe(first), SyscallSucceeds());
@@ -440,6 +440,49 @@ TEST(TeeTest, Blocking) {
   EXPECT_EQ(memcmp(rbuf.data(), buf.data(), kPageSize), 0);
 }
 
+TEST(TeeTest, BlockingWrite) {
+  // Create two new pipes.
+  int first[2], second[2];
+  ASSERT_THAT(pipe(first), SyscallSucceeds());
+  const FileDescriptor rfd1(first[0]);
+  const FileDescriptor wfd1(first[1]);
+  ASSERT_THAT(pipe(second), SyscallSucceeds());
+  const FileDescriptor rfd2(second[0]);
+  const FileDescriptor wfd2(second[1]);
+
+  // Make some data available to be read.
+  std::vector<char> buf1(kPageSize);
+  RandomizeBuffer(buf1.data(), buf1.size());
+  ASSERT_THAT(write(wfd1.get(), buf1.data(), buf1.size()),
+              SyscallSucceedsWithValue(kPageSize));
+
+  // Fill up the write pipe's buffer.
+  int pipe_size = -1;
+  ASSERT_THAT(pipe_size = fcntl(wfd2.get(), F_GETPIPE_SZ), SyscallSucceeds());
+  std::vector<char> buf2(pipe_size);
+  ASSERT_THAT(write(wfd2.get(), buf2.data(), buf2.size()),
+              SyscallSucceedsWithValue(pipe_size));
+
+  ScopedThread t([&]() {
+    absl::SleepFor(absl::Milliseconds(100));
+    ASSERT_THAT(read(rfd2.get(), buf2.data(), buf2.size()),
+                SyscallSucceedsWithValue(pipe_size));
+  });
+
+  // Attempt a tee immediately; it should block.
+  EXPECT_THAT(tee(rfd1.get(), wfd2.get(), kPageSize, 0),
+              SyscallSucceedsWithValue(kPageSize));
+
+  // Thread should be joinable.
+  t.Join();
+
+  // Content should reflect the tee.
+  std::vector<char> rbuf(kPageSize);
+  ASSERT_THAT(read(rfd2.get(), rbuf.data(), rbuf.size()),
+              SyscallSucceedsWithValue(kPageSize));
+  EXPECT_EQ(memcmp(rbuf.data(), buf1.data(), kPageSize), 0);
+}
+
 TEST(SpliceTest, NonBlocking) {
   // Create two new pipes.
   int first[2], second[2];
@@ -457,8 +500,6 @@ TEST(SpliceTest, NonBlocking) {
 }
 
 TEST(TeeTest, NonBlocking) {
-  SKIP_IF(IsRunningOnGvisor());
-
   // Create two new pipes.
   int first[2], second[2];
   ASSERT_THAT(pipe(first), SyscallSucceeds());
@@ -473,6 +514,79 @@ TEST(TeeTest, NonBlocking) {
               SyscallFailsWithErrno(EAGAIN));
 }
 
+TEST(TeeTest, MultiPage) {
+  // Create two new pipes.
+  int first[2], second[2];
+  ASSERT_THAT(pipe(first), SyscallSucceeds());
+  const FileDescriptor rfd1(first[0]);
+  const FileDescriptor wfd1(first[1]);
+  ASSERT_THAT(pipe(second), SyscallSucceeds());
+  const FileDescriptor rfd2(second[0]);
+  const FileDescriptor wfd2(second[1]);
+
+  // Make some data available to be read.
+  std::vector<char> wbuf(8 * kPageSize);
+  RandomizeBuffer(wbuf.data(), wbuf.size());
+  ASSERT_THAT(write(wfd1.get(), wbuf.data(), wbuf.size()),
+              SyscallSucceedsWithValue(wbuf.size()));
+
+  // Attempt a tee immediately; it should complete.
+  EXPECT_THAT(tee(rfd1.get(), wfd2.get(), wbuf.size(), 0),
+              SyscallSucceedsWithValue(wbuf.size()));
+
+  // Content should reflect the tee.
+  std::vector<char> rbuf(wbuf.size());
+  ASSERT_THAT(read(rfd2.get(), rbuf.data(), rbuf.size()),
+              SyscallSucceedsWithValue(rbuf.size()));
+  EXPECT_EQ(memcmp(rbuf.data(), wbuf.data(), rbuf.size()), 0);
+  ASSERT_THAT(read(rfd1.get(), rbuf.data(), rbuf.size()),
+              SyscallSucceedsWithValue(rbuf.size()));
+  EXPECT_EQ(memcmp(rbuf.data(), wbuf.data(), rbuf.size()), 0);
+}
+
+TEST(SpliceTest, FromPipeMaxFileSize) {
+  // Create a new pipe.
+  int fds[2];
+  ASSERT_THAT(pipe(fds), SyscallSucceeds());
+  const FileDescriptor rfd(fds[0]);
+  const FileDescriptor wfd(fds[1]);
+
+  // Fill with some random data.
+  std::vector<char> buf(kPageSize);
+  RandomizeBuffer(buf.data(), buf.size());
+  ASSERT_THAT(write(wfd.get(), buf.data(), buf.size()),
+              SyscallSucceedsWithValue(kPageSize));
+
+  // Open the input file.
+  const TempPath out_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  const FileDescriptor out_fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(out_file.path(), O_RDWR));
+
+  EXPECT_THAT(ftruncate(out_fd.get(), 13 << 20), SyscallSucceeds());
+  EXPECT_THAT(lseek(out_fd.get(), 0, SEEK_END),
+              SyscallSucceedsWithValue(13 << 20));
+
+  // Set our file size limit.
+  sigset_t set;
+  sigemptyset(&set);
+  sigaddset(&set, SIGXFSZ);
+  TEST_PCHECK(sigprocmask(SIG_BLOCK, &set, nullptr) == 0);
+  rlimit rlim = {};
+  rlim.rlim_cur = rlim.rlim_max = (13 << 20);
+  EXPECT_THAT(setrlimit(RLIMIT_FSIZE, &rlim), SyscallSucceeds());
+
+  // Splice to the output file.
+  EXPECT_THAT(
+      splice(rfd.get(), nullptr, out_fd.get(), nullptr, 3 * kPageSize, 0),
+      SyscallFailsWithErrno(EFBIG));
+
+  // Contents should be equal.
+  std::vector<char> rbuf(kPageSize);
+  ASSERT_THAT(read(rfd.get(), rbuf.data(), rbuf.size()),
+              SyscallSucceedsWithValue(kPageSize));
+  EXPECT_EQ(memcmp(rbuf.data(), buf.data(), buf.size()), 0);
+}
+
 }  // namespace
 
 }  // namespace testing
-- 
cgit v1.2.3


From 239a07aabfad8991556b43c85c30270d09353f86 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Fri, 13 Sep 2019 21:43:12 -0700
Subject: gvisor: return ENOTDIR from the unlink syscall

ENOTDIR has to be returned when a component used as a directory in
pathname is not, in  fact,  a directory.

PiperOrigin-RevId: 269037893
---
 pkg/sentry/fs/dirent.go               | 4 +++-
 pkg/sentry/fs/dirent_refs_test.go     | 2 +-
 pkg/sentry/syscalls/linux/sys_file.go | 5 +----
 test/syscalls/linux/unlink.cc         | 2 ++
 4 files changed, 7 insertions(+), 6 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/dirent.go b/pkg/sentry/fs/dirent.go
index fbca06761..3cb73bd78 100644
--- a/pkg/sentry/fs/dirent.go
+++ b/pkg/sentry/fs/dirent.go
@@ -1126,7 +1126,7 @@ func (d *Dirent) unmount(ctx context.Context, replacement *Dirent) error {
 
 // Remove removes the given file or symlink.  The root dirent is used to
 // resolve name, and must not be nil.
-func (d *Dirent) Remove(ctx context.Context, root *Dirent, name string) error {
+func (d *Dirent) Remove(ctx context.Context, root *Dirent, name string, dirPath bool) error {
 	// Check the root.
 	if root == nil {
 		panic("Dirent.Remove: root must not be nil")
@@ -1151,6 +1151,8 @@ func (d *Dirent) Remove(ctx context.Context, root *Dirent, name string) error {
 	// Remove cannot remove directories.
 	if IsDir(child.Inode.StableAttr) {
 		return syscall.EISDIR
+	} else if dirPath {
+		return syscall.ENOTDIR
 	}
 
 	// Remove cannot remove a mount point.
diff --git a/pkg/sentry/fs/dirent_refs_test.go b/pkg/sentry/fs/dirent_refs_test.go
index 884e3ff06..47bc72a88 100644
--- a/pkg/sentry/fs/dirent_refs_test.go
+++ b/pkg/sentry/fs/dirent_refs_test.go
@@ -343,7 +343,7 @@ func TestRemoveExtraRefs(t *testing.T) {
 			}
 			d := f.Dirent
 
-			if err := test.root.Remove(contexttest.Context(t), test.root, name); err != nil {
+			if err := test.root.Remove(contexttest.Context(t), test.root, name, false /* dirPath */); err != nil {
 				t.Fatalf("root.Remove(root, %q) failed: %v", name, err)
 			}
 
diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go
index 2e00a91ce..b9a8e3e21 100644
--- a/pkg/sentry/syscalls/linux/sys_file.go
+++ b/pkg/sentry/syscalls/linux/sys_file.go
@@ -1423,9 +1423,6 @@ func unlinkAt(t *kernel.Task, dirFD int32, addr usermem.Addr) error {
 	if err != nil {
 		return err
 	}
-	if dirPath {
-		return syserror.ENOENT
-	}
 
 	return fileOpAt(t, dirFD, path, func(root *fs.Dirent, d *fs.Dirent, name string, _ uint) error {
 		if !fs.IsDir(d.Inode.StableAttr) {
@@ -1436,7 +1433,7 @@ func unlinkAt(t *kernel.Task, dirFD int32, addr usermem.Addr) error {
 			return err
 		}
 
-		return d.Remove(t, root, name)
+		return d.Remove(t, root, name, dirPath)
 	})
 }
 
diff --git a/test/syscalls/linux/unlink.cc b/test/syscalls/linux/unlink.cc
index b6f65e027..2040375c9 100644
--- a/test/syscalls/linux/unlink.cc
+++ b/test/syscalls/linux/unlink.cc
@@ -123,6 +123,8 @@ TEST(UnlinkTest, AtBad) {
               SyscallSucceeds());
   EXPECT_THAT(unlinkat(dirfd, "UnlinkAtFile", AT_REMOVEDIR),
               SyscallFailsWithErrno(ENOTDIR));
+  EXPECT_THAT(unlinkat(dirfd, "UnlinkAtFile/", 0),
+              SyscallFailsWithErrno(ENOTDIR));
   ASSERT_THAT(close(fd), SyscallSucceeds());
   EXPECT_THAT(unlinkat(dirfd, "UnlinkAtFile", 0), SyscallSucceeds());
 
-- 
cgit v1.2.3


From 3b7119a7c91789f69d8637401a1359229a33b213 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Tue, 17 Sep 2019 12:44:05 -0700
Subject: platform/ptrace: log exit code for stub processes

PiperOrigin-RevId: 269631877
---
 pkg/sentry/platform/ptrace/ptrace_unsafe.go | 16 ++++++++++++++++
 pkg/sentry/platform/ptrace/subprocess.go    |  6 +++++-
 2 files changed, 21 insertions(+), 1 deletion(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/platform/ptrace/ptrace_unsafe.go b/pkg/sentry/platform/ptrace/ptrace_unsafe.go
index 47957bb3b..72c7ec564 100644
--- a/pkg/sentry/platform/ptrace/ptrace_unsafe.go
+++ b/pkg/sentry/platform/ptrace/ptrace_unsafe.go
@@ -154,3 +154,19 @@ func (t *thread) clone() (*thread, error) {
 		cpu:  ^uint32(0),
 	}, nil
 }
+
+// getEventMessage retrieves a message about the ptrace event that just happened.
+func (t *thread) getEventMessage() (uintptr, error) {
+	var msg uintptr
+	_, _, errno := syscall.RawSyscall6(
+		syscall.SYS_PTRACE,
+		syscall.PTRACE_GETEVENTMSG,
+		uintptr(t.tid),
+		0,
+		uintptr(unsafe.Pointer(&msg)),
+		0, 0)
+	if errno != 0 {
+		return msg, errno
+	}
+	return msg, nil
+}
diff --git a/pkg/sentry/platform/ptrace/subprocess.go b/pkg/sentry/platform/ptrace/subprocess.go
index 6bf7cd097..4f8f9c5d9 100644
--- a/pkg/sentry/platform/ptrace/subprocess.go
+++ b/pkg/sentry/platform/ptrace/subprocess.go
@@ -355,7 +355,8 @@ func (t *thread) wait(outcome waitOutcome) syscall.Signal {
 			}
 			if stopSig == syscall.SIGTRAP {
 				if status.TrapCause() == syscall.PTRACE_EVENT_EXIT {
-					t.dumpAndPanic("wait failed: the process exited")
+					msg, err := t.getEventMessage()
+					t.dumpAndPanic(fmt.Sprintf("wait failed: the process %d:%d exited: %x (err %v)", t.tgid, t.tid, msg, err))
 				}
 				// Re-encode the trap cause the way it's expected.
 				return stopSig | syscall.Signal(status.TrapCause()<<8)
@@ -426,6 +427,9 @@ func (t *thread) syscall(regs *syscall.PtraceRegs) (uintptr, error) {
 			break
 		} else {
 			// Some other signal caused a thread stop; ignore.
+			if sig != syscall.SIGSTOP && sig != syscall.SIGCHLD {
+				log.Warningf("The thread %d:%d has been interrupted by %d", t.tgid, t.tid, sig)
+			}
 			continue
 		}
 	}
-- 
cgit v1.2.3


From c98e7f0d19478ca57ba8c96444f225784035321e Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Wed, 18 Sep 2019 15:15:16 -0700
Subject: Signalfd support

Note that the exact semantics for these signalfds are slightly different from
Linux. These signalfds are bound to the process at creation time. Reads, polls,
etc. are all associated with signals directed at that task. In Linux, all
signalfd operations are associated with current, regardless of where the
signalfd originated.

In practice, this should not be an issue given how signalfds are used. In order
to fix this however, we will need to plumb the context through all the event
APIs. This gets complicated really quickly, because the waiter APIs are all
netstack-specific, and not generally exposed to the context.  Probably not
worthwhile fixing immediately.

PiperOrigin-RevId: 269901749
---
 pkg/abi/linux/BUILD                     |   1 +
 pkg/abi/linux/signalfd.go               |  45 +++++
 pkg/sentry/kernel/signalfd/BUILD        |  22 +++
 pkg/sentry/kernel/signalfd/signalfd.go  | 137 +++++++++++++
 pkg/sentry/kernel/task.go               |   8 +
 pkg/sentry/kernel/task_signals.go       |  18 ++
 pkg/sentry/syscalls/linux/BUILD         |   1 +
 pkg/sentry/syscalls/linux/linux64.go    |   4 +-
 pkg/sentry/syscalls/linux/sys_signal.go |  77 ++++++++
 test/syscalls/linux/BUILD               |  18 ++
 test/syscalls/linux/signalfd.cc         | 333 ++++++++++++++++++++++++++++++++
 11 files changed, 662 insertions(+), 2 deletions(-)
 create mode 100644 pkg/abi/linux/signalfd.go
 create mode 100644 pkg/sentry/kernel/signalfd/BUILD
 create mode 100644 pkg/sentry/kernel/signalfd/signalfd.go
 create mode 100644 test/syscalls/linux/signalfd.cc

(limited to 'pkg/sentry')

diff --git a/pkg/abi/linux/BUILD b/pkg/abi/linux/BUILD
index 39c92bb33..f45934466 100644
--- a/pkg/abi/linux/BUILD
+++ b/pkg/abi/linux/BUILD
@@ -46,6 +46,7 @@ go_library(
         "sem.go",
         "shm.go",
         "signal.go",
+        "signalfd.go",
         "socket.go",
         "splice.go",
         "tcp.go",
diff --git a/pkg/abi/linux/signalfd.go b/pkg/abi/linux/signalfd.go
new file mode 100644
index 000000000..85fad9956
--- /dev/null
+++ b/pkg/abi/linux/signalfd.go
@@ -0,0 +1,45 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+const (
+	// SFD_NONBLOCK is a signalfd(2) flag.
+	SFD_NONBLOCK = 00004000
+
+	// SFD_CLOEXEC is a signalfd(2) flag.
+	SFD_CLOEXEC = 02000000
+)
+
+// SignalfdSiginfo is the siginfo encoding for signalfds.
+type SignalfdSiginfo struct {
+	Signo   uint32
+	Errno   int32
+	Code    int32
+	PID     uint32
+	UID     uint32
+	FD      int32
+	TID     uint32
+	Band    uint32
+	Overrun uint32
+	TrapNo  uint32
+	Status  int32
+	Int     int32
+	Ptr     uint64
+	UTime   uint64
+	STime   uint64
+	Addr    uint64
+	AddrLSB uint16
+	_       [48]uint8
+}
diff --git a/pkg/sentry/kernel/signalfd/BUILD b/pkg/sentry/kernel/signalfd/BUILD
new file mode 100644
index 000000000..50b69d154
--- /dev/null
+++ b/pkg/sentry/kernel/signalfd/BUILD
@@ -0,0 +1,22 @@
+package(licenses = ["notice"])
+
+load("//tools/go_stateify:defs.bzl", "go_library")
+
+go_library(
+    name = "signalfd",
+    srcs = ["signalfd.go"],
+    importpath = "gvisor.dev/gvisor/pkg/sentry/kernel/signalfd",
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/binary",
+        "//pkg/sentry/context",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/fs/anon",
+        "//pkg/sentry/fs/fsutil",
+        "//pkg/sentry/kernel",
+        "//pkg/sentry/usermem",
+        "//pkg/syserror",
+        "//pkg/waiter",
+    ],
+)
diff --git a/pkg/sentry/kernel/signalfd/signalfd.go b/pkg/sentry/kernel/signalfd/signalfd.go
new file mode 100644
index 000000000..06fd5ec88
--- /dev/null
+++ b/pkg/sentry/kernel/signalfd/signalfd.go
@@ -0,0 +1,137 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package signalfd provides an implementation of signal file descriptors.
+package signalfd
+
+import (
+	"sync"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/fs/anon"
+	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// SignalOperations represent a file with signalfd semantics.
+//
+// +stateify savable
+type SignalOperations struct {
+	fsutil.FileNoopRelease          `state:"nosave"`
+	fsutil.FilePipeSeek             `state:"nosave"`
+	fsutil.FileNotDirReaddir        `state:"nosave"`
+	fsutil.FileNoIoctl              `state:"nosave"`
+	fsutil.FileNoFsync              `state:"nosave"`
+	fsutil.FileNoMMap               `state:"nosave"`
+	fsutil.FileNoSplice             `state:"nosave"`
+	fsutil.FileNoWrite              `state:"nosave"`
+	fsutil.FileNoopFlush            `state:"nosave"`
+	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
+
+	// target is the original task target.
+	//
+	// The semantics here are a bit broken. Linux will always use current
+	// for all reads, regardless of where the signalfd originated. We can't
+	// do exactly that because we need to plumb the context through
+	// EventRegister in order to support proper blocking behavior. This
+	// will undoubtedly become very complicated quickly.
+	target *kernel.Task
+
+	// mu protects below.
+	mu sync.Mutex `state:"nosave"`
+
+	// mask is the signal mask. Protected by mu.
+	mask linux.SignalSet
+}
+
+// New creates a new signalfd object with the supplied mask.
+func New(ctx context.Context, mask linux.SignalSet) (*fs.File, error) {
+	t := kernel.TaskFromContext(ctx)
+	if t == nil {
+		// No task context? Not valid.
+		return nil, syserror.EINVAL
+	}
+	// name matches fs/signalfd.c:signalfd4.
+	dirent := fs.NewDirent(ctx, anon.NewInode(ctx), "anon_inode:[signalfd]")
+	return fs.NewFile(ctx, dirent, fs.FileFlags{Read: true, Write: true}, &SignalOperations{
+		target: t,
+		mask:   mask,
+	}), nil
+}
+
+// Release implements fs.FileOperations.Release.
+func (s *SignalOperations) Release() {}
+
+// Mask returns the signal mask.
+func (s *SignalOperations) Mask() linux.SignalSet {
+	s.mu.Lock()
+	mask := s.mask
+	s.mu.Unlock()
+	return mask
+}
+
+// SetMask sets the signal mask.
+func (s *SignalOperations) SetMask(mask linux.SignalSet) {
+	s.mu.Lock()
+	s.mask = mask
+	s.mu.Unlock()
+}
+
+// Read implements fs.FileOperations.Read.
+func (s *SignalOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, _ int64) (int64, error) {
+	// Attempt to dequeue relevant signals.
+	info, err := s.target.Sigtimedwait(s.Mask(), 0)
+	if err != nil {
+		// There must be no signal available.
+		return 0, syserror.ErrWouldBlock
+	}
+
+	// Copy out the signal info using the specified format.
+	var buf [128]byte
+	binary.Marshal(buf[:0], usermem.ByteOrder, &linux.SignalfdSiginfo{
+		Signo:   uint32(info.Signo),
+		Errno:   info.Errno,
+		Code:    info.Code,
+		PID:     uint32(info.Pid()),
+		UID:     uint32(info.Uid()),
+		Status:  info.Status(),
+		Overrun: uint32(info.Overrun()),
+		Addr:    info.Addr(),
+	})
+	n, err := dst.CopyOut(ctx, buf[:])
+	return int64(n), err
+}
+
+// Readiness implements waiter.Waitable.Readiness.
+func (s *SignalOperations) Readiness(mask waiter.EventMask) waiter.EventMask {
+	return mask & waiter.EventIn
+}
+
+// EventRegister implements waiter.Waitable.EventRegister.
+func (s *SignalOperations) EventRegister(entry *waiter.Entry, _ waiter.EventMask) {
+	// Register for the signal set; ignore the passed events.
+	s.target.SignalRegister(entry, waiter.EventMask(s.Mask()))
+}
+
+// EventUnregister implements waiter.Waitable.EventUnregister.
+func (s *SignalOperations) EventUnregister(entry *waiter.Entry) {
+	// Unregister the original entry.
+	s.target.SignalUnregister(entry)
+}
diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index e91f82bb3..c82ef5486 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -35,6 +35,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/uniqueid"
 	"gvisor.dev/gvisor/pkg/sentry/usage"
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
 	"gvisor.dev/gvisor/third_party/gvsync"
 )
 
@@ -133,6 +134,13 @@ type Task struct {
 	// signalStack is exclusive to the task goroutine.
 	signalStack arch.SignalStack
 
+	// signalQueue is a set of registered waiters for signal-related events.
+	//
+	// signalQueue is protected by the signalMutex. Note that the task does
+	// not implement all queue methods, specifically the readiness checks.
+	// The task only broadcast a notification on signal delivery.
+	signalQueue waiter.Queue `state:"zerovalue"`
+
 	// If groupStopPending is true, the task should participate in a group
 	// stop in the interrupt path.
 	//
diff --git a/pkg/sentry/kernel/task_signals.go b/pkg/sentry/kernel/task_signals.go
index 266959a07..39cd1340d 100644
--- a/pkg/sentry/kernel/task_signals.go
+++ b/pkg/sentry/kernel/task_signals.go
@@ -28,6 +28,7 @@ import (
 	ucspb "gvisor.dev/gvisor/pkg/sentry/kernel/uncaught_signal_go_proto"
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/waiter"
 )
 
 // SignalAction is an internal signal action.
@@ -497,6 +498,9 @@ func (tg *ThreadGroup) applySignalSideEffectsLocked(sig linux.Signal) {
 //
 // Preconditions: The signal mutex must be locked.
 func (t *Task) canReceiveSignalLocked(sig linux.Signal) bool {
+	// Notify that the signal is queued.
+	t.signalQueue.Notify(waiter.EventMask(linux.MakeSignalSet(sig)))
+
 	// - Do not choose tasks that are blocking the signal.
 	if linux.SignalSetOf(sig)&t.signalMask != 0 {
 		return false
@@ -1108,3 +1112,17 @@ func (*runInterruptAfterSignalDeliveryStop) execute(t *Task) taskRunState {
 	t.tg.signalHandlers.mu.Unlock()
 	return t.deliverSignal(info, act)
 }
+
+// SignalRegister registers a waiter for pending signals.
+func (t *Task) SignalRegister(e *waiter.Entry, mask waiter.EventMask) {
+	t.tg.signalHandlers.mu.Lock()
+	t.signalQueue.EventRegister(e, mask)
+	t.tg.signalHandlers.mu.Unlock()
+}
+
+// SignalUnregister unregisters a waiter for pending signals.
+func (t *Task) SignalUnregister(e *waiter.Entry) {
+	t.tg.signalHandlers.mu.Lock()
+	t.signalQueue.EventUnregister(e)
+	t.tg.signalHandlers.mu.Unlock()
+}
diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD
index 33a40b9c6..e76ee27d2 100644
--- a/pkg/sentry/syscalls/linux/BUILD
+++ b/pkg/sentry/syscalls/linux/BUILD
@@ -74,6 +74,7 @@ go_library(
         "//pkg/sentry/kernel/pipe",
         "//pkg/sentry/kernel/sched",
         "//pkg/sentry/kernel/shm",
+        "//pkg/sentry/kernel/signalfd",
         "//pkg/sentry/kernel/time",
         "//pkg/sentry/limits",
         "//pkg/sentry/memmap",
diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go
index 150999fb8..18d24ab61 100644
--- a/pkg/sentry/syscalls/linux/linux64.go
+++ b/pkg/sentry/syscalls/linux/linux64.go
@@ -327,14 +327,14 @@ var AMD64 = &kernel.SyscallTable{
 		279: syscalls.CapError("move_pages", linux.CAP_SYS_NICE, "", nil),                               // requires cap_sys_nice (mostly)
 		280: syscalls.Supported("utimensat", Utimensat),
 		281: syscalls.Supported("epoll_pwait", EpollPwait),
-		282: syscalls.ErrorWithEvent("signalfd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/139"}), // TODO(b/19846426)
+		282: syscalls.PartiallySupported("signalfd", Signalfd, "Semantics are slightly different.", []string{"gvisor.dev/issue/139"}),
 		283: syscalls.Supported("timerfd_create", TimerfdCreate),
 		284: syscalls.Supported("eventfd", Eventfd),
 		285: syscalls.PartiallySupported("fallocate", Fallocate, "Not all options are supported.", nil),
 		286: syscalls.Supported("timerfd_settime", TimerfdSettime),
 		287: syscalls.Supported("timerfd_gettime", TimerfdGettime),
 		288: syscalls.Supported("accept4", Accept4),
-		289: syscalls.ErrorWithEvent("signalfd4", syserror.ENOSYS, "", []string{"gvisor.dev/issue/139"}), // TODO(b/19846426)
+		289: syscalls.PartiallySupported("signalfd4", Signalfd4, "Semantics are slightly different.", []string{"gvisor.dev/issue/139"}),
 		290: syscalls.Supported("eventfd2", Eventfd2),
 		291: syscalls.Supported("epoll_create1", EpollCreate1),
 		292: syscalls.Supported("dup3", Dup3),
diff --git a/pkg/sentry/syscalls/linux/sys_signal.go b/pkg/sentry/syscalls/linux/sys_signal.go
index 0104a94c0..fb6efd5d8 100644
--- a/pkg/sentry/syscalls/linux/sys_signal.go
+++ b/pkg/sentry/syscalls/linux/sys_signal.go
@@ -20,7 +20,10 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/signalfd"
+	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
@@ -506,3 +509,77 @@ func RestartSyscall(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne
 	t.Debugf("Restart block missing in restart_syscall(2). Did ptrace inject a return value of ERESTART_RESTARTBLOCK?")
 	return 0, nil, syserror.EINTR
 }
+
+// sharedSignalfd is shared between the two calls.
+func sharedSignalfd(t *kernel.Task, fd int32, sigset usermem.Addr, sigsetsize uint, flags int32) (uintptr, *kernel.SyscallControl, error) {
+	// Copy in the signal mask.
+	mask, err := copyInSigSet(t, sigset, sigsetsize)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	// Always check for valid flags, even if not creating.
+	if flags&^(linux.SFD_NONBLOCK|linux.SFD_CLOEXEC) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Is this a change to an existing signalfd?
+	//
+	// The spec indicates that this should adjust the mask.
+	if fd != -1 {
+		file := t.GetFile(fd)
+		if file == nil {
+			return 0, nil, syserror.EBADF
+		}
+		defer file.DecRef()
+
+		// Is this a signalfd?
+		if s, ok := file.FileOperations.(*signalfd.SignalOperations); ok {
+			s.SetMask(mask)
+			return 0, nil, nil
+		}
+
+		// Not a signalfd.
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Create a new file.
+	file, err := signalfd.New(t, mask)
+	if err != nil {
+		return 0, nil, err
+	}
+	defer file.DecRef()
+
+	// Set appropriate flags.
+	file.SetFlags(fs.SettableFileFlags{
+		NonBlocking: flags&linux.SFD_NONBLOCK != 0,
+	})
+
+	// Create a new descriptor.
+	fd, err = t.NewFDFrom(0, file, kernel.FDFlags{
+		CloseOnExec: flags&linux.SFD_CLOEXEC != 0,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	// Done.
+	return uintptr(fd), nil, nil
+}
+
+// Signalfd implements the linux syscall signalfd(2).
+func Signalfd(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	sigset := args[1].Pointer()
+	sigsetsize := args[2].SizeT()
+	return sharedSignalfd(t, fd, sigset, sigsetsize, 0)
+}
+
+// Signalfd4 implements the linux syscall signalfd4(2).
+func Signalfd4(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	sigset := args[1].Pointer()
+	sigsetsize := args[2].SizeT()
+	flags := args[3].Int()
+	return sharedSignalfd(t, fd, sigset, sigsetsize, flags)
+}
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index eac32850d..56fe7be37 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -1963,6 +1963,24 @@ cc_binary(
     ],
 )
 
+cc_binary(
+    name = "signalfd_test",
+    testonly = 1,
+    srcs = ["signalfd.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:file_descriptor",
+        "//test/util:logging",
+        "//test/util:posix_error",
+        "//test/util:signal_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "//test/util:thread_util",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 cc_binary(
     name = "sigprocmask_test",
     testonly = 1,
diff --git a/test/syscalls/linux/signalfd.cc b/test/syscalls/linux/signalfd.cc
new file mode 100644
index 000000000..54c598627
--- /dev/null
+++ b/test/syscalls/linux/signalfd.cc
@@ -0,0 +1,333 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <poll.h>
+#include <signal.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/signalfd.h>
+#include <unistd.h>
+
+#include <functional>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "gtest/gtest.h"
+#include "absl/synchronization/mutex.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/posix_error.h"
+#include "test/util/signal_util.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+using ::testing::KilledBySignal;
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+constexpr int kSigno = SIGUSR1;
+constexpr int kSignoAlt = SIGUSR2;
+
+// Returns a new signalfd.
+inline PosixErrorOr<FileDescriptor> NewSignalFD(sigset_t* mask, int flags = 0) {
+  int fd = signalfd(-1, mask, flags);
+  MaybeSave();
+  if (fd < 0) {
+    return PosixError(errno, "signalfd");
+  }
+  return FileDescriptor(fd);
+}
+
+TEST(Signalfd, Basic) {
+  // Create the signalfd.
+  sigset_t mask;
+  sigemptyset(&mask);
+  sigaddset(&mask, kSigno);
+  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(NewSignalFD(&mask, 0));
+
+  // Deliver the blocked signal.
+  const auto scoped_sigmask =
+      ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_BLOCK, kSigno));
+  ASSERT_THAT(tgkill(getpid(), gettid(), kSigno), SyscallSucceeds());
+
+  // We should now read the signal.
+  struct signalfd_siginfo rbuf;
+  ASSERT_THAT(read(fd.get(), &rbuf, sizeof(rbuf)),
+              SyscallSucceedsWithValue(sizeof(rbuf)));
+  EXPECT_EQ(rbuf.ssi_signo, kSigno);
+}
+
+TEST(Signalfd, MaskWorks) {
+  // Create two signalfds with different masks.
+  sigset_t mask1, mask2;
+  sigemptyset(&mask1);
+  sigemptyset(&mask2);
+  sigaddset(&mask1, kSigno);
+  sigaddset(&mask2, kSignoAlt);
+  FileDescriptor fd1 = ASSERT_NO_ERRNO_AND_VALUE(NewSignalFD(&mask1, 0));
+  FileDescriptor fd2 = ASSERT_NO_ERRNO_AND_VALUE(NewSignalFD(&mask2, 0));
+
+  // Deliver the two signals.
+  const auto scoped_sigmask1 =
+      ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_BLOCK, kSigno));
+  const auto scoped_sigmask2 =
+      ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_BLOCK, kSignoAlt));
+  ASSERT_THAT(tgkill(getpid(), gettid(), kSigno), SyscallSucceeds());
+  ASSERT_THAT(tgkill(getpid(), gettid(), kSignoAlt), SyscallSucceeds());
+
+  // We should see the signals on the appropriate signalfds.
+  //
+  // We read in the opposite order as the signals deliver above, to ensure that
+  // we don't happen to read the correct signal from the correct signalfd.
+  struct signalfd_siginfo rbuf1, rbuf2;
+  ASSERT_THAT(read(fd2.get(), &rbuf2, sizeof(rbuf2)),
+              SyscallSucceedsWithValue(sizeof(rbuf2)));
+  EXPECT_EQ(rbuf2.ssi_signo, kSignoAlt);
+  ASSERT_THAT(read(fd1.get(), &rbuf1, sizeof(rbuf1)),
+              SyscallSucceedsWithValue(sizeof(rbuf1)));
+  EXPECT_EQ(rbuf1.ssi_signo, kSigno);
+}
+
+TEST(Signalfd, Cloexec) {
+  // Exec tests confirm that O_CLOEXEC has the intended effect. We just create a
+  // signalfd with the appropriate flag here and assert that the FD has it set.
+  sigset_t mask;
+  sigemptyset(&mask);
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(NewSignalFD(&mask, SFD_CLOEXEC));
+  EXPECT_THAT(fcntl(fd.get(), F_GETFD), SyscallSucceedsWithValue(FD_CLOEXEC));
+}
+
+TEST(Signalfd, Blocking) {
+  // Create the signalfd in blocking mode.
+  sigset_t mask;
+  sigemptyset(&mask);
+  sigaddset(&mask, kSigno);
+  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(NewSignalFD(&mask, 0));
+
+  // Shared tid variable.
+  absl::Mutex mu;
+  bool has_tid;
+  pid_t tid;
+
+  // Start a thread reading.
+  ScopedThread t([&] {
+    // Copy the tid and notify the caller.
+    {
+      absl::MutexLock ml(&mu);
+      tid = gettid();
+      has_tid = true;
+    }
+
+    // Read the signal from the signalfd.
+    struct signalfd_siginfo rbuf;
+    ASSERT_THAT(read(fd.get(), &rbuf, sizeof(rbuf)),
+                SyscallSucceedsWithValue(sizeof(rbuf)));
+    EXPECT_EQ(rbuf.ssi_signo, kSigno);
+  });
+
+  // Wait until blocked.
+  absl::MutexLock ml(&mu);
+  mu.Await(absl::Condition(&has_tid));
+
+  // Deliver the signal to either the waiting thread, or
+  // to this thread. N.B. this is a bug in the core gVisor
+  // behavior for signalfd, and needs to be fixed.
+  //
+  // See gvisor.dev/issue/139.
+  if (IsRunningOnGvisor()) {
+    ASSERT_THAT(tgkill(getpid(), gettid(), kSigno), SyscallSucceeds());
+  } else {
+    ASSERT_THAT(tgkill(getpid(), tid, kSigno), SyscallSucceeds());
+  }
+
+  // Ensure that it was received.
+  t.Join();
+}
+
+TEST(Signalfd, ThreadGroup) {
+  // Create the signalfd in blocking mode.
+  sigset_t mask;
+  sigemptyset(&mask);
+  sigaddset(&mask, kSigno);
+  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(NewSignalFD(&mask, 0));
+
+  // Shared variable.
+  absl::Mutex mu;
+  bool first = false;
+  bool second = false;
+
+  // Start a thread reading.
+  ScopedThread t([&] {
+    // Read the signal from the signalfd.
+    struct signalfd_siginfo rbuf;
+    ASSERT_THAT(read(fd.get(), &rbuf, sizeof(rbuf)),
+                SyscallSucceedsWithValue(sizeof(rbuf)));
+    EXPECT_EQ(rbuf.ssi_signo, kSigno);
+
+    // Wait for the other thread.
+    absl::MutexLock ml(&mu);
+    first = true;
+    mu.Await(absl::Condition(&second));
+  });
+
+  // Deliver the signal to the threadgroup.
+  ASSERT_THAT(kill(getpid(), kSigno), SyscallSucceeds());
+
+  // Wait for the first thread to process.
+  {
+    absl::MutexLock ml(&mu);
+    mu.Await(absl::Condition(&first));
+  }
+
+  // Deliver to the thread group again (other thread still exists).
+  ASSERT_THAT(kill(getpid(), kSigno), SyscallSucceeds());
+
+  // Ensure that we can also receive it.
+  struct signalfd_siginfo rbuf;
+  ASSERT_THAT(read(fd.get(), &rbuf, sizeof(rbuf)),
+              SyscallSucceedsWithValue(sizeof(rbuf)));
+  EXPECT_EQ(rbuf.ssi_signo, kSigno);
+
+  // Mark the test as done.
+  {
+    absl::MutexLock ml(&mu);
+    second = true;
+  }
+
+  // The other thread should be joinable.
+  t.Join();
+}
+
+TEST(Signalfd, Nonblock) {
+  // Create the signalfd in non-blocking mode.
+  sigset_t mask;
+  sigemptyset(&mask);
+  sigaddset(&mask, kSigno);
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(NewSignalFD(&mask, SFD_NONBLOCK));
+
+  // We should return if we attempt to read.
+  struct signalfd_siginfo rbuf;
+  ASSERT_THAT(read(fd.get(), &rbuf, sizeof(rbuf)),
+              SyscallFailsWithErrno(EWOULDBLOCK));
+
+  // Block and deliver the signal.
+  const auto scoped_sigmask =
+      ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_BLOCK, kSigno));
+  ASSERT_THAT(tgkill(getpid(), gettid(), kSigno), SyscallSucceeds());
+
+  // Ensure that a read actually works.
+  ASSERT_THAT(read(fd.get(), &rbuf, sizeof(rbuf)),
+              SyscallSucceedsWithValue(sizeof(rbuf)));
+  EXPECT_EQ(rbuf.ssi_signo, kSigno);
+
+  // Should block again.
+  EXPECT_THAT(read(fd.get(), &rbuf, sizeof(rbuf)),
+              SyscallFailsWithErrno(EWOULDBLOCK));
+}
+
+TEST(Signalfd, SetMask) {
+  // Create the signalfd matching nothing.
+  sigset_t mask;
+  sigemptyset(&mask);
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(NewSignalFD(&mask, SFD_NONBLOCK));
+
+  // Block and deliver a signal.
+  const auto scoped_sigmask =
+      ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_BLOCK, kSigno));
+  ASSERT_THAT(tgkill(getpid(), gettid(), kSigno), SyscallSucceeds());
+
+  // We should have nothing.
+  struct signalfd_siginfo rbuf;
+  ASSERT_THAT(read(fd.get(), &rbuf, sizeof(rbuf)),
+              SyscallFailsWithErrno(EWOULDBLOCK));
+
+  // Change the signal mask.
+  sigaddset(&mask, kSigno);
+  ASSERT_THAT(signalfd(fd.get(), &mask, 0), SyscallSucceeds());
+
+  // We should now have the signal.
+  ASSERT_THAT(read(fd.get(), &rbuf, sizeof(rbuf)),
+              SyscallSucceedsWithValue(sizeof(rbuf)));
+  EXPECT_EQ(rbuf.ssi_signo, kSigno);
+}
+
+TEST(Signalfd, Poll) {
+  // Create the signalfd.
+  sigset_t mask;
+  sigemptyset(&mask);
+  sigaddset(&mask, kSigno);
+  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(NewSignalFD(&mask, 0));
+
+  // Block the signal, and start a thread to deliver it.
+  const auto scoped_sigmask =
+      ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_BLOCK, kSigno));
+  pid_t orig_tid = gettid();
+  ScopedThread t([&] {
+    absl::SleepFor(absl::Seconds(5));
+    ASSERT_THAT(tgkill(getpid(), orig_tid, kSigno), SyscallSucceeds());
+  });
+
+  // Start polling for the signal. We expect that it is not available at the
+  // outset, but then becomes available when the signal is sent. We give a
+  // timeout of 10000ms (or the delay above + 5 seconds of additional grace
+  // time).
+  struct pollfd poll_fd = {fd.get(), POLLIN, 0};
+  EXPECT_THAT(RetryEINTR(poll)(&poll_fd, 1, 10000),
+              SyscallSucceedsWithValue(1));
+
+  // Actually read the signal to prevent delivery.
+  struct signalfd_siginfo rbuf;
+  EXPECT_THAT(read(fd.get(), &rbuf, sizeof(rbuf)),
+              SyscallSucceedsWithValue(sizeof(rbuf)));
+}
+
+TEST(Signalfd, KillStillKills) {
+  sigset_t mask;
+  sigemptyset(&mask);
+  sigaddset(&mask, SIGKILL);
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(NewSignalFD(&mask, SFD_CLOEXEC));
+
+  // Just because there is a signalfd, we shouldn't see any change in behavior
+  // for unblockable signals. It's easier to test this with SIGKILL.
+  const auto scoped_sigmask =
+      ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_BLOCK, SIGKILL));
+  EXPECT_EXIT(tgkill(getpid(), gettid(), SIGKILL), KilledBySignal(SIGKILL), "");
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
+
+int main(int argc, char** argv) {
+  // These tests depend on delivering signals. Block them up front so that all
+  // other threads created by TestInit will also have them blocked, and they
+  // will not interface with the rest of the test.
+  sigset_t set;
+  sigemptyset(&set);
+  sigaddset(&set, gvisor::testing::kSigno);
+  sigaddset(&set, gvisor::testing::kSignoAlt);
+  TEST_PCHECK(sigprocmask(SIG_BLOCK, &set, nullptr) == 0);
+
+  gvisor::testing::TestInit(&argc, &argv);
+
+  return RUN_ALL_TESTS();
+}
-- 
cgit v1.2.3


From cabe10e603f36d610b7d3858c0911fc2dde26411 Mon Sep 17 00:00:00 2001
From: Haibo Xu <haibo.xu@arm.com>
Date: Tue, 17 Sep 2019 08:27:01 +0000
Subject: Enable pkg/sentry/hostcpu support on arm64.

Signed-off-by: Haibo Xu haibo.xu@arm.com
Change-Id: I333872da9bdf56ddfa8ab2f034dfc1f36a7d3132
---
 pkg/sentry/hostcpu/BUILD          |  1 +
 pkg/sentry/hostcpu/getcpu_arm64.s | 28 ++++++++++++++++++++++++++++
 2 files changed, 29 insertions(+)
 create mode 100644 pkg/sentry/hostcpu/getcpu_arm64.s

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/hostcpu/BUILD b/pkg/sentry/hostcpu/BUILD
index d4a420e60..359468ccc 100644
--- a/pkg/sentry/hostcpu/BUILD
+++ b/pkg/sentry/hostcpu/BUILD
@@ -7,6 +7,7 @@ go_library(
     name = "hostcpu",
     srcs = [
         "getcpu_amd64.s",
+        "getcpu_arm64.s",
         "hostcpu.go",
     ],
     importpath = "gvisor.dev/gvisor/pkg/sentry/hostcpu",
diff --git a/pkg/sentry/hostcpu/getcpu_arm64.s b/pkg/sentry/hostcpu/getcpu_arm64.s
new file mode 100644
index 000000000..caf9abb89
--- /dev/null
+++ b/pkg/sentry/hostcpu/getcpu_arm64.s
@@ -0,0 +1,28 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "textflag.h"
+
+// GetCPU makes the getcpu(unsigned *cpu, unsigned *node, NULL) syscall for
+// the lack of an optimazed way of getting the current CPU number on arm64.
+
+// func GetCPU() (cpu uint32)
+TEXT ·GetCPU(SB), NOSPLIT, $0-4
+	MOVW ZR, cpu+0(FP)
+	MOVD $cpu+0(FP), R0
+	MOVD $0x0, R1  // unused
+	MOVD $0x0, R2  // unused
+	MOVD $0xA8, R8 // SYS_GETCPU
+	SVC
+	RET
-- 
cgit v1.2.3


From d72c63664b46af52f4880a2a015666a52deb0749 Mon Sep 17 00:00:00 2001
From: Hang Su <darcy.sh@antfin.com>
Date: Wed, 18 Sep 2019 13:35:23 +0800
Subject: Accelerate byte lookup in string with `bytealg/indexbyte`

`bytealg/indexbyte` will use AVX or SSE instruction set, if possible,
which could accelerate `CopyStringIn` function by 28%.

In worst case(CPU doesn't support SSE), `bytealg/indexbyte`
will degenerate to traversal lookup. When dealing with
short strings, `bytealg/indexbyte` has the same performance level as
before.

Signed-off-by: Jianfeng Tan <henry.tjf@antfin.com>
Signed-off-by: Hang Su <darcy.sh@antfin.com>
---
 pkg/sentry/usermem/usermem.go | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/usermem/usermem.go b/pkg/sentry/usermem/usermem.go
index 6eced660a..7b1f312b1 100644
--- a/pkg/sentry/usermem/usermem.go
+++ b/pkg/sentry/usermem/usermem.go
@@ -16,6 +16,7 @@
 package usermem
 
 import (
+	"bytes"
 	"errors"
 	"io"
 	"strconv"
@@ -270,11 +271,10 @@ func CopyStringIn(ctx context.Context, uio IO, addr Addr, maxlen int, opts IOOpt
 		n, err := uio.CopyIn(ctx, addr, buf[done:done+readlen], opts)
 		// Look for the terminating zero byte, which may have occurred before
 		// hitting err.
-		for i, c := range buf[done : done+n] {
-			if c == 0 {
-				return stringFromImmutableBytes(buf[:done+i]), nil
-			}
+		if i := bytes.IndexByte(buf[done:done+n], byte(0)); i >= 0 {
+			return stringFromImmutableBytes(buf[:done+i]), nil
 		}
+
 		done += n
 		if err != nil {
 			return stringFromImmutableBytes(buf[:done]), err
-- 
cgit v1.2.3


From 0a8a75f3dabaf5c097710c3bb961b67b8ed653b5 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Thu, 19 Sep 2019 11:35:27 -0700
Subject: Job control: controlling TTYs and foreground process groups.

Adresses a deadlock with the rolled back change:
https://github.com/google/gvisor/commit/b6a5b950d28e0b474fdad160b88bc15314cf9259
Creating a session from an orphaned process group was causing a lock to be
acquired twice by a single goroutine. This behavior is addressed, and a test
(OrphanRegression) has been added to pty.cc.

Implemented the following ioctls:
- TIOCSCTTY - set controlling TTY
- TIOCNOTTY - remove controlling tty, maybe signal some other processes
- TIOCGPGRP - get foreground process group. Also enables tcgetpgrp().
- TIOCSPGRP - set foreground process group. Also enabled tcsetpgrp().

Next steps are to actually turn terminal-generated control characters (e.g. C^c)
into signals to the proper process groups, and to send SIGTTOU and SIGTTIN when
appropriate.

PiperOrigin-RevId: 270088599
---
 pkg/sentry/fs/tty/BUILD           |   1 +
 pkg/sentry/fs/tty/dir.go          |   3 +
 pkg/sentry/fs/tty/master.go       |  17 +-
 pkg/sentry/fs/tty/slave.go        |  13 +-
 pkg/sentry/fs/tty/terminal.go     |  92 ++++++++-
 pkg/sentry/kernel/BUILD           |   1 +
 pkg/sentry/kernel/sessions.go     |  20 +-
 pkg/sentry/kernel/task_start.go   |   3 +-
 pkg/sentry/kernel/thread_group.go | 179 +++++++++++++++++
 pkg/sentry/kernel/tty.go          |  28 +++
 test/syscalls/BUILD               |   4 +
 test/syscalls/linux/BUILD         |  19 ++
 test/syscalls/linux/pty.cc        | 393 ++++++++++++++++++++++++++++++++++++--
 test/syscalls/linux/pty_root.cc   |  68 +++++++
 test/util/BUILD                   |  11 ++
 test/util/pty_util.cc             |  45 +++++
 test/util/pty_util.h              |  30 +++
 17 files changed, 892 insertions(+), 35 deletions(-)
 create mode 100644 pkg/sentry/kernel/tty.go
 create mode 100644 test/syscalls/linux/pty_root.cc
 create mode 100644 test/util/pty_util.cc
 create mode 100644 test/util/pty_util.h

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/tty/BUILD b/pkg/sentry/fs/tty/BUILD
index d799de748..25811f668 100644
--- a/pkg/sentry/fs/tty/BUILD
+++ b/pkg/sentry/fs/tty/BUILD
@@ -25,6 +25,7 @@ go_library(
         "//pkg/sentry/device",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/fsutil",
+        "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/safemem",
         "//pkg/sentry/socket/unix/transport",
diff --git a/pkg/sentry/fs/tty/dir.go b/pkg/sentry/fs/tty/dir.go
index 1d128532b..2f639c823 100644
--- a/pkg/sentry/fs/tty/dir.go
+++ b/pkg/sentry/fs/tty/dir.go
@@ -129,6 +129,9 @@ func newDir(ctx context.Context, m *fs.MountSource) *fs.Inode {
 
 // Release implements fs.InodeOperations.Release.
 func (d *dirInodeOperations) Release(ctx context.Context) {
+	d.mu.Lock()
+	defer d.mu.Unlock()
+
 	d.master.DecRef()
 	if len(d.slaves) != 0 {
 		panic(fmt.Sprintf("devpts directory still contains active terminals: %+v", d))
diff --git a/pkg/sentry/fs/tty/master.go b/pkg/sentry/fs/tty/master.go
index 92ec1ca18..19b7557d5 100644
--- a/pkg/sentry/fs/tty/master.go
+++ b/pkg/sentry/fs/tty/master.go
@@ -172,6 +172,19 @@ func (mf *masterFileOperations) Ioctl(ctx context.Context, _ *fs.File, io userme
 		return 0, mf.t.ld.windowSize(ctx, io, args)
 	case linux.TIOCSWINSZ:
 		return 0, mf.t.ld.setWindowSize(ctx, io, args)
+	case linux.TIOCSCTTY:
+		// Make the given terminal the controlling terminal of the
+		// calling process.
+		return 0, mf.t.setControllingTTY(ctx, io, args, true /* isMaster */)
+	case linux.TIOCNOTTY:
+		// Release this process's controlling terminal.
+		return 0, mf.t.releaseControllingTTY(ctx, io, args, true /* isMaster */)
+	case linux.TIOCGPGRP:
+		// Get the foreground process group.
+		return mf.t.foregroundProcessGroup(ctx, io, args, true /* isMaster */)
+	case linux.TIOCSPGRP:
+		// Set the foreground process group.
+		return mf.t.setForegroundProcessGroup(ctx, io, args, true /* isMaster */)
 	default:
 		maybeEmitUnimplementedEvent(ctx, cmd)
 		return 0, syserror.ENOTTY
@@ -185,8 +198,6 @@ func maybeEmitUnimplementedEvent(ctx context.Context, cmd uint32) {
 		linux.TCSETS,
 		linux.TCSETSW,
 		linux.TCSETSF,
-		linux.TIOCGPGRP,
-		linux.TIOCSPGRP,
 		linux.TIOCGWINSZ,
 		linux.TIOCSWINSZ,
 		linux.TIOCSETD,
@@ -200,8 +211,6 @@ func maybeEmitUnimplementedEvent(ctx context.Context, cmd uint32) {
 		linux.TIOCEXCL,
 		linux.TIOCNXCL,
 		linux.TIOCGEXCL,
-		linux.TIOCNOTTY,
-		linux.TIOCSCTTY,
 		linux.TIOCGSID,
 		linux.TIOCGETD,
 		linux.TIOCVHANGUP,
diff --git a/pkg/sentry/fs/tty/slave.go b/pkg/sentry/fs/tty/slave.go
index e30266404..944c4ada1 100644
--- a/pkg/sentry/fs/tty/slave.go
+++ b/pkg/sentry/fs/tty/slave.go
@@ -152,9 +152,16 @@ func (sf *slaveFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem
 	case linux.TIOCSCTTY:
 		// Make the given terminal the controlling terminal of the
 		// calling process.
-		// TODO(b/129283598): Implement once we have support for job
-		// control.
-		return 0, nil
+		return 0, sf.si.t.setControllingTTY(ctx, io, args, false /* isMaster */)
+	case linux.TIOCNOTTY:
+		// Release this process's controlling terminal.
+		return 0, sf.si.t.releaseControllingTTY(ctx, io, args, false /* isMaster */)
+	case linux.TIOCGPGRP:
+		// Get the foreground process group.
+		return sf.si.t.foregroundProcessGroup(ctx, io, args, false /* isMaster */)
+	case linux.TIOCSPGRP:
+		// Set the foreground process group.
+		return sf.si.t.setForegroundProcessGroup(ctx, io, args, false /* isMaster */)
 	default:
 		maybeEmitUnimplementedEvent(ctx, cmd)
 		return 0, syserror.ENOTTY
diff --git a/pkg/sentry/fs/tty/terminal.go b/pkg/sentry/fs/tty/terminal.go
index b7cecb2ed..ff8138820 100644
--- a/pkg/sentry/fs/tty/terminal.go
+++ b/pkg/sentry/fs/tty/terminal.go
@@ -17,7 +17,10 @@ package tty
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/refs"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/usermem"
 )
 
 // Terminal is a pseudoterminal.
@@ -26,23 +29,100 @@ import (
 type Terminal struct {
 	refs.AtomicRefCount
 
-	// n is the terminal index.
+	// n is the terminal index. It is immutable.
 	n uint32
 
-	// d is the containing directory.
+	// d is the containing directory. It is immutable.
 	d *dirInodeOperations
 
-	// ld is the line discipline of the terminal.
+	// ld is the line discipline of the terminal. It is immutable.
 	ld *lineDiscipline
+
+	// masterKTTY contains the controlling process of the master end of
+	// this terminal. This field is immutable.
+	masterKTTY *kernel.TTY
+
+	// slaveKTTY contains the controlling process of the slave end of this
+	// terminal. This field is immutable.
+	slaveKTTY *kernel.TTY
 }
 
 func newTerminal(ctx context.Context, d *dirInodeOperations, n uint32) *Terminal {
 	termios := linux.DefaultSlaveTermios
 	t := Terminal{
-		d:  d,
-		n:  n,
-		ld: newLineDiscipline(termios),
+		d:          d,
+		n:          n,
+		ld:         newLineDiscipline(termios),
+		masterKTTY: &kernel.TTY{},
+		slaveKTTY:  &kernel.TTY{},
 	}
 	t.EnableLeakCheck("tty.Terminal")
 	return &t
 }
+
+// setControllingTTY makes tm the controlling terminal of the calling thread
+// group.
+func (tm *Terminal) setControllingTTY(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) error {
+	task := kernel.TaskFromContext(ctx)
+	if task == nil {
+		panic("setControllingTTY must be called from a task context")
+	}
+
+	return task.ThreadGroup().SetControllingTTY(tm.tty(isMaster), args[2].Int())
+}
+
+// releaseControllingTTY removes tm as the controlling terminal of the calling
+// thread group.
+func (tm *Terminal) releaseControllingTTY(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) error {
+	task := kernel.TaskFromContext(ctx)
+	if task == nil {
+		panic("releaseControllingTTY must be called from a task context")
+	}
+
+	return task.ThreadGroup().ReleaseControllingTTY(tm.tty(isMaster))
+}
+
+// foregroundProcessGroup gets the process group ID of tm's foreground process.
+func (tm *Terminal) foregroundProcessGroup(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) (uintptr, error) {
+	task := kernel.TaskFromContext(ctx)
+	if task == nil {
+		panic("foregroundProcessGroup must be called from a task context")
+	}
+
+	ret, err := task.ThreadGroup().ForegroundProcessGroup(tm.tty(isMaster))
+	if err != nil {
+		return 0, err
+	}
+
+	// Write it out to *arg.
+	_, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(ret), usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	return 0, err
+}
+
+// foregroundProcessGroup sets tm's foreground process.
+func (tm *Terminal) setForegroundProcessGroup(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) (uintptr, error) {
+	task := kernel.TaskFromContext(ctx)
+	if task == nil {
+		panic("setForegroundProcessGroup must be called from a task context")
+	}
+
+	// Read in the process group ID.
+	var pgid int32
+	if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &pgid, usermem.IOOpts{
+		AddressSpaceActive: true,
+	}); err != nil {
+		return 0, err
+	}
+
+	ret, err := task.ThreadGroup().SetForegroundProcessGroup(tm.tty(isMaster), kernel.ProcessGroupID(pgid))
+	return uintptr(ret), err
+}
+
+func (tm *Terminal) tty(isMaster bool) *kernel.TTY {
+	if isMaster {
+		return tm.masterKTTY
+	}
+	return tm.slaveKTTY
+}
diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
index e964a991b..eaccfd02d 100644
--- a/pkg/sentry/kernel/BUILD
+++ b/pkg/sentry/kernel/BUILD
@@ -145,6 +145,7 @@ go_library(
         "threads.go",
         "timekeeper.go",
         "timekeeper_state.go",
+        "tty.go",
         "uts_namespace.go",
         "vdso.go",
         "version.go",
diff --git a/pkg/sentry/kernel/sessions.go b/pkg/sentry/kernel/sessions.go
index 81fcd8258..047b5214d 100644
--- a/pkg/sentry/kernel/sessions.go
+++ b/pkg/sentry/kernel/sessions.go
@@ -47,6 +47,11 @@ type Session struct {
 	// The id is immutable.
 	id SessionID
 
+	// foreground is the foreground process group.
+	//
+	// This is protected by TaskSet.mu.
+	foreground *ProcessGroup
+
 	// ProcessGroups is a list of process groups in this Session. This is
 	// protected by TaskSet.mu.
 	processGroups processGroupList
@@ -260,12 +265,14 @@ func (pg *ProcessGroup) SendSignal(info *arch.SignalInfo) error {
 func (tg *ThreadGroup) CreateSession() error {
 	tg.pidns.owner.mu.Lock()
 	defer tg.pidns.owner.mu.Unlock()
+	tg.signalHandlers.mu.Lock()
+	defer tg.signalHandlers.mu.Unlock()
 	return tg.createSession()
 }
 
 // createSession creates a new session for a threadgroup.
 //
-// Precondition: callers must hold TaskSet.mu for writing.
+// Precondition: callers must hold TaskSet.mu and the signal mutex for writing.
 func (tg *ThreadGroup) createSession() error {
 	// Get the ID for this thread in the current namespace.
 	id := tg.pidns.tgids[tg]
@@ -321,8 +328,14 @@ func (tg *ThreadGroup) createSession() error {
 			childTG.processGroup.incRefWithParent(pg)
 			childTG.processGroup.decRefWithParent(oldParentPG)
 		})
-		tg.processGroup.decRefWithParent(oldParentPG)
+		// If tg.processGroup is an orphan, decRefWithParent will lock
+		// the signal mutex of each thread group in tg.processGroup.
+		// However, tg's signal mutex may already be locked at this
+		// point. We change tg's process group before calling
+		// decRefWithParent to avoid locking tg's signal mutex twice.
+		oldPG := tg.processGroup
 		tg.processGroup = pg
+		oldPG.decRefWithParent(oldParentPG)
 	} else {
 		// The current process group may be nil only in the case of an
 		// unparented thread group (i.e. the init process). This would
@@ -346,6 +359,9 @@ func (tg *ThreadGroup) createSession() error {
 		ns.processGroups[ProcessGroupID(local)] = pg
 	}
 
+	// Disconnect from the controlling terminal.
+	tg.tty = nil
+
 	return nil
 }
 
diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go
index d60cd62c7..ae6fc4025 100644
--- a/pkg/sentry/kernel/task_start.go
+++ b/pkg/sentry/kernel/task_start.go
@@ -172,9 +172,10 @@ func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) {
 		if parentPG := tg.parentPG(); parentPG == nil {
 			tg.createSession()
 		} else {
-			// Inherit the process group.
+			// Inherit the process group and terminal.
 			parentPG.incRefWithParent(parentPG)
 			tg.processGroup = parentPG
+			tg.tty = t.parent.tg.tty
 		}
 	}
 	tg.tasks.PushBack(t)
diff --git a/pkg/sentry/kernel/thread_group.go b/pkg/sentry/kernel/thread_group.go
index 2a97e3e8e..0eef24bfb 100644
--- a/pkg/sentry/kernel/thread_group.go
+++ b/pkg/sentry/kernel/thread_group.go
@@ -19,10 +19,13 @@ import (
 	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
 	"gvisor.dev/gvisor/pkg/sentry/limits"
 	"gvisor.dev/gvisor/pkg/sentry/usage"
+	"gvisor.dev/gvisor/pkg/syserror"
 )
 
 // A ThreadGroup is a logical grouping of tasks that has widespread
@@ -245,6 +248,12 @@ type ThreadGroup struct {
 	//
 	// mounts is immutable.
 	mounts *fs.MountNamespace
+
+	// tty is the thread group's controlling terminal. If nil, there is no
+	// controlling terminal.
+	//
+	// tty is protected by the signal mutex.
+	tty *TTY
 }
 
 // newThreadGroup returns a new, empty thread group in PID namespace ns. The
@@ -324,6 +333,176 @@ func (tg *ThreadGroup) forEachChildThreadGroupLocked(fn func(*ThreadGroup)) {
 	}
 }
 
+// SetControllingTTY sets tty as the controlling terminal of tg.
+func (tg *ThreadGroup) SetControllingTTY(tty *TTY, arg int32) error {
+	tty.mu.Lock()
+	defer tty.mu.Unlock()
+
+	// We might be asked to set the controlling terminal of multiple
+	// processes, so we lock both the TaskSet and SignalHandlers.
+	tg.pidns.owner.mu.Lock()
+	defer tg.pidns.owner.mu.Unlock()
+	tg.signalHandlers.mu.Lock()
+	defer tg.signalHandlers.mu.Unlock()
+
+	// "The calling process must be a session leader and not have a
+	// controlling terminal already." - tty_ioctl(4)
+	if tg.processGroup.session.leader != tg || tg.tty != nil {
+		return syserror.EINVAL
+	}
+
+	// "If this terminal is already the controlling terminal of a different
+	// session group, then the ioctl fails with EPERM, unless the caller
+	// has the CAP_SYS_ADMIN capability and arg equals 1, in which case the
+	// terminal is stolen, and all processes that had it as controlling
+	// terminal lose it." - tty_ioctl(4)
+	if tty.tg != nil && tg.processGroup.session != tty.tg.processGroup.session {
+		if !auth.CredentialsFromContext(tg.leader).HasCapability(linux.CAP_SYS_ADMIN) || arg != 1 {
+			return syserror.EPERM
+		}
+		// Steal the TTY away. Unlike TIOCNOTTY, don't send signals.
+		for othertg := range tg.pidns.owner.Root.tgids {
+			// This won't deadlock by locking tg.signalHandlers
+			// because at this point:
+			// - We only lock signalHandlers if it's in the same
+			//   session as the tty's controlling thread group.
+			// - We know that the calling thread group is not in
+			//   the same session as the tty's controlling thread
+			//   group.
+			if othertg.processGroup.session == tty.tg.processGroup.session {
+				othertg.signalHandlers.mu.Lock()
+				othertg.tty = nil
+				othertg.signalHandlers.mu.Unlock()
+			}
+		}
+	}
+
+	// Set the controlling terminal and foreground process group.
+	tg.tty = tty
+	tg.processGroup.session.foreground = tg.processGroup
+	// Set this as the controlling process of the terminal.
+	tty.tg = tg
+
+	return nil
+}
+
+// ReleaseControllingTTY gives up tty as the controlling tty of tg.
+func (tg *ThreadGroup) ReleaseControllingTTY(tty *TTY) error {
+	tty.mu.Lock()
+	defer tty.mu.Unlock()
+
+	// We might be asked to set the controlling terminal of multiple
+	// processes, so we lock both the TaskSet and SignalHandlers.
+	tg.pidns.owner.mu.RLock()
+	defer tg.pidns.owner.mu.RUnlock()
+
+	// Just below, we may re-lock signalHandlers in order to send signals.
+	// Thus we can't defer Unlock here.
+	tg.signalHandlers.mu.Lock()
+
+	if tg.tty == nil || tg.tty != tty {
+		tg.signalHandlers.mu.Unlock()
+		return syserror.ENOTTY
+	}
+
+	// "If the process was session leader, then send SIGHUP and SIGCONT to
+	// the foreground process group and all processes in the current
+	// session lose their controlling terminal." - tty_ioctl(4)
+	// Remove tty as the controlling tty for each process in the session,
+	// then send them SIGHUP and SIGCONT.
+
+	// If we're not the session leader, we don't have to do much.
+	if tty.tg != tg {
+		tg.tty = nil
+		tg.signalHandlers.mu.Unlock()
+		return nil
+	}
+
+	tg.signalHandlers.mu.Unlock()
+
+	// We're the session leader. SIGHUP and SIGCONT the foreground process
+	// group and remove all controlling terminals in the session.
+	var lastErr error
+	for othertg := range tg.pidns.owner.Root.tgids {
+		if othertg.processGroup.session == tg.processGroup.session {
+			othertg.signalHandlers.mu.Lock()
+			othertg.tty = nil
+			if othertg.processGroup == tg.processGroup.session.foreground {
+				if err := othertg.leader.sendSignalLocked(&arch.SignalInfo{Signo: int32(linux.SIGHUP)}, true /* group */); err != nil {
+					lastErr = err
+				}
+				if err := othertg.leader.sendSignalLocked(&arch.SignalInfo{Signo: int32(linux.SIGCONT)}, true /* group */); err != nil {
+					lastErr = err
+				}
+			}
+			othertg.signalHandlers.mu.Unlock()
+		}
+	}
+
+	return lastErr
+}
+
+// ForegroundProcessGroup returns the process group ID of the foreground
+// process group.
+func (tg *ThreadGroup) ForegroundProcessGroup(tty *TTY) (int32, error) {
+	tty.mu.Lock()
+	defer tty.mu.Unlock()
+
+	tg.pidns.owner.mu.Lock()
+	defer tg.pidns.owner.mu.Unlock()
+	tg.signalHandlers.mu.Lock()
+	defer tg.signalHandlers.mu.Unlock()
+
+	// "When fd does not refer to the controlling terminal of the calling
+	// process, -1 is returned" - tcgetpgrp(3)
+	if tg.tty != tty {
+		return -1, syserror.ENOTTY
+	}
+
+	return int32(tg.processGroup.session.foreground.id), nil
+}
+
+// SetForegroundProcessGroup sets the foreground process group of tty to pgid.
+func (tg *ThreadGroup) SetForegroundProcessGroup(tty *TTY, pgid ProcessGroupID) (int32, error) {
+	tty.mu.Lock()
+	defer tty.mu.Unlock()
+
+	tg.pidns.owner.mu.Lock()
+	defer tg.pidns.owner.mu.Unlock()
+	tg.signalHandlers.mu.Lock()
+	defer tg.signalHandlers.mu.Unlock()
+
+	// TODO(b/129283598): "If tcsetpgrp() is called by a member of a
+	// background process group in its session, and the calling process is
+	// not blocking or ignoring SIGTTOU, a SIGTTOU signal is sent to all
+	// members of this background process group."
+
+	// tty must be the controlling terminal.
+	if tg.tty != tty {
+		return -1, syserror.ENOTTY
+	}
+
+	// pgid must be positive.
+	if pgid < 0 {
+		return -1, syserror.EINVAL
+	}
+
+	// pg must not be empty. Empty process groups are removed from their
+	// pid namespaces.
+	pg, ok := tg.pidns.processGroups[pgid]
+	if !ok {
+		return -1, syserror.ESRCH
+	}
+
+	// pg must be part of this process's session.
+	if tg.processGroup.session != pg.session {
+		return -1, syserror.EPERM
+	}
+
+	tg.processGroup.session.foreground.id = pgid
+	return 0, nil
+}
+
 // itimerRealListener implements ktime.Listener for ITIMER_REAL expirations.
 //
 // +stateify savable
diff --git a/pkg/sentry/kernel/tty.go b/pkg/sentry/kernel/tty.go
new file mode 100644
index 000000000..34f84487a
--- /dev/null
+++ b/pkg/sentry/kernel/tty.go
@@ -0,0 +1,28 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import "sync"
+
+// TTY defines the relationship between a thread group and its controlling
+// terminal.
+//
+// +stateify savable
+type TTY struct {
+	mu sync.Mutex `state:"nosave"`
+
+	// tg is protected by mu.
+	tg *ThreadGroup
+}
diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD
index 0135435ea..63e4c63dd 100644
--- a/test/syscalls/BUILD
+++ b/test/syscalls/BUILD
@@ -320,6 +320,10 @@ syscall_test(
     test = "//test/syscalls/linux:pty_test",
 )
 
+syscall_test(
+    test = "//test/syscalls/linux:pty_root_test",
+)
+
 syscall_test(
     add_overlay = True,
     test = "//test/syscalls/linux:pwritev2_test",
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 56fe7be37..a4cebf46f 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -1287,8 +1287,10 @@ cc_binary(
     srcs = ["pty.cc"],
     linkstatic = 1,
     deps = [
+        "//test/util:capability_util",
         "//test/util:file_descriptor",
         "//test/util:posix_error",
+        "//test/util:pty_util",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
@@ -1300,6 +1302,23 @@ cc_binary(
     ],
 )
 
+cc_binary(
+    name = "pty_root_test",
+    testonly = 1,
+    srcs = ["pty_root.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:capability_util",
+        "//test/util:file_descriptor",
+        "//test/util:posix_error",
+        "//test/util:pty_util",
+        "//test/util:test_main",
+        "//test/util:thread_util",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 cc_binary(
     name = "partial_bad_buffer_test",
     testonly = 1,
diff --git a/test/syscalls/linux/pty.cc b/test/syscalls/linux/pty.cc
index d1ab4703f..286388316 100644
--- a/test/syscalls/linux/pty.cc
+++ b/test/syscalls/linux/pty.cc
@@ -13,13 +13,17 @@
 // limitations under the License.
 
 #include <fcntl.h>
+#include <linux/capability.h>
 #include <linux/major.h>
 #include <poll.h>
+#include <sched.h>
+#include <signal.h>
 #include <sys/ioctl.h>
 #include <sys/mman.h>
 #include <sys/stat.h>
 #include <sys/sysmacros.h>
 #include <sys/types.h>
+#include <sys/wait.h>
 #include <termios.h>
 #include <unistd.h>
 
@@ -31,8 +35,10 @@
 #include "absl/synchronization/notification.h"
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
+#include "test/util/capability_util.h"
 #include "test/util/file_descriptor.h"
 #include "test/util/posix_error.h"
+#include "test/util/pty_util.h"
 #include "test/util/test_util.h"
 #include "test/util/thread_util.h"
 
@@ -370,25 +376,6 @@ PosixErrorOr<size_t> PollAndReadFd(int fd, void* buf, size_t count,
   return PosixError(ETIMEDOUT, "Poll timed out");
 }
 
-// Opens the slave end of the passed master as R/W and nonblocking.
-PosixErrorOr<FileDescriptor> OpenSlave(const FileDescriptor& master) {
-  // Get pty index.
-  int n;
-  int ret = ioctl(master.get(), TIOCGPTN, &n);
-  if (ret < 0) {
-    return PosixError(errno, "ioctl(TIOCGPTN) failed");
-  }
-
-  // Unlock pts.
-  int unlock = 0;
-  ret = ioctl(master.get(), TIOCSPTLCK, &unlock);
-  if (ret < 0) {
-    return PosixError(errno, "ioctl(TIOSPTLCK) failed");
-  }
-
-  return Open(absl::StrCat("/dev/pts/", n), O_RDWR | O_NONBLOCK);
-}
-
 TEST(BasicPtyTest, StatUnopenedMaster) {
   struct stat s;
   ASSERT_THAT(stat("/dev/ptmx", &s), SyscallSucceeds());
@@ -1233,6 +1220,374 @@ TEST_F(PtyTest, SetMasterWindowSize) {
   EXPECT_EQ(retrieved_ws.ws_col, kCols);
 }
 
+class JobControlTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    master_ = ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/ptmx", O_RDWR | O_NONBLOCK));
+    slave_ = ASSERT_NO_ERRNO_AND_VALUE(OpenSlave(master_));
+
+    // Make this a session leader, which also drops the controlling terminal.
+    // In the gVisor test environment, this test will be run as the session
+    // leader already (as the sentry init process).
+    if (!IsRunningOnGvisor()) {
+      ASSERT_THAT(setsid(), SyscallSucceeds());
+    }
+  }
+
+  // Master and slave ends of the PTY. Non-blocking.
+  FileDescriptor master_;
+  FileDescriptor slave_;
+};
+
+TEST_F(JobControlTest, SetTTYMaster) {
+  ASSERT_THAT(ioctl(master_.get(), TIOCSCTTY, 0), SyscallSucceeds());
+}
+
+TEST_F(JobControlTest, SetTTY) {
+  ASSERT_THAT(ioctl(slave_.get(), TIOCSCTTY, 0), SyscallSucceeds());
+}
+
+TEST_F(JobControlTest, SetTTYNonLeader) {
+  // Fork a process that won't be the session leader.
+  pid_t child = fork();
+  if (!child) {
+    // We shouldn't be able to set the terminal.
+    TEST_PCHECK(ioctl(slave_.get(), TIOCSCTTY, 0));
+    _exit(0);
+  }
+
+  int wstatus;
+  ASSERT_THAT(waitpid(child, &wstatus, 0), SyscallSucceedsWithValue(child));
+  ASSERT_EQ(wstatus, 0);
+}
+
+TEST_F(JobControlTest, SetTTYBadArg) {
+  // Despite the man page saying arg should be 0 here, Linux doesn't actually
+  // check.
+  ASSERT_THAT(ioctl(slave_.get(), TIOCSCTTY, 1), SyscallSucceeds());
+}
+
+TEST_F(JobControlTest, SetTTYDifferentSession) {
+  SKIP_IF(ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN)));
+
+  ASSERT_THAT(ioctl(slave_.get(), TIOCSCTTY, 0), SyscallSucceeds());
+
+  // Fork, join a new session, and try to steal the parent's controlling
+  // terminal, which should fail.
+  pid_t child = fork();
+  if (!child) {
+    TEST_PCHECK(setsid() >= 0);
+    // We shouldn't be able to steal the terminal.
+    TEST_PCHECK(ioctl(slave_.get(), TIOCSCTTY, 1));
+    _exit(0);
+  }
+
+  int wstatus;
+  ASSERT_THAT(waitpid(child, &wstatus, 0), SyscallSucceedsWithValue(child));
+  ASSERT_EQ(wstatus, 0);
+}
+
+TEST_F(JobControlTest, ReleaseTTY) {
+  ASSERT_THAT(ioctl(slave_.get(), TIOCSCTTY, 0), SyscallSucceeds());
+
+  // Make sure we're ignoring SIGHUP, which will be sent to this process once we
+  // disconnect they TTY.
+  struct sigaction sa = {
+      .sa_handler = SIG_IGN,
+      .sa_flags = 0,
+  };
+  sigemptyset(&sa.sa_mask);
+  struct sigaction old_sa;
+  EXPECT_THAT(sigaction(SIGHUP, &sa, &old_sa), SyscallSucceeds());
+  EXPECT_THAT(ioctl(slave_.get(), TIOCNOTTY), SyscallSucceeds());
+  EXPECT_THAT(sigaction(SIGHUP, &old_sa, NULL), SyscallSucceeds());
+}
+
+TEST_F(JobControlTest, ReleaseUnsetTTY) {
+  ASSERT_THAT(ioctl(slave_.get(), TIOCNOTTY), SyscallFailsWithErrno(ENOTTY));
+}
+
+TEST_F(JobControlTest, ReleaseWrongTTY) {
+  ASSERT_THAT(ioctl(slave_.get(), TIOCSCTTY, 0), SyscallSucceeds());
+
+  ASSERT_THAT(ioctl(master_.get(), TIOCNOTTY), SyscallFailsWithErrno(ENOTTY));
+}
+
+TEST_F(JobControlTest, ReleaseTTYNonLeader) {
+  ASSERT_THAT(ioctl(slave_.get(), TIOCSCTTY, 0), SyscallSucceeds());
+
+  pid_t child = fork();
+  if (!child) {
+    TEST_PCHECK(!ioctl(slave_.get(), TIOCNOTTY));
+    _exit(0);
+  }
+
+  int wstatus;
+  ASSERT_THAT(waitpid(child, &wstatus, 0), SyscallSucceedsWithValue(child));
+  ASSERT_EQ(wstatus, 0);
+}
+
+TEST_F(JobControlTest, ReleaseTTYDifferentSession) {
+  ASSERT_THAT(ioctl(slave_.get(), TIOCSCTTY, 0), SyscallSucceeds());
+
+  pid_t child = fork();
+  if (!child) {
+    // Join a new session, then try to disconnect.
+    TEST_PCHECK(setsid() >= 0);
+    TEST_PCHECK(ioctl(slave_.get(), TIOCNOTTY));
+    _exit(0);
+  }
+
+  int wstatus;
+  ASSERT_THAT(waitpid(child, &wstatus, 0), SyscallSucceedsWithValue(child));
+  ASSERT_EQ(wstatus, 0);
+}
+
+// Used by the child process spawned in ReleaseTTYSignals to track received
+// signals.
+static int received;
+
+void sig_handler(int signum) { received |= signum; }
+
+// When the session leader releases its controlling terminal, the foreground
+// process group gets SIGHUP, then SIGCONT. This test:
+// - Spawns 2 threads
+// - Has thread 1 return 0 if it gets both SIGHUP and SIGCONT
+// - Has thread 2 leave the foreground process group, and return non-zero if it
+//   receives any signals.
+// - Has the parent thread release its controlling terminal
+// - Checks that thread 1 got both signals
+// - Checks that thread 2 didn't get any signals.
+TEST_F(JobControlTest, ReleaseTTYSignals) {
+  ASSERT_THAT(ioctl(slave_.get(), TIOCSCTTY, 0), SyscallSucceeds());
+
+  received = 0;
+  struct sigaction sa = {
+      .sa_handler = sig_handler,
+      .sa_flags = 0,
+  };
+  sigemptyset(&sa.sa_mask);
+  sigaddset(&sa.sa_mask, SIGHUP);
+  sigaddset(&sa.sa_mask, SIGCONT);
+  sigprocmask(SIG_BLOCK, &sa.sa_mask, NULL);
+
+  pid_t same_pgrp_child = fork();
+  if (!same_pgrp_child) {
+    // The child will wait for SIGHUP and SIGCONT, then return 0. It begins with
+    // SIGHUP and SIGCONT blocked. We install signal handlers for those signals,
+    // then use sigsuspend to wait for those specific signals.
+    TEST_PCHECK(!sigaction(SIGHUP, &sa, NULL));
+    TEST_PCHECK(!sigaction(SIGCONT, &sa, NULL));
+    sigset_t mask;
+    sigfillset(&mask);
+    sigdelset(&mask, SIGHUP);
+    sigdelset(&mask, SIGCONT);
+    while (received != (SIGHUP | SIGCONT)) {
+      sigsuspend(&mask);
+    }
+    _exit(0);
+  }
+
+  // We don't want to block these anymore.
+  sigprocmask(SIG_UNBLOCK, &sa.sa_mask, NULL);
+
+  // This child will return non-zero if either SIGHUP or SIGCONT are received.
+  pid_t diff_pgrp_child = fork();
+  if (!diff_pgrp_child) {
+    TEST_PCHECK(!setpgid(0, 0));
+    TEST_PCHECK(pause());
+    _exit(1);
+  }
+
+  EXPECT_THAT(setpgid(diff_pgrp_child, diff_pgrp_child), SyscallSucceeds());
+
+  // Make sure we're ignoring SIGHUP, which will be sent to this process once we
+  // disconnect they TTY.
+  struct sigaction sighup_sa = {
+      .sa_handler = SIG_IGN,
+      .sa_flags = 0,
+  };
+  sigemptyset(&sighup_sa.sa_mask);
+  struct sigaction old_sa;
+  EXPECT_THAT(sigaction(SIGHUP, &sighup_sa, &old_sa), SyscallSucceeds());
+
+  // Release the controlling terminal, sending SIGHUP and SIGCONT to all other
+  // processes in this process group.
+  EXPECT_THAT(ioctl(slave_.get(), TIOCNOTTY), SyscallSucceeds());
+
+  EXPECT_THAT(sigaction(SIGHUP, &old_sa, NULL), SyscallSucceeds());
+
+  // The child in the same process group will get signaled.
+  int wstatus;
+  EXPECT_THAT(waitpid(same_pgrp_child, &wstatus, 0),
+              SyscallSucceedsWithValue(same_pgrp_child));
+  EXPECT_EQ(wstatus, 0);
+
+  // The other child will not get signaled.
+  EXPECT_THAT(waitpid(diff_pgrp_child, &wstatus, WNOHANG),
+              SyscallSucceedsWithValue(0));
+  EXPECT_THAT(kill(diff_pgrp_child, SIGKILL), SyscallSucceeds());
+}
+
+TEST_F(JobControlTest, GetForegroundProcessGroup) {
+  ASSERT_THAT(ioctl(slave_.get(), TIOCSCTTY, 0), SyscallSucceeds());
+  pid_t foreground_pgid;
+  pid_t pid;
+  ASSERT_THAT(ioctl(slave_.get(), TIOCGPGRP, &foreground_pgid),
+              SyscallSucceeds());
+  ASSERT_THAT(pid = getpid(), SyscallSucceeds());
+
+  ASSERT_EQ(foreground_pgid, pid);
+}
+
+TEST_F(JobControlTest, GetForegroundProcessGroupNonControlling) {
+  // At this point there's no controlling terminal, so TIOCGPGRP should fail.
+  pid_t foreground_pgid;
+  ASSERT_THAT(ioctl(slave_.get(), TIOCGPGRP, &foreground_pgid),
+              SyscallFailsWithErrno(ENOTTY));
+}
+
+// This test:
+// - sets itself as the foreground process group
+// - creates a child process in a new process group
+// - sets that child as the foreground process group
+// - kills its child and sets itself as the foreground process group.
+TEST_F(JobControlTest, SetForegroundProcessGroup) {
+  ASSERT_THAT(ioctl(slave_.get(), TIOCSCTTY, 0), SyscallSucceeds());
+
+  // Ignore SIGTTOU so that we don't stop ourself when calling tcsetpgrp.
+  struct sigaction sa = {
+      .sa_handler = SIG_IGN,
+      .sa_flags = 0,
+  };
+  sigemptyset(&sa.sa_mask);
+  sigaction(SIGTTOU, &sa, NULL);
+
+  // Set ourself as the foreground process group.
+  ASSERT_THAT(tcsetpgrp(slave_.get(), getpgid(0)), SyscallSucceeds());
+
+  // Create a new process that just waits to be signaled.
+  pid_t child = fork();
+  if (!child) {
+    TEST_PCHECK(!pause());
+    // We should never reach this.
+    _exit(1);
+  }
+
+  // Make the child its own process group, then make it the controlling process
+  // group of the terminal.
+  ASSERT_THAT(setpgid(child, child), SyscallSucceeds());
+  ASSERT_THAT(tcsetpgrp(slave_.get(), child), SyscallSucceeds());
+
+  // Sanity check - we're still the controlling session.
+  ASSERT_EQ(getsid(0), getsid(child));
+
+  // Signal the child, wait for it to exit, then retake the terminal.
+  ASSERT_THAT(kill(child, SIGTERM), SyscallSucceeds());
+  int wstatus;
+  ASSERT_THAT(waitpid(child, &wstatus, 0), SyscallSucceedsWithValue(child));
+  ASSERT_TRUE(WIFSIGNALED(wstatus));
+  ASSERT_EQ(WTERMSIG(wstatus), SIGTERM);
+
+  // Set ourself as the foreground process.
+  pid_t pgid;
+  ASSERT_THAT(pgid = getpgid(0), SyscallSucceeds());
+  ASSERT_THAT(tcsetpgrp(slave_.get(), pgid), SyscallSucceeds());
+}
+
+TEST_F(JobControlTest, SetForegroundProcessGroupWrongTTY) {
+  pid_t pid = getpid();
+  ASSERT_THAT(ioctl(slave_.get(), TIOCSPGRP, &pid),
+              SyscallFailsWithErrno(ENOTTY));
+}
+
+TEST_F(JobControlTest, SetForegroundProcessGroupNegPgid) {
+  ASSERT_THAT(ioctl(slave_.get(), TIOCSCTTY, 0), SyscallSucceeds());
+
+  pid_t pid = -1;
+  ASSERT_THAT(ioctl(slave_.get(), TIOCSPGRP, &pid),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+TEST_F(JobControlTest, SetForegroundProcessGroupEmptyProcessGroup) {
+  ASSERT_THAT(ioctl(slave_.get(), TIOCSCTTY, 0), SyscallSucceeds());
+
+  // Create a new process, put it in a new process group, make that group the
+  // foreground process group, then have the process wait.
+  pid_t child = fork();
+  if (!child) {
+    TEST_PCHECK(!setpgid(0, 0));
+    _exit(0);
+  }
+
+  // Wait for the child to exit.
+  int wstatus;
+  EXPECT_THAT(waitpid(child, &wstatus, 0), SyscallSucceedsWithValue(child));
+  // The child's process group doesn't exist anymore - this should fail.
+  ASSERT_THAT(ioctl(slave_.get(), TIOCSPGRP, &child),
+              SyscallFailsWithErrno(ESRCH));
+}
+
+TEST_F(JobControlTest, SetForegroundProcessGroupDifferentSession) {
+  ASSERT_THAT(ioctl(slave_.get(), TIOCSCTTY, 0), SyscallSucceeds());
+
+  // Create a new process and put it in a new session.
+  pid_t child = fork();
+  if (!child) {
+    TEST_PCHECK(setsid() >= 0);
+    // Tell the parent we're in a new session.
+    TEST_PCHECK(!raise(SIGSTOP));
+    TEST_PCHECK(!pause());
+    _exit(1);
+  }
+
+  // Wait for the child to tell us it's in a new session.
+  int wstatus;
+  EXPECT_THAT(waitpid(child, &wstatus, WUNTRACED),
+              SyscallSucceedsWithValue(child));
+  EXPECT_TRUE(WSTOPSIG(wstatus));
+
+  // Child is in a new session, so we can't make it the foregroup process group.
+  EXPECT_THAT(ioctl(slave_.get(), TIOCSPGRP, &child),
+              SyscallFailsWithErrno(EPERM));
+
+  EXPECT_THAT(kill(child, SIGKILL), SyscallSucceeds());
+}
+
+// Verify that we don't hang when creating a new session from an orphaned
+// process group (b/139968068). Calling setsid() creates an orphaned process
+// group, as process groups that contain the session's leading process are
+// orphans.
+//
+// We create 2 sessions in this test. The init process in gVisor is considered
+// not to be an orphan (see sessions.go), so we have to create a session from
+// which to create a session. The latter session is being created from an
+// orphaned process group.
+TEST_F(JobControlTest, OrphanRegression) {
+  pid_t session_2_leader = fork();
+  if (!session_2_leader) {
+    TEST_PCHECK(setsid() >= 0);
+
+    pid_t session_3_leader = fork();
+    if (!session_3_leader) {
+      TEST_PCHECK(setsid() >= 0);
+
+      _exit(0);
+    }
+
+    int wstatus;
+    TEST_PCHECK(waitpid(session_3_leader, &wstatus, 0) == session_3_leader);
+    TEST_PCHECK(wstatus == 0);
+
+    _exit(0);
+  }
+
+  int wstatus;
+  ASSERT_THAT(waitpid(session_2_leader, &wstatus, 0),
+              SyscallSucceedsWithValue(session_2_leader));
+  ASSERT_EQ(wstatus, 0);
+}
+
 }  // namespace
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/pty_root.cc b/test/syscalls/linux/pty_root.cc
new file mode 100644
index 000000000..14a4af980
--- /dev/null
+++ b/test/syscalls/linux/pty_root.cc
@@ -0,0 +1,68 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sys/ioctl.h>
+#include <termios.h>
+
+#include "gtest/gtest.h"
+#include "absl/base/macros.h"
+#include "test/util/capability_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/posix_error.h"
+#include "test/util/pty_util.h"
+
+namespace gvisor {
+namespace testing {
+
+// These tests should be run as root.
+namespace {
+
+TEST(JobControlRootTest, StealTTY) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN)));
+
+  // Make this a session leader, which also drops the controlling terminal.
+  // In the gVisor test environment, this test will be run as the session
+  // leader already (as the sentry init process).
+  if (!IsRunningOnGvisor()) {
+    ASSERT_THAT(setsid(), SyscallSucceeds());
+  }
+
+  FileDescriptor master =
+      ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/ptmx", O_RDWR | O_NONBLOCK));
+  FileDescriptor slave = ASSERT_NO_ERRNO_AND_VALUE(OpenSlave(master));
+
+  // Make slave the controlling terminal.
+  ASSERT_THAT(ioctl(slave.get(), TIOCSCTTY, 0), SyscallSucceeds());
+
+  // Fork, join a new session, and try to steal the parent's controlling
+  // terminal, which should succeed when we have CAP_SYS_ADMIN and pass an arg
+  // of 1.
+  pid_t child = fork();
+  if (!child) {
+    ASSERT_THAT(setsid(), SyscallSucceeds());
+    // We shouldn't be able to steal the terminal with the wrong arg value.
+    TEST_PCHECK(ioctl(slave.get(), TIOCSCTTY, 0));
+    // We should be able to steal it here.
+    TEST_PCHECK(!ioctl(slave.get(), TIOCSCTTY, 1));
+    _exit(0);
+  }
+
+  int wstatus;
+  ASSERT_THAT(waitpid(child, &wstatus, 0), SyscallSucceedsWithValue(child));
+  ASSERT_EQ(wstatus, 0);
+}
+
+}  // namespace
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/util/BUILD b/test/util/BUILD
index 52f8b9e1f..25ed9c944 100644
--- a/test/util/BUILD
+++ b/test/util/BUILD
@@ -190,6 +190,17 @@ cc_test(
     ],
 )
 
+cc_library(
+    name = "pty_util",
+    testonly = 1,
+    srcs = ["pty_util.cc"],
+    hdrs = ["pty_util.h"],
+    deps = [
+        ":file_descriptor",
+        ":posix_error",
+    ],
+)
+
 cc_library(
     name = "signal_util",
     testonly = 1,
diff --git a/test/util/pty_util.cc b/test/util/pty_util.cc
new file mode 100644
index 000000000..c0fd9a095
--- /dev/null
+++ b/test/util/pty_util.cc
@@ -0,0 +1,45 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/util/pty_util.h"
+
+#include <sys/ioctl.h>
+#include <termios.h>
+
+#include "test/util/file_descriptor.h"
+#include "test/util/posix_error.h"
+
+namespace gvisor {
+namespace testing {
+
+PosixErrorOr<FileDescriptor> OpenSlave(const FileDescriptor& master) {
+  // Get pty index.
+  int n;
+  int ret = ioctl(master.get(), TIOCGPTN, &n);
+  if (ret < 0) {
+    return PosixError(errno, "ioctl(TIOCGPTN) failed");
+  }
+
+  // Unlock pts.
+  int unlock = 0;
+  ret = ioctl(master.get(), TIOCSPTLCK, &unlock);
+  if (ret < 0) {
+    return PosixError(errno, "ioctl(TIOSPTLCK) failed");
+  }
+
+  return Open(absl::StrCat("/dev/pts/", n), O_RDWR | O_NONBLOCK);
+}
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/util/pty_util.h b/test/util/pty_util.h
new file mode 100644
index 000000000..367b14f15
--- /dev/null
+++ b/test/util/pty_util.h
@@ -0,0 +1,30 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_UTIL_PTY_UTIL_H_
+#define GVISOR_TEST_UTIL_PTY_UTIL_H_
+
+#include "test/util/file_descriptor.h"
+#include "test/util/posix_error.h"
+
+namespace gvisor {
+namespace testing {
+
+// Opens the slave end of the passed master as R/W and nonblocking.
+PosixErrorOr<FileDescriptor> OpenSlave(const FileDescriptor& master);
+
+}  // namespace testing
+}  // namespace gvisor
+
+#endif  // GVISOR_TEST_UTIL_PTY_UTIL_H_
-- 
cgit v1.2.3


From 75781ab3efa7b377c6dc4cf26840323f504d5eb5 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Thu, 19 Sep 2019 13:38:14 -0700
Subject: Remove defer from hot path and ensure Atomic is applied consistently.

PiperOrigin-RevId: 270114317
---
 pkg/sentry/socket/epsocket/epsocket.go | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/socket/epsocket/epsocket.go b/pkg/sentry/socket/epsocket/epsocket.go
index 3e05e40fe..25adca090 100644
--- a/pkg/sentry/socket/epsocket/epsocket.go
+++ b/pkg/sentry/socket/epsocket/epsocket.go
@@ -415,13 +415,13 @@ func (s *SocketOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOS
 // WriteTo implements fs.FileOperations.WriteTo.
 func (s *SocketOperations) WriteTo(ctx context.Context, _ *fs.File, dst io.Writer, count int64, dup bool) (int64, error) {
 	s.readMu.Lock()
-	defer s.readMu.Unlock()
 
 	// Copy as much data as possible.
 	done := int64(0)
 	for count > 0 {
 		// This may return a blocking error.
 		if err := s.fetchReadView(); err != nil {
+			s.readMu.Unlock()
 			return done, err.ToError()
 		}
 
@@ -434,16 +434,18 @@ func (s *SocketOperations) WriteTo(ctx context.Context, _ *fs.File, dst io.Write
 			// supported by any Linux system calls, but the
 			// expectation is that now a caller will call read to
 			// actually remove these bytes from the socket.
-			return done, nil
+			break
 		}
 
 		// Drop that part of the view.
 		s.readView.TrimFront(n)
 		if err != nil {
+			s.readMu.Unlock()
 			return done, err
 		}
 	}
 
+	s.readMu.Unlock()
 	return done, nil
 }
 
@@ -549,7 +551,11 @@ func (r *readerPayload) Payload(size int) ([]byte, *tcpip.Error) {
 // ReadFrom implements fs.FileOperations.ReadFrom.
 func (s *SocketOperations) ReadFrom(ctx context.Context, _ *fs.File, r io.Reader, count int64) (int64, error) {
 	f := &readerPayload{ctx: ctx, r: r, count: count}
-	n, resCh, err := s.Endpoint.Write(f, tcpip.WriteOptions{})
+	n, resCh, err := s.Endpoint.Write(f, tcpip.WriteOptions{
+		// Reads may be destructive but should be very fast,
+		// so we can't release the lock while copying data.
+		Atomic: true,
+	})
 	if err == tcpip.ErrWouldBlock {
 		return 0, syserror.ErrWouldBlock
 	}
@@ -561,9 +567,7 @@ func (s *SocketOperations) ReadFrom(ctx context.Context, _ *fs.File, r io.Reader
 		}
 
 		n, _, err = s.Endpoint.Write(f, tcpip.WriteOptions{
-			// Reads may be destructive but should be very fast,
-			// so we can't release the lock while copying data.
-			Atomic: true,
+			Atomic: true, // See above.
 		})
 	}
 	if err == tcpip.ErrWouldBlock {
-- 
cgit v1.2.3


From 223481e92743f305ada22689f4ba41b36119d5fc Mon Sep 17 00:00:00 2001
From: Jianfeng Tan <henry.tjf@antfin.com>
Date: Fri, 20 Sep 2019 17:49:18 +0000
Subject: fix set hostname

Previously, when we set hostname:

$ strace hostname abc
...
sethostname("abc", 3) = -1 ENAMETOOLONG (File name too long)
...

According to man 2 sethostname:

"The len argument specifies the number of bytes in name. (Thus, name
does not require a terminating null byte.)"

We wrongly use the CopyStringIn() to check terminating zero byte in
the implementation of sethostname syscall.

To fix this, we use CopyInBytes() instead.

Fixes: #861

Reported-by: chenglang.hy <chenglang.hy@antfin.com>
Signed-off-by: Jianfeng Tan <henry.tjf@antfin.com>
---
 pkg/sentry/syscalls/linux/sys_utsname.go |  6 +++---
 test/syscalls/linux/uname.cc             | 14 +++++++++++++-
 2 files changed, 16 insertions(+), 4 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/syscalls/linux/sys_utsname.go b/pkg/sentry/syscalls/linux/sys_utsname.go
index 271ace08e..748e8dd8d 100644
--- a/pkg/sentry/syscalls/linux/sys_utsname.go
+++ b/pkg/sentry/syscalls/linux/sys_utsname.go
@@ -79,11 +79,11 @@ func Sethostname(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S
 		return 0, nil, syserror.EINVAL
 	}
 
-	name, err := t.CopyInString(nameAddr, int(size))
-	if err != nil {
+	name := make([]byte, size)
+	if _, err := t.CopyInBytes(nameAddr, name); err != nil {
 		return 0, nil, err
 	}
 
-	utsns.SetHostName(name)
+	utsns.SetHostName(string(name))
 	return 0, nil, nil
 }
diff --git a/test/syscalls/linux/uname.cc b/test/syscalls/linux/uname.cc
index 0a5d91017..d8824b171 100644
--- a/test/syscalls/linux/uname.cc
+++ b/test/syscalls/linux/uname.cc
@@ -41,6 +41,19 @@ TEST(UnameTest, Sanity) {
 TEST(UnameTest, SetNames) {
   SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN)));
 
+  char hostname[65];
+  ASSERT_THAT(sethostname("0123456789", 3), SyscallSucceeds());
+  EXPECT_THAT(gethostname(hostname, sizeof(hostname)), SyscallSucceeds());
+  EXPECT_EQ(absl::string_view(hostname), "012");
+
+  ASSERT_THAT(sethostname("0123456789\0xxx", 11), SyscallSucceeds());
+  EXPECT_THAT(gethostname(hostname, sizeof(hostname)), SyscallSucceeds());
+  EXPECT_EQ(absl::string_view(hostname), "0123456789");
+
+  ASSERT_THAT(sethostname("0123456789\0xxx", 12), SyscallSucceeds());
+  EXPECT_THAT(gethostname(hostname, sizeof(hostname)), SyscallSucceeds());
+  EXPECT_EQ(absl::string_view(hostname), "0123456789");
+
   constexpr char kHostname[] = "wubbalubba";
   ASSERT_THAT(sethostname(kHostname, sizeof(kHostname)), SyscallSucceeds());
 
@@ -54,7 +67,6 @@ TEST(UnameTest, SetNames) {
   EXPECT_EQ(absl::string_view(buf.domainname), kDomainname);
 
   // These should just be glibc wrappers that also call uname(2).
-  char hostname[65];
   EXPECT_THAT(gethostname(hostname, sizeof(hostname)), SyscallSucceeds());
   EXPECT_EQ(absl::string_view(hostname), kHostname);
 
-- 
cgit v1.2.3


From fb55c2bd0d5d80b240184f643967f214d3dbc259 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Fri, 20 Sep 2019 14:23:20 -0700
Subject: Change vfs.Dirent.Off to NextOff.

"d_off is the distance from the start of the directory to the start of the next
linux_dirent." - getdents(2).

PiperOrigin-RevId: 270349685
---
 pkg/sentry/fsimpl/ext/directory.go   |  8 ++++----
 pkg/sentry/fsimpl/ext/ext_test.go    |  2 +-
 pkg/sentry/fsimpl/memfs/directory.go | 24 ++++++++++++------------
 pkg/sentry/vfs/file_description.go   |  7 +++++--
 4 files changed, 22 insertions(+), 19 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fsimpl/ext/directory.go b/pkg/sentry/fsimpl/ext/directory.go
index b51f3e18d..0b471d121 100644
--- a/pkg/sentry/fsimpl/ext/directory.go
+++ b/pkg/sentry/fsimpl/ext/directory.go
@@ -190,10 +190,10 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba
 			}
 
 			if !cb.Handle(vfs.Dirent{
-				Name: child.diskDirent.FileName(),
-				Type: fs.ToDirentType(childType),
-				Ino:  uint64(child.diskDirent.Inode()),
-				Off:  fd.off,
+				Name:    child.diskDirent.FileName(),
+				Type:    fs.ToDirentType(childType),
+				Ino:     uint64(child.diskDirent.Inode()),
+				NextOff: fd.off + 1,
 			}) {
 				dir.childList.InsertBefore(child, fd.iter)
 				return nil
diff --git a/pkg/sentry/fsimpl/ext/ext_test.go b/pkg/sentry/fsimpl/ext/ext_test.go
index 63cf7aeaf..1aa2bd6a4 100644
--- a/pkg/sentry/fsimpl/ext/ext_test.go
+++ b/pkg/sentry/fsimpl/ext/ext_test.go
@@ -584,7 +584,7 @@ func TestIterDirents(t *testing.T) {
 			// Ignore the inode number and offset of dirents because those are likely to
 			// change as the underlying image changes.
 			cmpIgnoreFields := cmp.FilterPath(func(p cmp.Path) bool {
-				return p.String() == "Ino" || p.String() == "Off"
+				return p.String() == "Ino" || p.String() == "NextOff"
 			}, cmp.Ignore())
 			if diff := cmp.Diff(cb.dirents, test.want, cmpIgnoreFields); diff != "" {
 				t.Errorf("dirents mismatch (-want +got):\n%s", diff)
diff --git a/pkg/sentry/fsimpl/memfs/directory.go b/pkg/sentry/fsimpl/memfs/directory.go
index c52dc781c..c620227c9 100644
--- a/pkg/sentry/fsimpl/memfs/directory.go
+++ b/pkg/sentry/fsimpl/memfs/directory.go
@@ -75,10 +75,10 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba
 
 	if fd.off == 0 {
 		if !cb.Handle(vfs.Dirent{
-			Name: ".",
-			Type: linux.DT_DIR,
-			Ino:  vfsd.Impl().(*dentry).inode.ino,
-			Off:  0,
+			Name:    ".",
+			Type:    linux.DT_DIR,
+			Ino:     vfsd.Impl().(*dentry).inode.ino,
+			NextOff: 1,
 		}) {
 			return nil
 		}
@@ -87,10 +87,10 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba
 	if fd.off == 1 {
 		parentInode := vfsd.ParentOrSelf().Impl().(*dentry).inode
 		if !cb.Handle(vfs.Dirent{
-			Name: "..",
-			Type: parentInode.direntType(),
-			Ino:  parentInode.ino,
-			Off:  1,
+			Name:    "..",
+			Type:    parentInode.direntType(),
+			Ino:     parentInode.ino,
+			NextOff: 2,
 		}) {
 			return nil
 		}
@@ -112,10 +112,10 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba
 		// Skip other directoryFD iterators.
 		if child.inode != nil {
 			if !cb.Handle(vfs.Dirent{
-				Name: child.vfsd.Name(),
-				Type: child.inode.direntType(),
-				Ino:  child.inode.ino,
-				Off:  fd.off,
+				Name:    child.vfsd.Name(),
+				Type:    child.inode.direntType(),
+				Ino:     child.inode.ino,
+				NextOff: fd.off + 1,
 			}) {
 				dir.childList.InsertBefore(child, fd.iter)
 				return nil
diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go
index 86bde7fb3..7eb2b2821 100644
--- a/pkg/sentry/vfs/file_description.go
+++ b/pkg/sentry/vfs/file_description.go
@@ -199,8 +199,11 @@ type Dirent struct {
 	// Ino is the inode number.
 	Ino uint64
 
-	// Off is this Dirent's offset.
-	Off int64
+	// NextOff is the offset of the *next* Dirent in the directory; that is,
+	// FileDescription.Seek(NextOff, SEEK_SET) (as called by seekdir(3)) will
+	// cause the next call to FileDescription.IterDirents() to yield the next
+	// Dirent. (The offset of the first Dirent in a directory is always 0.)
+	NextOff int64
 }
 
 // IterDirentsCallback receives Dirents from FileDescriptionImpl.IterDirents.
-- 
cgit v1.2.3


From 4aeedd47bfc70c6c1ffd3abcde7227b936962dfd Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Mon, 23 Sep 2019 08:23:44 -0700
Subject: internal BUILD file cleanup.

PiperOrigin-RevId: 270680704
---
 pkg/metric/BUILD                 | 7 +++++++
 pkg/sentry/arch/BUILD            | 7 +++++++
 pkg/sentry/kernel/BUILD          | 7 +++++++
 pkg/sentry/kernel/memevent/BUILD | 7 +++++++
 pkg/sentry/socket/rpcinet/BUILD  | 9 +++++++++
 pkg/sentry/strace/BUILD          | 7 +++++++
 pkg/sentry/unimpl/BUILD          | 7 +++++++
 7 files changed, 51 insertions(+)

(limited to 'pkg/sentry')

diff --git a/pkg/metric/BUILD b/pkg/metric/BUILD
index 842788179..dd6ca6d39 100644
--- a/pkg/metric/BUILD
+++ b/pkg/metric/BUILD
@@ -1,6 +1,7 @@
 load("//tools/go_stateify:defs.bzl", "go_library")
 load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
 load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("@rules_cc//cc:defs.bzl", "cc_proto_library")
 
 package(licenses = ["notice"])
 
@@ -22,6 +23,12 @@ proto_library(
     visibility = ["//:sandbox"],
 )
 
+cc_proto_library(
+    name = "metric_cc_proto",
+    visibility = ["//:sandbox"],
+    deps = [":metric_proto"],
+)
+
 go_proto_library(
     name = "metric_go_proto",
     importpath = "gvisor.dev/gvisor/pkg/metric/metric_go_proto",
diff --git a/pkg/sentry/arch/BUILD b/pkg/sentry/arch/BUILD
index 7aace2d7b..c71cff9f3 100644
--- a/pkg/sentry/arch/BUILD
+++ b/pkg/sentry/arch/BUILD
@@ -1,4 +1,5 @@
 load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
+load("@rules_cc//cc:defs.bzl", "cc_proto_library")
 
 package(licenses = ["notice"])
 
@@ -42,6 +43,12 @@ proto_library(
     visibility = ["//visibility:public"],
 )
 
+cc_proto_library(
+    name = "registers_cc_proto",
+    visibility = ["//visibility:public"],
+    deps = [":registers_proto"],
+)
+
 go_proto_library(
     name = "registers_go_proto",
     importpath = "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto",
diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
index eaccfd02d..aba2414d4 100644
--- a/pkg/sentry/kernel/BUILD
+++ b/pkg/sentry/kernel/BUILD
@@ -1,5 +1,6 @@
 load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
 load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("@rules_cc//cc:defs.bzl", "cc_proto_library")
 
 package(licenses = ["notice"])
 
@@ -84,6 +85,12 @@ proto_library(
     deps = ["//pkg/sentry/arch:registers_proto"],
 )
 
+cc_proto_library(
+    name = "uncaught_signal_cc_proto",
+    visibility = ["//visibility:public"],
+    deps = [":uncaught_signal_proto"],
+)
+
 go_proto_library(
     name = "uncaught_signal_go_proto",
     importpath = "gvisor.dev/gvisor/pkg/sentry/kernel/uncaught_signal_go_proto",
diff --git a/pkg/sentry/kernel/memevent/BUILD b/pkg/sentry/kernel/memevent/BUILD
index ebcfaa619..d7a7d1169 100644
--- a/pkg/sentry/kernel/memevent/BUILD
+++ b/pkg/sentry/kernel/memevent/BUILD
@@ -1,5 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library")
 load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
+load("@rules_cc//cc:defs.bzl", "cc_proto_library")
 
 package(licenses = ["notice"])
 
@@ -24,6 +25,12 @@ proto_library(
     visibility = ["//visibility:public"],
 )
 
+cc_proto_library(
+    name = "memory_events_cc_proto",
+    visibility = ["//visibility:public"],
+    deps = [":memory_events_proto"],
+)
+
 go_proto_library(
     name = "memory_events_go_proto",
     importpath = "gvisor.dev/gvisor/pkg/sentry/kernel/memevent/memory_events_go_proto",
diff --git a/pkg/sentry/socket/rpcinet/BUILD b/pkg/sentry/socket/rpcinet/BUILD
index 5061dcbde..3a6baa308 100644
--- a/pkg/sentry/socket/rpcinet/BUILD
+++ b/pkg/sentry/socket/rpcinet/BUILD
@@ -1,5 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library")
 load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
+load("@rules_cc//cc:defs.bzl", "cc_proto_library")
 
 package(licenses = ["notice"])
 
@@ -49,6 +50,14 @@ proto_library(
     ],
 )
 
+cc_proto_library(
+    name = "syscall_rpc_cc_proto",
+    visibility = [
+        "//visibility:public",
+    ],
+    deps = [":syscall_rpc_proto"],
+)
+
 go_proto_library(
     name = "syscall_rpc_go_proto",
     importpath = "gvisor.dev/gvisor/pkg/sentry/socket/rpcinet/syscall_rpc_go_proto",
diff --git a/pkg/sentry/strace/BUILD b/pkg/sentry/strace/BUILD
index 445d25010..7d7b42eba 100644
--- a/pkg/sentry/strace/BUILD
+++ b/pkg/sentry/strace/BUILD
@@ -1,5 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library")
 load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
+load("@rules_cc//cc:defs.bzl", "cc_proto_library")
 
 package(licenses = ["notice"])
 
@@ -44,6 +45,12 @@ proto_library(
     visibility = ["//visibility:public"],
 )
 
+cc_proto_library(
+    name = "strace_cc_proto",
+    visibility = ["//visibility:public"],
+    deps = [":strace_proto"],
+)
+
 go_proto_library(
     name = "strace_go_proto",
     importpath = "gvisor.dev/gvisor/pkg/sentry/strace/strace_go_proto",
diff --git a/pkg/sentry/unimpl/BUILD b/pkg/sentry/unimpl/BUILD
index b69603da3..fc7614fff 100644
--- a/pkg/sentry/unimpl/BUILD
+++ b/pkg/sentry/unimpl/BUILD
@@ -1,5 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library")
 load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
+load("@rules_cc//cc:defs.bzl", "cc_proto_library")
 
 package(licenses = ["notice"])
 
@@ -10,6 +11,12 @@ proto_library(
     deps = ["//pkg/sentry/arch:registers_proto"],
 )
 
+cc_proto_library(
+    name = "unimplemented_syscall_cc_proto",
+    visibility = ["//visibility:public"],
+    deps = [":unimplemented_syscall_proto"],
+)
+
 go_proto_library(
     name = "unimplemented_syscall_go_proto",
     importpath = "gvisor.dev/gvisor/pkg/sentry/unimpl/unimplemented_syscall_go_proto",
-- 
cgit v1.2.3


From 03ee55cc62c99c5b8f5d6fb00423a66ef44589e3 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Mon, 23 Sep 2019 14:37:39 -0700
Subject: netstack: convert more socket options to {Set,Get}SockOptInt

PiperOrigin-RevId: 270763208
---
 pkg/sentry/socket/epsocket/epsocket.go             |  22 ++--
 pkg/sentry/socket/unix/transport/unix.go           |  82 ++++++------
 pkg/tcpip/stack/transport_test.go                  |   5 +
 pkg/tcpip/tcpip.go                                 |  32 +++--
 pkg/tcpip/transport/icmp/endpoint.go               |  29 +++--
 pkg/tcpip/transport/raw/endpoint.go                |  30 +++--
 pkg/tcpip/transport/tcp/endpoint.go                | 144 +++++++++++----------
 pkg/tcpip/transport/tcp/tcp_noracedetector_test.go |  10 +-
 pkg/tcpip/transport/tcp/tcp_test.go                | 121 ++++++++---------
 pkg/tcpip/transport/tcp/testing/context/context.go |   8 +-
 pkg/tcpip/transport/udp/endpoint.go                |  32 +++--
 11 files changed, 276 insertions(+), 239 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/socket/epsocket/epsocket.go b/pkg/sentry/socket/epsocket/epsocket.go
index 25adca090..3e66f9cbb 100644
--- a/pkg/sentry/socket/epsocket/epsocket.go
+++ b/pkg/sentry/socket/epsocket/epsocket.go
@@ -209,6 +209,10 @@ type commonEndpoint interface {
 	// transport.Endpoint.SetSockOpt.
 	SetSockOpt(interface{}) *tcpip.Error
 
+	// SetSockOptInt implements tcpip.Endpoint.SetSockOptInt and
+	// transport.Endpoint.SetSockOptInt.
+	SetSockOptInt(opt tcpip.SockOpt, v int) *tcpip.Error
+
 	// GetSockOpt implements tcpip.Endpoint.GetSockOpt and
 	// transport.Endpoint.GetSockOpt.
 	GetSockOpt(interface{}) *tcpip.Error
@@ -887,8 +891,8 @@ func getSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, family
 			return nil, syserr.ErrInvalidArgument
 		}
 
-		var size tcpip.SendBufferSizeOption
-		if err := ep.GetSockOpt(&size); err != nil {
+		size, err := ep.GetSockOptInt(tcpip.SendBufferSizeOption)
+		if err != nil {
 			return nil, syserr.TranslateNetstackError(err)
 		}
 
@@ -903,8 +907,8 @@ func getSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, family
 			return nil, syserr.ErrInvalidArgument
 		}
 
-		var size tcpip.ReceiveBufferSizeOption
-		if err := ep.GetSockOpt(&size); err != nil {
+		size, err := ep.GetSockOptInt(tcpip.ReceiveBufferSizeOption)
+		if err != nil {
 			return nil, syserr.TranslateNetstackError(err)
 		}
 
@@ -1275,7 +1279,7 @@ func setSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, name i
 		}
 
 		v := usermem.ByteOrder.Uint32(optVal)
-		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.SendBufferSizeOption(v)))
+		return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.SendBufferSizeOption, int(v)))
 
 	case linux.SO_RCVBUF:
 		if len(optVal) < sizeOfInt32 {
@@ -1283,7 +1287,7 @@ func setSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, name i
 		}
 
 		v := usermem.ByteOrder.Uint32(optVal)
-		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.ReceiveBufferSizeOption(v)))
+		return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.ReceiveBufferSizeOption, int(v)))
 
 	case linux.SO_REUSEADDR:
 		if len(optVal) < sizeOfInt32 {
@@ -2317,9 +2321,9 @@ func Ioctl(ctx context.Context, ep commonEndpoint, io usermem.IO, args arch.Sysc
 		return 0, err
 
 	case linux.TIOCOUTQ:
-		var v tcpip.SendQueueSizeOption
-		if err := ep.GetSockOpt(&v); err != nil {
-			return 0, syserr.TranslateNetstackError(err).ToError()
+		v, terr := ep.GetSockOptInt(tcpip.SendQueueSizeOption)
+		if terr != nil {
+			return 0, syserr.TranslateNetstackError(terr).ToError()
 		}
 
 		if v > math.MaxInt32 {
diff --git a/pkg/sentry/socket/unix/transport/unix.go b/pkg/sentry/socket/unix/transport/unix.go
index 2b0ad6395..1867b3a5c 100644
--- a/pkg/sentry/socket/unix/transport/unix.go
+++ b/pkg/sentry/socket/unix/transport/unix.go
@@ -175,6 +175,10 @@ type Endpoint interface {
 	// types.
 	SetSockOpt(opt interface{}) *tcpip.Error
 
+	// SetSockOptInt sets a socket option for simple cases when a value has
+	// the int type.
+	SetSockOptInt(opt tcpip.SockOpt, v int) *tcpip.Error
+
 	// GetSockOpt gets a socket option. opt should be a pointer to one of the
 	// tcpip.*Option types.
 	GetSockOpt(opt interface{}) *tcpip.Error
@@ -838,6 +842,10 @@ func (e *baseEndpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 	return nil
 }
 
+func (e *baseEndpoint) SetSockOptInt(opt tcpip.SockOpt, v int) *tcpip.Error {
+	return nil
+}
+
 func (e *baseEndpoint) GetSockOptInt(opt tcpip.SockOpt) (int, *tcpip.Error) {
 	switch opt {
 	case tcpip.ReceiveQueueSizeOption:
@@ -853,65 +861,63 @@ func (e *baseEndpoint) GetSockOptInt(opt tcpip.SockOpt) (int, *tcpip.Error) {
 			return -1, tcpip.ErrQueueSizeNotSupported
 		}
 		return v, nil
-	default:
-		return -1, tcpip.ErrUnknownProtocolOption
-	}
-}
-
-// GetSockOpt implements tcpip.Endpoint.GetSockOpt.
-func (e *baseEndpoint) GetSockOpt(opt interface{}) *tcpip.Error {
-	switch o := opt.(type) {
-	case tcpip.ErrorOption:
-		return nil
 
-	case *tcpip.SendQueueSizeOption:
+	case tcpip.SendQueueSizeOption:
 		e.Lock()
 		if !e.Connected() {
 			e.Unlock()
-			return tcpip.ErrNotConnected
+			return -1, tcpip.ErrNotConnected
 		}
-		qs := tcpip.SendQueueSizeOption(e.connected.SendQueuedSize())
+		v := e.connected.SendQueuedSize()
 		e.Unlock()
-		if qs < 0 {
-			return tcpip.ErrQueueSizeNotSupported
-		}
-		*o = qs
-		return nil
-
-	case *tcpip.PasscredOption:
-		if e.Passcred() {
-			*o = tcpip.PasscredOption(1)
-		} else {
-			*o = tcpip.PasscredOption(0)
+		if v < 0 {
+			return -1, tcpip.ErrQueueSizeNotSupported
 		}
-		return nil
+		return int(v), nil
 
-	case *tcpip.SendBufferSizeOption:
+	case tcpip.SendBufferSizeOption:
 		e.Lock()
 		if !e.Connected() {
 			e.Unlock()
-			return tcpip.ErrNotConnected
+			return -1, tcpip.ErrNotConnected
 		}
-		qs := tcpip.SendBufferSizeOption(e.connected.SendMaxQueueSize())
+		v := e.connected.SendMaxQueueSize()
 		e.Unlock()
-		if qs < 0 {
-			return tcpip.ErrQueueSizeNotSupported
+		if v < 0 {
+			return -1, tcpip.ErrQueueSizeNotSupported
 		}
-		*o = qs
-		return nil
+		return int(v), nil
 
-	case *tcpip.ReceiveBufferSizeOption:
+	case tcpip.ReceiveBufferSizeOption:
 		e.Lock()
 		if e.receiver == nil {
 			e.Unlock()
-			return tcpip.ErrNotConnected
+			return -1, tcpip.ErrNotConnected
 		}
-		qs := tcpip.ReceiveBufferSizeOption(e.receiver.RecvMaxQueueSize())
+		v := e.receiver.RecvMaxQueueSize()
 		e.Unlock()
-		if qs < 0 {
-			return tcpip.ErrQueueSizeNotSupported
+		if v < 0 {
+			return -1, tcpip.ErrQueueSizeNotSupported
+		}
+		return int(v), nil
+
+	default:
+		return -1, tcpip.ErrUnknownProtocolOption
+	}
+}
+
+// GetSockOpt implements tcpip.Endpoint.GetSockOpt.
+func (e *baseEndpoint) GetSockOpt(opt interface{}) *tcpip.Error {
+	switch o := opt.(type) {
+	case tcpip.ErrorOption:
+		return nil
+
+	case *tcpip.PasscredOption:
+		if e.Passcred() {
+			*o = tcpip.PasscredOption(1)
+		} else {
+			*o = tcpip.PasscredOption(0)
 		}
-		*o = qs
 		return nil
 
 	case *tcpip.KeepaliveEnabledOption:
diff --git a/pkg/tcpip/stack/transport_test.go b/pkg/tcpip/stack/transport_test.go
index 847d02982..0e69ac7c8 100644
--- a/pkg/tcpip/stack/transport_test.go
+++ b/pkg/tcpip/stack/transport_test.go
@@ -91,6 +91,11 @@ func (*fakeTransportEndpoint) SetSockOpt(interface{}) *tcpip.Error {
 	return tcpip.ErrInvalidEndpointState
 }
 
+// SetSockOptInt sets a socket option. Currently not supported.
+func (*fakeTransportEndpoint) SetSockOptInt(tcpip.SockOpt, int) *tcpip.Error {
+	return tcpip.ErrInvalidEndpointState
+}
+
 // GetSockOptInt implements tcpip.Endpoint.GetSockOptInt.
 func (*fakeTransportEndpoint) GetSockOptInt(opt tcpip.SockOpt) (int, *tcpip.Error) {
 	return -1, tcpip.ErrUnknownProtocolOption
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index 2534069ab..c021c67ac 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -401,6 +401,10 @@ type Endpoint interface {
 	// SetSockOpt sets a socket option. opt should be one of the *Option types.
 	SetSockOpt(opt interface{}) *Error
 
+	// SetSockOptInt sets a socket option, for simple cases where a value
+	// has the int type.
+	SetSockOptInt(opt SockOpt, v int) *Error
+
 	// GetSockOpt gets a socket option. opt should be a pointer to one of the
 	// *Option types.
 	GetSockOpt(opt interface{}) *Error
@@ -446,10 +450,22 @@ type WriteOptions struct {
 type SockOpt int
 
 const (
-	// ReceiveQueueSizeOption is used in GetSockOpt to specify that the number of
-	// unread bytes in the input buffer should be returned.
+	// ReceiveQueueSizeOption is used in GetSockOptInt to specify that the
+	// number of unread bytes in the input buffer should be returned.
 	ReceiveQueueSizeOption SockOpt = iota
 
+	// SendBufferSizeOption is used by SetSockOptInt/GetSockOptInt to
+	// specify the send buffer size option.
+	SendBufferSizeOption
+
+	// ReceiveBufferSizeOption is used by SetSockOptInt/GetSockOptInt to
+	// specify the receive buffer size option.
+	ReceiveBufferSizeOption
+
+	// SendQueueSizeOption is used in GetSockOptInt to specify that the
+	// number of unread bytes in the output buffer should be returned.
+	SendQueueSizeOption
+
 	// TODO(b/137664753): convert all int socket options to be handled via
 	// GetSockOptInt.
 )
@@ -458,18 +474,6 @@ const (
 // the endpoint should be cleared and returned.
 type ErrorOption struct{}
 
-// SendBufferSizeOption is used by SetSockOpt/GetSockOpt to specify the send
-// buffer size option.
-type SendBufferSizeOption int
-
-// ReceiveBufferSizeOption is used by SetSockOpt/GetSockOpt to specify the
-// receive buffer size option.
-type ReceiveBufferSizeOption int
-
-// SendQueueSizeOption is used in GetSockOpt to specify that the number of
-// unread bytes in the output buffer should be returned.
-type SendQueueSizeOption int
-
 // V6OnlyOption is used by SetSockOpt/GetSockOpt to specify whether an IPv6
 // socket is to be restricted to sending and receiving IPv6 packets only.
 type V6OnlyOption int
diff --git a/pkg/tcpip/transport/icmp/endpoint.go b/pkg/tcpip/transport/icmp/endpoint.go
index 3db060384..a111fdb2a 100644
--- a/pkg/tcpip/transport/icmp/endpoint.go
+++ b/pkg/tcpip/transport/icmp/endpoint.go
@@ -319,6 +319,11 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 	return nil
 }
 
+// SetSockOptInt sets a socket option. Currently not supported.
+func (e *endpoint) SetSockOptInt(opt tcpip.SockOpt, v int) *tcpip.Error {
+	return nil
+}
+
 // GetSockOptInt implements tcpip.Endpoint.GetSockOptInt.
 func (e *endpoint) GetSockOptInt(opt tcpip.SockOpt) (int, *tcpip.Error) {
 	switch opt {
@@ -331,6 +336,18 @@ func (e *endpoint) GetSockOptInt(opt tcpip.SockOpt) (int, *tcpip.Error) {
 		}
 		e.rcvMu.Unlock()
 		return v, nil
+	case tcpip.SendBufferSizeOption:
+		e.mu.Lock()
+		v := e.sndBufSize
+		e.mu.Unlock()
+		return v, nil
+
+	case tcpip.ReceiveBufferSizeOption:
+		e.rcvMu.Lock()
+		v := e.rcvBufSizeMax
+		e.rcvMu.Unlock()
+		return v, nil
+
 	}
 	return -1, tcpip.ErrUnknownProtocolOption
 }
@@ -341,18 +358,6 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 	case tcpip.ErrorOption:
 		return nil
 
-	case *tcpip.SendBufferSizeOption:
-		e.mu.Lock()
-		*o = tcpip.SendBufferSizeOption(e.sndBufSize)
-		e.mu.Unlock()
-		return nil
-
-	case *tcpip.ReceiveBufferSizeOption:
-		e.rcvMu.Lock()
-		*o = tcpip.ReceiveBufferSizeOption(e.rcvBufSizeMax)
-		e.rcvMu.Unlock()
-		return nil
-
 	case *tcpip.KeepaliveEnabledOption:
 		*o = 0
 		return nil
diff --git a/pkg/tcpip/transport/raw/endpoint.go b/pkg/tcpip/transport/raw/endpoint.go
index cf1c5c433..a02731a5d 100644
--- a/pkg/tcpip/transport/raw/endpoint.go
+++ b/pkg/tcpip/transport/raw/endpoint.go
@@ -492,6 +492,11 @@ func (ep *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 	return tcpip.ErrUnknownProtocolOption
 }
 
+// SetSockOptInt implements tcpip.Endpoint.SetSockOptInt.
+func (ep *endpoint) SetSockOptInt(opt tcpip.SockOpt, v int) *tcpip.Error {
+	return tcpip.ErrUnknownProtocolOption
+}
+
 // GetSockOptInt implements tcpip.Endpoint.GetSockOptInt.
 func (ep *endpoint) GetSockOptInt(opt tcpip.SockOpt) (int, *tcpip.Error) {
 	switch opt {
@@ -504,6 +509,19 @@ func (ep *endpoint) GetSockOptInt(opt tcpip.SockOpt) (int, *tcpip.Error) {
 		}
 		ep.rcvMu.Unlock()
 		return v, nil
+
+	case tcpip.SendBufferSizeOption:
+		ep.mu.Lock()
+		v := ep.sndBufSize
+		ep.mu.Unlock()
+		return v, nil
+
+	case tcpip.ReceiveBufferSizeOption:
+		ep.rcvMu.Lock()
+		v := ep.rcvBufSizeMax
+		ep.rcvMu.Unlock()
+		return v, nil
+
 	}
 
 	return -1, tcpip.ErrUnknownProtocolOption
@@ -515,18 +533,6 @@ func (ep *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 	case tcpip.ErrorOption:
 		return nil
 
-	case *tcpip.SendBufferSizeOption:
-		ep.mu.Lock()
-		*o = tcpip.SendBufferSizeOption(ep.sndBufSize)
-		ep.mu.Unlock()
-		return nil
-
-	case *tcpip.ReceiveBufferSizeOption:
-		ep.rcvMu.Lock()
-		*o = tcpip.ReceiveBufferSizeOption(ep.rcvBufSizeMax)
-		ep.rcvMu.Unlock()
-		return nil
-
 	case *tcpip.KeepaliveEnabledOption:
 		*o = 0
 		return nil
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index dd931f88c..35b489c68 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -952,62 +952,9 @@ func (e *endpoint) zeroReceiveWindow(scale uint8) bool {
 	return ((e.rcvBufSize - e.rcvBufUsed) >> scale) == 0
 }
 
-// SetSockOpt sets a socket option.
-func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
-	switch v := opt.(type) {
-	case tcpip.DelayOption:
-		if v == 0 {
-			atomic.StoreUint32(&e.delay, 0)
-
-			// Handle delayed data.
-			e.sndWaker.Assert()
-		} else {
-			atomic.StoreUint32(&e.delay, 1)
-		}
-		return nil
-
-	case tcpip.CorkOption:
-		if v == 0 {
-			atomic.StoreUint32(&e.cork, 0)
-
-			// Handle the corked data.
-			e.sndWaker.Assert()
-		} else {
-			atomic.StoreUint32(&e.cork, 1)
-		}
-		return nil
-
-	case tcpip.ReuseAddressOption:
-		e.mu.Lock()
-		e.reuseAddr = v != 0
-		e.mu.Unlock()
-		return nil
-
-	case tcpip.ReusePortOption:
-		e.mu.Lock()
-		e.reusePort = v != 0
-		e.mu.Unlock()
-		return nil
-
-	case tcpip.QuickAckOption:
-		if v == 0 {
-			atomic.StoreUint32(&e.slowAck, 1)
-		} else {
-			atomic.StoreUint32(&e.slowAck, 0)
-		}
-		return nil
-
-	case tcpip.MaxSegOption:
-		userMSS := v
-		if userMSS < header.TCPMinimumMSS || userMSS > header.TCPMaximumMSS {
-			return tcpip.ErrInvalidOptionValue
-		}
-		e.mu.Lock()
-		e.userMSS = int(userMSS)
-		e.mu.Unlock()
-		e.notifyProtocolGoroutine(notifyMSSChanged)
-		return nil
-
+// SetSockOptInt sets a socket option.
+func (e *endpoint) SetSockOptInt(opt tcpip.SockOpt, v int) *tcpip.Error {
+	switch opt {
 	case tcpip.ReceiveBufferSizeOption:
 		// Make sure the receive buffer size is within the min and max
 		// allowed.
@@ -1071,6 +1018,67 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 		e.sndBufMu.Unlock()
 		return nil
 
+	default:
+		return nil
+	}
+}
+
+// SetSockOpt sets a socket option.
+func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
+	switch v := opt.(type) {
+	case tcpip.DelayOption:
+		if v == 0 {
+			atomic.StoreUint32(&e.delay, 0)
+
+			// Handle delayed data.
+			e.sndWaker.Assert()
+		} else {
+			atomic.StoreUint32(&e.delay, 1)
+		}
+		return nil
+
+	case tcpip.CorkOption:
+		if v == 0 {
+			atomic.StoreUint32(&e.cork, 0)
+
+			// Handle the corked data.
+			e.sndWaker.Assert()
+		} else {
+			atomic.StoreUint32(&e.cork, 1)
+		}
+		return nil
+
+	case tcpip.ReuseAddressOption:
+		e.mu.Lock()
+		e.reuseAddr = v != 0
+		e.mu.Unlock()
+		return nil
+
+	case tcpip.ReusePortOption:
+		e.mu.Lock()
+		e.reusePort = v != 0
+		e.mu.Unlock()
+		return nil
+
+	case tcpip.QuickAckOption:
+		if v == 0 {
+			atomic.StoreUint32(&e.slowAck, 1)
+		} else {
+			atomic.StoreUint32(&e.slowAck, 0)
+		}
+		return nil
+
+	case tcpip.MaxSegOption:
+		userMSS := v
+		if userMSS < header.TCPMinimumMSS || userMSS > header.TCPMaximumMSS {
+			return tcpip.ErrInvalidOptionValue
+		}
+		e.mu.Lock()
+		e.userMSS = int(userMSS)
+		e.mu.Unlock()
+		e.notifyProtocolGoroutine(notifyMSSChanged)
+		return nil
+
 	case tcpip.V6OnlyOption:
 		// We only recognize this option on v6 endpoints.
 		if e.netProto != header.IPv6ProtocolNumber {
@@ -1182,6 +1190,18 @@ func (e *endpoint) GetSockOptInt(opt tcpip.SockOpt) (int, *tcpip.Error) {
 	switch opt {
 	case tcpip.ReceiveQueueSizeOption:
 		return e.readyReceiveSize()
+	case tcpip.SendBufferSizeOption:
+		e.sndBufMu.Lock()
+		v := e.sndBufSize
+		e.sndBufMu.Unlock()
+		return v, nil
+
+	case tcpip.ReceiveBufferSizeOption:
+		e.rcvListMu.Lock()
+		v := e.rcvBufSize
+		e.rcvListMu.Unlock()
+		return v, nil
+
 	}
 	return -1, tcpip.ErrUnknownProtocolOption
 }
@@ -1204,18 +1224,6 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 		*o = header.TCPDefaultMSS
 		return nil
 
-	case *tcpip.SendBufferSizeOption:
-		e.sndBufMu.Lock()
-		*o = tcpip.SendBufferSizeOption(e.sndBufSize)
-		e.sndBufMu.Unlock()
-		return nil
-
-	case *tcpip.ReceiveBufferSizeOption:
-		e.rcvListMu.Lock()
-		*o = tcpip.ReceiveBufferSizeOption(e.rcvBufSize)
-		e.rcvListMu.Unlock()
-		return nil
-
 	case *tcpip.DelayOption:
 		*o = 0
 		if v := atomic.LoadUint32(&e.delay); v != 0 {
diff --git a/pkg/tcpip/transport/tcp/tcp_noracedetector_test.go b/pkg/tcpip/transport/tcp/tcp_noracedetector_test.go
index 272bbcdbd..9fa97528b 100644
--- a/pkg/tcpip/transport/tcp/tcp_noracedetector_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_noracedetector_test.go
@@ -38,7 +38,7 @@ func TestFastRecovery(t *testing.T) {
 	c := context.New(t, uint32(header.TCPMinimumSize+header.IPv4MinimumSize+maxPayload))
 	defer c.Cleanup()
 
-	c.CreateConnected(789, 30000, nil)
+	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
 
 	const iterations = 7
 	data := buffer.NewView(2 * maxPayload * (tcp.InitialCwnd << (iterations + 1)))
@@ -190,7 +190,7 @@ func TestExponentialIncreaseDuringSlowStart(t *testing.T) {
 	c := context.New(t, uint32(header.TCPMinimumSize+header.IPv4MinimumSize+maxPayload))
 	defer c.Cleanup()
 
-	c.CreateConnected(789, 30000, nil)
+	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
 
 	const iterations = 7
 	data := buffer.NewView(maxPayload * (tcp.InitialCwnd << (iterations + 1)))
@@ -232,7 +232,7 @@ func TestCongestionAvoidance(t *testing.T) {
 	c := context.New(t, uint32(header.TCPMinimumSize+header.IPv4MinimumSize+maxPayload))
 	defer c.Cleanup()
 
-	c.CreateConnected(789, 30000, nil)
+	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
 
 	const iterations = 7
 	data := buffer.NewView(2 * maxPayload * (tcp.InitialCwnd << (iterations + 1)))
@@ -336,7 +336,7 @@ func TestCubicCongestionAvoidance(t *testing.T) {
 
 	enableCUBIC(t, c)
 
-	c.CreateConnected(789, 30000, nil)
+	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
 
 	const iterations = 7
 	data := buffer.NewView(2 * maxPayload * (tcp.InitialCwnd << (iterations + 1)))
@@ -445,7 +445,7 @@ func TestRetransmit(t *testing.T) {
 	c := context.New(t, uint32(header.TCPMinimumSize+header.IPv4MinimumSize+maxPayload))
 	defer c.Cleanup()
 
-	c.CreateConnected(789, 30000, nil)
+	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
 
 	const iterations = 7
 	data := buffer.NewView(maxPayload * (tcp.InitialCwnd << (iterations + 1)))
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index 32bb45224..7fa5cfb6e 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -84,7 +84,7 @@ func TestConnectIncrementActiveConnection(t *testing.T) {
 	stats := c.Stack().Stats()
 	want := stats.TCP.ActiveConnectionOpenings.Value() + 1
 
-	c.CreateConnected(789, 30000, nil)
+	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
 	if got := stats.TCP.ActiveConnectionOpenings.Value(); got != want {
 		t.Errorf("got stats.TCP.ActtiveConnectionOpenings.Value() = %v, want = %v", got, want)
 	}
@@ -97,7 +97,7 @@ func TestConnectDoesNotIncrementFailedConnectionAttempts(t *testing.T) {
 	stats := c.Stack().Stats()
 	want := stats.TCP.FailedConnectionAttempts.Value()
 
-	c.CreateConnected(789, 30000, nil)
+	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
 	if got := stats.TCP.FailedConnectionAttempts.Value(); got != want {
 		t.Errorf("got stats.TCP.FailedConnectionOpenings.Value() = %v, want = %v", got, want)
 	}
@@ -131,7 +131,7 @@ func TestTCPSegmentsSentIncrement(t *testing.T) {
 	stats := c.Stack().Stats()
 	// SYN and ACK
 	want := stats.TCP.SegmentsSent.Value() + 2
-	c.CreateConnected(789, 30000, nil)
+	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
 
 	if got := stats.TCP.SegmentsSent.Value(); got != want {
 		t.Errorf("got stats.TCP.SegmentsSent.Value() = %v, want = %v", got, want)
@@ -299,7 +299,7 @@ func TestTCPResetsReceivedIncrement(t *testing.T) {
 	want := stats.TCP.ResetsReceived.Value() + 1
 	iss := seqnum.Value(789)
 	rcvWnd := seqnum.Size(30000)
-	c.CreateConnected(iss, rcvWnd, nil)
+	c.CreateConnected(iss, rcvWnd, -1 /* epRcvBuf */)
 
 	c.SendPacket(nil, &context.Headers{
 		SrcPort: context.TestPort,
@@ -323,7 +323,7 @@ func TestTCPResetsDoNotGenerateResets(t *testing.T) {
 	want := stats.TCP.ResetsReceived.Value() + 1
 	iss := seqnum.Value(789)
 	rcvWnd := seqnum.Size(30000)
-	c.CreateConnected(iss, rcvWnd, nil)
+	c.CreateConnected(iss, rcvWnd, -1 /* epRcvBuf */)
 
 	c.SendPacket(nil, &context.Headers{
 		SrcPort: context.TestPort,
@@ -344,14 +344,14 @@ func TestActiveHandshake(t *testing.T) {
 	c := context.New(t, defaultMTU)
 	defer c.Cleanup()
 
-	c.CreateConnected(789, 30000, nil)
+	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
 }
 
 func TestNonBlockingClose(t *testing.T) {
 	c := context.New(t, defaultMTU)
 	defer c.Cleanup()
 
-	c.CreateConnected(789, 30000, nil)
+	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
 	ep := c.EP
 	c.EP = nil
 
@@ -367,7 +367,7 @@ func TestConnectResetAfterClose(t *testing.T) {
 	c := context.New(t, defaultMTU)
 	defer c.Cleanup()
 
-	c.CreateConnected(789, 30000, nil)
+	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
 	ep := c.EP
 	c.EP = nil
 
@@ -417,7 +417,7 @@ func TestSimpleReceive(t *testing.T) {
 	c := context.New(t, defaultMTU)
 	defer c.Cleanup()
 
-	c.CreateConnected(789, 30000, nil)
+	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
 
 	we, ch := waiter.NewChannelEntry(nil)
 	c.WQ.EventRegister(&we, waiter.EventIn)
@@ -469,7 +469,7 @@ func TestOutOfOrderReceive(t *testing.T) {
 	c := context.New(t, defaultMTU)
 	defer c.Cleanup()
 
-	c.CreateConnected(789, 30000, nil)
+	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
 
 	we, ch := waiter.NewChannelEntry(nil)
 	c.WQ.EventRegister(&we, waiter.EventIn)
@@ -557,8 +557,7 @@ func TestOutOfOrderFlood(t *testing.T) {
 	defer c.Cleanup()
 
 	// Create a new connection with initial window size of 10.
-	opt := tcpip.ReceiveBufferSizeOption(10)
-	c.CreateConnected(789, 30000, &opt)
+	c.CreateConnected(789, 30000, 10)
 
 	if _, _, err := c.EP.Read(nil); err != tcpip.ErrWouldBlock {
 		t.Fatalf("got c.EP.Read(nil) = %v, want = %v", err, tcpip.ErrWouldBlock)
@@ -631,7 +630,7 @@ func TestRstOnCloseWithUnreadData(t *testing.T) {
 	c := context.New(t, defaultMTU)
 	defer c.Cleanup()
 
-	c.CreateConnected(789, 30000, nil)
+	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
 
 	we, ch := waiter.NewChannelEntry(nil)
 	c.WQ.EventRegister(&we, waiter.EventIn)
@@ -700,7 +699,7 @@ func TestRstOnCloseWithUnreadDataFinConvertRst(t *testing.T) {
 	c := context.New(t, defaultMTU)
 	defer c.Cleanup()
 
-	c.CreateConnected(789, 30000, nil)
+	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
 
 	we, ch := waiter.NewChannelEntry(nil)
 	c.WQ.EventRegister(&we, waiter.EventIn)
@@ -785,7 +784,7 @@ func TestShutdownRead(t *testing.T) {
 	c := context.New(t, defaultMTU)
 	defer c.Cleanup()
 
-	c.CreateConnected(789, 30000, nil)
+	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
 
 	if _, _, err := c.EP.Read(nil); err != tcpip.ErrWouldBlock {
 		t.Fatalf("got c.EP.Read(nil) = %v, want = %v", err, tcpip.ErrWouldBlock)
@@ -804,8 +803,7 @@ func TestFullWindowReceive(t *testing.T) {
 	c := context.New(t, defaultMTU)
 	defer c.Cleanup()
 
-	opt := tcpip.ReceiveBufferSizeOption(10)
-	c.CreateConnected(789, 30000, &opt)
+	c.CreateConnected(789, 30000, 10)
 
 	we, ch := waiter.NewChannelEntry(nil)
 	c.WQ.EventRegister(&we, waiter.EventIn)
@@ -872,11 +870,9 @@ func TestNoWindowShrinking(t *testing.T) {
 	defer c.Cleanup()
 
 	// Start off with a window size of 10, then shrink it to 5.
-	opt := tcpip.ReceiveBufferSizeOption(10)
-	c.CreateConnected(789, 30000, &opt)
+	c.CreateConnected(789, 30000, 10)
 
-	opt = 5
-	if err := c.EP.SetSockOpt(opt); err != nil {
+	if err := c.EP.SetSockOptInt(tcpip.ReceiveBufferSizeOption, 5); err != nil {
 		t.Fatalf("SetSockOpt failed: %v", err)
 	}
 
@@ -976,7 +972,7 @@ func TestSimpleSend(t *testing.T) {
 	c := context.New(t, defaultMTU)
 	defer c.Cleanup()
 
-	c.CreateConnected(789, 30000, nil)
+	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
 
 	data := []byte{1, 2, 3}
 	view := buffer.NewView(len(data))
@@ -1017,7 +1013,7 @@ func TestZeroWindowSend(t *testing.T) {
 	c := context.New(t, defaultMTU)
 	defer c.Cleanup()
 
-	c.CreateConnected(789, 0, nil)
+	c.CreateConnected(789, 0, -1 /* epRcvBuf */)
 
 	data := []byte{1, 2, 3}
 	view := buffer.NewView(len(data))
@@ -1075,8 +1071,7 @@ func TestScaledWindowConnect(t *testing.T) {
 	defer c.Cleanup()
 
 	// Set the window size greater than the maximum non-scaled window.
-	opt := tcpip.ReceiveBufferSizeOption(65535 * 3)
-	c.CreateConnectedWithRawOptions(789, 30000, &opt, []byte{
+	c.CreateConnectedWithRawOptions(789, 30000, 65535*3, []byte{
 		header.TCPOptionWS, 3, 0, header.TCPOptionNOP,
 	})
 
@@ -1110,8 +1105,7 @@ func TestNonScaledWindowConnect(t *testing.T) {
 	defer c.Cleanup()
 
 	// Set the window size greater than the maximum non-scaled window.
-	opt := tcpip.ReceiveBufferSizeOption(65535 * 3)
-	c.CreateConnected(789, 30000, &opt)
+	c.CreateConnected(789, 30000, 65535*3)
 
 	data := []byte{1, 2, 3}
 	view := buffer.NewView(len(data))
@@ -1151,7 +1145,7 @@ func TestScaledWindowAccept(t *testing.T) {
 	defer ep.Close()
 
 	// Set the window size greater than the maximum non-scaled window.
-	if err := ep.SetSockOpt(tcpip.ReceiveBufferSizeOption(65535 * 3)); err != nil {
+	if err := ep.SetSockOptInt(tcpip.ReceiveBufferSizeOption, 65535*3); err != nil {
 		t.Fatalf("SetSockOpt failed failed: %v", err)
 	}
 
@@ -1224,7 +1218,7 @@ func TestNonScaledWindowAccept(t *testing.T) {
 	defer ep.Close()
 
 	// Set the window size greater than the maximum non-scaled window.
-	if err := ep.SetSockOpt(tcpip.ReceiveBufferSizeOption(65535 * 3)); err != nil {
+	if err := ep.SetSockOptInt(tcpip.ReceiveBufferSizeOption, 65535*3); err != nil {
 		t.Fatalf("SetSockOpt failed failed: %v", err)
 	}
 
@@ -1293,8 +1287,7 @@ func TestZeroScaledWindowReceive(t *testing.T) {
 	// Set the window size such that a window scale of 4 will be used.
 	const wnd = 65535 * 10
 	const ws = uint32(4)
-	opt := tcpip.ReceiveBufferSizeOption(wnd)
-	c.CreateConnectedWithRawOptions(789, 30000, &opt, []byte{
+	c.CreateConnectedWithRawOptions(789, 30000, wnd, []byte{
 		header.TCPOptionWS, 3, 0, header.TCPOptionNOP,
 	})
 
@@ -1399,7 +1392,7 @@ func TestSegmentMerging(t *testing.T) {
 			c := context.New(t, defaultMTU)
 			defer c.Cleanup()
 
-			c.CreateConnected(789, 30000, nil)
+			c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
 
 			// Prevent the endpoint from processing packets.
 			test.stop(c.EP)
@@ -1449,7 +1442,7 @@ func TestDelay(t *testing.T) {
 	c := context.New(t, defaultMTU)
 	defer c.Cleanup()
 
-	c.CreateConnected(789, 30000, nil)
+	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
 
 	c.EP.SetSockOpt(tcpip.DelayOption(1))
 
@@ -1497,7 +1490,7 @@ func TestUndelay(t *testing.T) {
 	c := context.New(t, defaultMTU)
 	defer c.Cleanup()
 
-	c.CreateConnected(789, 30000, nil)
+	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
 
 	c.EP.SetSockOpt(tcpip.DelayOption(1))
 
@@ -1579,7 +1572,7 @@ func TestMSSNotDelayed(t *testing.T) {
 			c := context.New(t, defaultMTU)
 			defer c.Cleanup()
 
-			c.CreateConnectedWithRawOptions(789, 30000, nil, []byte{
+			c.CreateConnectedWithRawOptions(789, 30000, -1 /* epRcvBuf */, []byte{
 				header.TCPOptionMSS, 4, byte(maxPayload / 256), byte(maxPayload % 256),
 			})
 
@@ -1695,7 +1688,7 @@ func TestSendGreaterThanMTU(t *testing.T) {
 	c := context.New(t, uint32(header.TCPMinimumSize+header.IPv4MinimumSize+maxPayload))
 	defer c.Cleanup()
 
-	c.CreateConnected(789, 30000, nil)
+	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
 	testBrokenUpWrite(t, c, maxPayload)
 }
 
@@ -1704,7 +1697,7 @@ func TestActiveSendMSSLessThanMTU(t *testing.T) {
 	c := context.New(t, 65535)
 	defer c.Cleanup()
 
-	c.CreateConnectedWithRawOptions(789, 30000, nil, []byte{
+	c.CreateConnectedWithRawOptions(789, 30000, -1 /* epRcvBuf */, []byte{
 		header.TCPOptionMSS, 4, byte(maxPayload / 256), byte(maxPayload % 256),
 	})
 	testBrokenUpWrite(t, c, maxPayload)
@@ -1727,7 +1720,7 @@ func TestPassiveSendMSSLessThanMTU(t *testing.T) {
 	// Set the buffer size to a deterministic size so that we can check the
 	// window scaling option.
 	const rcvBufferSize = 0x20000
-	if err := ep.SetSockOpt(tcpip.ReceiveBufferSizeOption(rcvBufferSize)); err != nil {
+	if err := ep.SetSockOptInt(tcpip.ReceiveBufferSizeOption, rcvBufferSize); err != nil {
 		t.Fatalf("SetSockOpt failed failed: %v", err)
 	}
 
@@ -1871,7 +1864,7 @@ func TestSynOptionsOnActiveConnect(t *testing.T) {
 	// window scaling option.
 	const rcvBufferSize = 0x20000
 	const wndScale = 2
-	if err := c.EP.SetSockOpt(tcpip.ReceiveBufferSizeOption(rcvBufferSize)); err != nil {
+	if err := c.EP.SetSockOptInt(tcpip.ReceiveBufferSizeOption, rcvBufferSize); err != nil {
 		t.Fatalf("SetSockOpt failed failed: %v", err)
 	}
 
@@ -1973,7 +1966,7 @@ func TestReceiveOnResetConnection(t *testing.T) {
 	c := context.New(t, defaultMTU)
 	defer c.Cleanup()
 
-	c.CreateConnected(789, 30000, nil)
+	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
 
 	// Send RST segment.
 	c.SendPacket(nil, &context.Headers{
@@ -2010,7 +2003,7 @@ func TestSendOnResetConnection(t *testing.T) {
 	c := context.New(t, defaultMTU)
 	defer c.Cleanup()
 
-	c.CreateConnected(789, 30000, nil)
+	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
 
 	// Send RST segment.
 	c.SendPacket(nil, &context.Headers{
@@ -2035,7 +2028,7 @@ func TestFinImmediately(t *testing.T) {
 	c := context.New(t, defaultMTU)
 	defer c.Cleanup()
 
-	c.CreateConnected(789, 30000, nil)
+	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
 
 	// Shutdown immediately, check that we get a FIN.
 	if err := c.EP.Shutdown(tcpip.ShutdownWrite); err != nil {
@@ -2078,7 +2071,7 @@ func TestFinRetransmit(t *testing.T) {
 	c := context.New(t, defaultMTU)
 	defer c.Cleanup()
 
-	c.CreateConnected(789, 30000, nil)
+	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
 
 	// Shutdown immediately, check that we get a FIN.
 	if err := c.EP.Shutdown(tcpip.ShutdownWrite); err != nil {
@@ -2132,7 +2125,7 @@ func TestFinWithNoPendingData(t *testing.T) {
 	c := context.New(t, defaultMTU)
 	defer c.Cleanup()
 
-	c.CreateConnected(789, 30000, nil)
+	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
 
 	// Write something out, and have it acknowledged.
 	view := buffer.NewView(10)
@@ -2203,7 +2196,7 @@ func TestFinWithPendingDataCwndFull(t *testing.T) {
 	c := context.New(t, defaultMTU)
 	defer c.Cleanup()
 
-	c.CreateConnected(789, 30000, nil)
+	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
 
 	// Write enough segments to fill the congestion window before ACK'ing
 	// any of them.
@@ -2291,7 +2284,7 @@ func TestFinWithPendingData(t *testing.T) {
 	c := context.New(t, defaultMTU)
 	defer c.Cleanup()
 
-	c.CreateConnected(789, 30000, nil)
+	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
 
 	// Write something out, and acknowledge it to get cwnd to 2.
 	view := buffer.NewView(10)
@@ -2377,7 +2370,7 @@ func TestFinWithPartialAck(t *testing.T) {
 	c := context.New(t, defaultMTU)
 	defer c.Cleanup()
 
-	c.CreateConnected(789, 30000, nil)
+	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
 
 	// Write something out, and acknowledge it to get cwnd to 2. Also send
 	// FIN from the test side.
@@ -2509,7 +2502,7 @@ func scaledSendWindow(t *testing.T, scale uint8) {
 	defer c.Cleanup()
 
 	maxPayload := defaultMTU - header.IPv4MinimumSize - header.TCPMinimumSize
-	c.CreateConnectedWithRawOptions(789, 0, nil, []byte{
+	c.CreateConnectedWithRawOptions(789, 0, -1 /* epRcvBuf */, []byte{
 		header.TCPOptionMSS, 4, byte(maxPayload / 256), byte(maxPayload % 256),
 		header.TCPOptionWS, 3, scale, header.TCPOptionNOP,
 	})
@@ -2559,7 +2552,7 @@ func TestScaledSendWindow(t *testing.T) {
 func TestReceivedValidSegmentCountIncrement(t *testing.T) {
 	c := context.New(t, defaultMTU)
 	defer c.Cleanup()
-	c.CreateConnected(789, 30000, nil)
+	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
 	stats := c.Stack().Stats()
 	want := stats.TCP.ValidSegmentsReceived.Value() + 1
 
@@ -2580,7 +2573,7 @@ func TestReceivedValidSegmentCountIncrement(t *testing.T) {
 func TestReceivedInvalidSegmentCountIncrement(t *testing.T) {
 	c := context.New(t, defaultMTU)
 	defer c.Cleanup()
-	c.CreateConnected(789, 30000, nil)
+	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
 	stats := c.Stack().Stats()
 	want := stats.TCP.InvalidSegmentsReceived.Value() + 1
 	vv := c.BuildSegment(nil, &context.Headers{
@@ -2604,7 +2597,7 @@ func TestReceivedInvalidSegmentCountIncrement(t *testing.T) {
 func TestReceivedIncorrectChecksumIncrement(t *testing.T) {
 	c := context.New(t, defaultMTU)
 	defer c.Cleanup()
-	c.CreateConnected(789, 30000, nil)
+	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
 	stats := c.Stack().Stats()
 	want := stats.TCP.ChecksumErrors.Value() + 1
 	vv := c.BuildSegment([]byte{0x1, 0x2, 0x3}, &context.Headers{
@@ -2635,7 +2628,7 @@ func TestReceivedSegmentQueuing(t *testing.T) {
 	c := context.New(t, defaultMTU)
 	defer c.Cleanup()
 
-	c.CreateConnected(789, 30000, nil)
+	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
 
 	// Send 200 segments.
 	data := []byte{1, 2, 3}
@@ -2681,7 +2674,7 @@ func TestReadAfterClosedState(t *testing.T) {
 	c := context.New(t, defaultMTU)
 	defer c.Cleanup()
 
-	c.CreateConnected(789, 30000, nil)
+	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
 
 	we, ch := waiter.NewChannelEntry(nil)
 	c.WQ.EventRegister(&we, waiter.EventIn)
@@ -2856,8 +2849,8 @@ func TestReusePort(t *testing.T) {
 func checkRecvBufferSize(t *testing.T, ep tcpip.Endpoint, v int) {
 	t.Helper()
 
-	var s tcpip.ReceiveBufferSizeOption
-	if err := ep.GetSockOpt(&s); err != nil {
+	s, err := ep.GetSockOptInt(tcpip.ReceiveBufferSizeOption)
+	if err != nil {
 		t.Fatalf("GetSockOpt failed: %v", err)
 	}
 
@@ -2869,8 +2862,8 @@ func checkRecvBufferSize(t *testing.T, ep tcpip.Endpoint, v int) {
 func checkSendBufferSize(t *testing.T, ep tcpip.Endpoint, v int) {
 	t.Helper()
 
-	var s tcpip.SendBufferSizeOption
-	if err := ep.GetSockOpt(&s); err != nil {
+	s, err := ep.GetSockOptInt(tcpip.SendBufferSizeOption)
+	if err != nil {
 		t.Fatalf("GetSockOpt failed: %v", err)
 	}
 
@@ -2945,26 +2938,26 @@ func TestMinMaxBufferSizes(t *testing.T) {
 	}
 
 	// Set values below the min.
-	if err := ep.SetSockOpt(tcpip.ReceiveBufferSizeOption(199)); err != nil {
+	if err := ep.SetSockOptInt(tcpip.ReceiveBufferSizeOption, 199); err != nil {
 		t.Fatalf("GetSockOpt failed: %v", err)
 	}
 
 	checkRecvBufferSize(t, ep, 200)
 
-	if err := ep.SetSockOpt(tcpip.SendBufferSizeOption(299)); err != nil {
+	if err := ep.SetSockOptInt(tcpip.SendBufferSizeOption, 299); err != nil {
 		t.Fatalf("GetSockOpt failed: %v", err)
 	}
 
 	checkSendBufferSize(t, ep, 300)
 
 	// Set values above the max.
-	if err := ep.SetSockOpt(tcpip.ReceiveBufferSizeOption(1 + tcp.DefaultReceiveBufferSize*20)); err != nil {
+	if err := ep.SetSockOptInt(tcpip.ReceiveBufferSizeOption, 1+tcp.DefaultReceiveBufferSize*20); err != nil {
 		t.Fatalf("GetSockOpt failed: %v", err)
 	}
 
 	checkRecvBufferSize(t, ep, tcp.DefaultReceiveBufferSize*20)
 
-	if err := ep.SetSockOpt(tcpip.SendBufferSizeOption(1 + tcp.DefaultSendBufferSize*30)); err != nil {
+	if err := ep.SetSockOptInt(tcpip.SendBufferSizeOption, 1+tcp.DefaultSendBufferSize*30); err != nil {
 		t.Fatalf("GetSockOpt failed: %v", err)
 	}
 
@@ -3231,7 +3224,7 @@ func TestPathMTUDiscovery(t *testing.T) {
 
 	// Create new connection with MSS of 1460.
 	const maxPayload = 1500 - header.TCPMinimumSize - header.IPv4MinimumSize
-	c.CreateConnectedWithRawOptions(789, 30000, nil, []byte{
+	c.CreateConnectedWithRawOptions(789, 30000, -1 /* epRcvBuf */, []byte{
 		header.TCPOptionMSS, 4, byte(maxPayload / 256), byte(maxPayload % 256),
 	})
 
@@ -3308,7 +3301,7 @@ func TestTCPEndpointProbe(t *testing.T) {
 		invoked <- struct{}{}
 	})
 
-	c.CreateConnected(789, 30000, nil)
+	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
 
 	data := []byte{1, 2, 3}
 	c.SendPacket(data, &context.Headers{
@@ -3482,7 +3475,7 @@ func TestKeepalive(t *testing.T) {
 	c := context.New(t, defaultMTU)
 	defer c.Cleanup()
 
-	c.CreateConnected(789, 30000, nil)
+	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
 
 	c.EP.SetSockOpt(tcpip.KeepaliveIdleOption(10 * time.Millisecond))
 	c.EP.SetSockOpt(tcpip.KeepaliveIntervalOption(10 * time.Millisecond))
diff --git a/pkg/tcpip/transport/tcp/testing/context/context.go b/pkg/tcpip/transport/tcp/testing/context/context.go
index 16783e716..78eff5c3a 100644
--- a/pkg/tcpip/transport/tcp/testing/context/context.go
+++ b/pkg/tcpip/transport/tcp/testing/context/context.go
@@ -512,7 +512,7 @@ func (c *Context) SendV6Packet(payload []byte, h *Headers) {
 }
 
 // CreateConnected creates a connected TCP endpoint.
-func (c *Context) CreateConnected(iss seqnum.Value, rcvWnd seqnum.Size, epRcvBuf *tcpip.ReceiveBufferSizeOption) {
+func (c *Context) CreateConnected(iss seqnum.Value, rcvWnd seqnum.Size, epRcvBuf int) {
 	c.CreateConnectedWithRawOptions(iss, rcvWnd, epRcvBuf, nil)
 }
 
@@ -590,7 +590,7 @@ func (c *Context) Connect(iss seqnum.Value, rcvWnd seqnum.Size, options []byte)
 //
 // It also sets the receive buffer for the endpoint to the specified
 // value in epRcvBuf.
-func (c *Context) CreateConnectedWithRawOptions(iss seqnum.Value, rcvWnd seqnum.Size, epRcvBuf *tcpip.ReceiveBufferSizeOption, options []byte) {
+func (c *Context) CreateConnectedWithRawOptions(iss seqnum.Value, rcvWnd seqnum.Size, epRcvBuf int, options []byte) {
 	// Create TCP endpoint.
 	var err *tcpip.Error
 	c.EP, err = c.s.NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &c.WQ)
@@ -598,8 +598,8 @@ func (c *Context) CreateConnectedWithRawOptions(iss seqnum.Value, rcvWnd seqnum.
 		c.t.Fatalf("NewEndpoint failed: %v", err)
 	}
 
-	if epRcvBuf != nil {
-		if err := c.EP.SetSockOpt(*epRcvBuf); err != nil {
+	if epRcvBuf != -1 {
+		if err := c.EP.SetSockOptInt(tcpip.ReceiveBufferSizeOption, epRcvBuf); err != nil {
 			c.t.Fatalf("SetSockOpt failed failed: %v", err)
 		}
 	}
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index 6ac7c067a..0bec7e62d 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -389,7 +389,12 @@ func (e *endpoint) Peek([][]byte) (int64, tcpip.ControlMessages, *tcpip.Error) {
 	return 0, tcpip.ControlMessages{}, nil
 }
 
-// SetSockOpt sets a socket option. Currently not supported.
+// SetSockOptInt implements tcpip.Endpoint.SetSockOptInt.
+func (e *endpoint) SetSockOptInt(opt tcpip.SockOpt, v int) *tcpip.Error {
+	return nil
+}
+
+// SetSockOpt implements tcpip.Endpoint.SetSockOpt.
 func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 	switch v := opt.(type) {
 	case tcpip.V6OnlyOption:
@@ -568,7 +573,20 @@ func (e *endpoint) GetSockOptInt(opt tcpip.SockOpt) (int, *tcpip.Error) {
 		}
 		e.rcvMu.Unlock()
 		return v, nil
+
+	case tcpip.SendBufferSizeOption:
+		e.mu.Lock()
+		v := e.sndBufSize
+		e.mu.Unlock()
+		return v, nil
+
+	case tcpip.ReceiveBufferSizeOption:
+		e.rcvMu.Lock()
+		v := e.rcvBufSizeMax
+		e.rcvMu.Unlock()
+		return v, nil
 	}
+
 	return -1, tcpip.ErrUnknownProtocolOption
 }
 
@@ -578,18 +596,6 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 	case tcpip.ErrorOption:
 		return nil
 
-	case *tcpip.SendBufferSizeOption:
-		e.mu.Lock()
-		*o = tcpip.SendBufferSizeOption(e.sndBufSize)
-		e.mu.Unlock()
-		return nil
-
-	case *tcpip.ReceiveBufferSizeOption:
-		e.rcvMu.Lock()
-		*o = tcpip.ReceiveBufferSizeOption(e.rcvBufSizeMax)
-		e.rcvMu.Unlock()
-		return nil
-
 	case *tcpip.V6OnlyOption:
 		// We only recognize this option on v6 endpoints.
 		if e.netProto != header.IPv6ProtocolNumber {
-- 
cgit v1.2.3


From bc9de939fd00e71ed8fbfc6c1b631c1facb445a2 Mon Sep 17 00:00:00 2001
From: "henry.tjf" <henry.tjf@antfin.com>
Date: Wed, 11 Sep 2019 18:30:57 +0800
Subject: tty: fix sending SIGTTOU on tty write

How to reproduce:
  $ echo "timeout 10 ls" > foo.sh
  $ chmod +x foo.sh
  $ ./foo.sh
  (will hang here for 10 secs, and the output of ls does not show)

When "ls" process writes to stdout, it receives SIGTTOU signal, and
hangs there. Until "timeout" process timeouts, and kills "ls" process.

The expected result is: "ls" writes its output into tty, and terminates
immdedately, then "timeout" process receives SIGCHLD and terminates.

The reason for this failure is that we missed the check for TOSTOP (if
set, background processes will receive the SIGTTOU signal when they do
write).

We use drivers/tty/n_tty.c:n_tty_write() as a reference.

Fixes: #862

Reported-by: chris.zn <chris.zn@antfin.com>
Signed-off-by: Jianfeng Tan <henry.tjf@antfin.com>
Signed-off-by: chenglang.hy <chenglang.hy@antfin.com>
---
 pkg/sentry/fs/host/tty.go | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/host/tty.go b/pkg/sentry/fs/host/tty.go
index 2526412a4..90331e3b2 100644
--- a/pkg/sentry/fs/host/tty.go
+++ b/pkg/sentry/fs/host/tty.go
@@ -43,12 +43,15 @@ type TTYFileOperations struct {
 	// fgProcessGroup is the foreground process group that is currently
 	// connected to this TTY.
 	fgProcessGroup *kernel.ProcessGroup
+
+	termios linux.KernelTermios
 }
 
 // newTTYFile returns a new fs.File that wraps a TTY FD.
 func newTTYFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags, iops *inodeOperations) *fs.File {
 	return fs.NewFile(ctx, dirent, flags, &TTYFileOperations{
 		fileOperations: fileOperations{iops: iops},
+		termios:        linux.DefaultSlaveTermios,
 	})
 }
 
@@ -97,9 +100,12 @@ func (t *TTYFileOperations) Write(ctx context.Context, file *fs.File, src userme
 	t.mu.Lock()
 	defer t.mu.Unlock()
 
-	// Are we allowed to do the write?
-	if err := t.checkChange(ctx, linux.SIGTTOU); err != nil {
-		return 0, err
+	// Check whether TOSTOP is enabled. This corresponds to the check in
+	// drivers/tty/n_tty.c:n_tty_write().
+	if t.termios.LEnabled(linux.TOSTOP) {
+		if err := t.checkChange(ctx, linux.SIGTTOU); err != nil {
+			return 0, err
+		}
 	}
 	return t.fileOperations.Write(ctx, file, src, offset)
 }
@@ -144,6 +150,9 @@ func (t *TTYFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO
 			return 0, err
 		}
 		err := ioctlSetTermios(fd, ioctl, &termios)
+		if err == nil {
+			t.termios.FromTermios(termios)
+		}
 		return 0, err
 
 	case linux.TIOCGPGRP:
-- 
cgit v1.2.3


From 502f8f238ea58c4828e528e563d8dbd419faeea7 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Tue, 24 Sep 2019 13:25:25 -0700
Subject: Stub out readahead implementation.

Closes #261

PiperOrigin-RevId: 270973347
---
 pkg/sentry/syscalls/linux/linux64.go  |  2 +-
 pkg/sentry/syscalls/linux/sys_read.go | 33 +++++++++++++
 test/syscalls/BUILD                   |  5 ++
 test/syscalls/linux/BUILD             | 14 ++++++
 test/syscalls/linux/readahead.cc      | 91 +++++++++++++++++++++++++++++++++++
 5 files changed, 144 insertions(+), 1 deletion(-)
 create mode 100644 test/syscalls/linux/readahead.cc

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go
index 18d24ab61..61acd0abd 100644
--- a/pkg/sentry/syscalls/linux/linux64.go
+++ b/pkg/sentry/syscalls/linux/linux64.go
@@ -232,7 +232,7 @@ var AMD64 = &kernel.SyscallTable{
 		184: syscalls.Error("tuxcall", syserror.ENOSYS, "Not implemented in Linux.", nil),
 		185: syscalls.Error("security", syserror.ENOSYS, "Not implemented in Linux.", nil),
 		186: syscalls.Supported("gettid", Gettid),
-		187: syscalls.ErrorWithEvent("readahead", syserror.ENOSYS, "", []string{"gvisor.dev/issue/261"}), // TODO(b/29351341)
+		187: syscalls.Supported("readahead", Readahead),
 		188: syscalls.Error("setxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
 		189: syscalls.Error("lsetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
 		190: syscalls.Error("fsetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
diff --git a/pkg/sentry/syscalls/linux/sys_read.go b/pkg/sentry/syscalls/linux/sys_read.go
index 3ab54271c..cd31e0649 100644
--- a/pkg/sentry/syscalls/linux/sys_read.go
+++ b/pkg/sentry/syscalls/linux/sys_read.go
@@ -72,6 +72,39 @@ func Read(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC
 	return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "read", file)
 }
 
+// Readahead implements readahead(2).
+func Readahead(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	offset := args[1].Int64()
+	size := args[2].SizeT()
+
+	file := t.GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Check that the file is readable.
+	if !file.Flags().Read {
+		return 0, nil, syserror.EBADF
+	}
+
+	// Check that the size is valid.
+	if int(size) < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Check that the offset is legitimate.
+	if offset < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Return EINVAL; if the underlying file type does not support readahead,
+	// then Linux will return EINVAL to indicate as much. In the future, we
+	// may extend this function to actually support readahead hints.
+	return 0, nil, syserror.EINVAL
+}
+
 // Pread64 implements linux syscall pread64(2).
 func Pread64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	fd := args[0].Int()
diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD
index 63e4c63dd..341e6b252 100644
--- a/test/syscalls/BUILD
+++ b/test/syscalls/BUILD
@@ -345,6 +345,11 @@ syscall_test(
     test = "//test/syscalls/linux:read_test",
 )
 
+syscall_test(
+    add_overlay = True,
+    test = "//test/syscalls/linux:readahead_test",
+)
+
 syscall_test(
     size = "medium",
     shard_count = 5,
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index a4cebf46f..28b23ce58 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -1735,6 +1735,20 @@ cc_binary(
     ],
 )
 
+cc_binary(
+    name = "readahead_test",
+    testonly = 1,
+    srcs = ["readahead.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:file_descriptor",
+        "//test/util:temp_path",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 cc_binary(
     name = "readv_test",
     testonly = 1,
diff --git a/test/syscalls/linux/readahead.cc b/test/syscalls/linux/readahead.cc
new file mode 100644
index 000000000..09703b5c1
--- /dev/null
+++ b/test/syscalls/linux/readahead.cc
@@ -0,0 +1,91 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <fcntl.h>
+
+#include "gtest/gtest.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+TEST(ReadaheadTest, InvalidFD) {
+  EXPECT_THAT(readahead(-1, 1, 1), SyscallFailsWithErrno(EBADF));
+}
+
+TEST(ReadaheadTest, InvalidOffset) {
+  const TempPath in_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(in_file.path(), O_RDWR));
+  EXPECT_THAT(readahead(fd.get(), -1, 1), SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(ReadaheadTest, ValidOffset) {
+  constexpr char kData[] = "123";
+  const TempPath in_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      GetAbsoluteTestTmpdir(), kData, TempPath::kDefaultFileMode));
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(in_file.path(), O_RDWR));
+
+  // N.B. The implementation of readahead is filesystem-specific, and a file
+  // backed by ram may return EINVAL because there is nothing to be read.
+  EXPECT_THAT(readahead(fd.get(), 1, 1), AnyOf(SyscallSucceedsWithValue(0),
+                                               SyscallFailsWithErrno(EINVAL)));
+}
+
+TEST(ReadaheadTest, PastEnd) {
+  constexpr char kData[] = "123";
+  const TempPath in_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      GetAbsoluteTestTmpdir(), kData, TempPath::kDefaultFileMode));
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(in_file.path(), O_RDWR));
+  // See above.
+  EXPECT_THAT(readahead(fd.get(), 2, 2), AnyOf(SyscallSucceedsWithValue(0),
+                                               SyscallFailsWithErrno(EINVAL)));
+}
+
+TEST(ReadaheadTest, CrossesEnd) {
+  constexpr char kData[] = "123";
+  const TempPath in_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      GetAbsoluteTestTmpdir(), kData, TempPath::kDefaultFileMode));
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(in_file.path(), O_RDWR));
+  // See above.
+  EXPECT_THAT(readahead(fd.get(), 4, 2), AnyOf(SyscallSucceedsWithValue(0),
+                                               SyscallFailsWithErrno(EINVAL)));
+}
+
+TEST(ReadaheadTest, WriteOnly) {
+  const TempPath in_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(in_file.path(), O_WRONLY));
+  EXPECT_THAT(readahead(fd.get(), 0, 1), SyscallFailsWithErrno(EBADF));
+}
+
+TEST(ReadaheadTest, InvalidSize) {
+  const TempPath in_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(in_file.path(), O_RDWR));
+  EXPECT_THAT(readahead(fd.get(), 0, -1), SyscallFailsWithErrno(EINVAL));
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
-- 
cgit v1.2.3


From 76ff1947b6e3703c2b9524e1086c791ceb4edb74 Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Tue, 24 Sep 2019 23:46:08 -0700
Subject: gvisor: change syscall.RawSyscall to syscall.RawSyscall6 where
 required

Before https://golang.org/cl/173160 syscall.RawSyscall would zero out
the last three register arguments to the system call. That no longer happens.
For system calls that take more than three arguments, use RawSyscall6 to
ensure that we pass zero, not random data, for the additional arguments.

PiperOrigin-RevId: 271062527
---
 pkg/seccomp/seccomp_unsafe.go                  |  2 +-
 pkg/sentry/platform/ptrace/subprocess.go       | 14 +++++++-------
 pkg/sentry/platform/ptrace/subprocess_linux.go |  4 ++--
 3 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/seccomp/seccomp_unsafe.go b/pkg/seccomp/seccomp_unsafe.go
index 0a3d92854..be328db12 100644
--- a/pkg/seccomp/seccomp_unsafe.go
+++ b/pkg/seccomp/seccomp_unsafe.go
@@ -35,7 +35,7 @@ type sockFprog struct {
 //go:nosplit
 func SetFilter(instrs []linux.BPFInstruction) syscall.Errno {
 	// PR_SET_NO_NEW_PRIVS is required in order to enable seccomp. See seccomp(2) for details.
-	if _, _, errno := syscall.RawSyscall(syscall.SYS_PRCTL, linux.PR_SET_NO_NEW_PRIVS, 1, 0); errno != 0 {
+	if _, _, errno := syscall.RawSyscall6(syscall.SYS_PRCTL, linux.PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0, 0); errno != 0 {
 		return errno
 	}
 
diff --git a/pkg/sentry/platform/ptrace/subprocess.go b/pkg/sentry/platform/ptrace/subprocess.go
index 4f8f9c5d9..9f0ecfbe4 100644
--- a/pkg/sentry/platform/ptrace/subprocess.go
+++ b/pkg/sentry/platform/ptrace/subprocess.go
@@ -267,7 +267,7 @@ func (s *subprocess) newThread() *thread {
 
 // attach attaches to the thread.
 func (t *thread) attach() {
-	if _, _, errno := syscall.RawSyscall(syscall.SYS_PTRACE, syscall.PTRACE_ATTACH, uintptr(t.tid), 0); errno != 0 {
+	if _, _, errno := syscall.RawSyscall6(syscall.SYS_PTRACE, syscall.PTRACE_ATTACH, uintptr(t.tid), 0, 0, 0, 0); errno != 0 {
 		panic(fmt.Sprintf("unable to attach: %v", errno))
 	}
 
@@ -417,7 +417,7 @@ func (t *thread) syscall(regs *syscall.PtraceRegs) (uintptr, error) {
 
 	for {
 		// Execute the syscall instruction.
-		if _, _, errno := syscall.RawSyscall(syscall.SYS_PTRACE, syscall.PTRACE_SYSCALL, uintptr(t.tid), 0); errno != 0 {
+		if _, _, errno := syscall.RawSyscall6(syscall.SYS_PTRACE, syscall.PTRACE_SYSCALL, uintptr(t.tid), 0, 0, 0, 0); errno != 0 {
 			panic(fmt.Sprintf("ptrace syscall-enter failed: %v", errno))
 		}
 
@@ -435,7 +435,7 @@ func (t *thread) syscall(regs *syscall.PtraceRegs) (uintptr, error) {
 	}
 
 	// Complete the actual system call.
-	if _, _, errno := syscall.RawSyscall(syscall.SYS_PTRACE, syscall.PTRACE_SYSCALL, uintptr(t.tid), 0); errno != 0 {
+	if _, _, errno := syscall.RawSyscall6(syscall.SYS_PTRACE, syscall.PTRACE_SYSCALL, uintptr(t.tid), 0, 0, 0, 0); errno != 0 {
 		panic(fmt.Sprintf("ptrace syscall-enter failed: %v", errno))
 	}
 
@@ -526,17 +526,17 @@ func (s *subprocess) switchToApp(c *context, ac arch.Context) bool {
 	for {
 		// Start running until the next system call.
 		if isSingleStepping(regs) {
-			if _, _, errno := syscall.RawSyscall(
+			if _, _, errno := syscall.RawSyscall6(
 				syscall.SYS_PTRACE,
 				syscall.PTRACE_SYSEMU_SINGLESTEP,
-				uintptr(t.tid), 0); errno != 0 {
+				uintptr(t.tid), 0, 0, 0, 0); errno != 0 {
 				panic(fmt.Sprintf("ptrace sysemu failed: %v", errno))
 			}
 		} else {
-			if _, _, errno := syscall.RawSyscall(
+			if _, _, errno := syscall.RawSyscall6(
 				syscall.SYS_PTRACE,
 				syscall.PTRACE_SYSEMU,
-				uintptr(t.tid), 0); errno != 0 {
+				uintptr(t.tid), 0, 0, 0, 0); errno != 0 {
 				panic(fmt.Sprintf("ptrace sysemu failed: %v", errno))
 			}
 		}
diff --git a/pkg/sentry/platform/ptrace/subprocess_linux.go b/pkg/sentry/platform/ptrace/subprocess_linux.go
index f09b0b3d0..c075b5f91 100644
--- a/pkg/sentry/platform/ptrace/subprocess_linux.go
+++ b/pkg/sentry/platform/ptrace/subprocess_linux.go
@@ -53,7 +53,7 @@ func probeSeccomp() bool {
 
 	for {
 		// Attempt an emulation.
-		if _, _, errno := syscall.RawSyscall(syscall.SYS_PTRACE, syscall.PTRACE_SYSEMU, uintptr(t.tid), 0); errno != 0 {
+		if _, _, errno := syscall.RawSyscall6(syscall.SYS_PTRACE, syscall.PTRACE_SYSEMU, uintptr(t.tid), 0, 0, 0, 0); errno != 0 {
 			panic(fmt.Sprintf("ptrace syscall-enter failed: %v", errno))
 		}
 
@@ -266,7 +266,7 @@ func attachedThread(flags uintptr, defaultAction linux.BPFAction) (*thread, erro
 
 	// Enable cpuid-faulting; this may fail on older kernels or hardware,
 	// so we just disregard the result. Host CPUID will be enabled.
-	syscall.RawSyscall(syscall.SYS_ARCH_PRCTL, linux.ARCH_SET_CPUID, 0, 0)
+	syscall.RawSyscall6(syscall.SYS_ARCH_PRCTL, linux.ARCH_SET_CPUID, 0, 0, 0, 0, 0)
 
 	// Call the stub; should not return.
 	stubCall(stubStart, ppid)
-- 
cgit v1.2.3


From 543492650dd528c1d837d788dcd3b5138e8dc1c0 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Thu, 26 Sep 2019 15:07:59 -0700
Subject: Make raw socket tests pass in environments with or without
 CAP_NET_RAW.

PiperOrigin-RevId: 271442321
---
 pkg/sentry/socket/epsocket/provider.go    |  2 +-
 test/syscalls/linux/packet_socket.cc      | 29 ++++++++++++++-------
 test/syscalls/linux/packet_socket_raw.cc  | 21 ++++++++-------
 test/syscalls/linux/raw_socket_hdrincl.cc | 43 +++++++------------------------
 test/syscalls/linux/raw_socket_icmp.cc    | 13 +++++++---
 test/syscalls/linux/raw_socket_ipv4.cc    | 13 +++++++---
 6 files changed, 59 insertions(+), 62 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/socket/epsocket/provider.go b/pkg/sentry/socket/epsocket/provider.go
index 421f93dc4..0a9dfa6c3 100644
--- a/pkg/sentry/socket/epsocket/provider.go
+++ b/pkg/sentry/socket/epsocket/provider.go
@@ -65,7 +65,7 @@ func getTransportProtocol(ctx context.Context, stype linux.SockType, protocol in
 		// Raw sockets require CAP_NET_RAW.
 		creds := auth.CredentialsFromContext(ctx)
 		if !creds.HasCapability(linux.CAP_NET_RAW) {
-			return 0, true, syserr.ErrPermissionDenied
+			return 0, true, syserr.ErrNotPermitted
 		}
 
 		switch protocol {
diff --git a/test/syscalls/linux/packet_socket.cc b/test/syscalls/linux/packet_socket.cc
index 7a3379b9e..37b4e6575 100644
--- a/test/syscalls/linux/packet_socket.cc
+++ b/test/syscalls/linux/packet_socket.cc
@@ -83,9 +83,15 @@ void SendUDPMessage(int sock) {
 
 // Send an IP packet and make sure ETH_P_<something else> doesn't pick it up.
 TEST(BasicCookedPacketTest, WrongType) {
-  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+  // (b/129292371): Remove once we support packet sockets.
   SKIP_IF(IsRunningOnGvisor());
 
+  if (!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))) {
+    ASSERT_THAT(socket(AF_PACKET, SOCK_DGRAM, ETH_P_PUP),
+                SyscallFailsWithErrno(EPERM));
+    GTEST_SKIP();
+  }
+
   FileDescriptor sock =
       ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_PACKET, SOCK_DGRAM, ETH_P_PUP));
 
@@ -118,18 +124,27 @@ class CookedPacketTest : public ::testing::TestWithParam<int> {
 };
 
 void CookedPacketTest::SetUp() {
-  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+  // (b/129292371): Remove once we support packet sockets.
   SKIP_IF(IsRunningOnGvisor());
 
+  if (!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))) {
+    ASSERT_THAT(socket(AF_PACKET, SOCK_DGRAM, htons(GetParam())),
+                SyscallFailsWithErrno(EPERM));
+    GTEST_SKIP();
+  }
+
   ASSERT_THAT(socket_ = socket(AF_PACKET, SOCK_DGRAM, htons(GetParam())),
               SyscallSucceeds());
 }
 
 void CookedPacketTest::TearDown() {
-  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+  // (b/129292371): Remove once we support packet sockets.
   SKIP_IF(IsRunningOnGvisor());
 
-  EXPECT_THAT(close(socket_), SyscallSucceeds());
+  // TearDown will be run even if we skip the test.
+  if (ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))) {
+    EXPECT_THAT(close(socket_), SyscallSucceeds());
+  }
 }
 
 int CookedPacketTest::GetLoopbackIndex() {
@@ -142,9 +157,6 @@ int CookedPacketTest::GetLoopbackIndex() {
 
 // Receive via a packet socket.
 TEST_P(CookedPacketTest, Receive) {
-  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
-  SKIP_IF(IsRunningOnGvisor());
-
   // Let's use a simple IP payload: a UDP datagram.
   FileDescriptor udp_sock =
       ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, 0));
@@ -201,9 +213,6 @@ TEST_P(CookedPacketTest, Receive) {
 
 // Send via a packet socket.
 TEST_P(CookedPacketTest, Send) {
-  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
-  SKIP_IF(IsRunningOnGvisor());
-
   // Let's send a UDP packet and receive it using a regular UDP socket.
   FileDescriptor udp_sock =
       ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, 0));
diff --git a/test/syscalls/linux/packet_socket_raw.cc b/test/syscalls/linux/packet_socket_raw.cc
index 9e96460ee..6491453b6 100644
--- a/test/syscalls/linux/packet_socket_raw.cc
+++ b/test/syscalls/linux/packet_socket_raw.cc
@@ -97,9 +97,15 @@ class RawPacketTest : public ::testing::TestWithParam<int> {
 };
 
 void RawPacketTest::SetUp() {
-  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+  // (b/129292371): Remove once we support packet sockets.
   SKIP_IF(IsRunningOnGvisor());
 
+  if (!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))) {
+    ASSERT_THAT(socket(AF_PACKET, SOCK_RAW, htons(GetParam())),
+                SyscallFailsWithErrno(EPERM));
+    GTEST_SKIP();
+  }
+
   if (!IsRunningOnGvisor()) {
     FileDescriptor acceptLocal = ASSERT_NO_ERRNO_AND_VALUE(
         Open("/proc/sys/net/ipv4/conf/lo/accept_local", O_RDONLY));
@@ -119,10 +125,13 @@ void RawPacketTest::SetUp() {
 }
 
 void RawPacketTest::TearDown() {
-  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+  // (b/129292371): Remove once we support packet sockets.
   SKIP_IF(IsRunningOnGvisor());
 
-  EXPECT_THAT(close(socket_), SyscallSucceeds());
+  // TearDown will be run even if we skip the test.
+  if (ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))) {
+    EXPECT_THAT(close(socket_), SyscallSucceeds());
+  }
 }
 
 int RawPacketTest::GetLoopbackIndex() {
@@ -135,9 +144,6 @@ int RawPacketTest::GetLoopbackIndex() {
 
 // Receive via a packet socket.
 TEST_P(RawPacketTest, Receive) {
-  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
-  SKIP_IF(IsRunningOnGvisor());
-
   // Let's use a simple IP payload: a UDP datagram.
   FileDescriptor udp_sock =
       ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, 0));
@@ -208,9 +214,6 @@ TEST_P(RawPacketTest, Receive) {
 
 // Send via a packet socket.
 TEST_P(RawPacketTest, Send) {
-  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
-  SKIP_IF(IsRunningOnGvisor());
-
   // Let's send a UDP packet and receive it using a regular UDP socket.
   FileDescriptor udp_sock =
       ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, 0));
diff --git a/test/syscalls/linux/raw_socket_hdrincl.cc b/test/syscalls/linux/raw_socket_hdrincl.cc
index a070817eb..0a27506aa 100644
--- a/test/syscalls/linux/raw_socket_hdrincl.cc
+++ b/test/syscalls/linux/raw_socket_hdrincl.cc
@@ -63,7 +63,11 @@ class RawHDRINCL : public ::testing::Test {
 };
 
 void RawHDRINCL::SetUp() {
-  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+  if (!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))) {
+    ASSERT_THAT(socket(AF_INET, SOCK_RAW, IPPROTO_RAW),
+                SyscallFailsWithErrno(EPERM));
+    GTEST_SKIP();
+  }
 
   ASSERT_THAT(socket_ = socket(AF_INET, SOCK_RAW, IPPROTO_RAW),
               SyscallSucceeds());
@@ -76,9 +80,10 @@ void RawHDRINCL::SetUp() {
 }
 
 void RawHDRINCL::TearDown() {
-  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
-
-  EXPECT_THAT(close(socket_), SyscallSucceeds());
+  // TearDown will be run even if we skip the test.
+  if (ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))) {
+    EXPECT_THAT(close(socket_), SyscallSucceeds());
+  }
 }
 
 struct iphdr RawHDRINCL::LoopbackHeader() {
@@ -123,8 +128,6 @@ bool RawHDRINCL::FillPacket(char* buf, size_t buf_size, int port,
 // We should be able to create multiple IPPROTO_RAW sockets. RawHDRINCL::Setup
 // creates the first one, so we only have to create one more here.
 TEST_F(RawHDRINCL, MultipleCreation) {
-  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
-
   int s2;
   ASSERT_THAT(s2 = socket(AF_INET, SOCK_RAW, IPPROTO_RAW), SyscallSucceeds());
 
@@ -133,23 +136,17 @@ TEST_F(RawHDRINCL, MultipleCreation) {
 
 // Test that shutting down an unconnected socket fails.
 TEST_F(RawHDRINCL, FailShutdownWithoutConnect) {
-  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
-
   ASSERT_THAT(shutdown(socket_, SHUT_WR), SyscallFailsWithErrno(ENOTCONN));
   ASSERT_THAT(shutdown(socket_, SHUT_RD), SyscallFailsWithErrno(ENOTCONN));
 }
 
 // Test that listen() fails.
 TEST_F(RawHDRINCL, FailListen) {
-  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
-
   ASSERT_THAT(listen(socket_, 1), SyscallFailsWithErrno(ENOTSUP));
 }
 
 // Test that accept() fails.
 TEST_F(RawHDRINCL, FailAccept) {
-  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
-
   struct sockaddr saddr;
   socklen_t addrlen;
   ASSERT_THAT(accept(socket_, &saddr, &addrlen),
@@ -158,8 +155,6 @@ TEST_F(RawHDRINCL, FailAccept) {
 
 // Test that the socket is writable immediately.
 TEST_F(RawHDRINCL, PollWritableImmediately) {
-  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
-
   struct pollfd pfd = {};
   pfd.fd = socket_;
   pfd.events = POLLOUT;
@@ -168,8 +163,6 @@ TEST_F(RawHDRINCL, PollWritableImmediately) {
 
 // Test that the socket isn't readable.
 TEST_F(RawHDRINCL, NotReadable) {
-  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
-
   // Try to receive data with MSG_DONTWAIT, which returns immediately if there's
   // nothing to be read.
   char buf[117];
@@ -179,16 +172,12 @@ TEST_F(RawHDRINCL, NotReadable) {
 
 // Test that we can connect() to a valid IP (loopback).
 TEST_F(RawHDRINCL, ConnectToLoopback) {
-  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
-
   ASSERT_THAT(connect(socket_, reinterpret_cast<struct sockaddr*>(&addr_),
                       sizeof(addr_)),
               SyscallSucceeds());
 }
 
 TEST_F(RawHDRINCL, SendWithoutConnectSucceeds) {
-  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
-
   struct iphdr hdr = LoopbackHeader();
   ASSERT_THAT(send(socket_, &hdr, sizeof(hdr), 0),
               SyscallSucceedsWithValue(sizeof(hdr)));
@@ -197,8 +186,6 @@ TEST_F(RawHDRINCL, SendWithoutConnectSucceeds) {
 // HDRINCL implies write-only. Verify that we can't read a packet sent to
 // loopback.
 TEST_F(RawHDRINCL, NotReadableAfterWrite) {
-  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
-
   ASSERT_THAT(connect(socket_, reinterpret_cast<struct sockaddr*>(&addr_),
                       sizeof(addr_)),
               SyscallSucceeds());
@@ -221,8 +208,6 @@ TEST_F(RawHDRINCL, NotReadableAfterWrite) {
 }
 
 TEST_F(RawHDRINCL, WriteTooSmall) {
-  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
-
   ASSERT_THAT(connect(socket_, reinterpret_cast<struct sockaddr*>(&addr_),
                       sizeof(addr_)),
               SyscallSucceeds());
@@ -235,8 +220,6 @@ TEST_F(RawHDRINCL, WriteTooSmall) {
 
 // Bind to localhost.
 TEST_F(RawHDRINCL, BindToLocalhost) {
-  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
-
   ASSERT_THAT(
       bind(socket_, reinterpret_cast<struct sockaddr*>(&addr_), sizeof(addr_)),
       SyscallSucceeds());
@@ -244,8 +227,6 @@ TEST_F(RawHDRINCL, BindToLocalhost) {
 
 // Bind to a different address.
 TEST_F(RawHDRINCL, BindToInvalid) {
-  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
-
   struct sockaddr_in bind_addr = {};
   bind_addr.sin_family = AF_INET;
   bind_addr.sin_addr = {1};  // 1.0.0.0 - An address that we can't bind to.
@@ -256,8 +237,6 @@ TEST_F(RawHDRINCL, BindToInvalid) {
 
 // Send and receive a packet.
 TEST_F(RawHDRINCL, SendAndReceive) {
-  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
-
   int port = 40000;
   if (!IsRunningOnGvisor()) {
     port = static_cast<short>(ASSERT_NO_ERRNO_AND_VALUE(
@@ -302,8 +281,6 @@ TEST_F(RawHDRINCL, SendAndReceive) {
 
 // Send and receive a packet with nonzero IP ID.
 TEST_F(RawHDRINCL, SendAndReceiveNonzeroID) {
-  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
-
   int port = 40000;
   if (!IsRunningOnGvisor()) {
     port = static_cast<short>(ASSERT_NO_ERRNO_AND_VALUE(
@@ -349,8 +326,6 @@ TEST_F(RawHDRINCL, SendAndReceiveNonzeroID) {
 // Send and receive a packet where the sendto address is not the same as the
 // provided destination.
 TEST_F(RawHDRINCL, SendAndReceiveDifferentAddress) {
-  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
-
   int port = 40000;
   if (!IsRunningOnGvisor()) {
     port = static_cast<short>(ASSERT_NO_ERRNO_AND_VALUE(
diff --git a/test/syscalls/linux/raw_socket_icmp.cc b/test/syscalls/linux/raw_socket_icmp.cc
index 971592d7d..8bcaba6f1 100644
--- a/test/syscalls/linux/raw_socket_icmp.cc
+++ b/test/syscalls/linux/raw_socket_icmp.cc
@@ -77,7 +77,11 @@ class RawSocketICMPTest : public ::testing::Test {
 };
 
 void RawSocketICMPTest::SetUp() {
-  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+  if (!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))) {
+    ASSERT_THAT(socket(AF_INET, SOCK_RAW, IPPROTO_ICMP),
+                SyscallFailsWithErrno(EPERM));
+    GTEST_SKIP();
+  }
 
   ASSERT_THAT(s_ = socket(AF_INET, SOCK_RAW, IPPROTO_ICMP), SyscallSucceeds());
 
@@ -90,9 +94,10 @@ void RawSocketICMPTest::SetUp() {
 }
 
 void RawSocketICMPTest::TearDown() {
-  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
-
-  EXPECT_THAT(close(s_), SyscallSucceeds());
+  // TearDown will be run even if we skip the test.
+  if (ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))) {
+    EXPECT_THAT(close(s_), SyscallSucceeds());
+  }
 }
 
 // We'll only read an echo in this case, as the kernel won't respond to the
diff --git a/test/syscalls/linux/raw_socket_ipv4.cc b/test/syscalls/linux/raw_socket_ipv4.cc
index 352037c88..cde2f07c9 100644
--- a/test/syscalls/linux/raw_socket_ipv4.cc
+++ b/test/syscalls/linux/raw_socket_ipv4.cc
@@ -67,7 +67,11 @@ class RawSocketTest : public ::testing::TestWithParam<int> {
 };
 
 void RawSocketTest::SetUp() {
-  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+  if (!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))) {
+    ASSERT_THAT(socket(AF_INET, SOCK_RAW, Protocol()),
+                SyscallFailsWithErrno(EPERM));
+    GTEST_SKIP();
+  }
 
   ASSERT_THAT(s_ = socket(AF_INET, SOCK_RAW, Protocol()), SyscallSucceeds());
 
@@ -79,9 +83,10 @@ void RawSocketTest::SetUp() {
 }
 
 void RawSocketTest::TearDown() {
-  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
-
-  EXPECT_THAT(close(s_), SyscallSucceeds());
+  // TearDown will be run even if we skip the test.
+  if (ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))) {
+    EXPECT_THAT(close(s_), SyscallSucceeds());
+  }
 }
 
 // We should be able to create multiple raw sockets for the same protocol.
-- 
cgit v1.2.3


From abbee5615f4480d8a41b4cf63839d2ab13b19abf Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Fri, 27 Sep 2019 14:12:35 -0700
Subject: Implement SO_BINDTODEVICE sockopt

PiperOrigin-RevId: 271644926
---
 pkg/sentry/socket/epsocket/epsocket.go             |  20 ++
 pkg/sentry/syscalls/linux/sys_socket.go            |   2 +-
 pkg/tcpip/ports/ports.go                           | 114 ++++--
 pkg/tcpip/ports/ports_test.go                      | 113 +++++-
 pkg/tcpip/stack/BUILD                              |   4 +
 pkg/tcpip/stack/nic.go                             |  15 +-
 pkg/tcpip/stack/stack.go                           |  58 +---
 pkg/tcpip/stack/transport_demuxer.go               | 227 +++++++-----
 pkg/tcpip/stack/transport_demuxer_test.go          | 352 +++++++++++++++++++
 pkg/tcpip/stack/transport_test.go                  |   5 +-
 pkg/tcpip/tcpip.go                                 |   4 +
 pkg/tcpip/transport/icmp/endpoint.go               |   6 +-
 pkg/tcpip/transport/tcp/accept.go                  |   2 +-
 pkg/tcpip/transport/tcp/endpoint.go                |  56 ++-
 pkg/tcpip/transport/tcp/tcp_test.go                | 116 +++++++
 pkg/tcpip/transport/tcp/testing/context/context.go |  26 +-
 pkg/tcpip/transport/udp/BUILD                      |   1 +
 pkg/tcpip/transport/udp/endpoint.go                |  42 ++-
 pkg/tcpip/transport/udp/forwarder.go               |   2 +-
 pkg/tcpip/transport/udp/udp_test.go                | 120 +++----
 test/syscalls/linux/BUILD                          |  75 ++++
 test/syscalls/linux/socket_bind_to_device.cc       | 314 +++++++++++++++++
 .../linux/socket_bind_to_device_distribution.cc    | 381 +++++++++++++++++++++
 .../linux/socket_bind_to_device_sequence.cc        | 316 +++++++++++++++++
 test/syscalls/linux/socket_bind_to_device_util.cc  |  75 ++++
 test/syscalls/linux/socket_bind_to_device_util.h   |  67 ++++
 test/syscalls/linux/uidgid.cc                      |  25 +-
 test/util/BUILD                                    |  11 +
 test/util/uid_util.cc                              |  44 +++
 test/util/uid_util.h                               |  29 ++
 30 files changed, 2308 insertions(+), 314 deletions(-)
 create mode 100644 pkg/tcpip/stack/transport_demuxer_test.go
 create mode 100644 test/syscalls/linux/socket_bind_to_device.cc
 create mode 100644 test/syscalls/linux/socket_bind_to_device_distribution.cc
 create mode 100644 test/syscalls/linux/socket_bind_to_device_sequence.cc
 create mode 100644 test/syscalls/linux/socket_bind_to_device_util.cc
 create mode 100644 test/syscalls/linux/socket_bind_to_device_util.h
 create mode 100644 test/util/uid_util.cc
 create mode 100644 test/util/uid_util.h

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/socket/epsocket/epsocket.go b/pkg/sentry/socket/epsocket/epsocket.go
index 3e66f9cbb..5812085fa 100644
--- a/pkg/sentry/socket/epsocket/epsocket.go
+++ b/pkg/sentry/socket/epsocket/epsocket.go
@@ -942,6 +942,19 @@ func getSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, family
 
 		return int32(v), nil
 
+	case linux.SO_BINDTODEVICE:
+		var v tcpip.BindToDeviceOption
+		if err := ep.GetSockOpt(&v); err != nil {
+			return nil, syserr.TranslateNetstackError(err)
+		}
+		if len(v) == 0 {
+			return []byte{}, nil
+		}
+		if outLen < linux.IFNAMSIZ {
+			return nil, syserr.ErrInvalidArgument
+		}
+		return append([]byte(v), 0), nil
+
 	case linux.SO_BROADCAST:
 		if outLen < sizeOfInt32 {
 			return nil, syserr.ErrInvalidArgument
@@ -1305,6 +1318,13 @@ func setSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, name i
 		v := usermem.ByteOrder.Uint32(optVal)
 		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.ReusePortOption(v)))
 
+	case linux.SO_BINDTODEVICE:
+		n := bytes.IndexByte(optVal, 0)
+		if n == -1 {
+			n = len(optVal)
+		}
+		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.BindToDeviceOption(optVal[:n])))
+
 	case linux.SO_BROADCAST:
 		if len(optVal) < sizeOfInt32 {
 			return syserr.ErrInvalidArgument
diff --git a/pkg/sentry/syscalls/linux/sys_socket.go b/pkg/sentry/syscalls/linux/sys_socket.go
index 3bac4d90d..b5a72ce63 100644
--- a/pkg/sentry/syscalls/linux/sys_socket.go
+++ b/pkg/sentry/syscalls/linux/sys_socket.go
@@ -531,7 +531,7 @@ func SetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
 		return 0, nil, syserror.ENOTSOCK
 	}
 
-	if optLen <= 0 {
+	if optLen < 0 {
 		return 0, nil, syserror.EINVAL
 	}
 	if optLen > maxOptLen {
diff --git a/pkg/tcpip/ports/ports.go b/pkg/tcpip/ports/ports.go
index 315780c0c..40e202717 100644
--- a/pkg/tcpip/ports/ports.go
+++ b/pkg/tcpip/ports/ports.go
@@ -47,43 +47,76 @@ type portNode struct {
 	refs  int
 }
 
-// bindAddresses is a set of IP addresses.
-type bindAddresses map[tcpip.Address]portNode
-
-// isAvailable checks whether an IP address is available to bind to.
-func (b bindAddresses) isAvailable(addr tcpip.Address, reuse bool) bool {
-	if addr == anyIPAddress {
-		if len(b) == 0 {
-			return true
-		}
+// deviceNode is never empty. When it has no elements, it is removed from the
+// map that references it.
+type deviceNode map[tcpip.NICID]portNode
+
+// isAvailable checks whether binding is possible by device. If not binding to a
+// device, check against all portNodes. If binding to a specific device, check
+// against the unspecified device and the provided device.
+func (d deviceNode) isAvailable(reuse bool, bindToDevice tcpip.NICID) bool {
+	if bindToDevice == 0 {
+		// Trying to binding all devices.
 		if !reuse {
+			// Can't bind because the (addr,port) is already bound.
 			return false
 		}
-		for _, n := range b {
-			if !n.reuse {
+		for _, p := range d {
+			if !p.reuse {
+				// Can't bind because the (addr,port) was previously bound without reuse.
 				return false
 			}
 		}
 		return true
 	}
 
-	// If all addresses for this portDescriptor are already bound, no
-	// address is available.
-	if n, ok := b[anyIPAddress]; ok {
-		if !reuse {
+	if p, ok := d[0]; ok {
+		if !reuse || !p.reuse {
 			return false
 		}
-		if !n.reuse {
+	}
+
+	if p, ok := d[bindToDevice]; ok {
+		if !reuse || !p.reuse {
 			return false
 		}
 	}
 
-	if n, ok := b[addr]; ok {
-		if !reuse {
+	return true
+}
+
+// bindAddresses is a set of IP addresses.
+type bindAddresses map[tcpip.Address]deviceNode
+
+// isAvailable checks whether an IP address is available to bind to. If the
+// address is the "any" address, check all other addresses. Otherwise, just
+// check against the "any" address and the provided address.
+func (b bindAddresses) isAvailable(addr tcpip.Address, reuse bool, bindToDevice tcpip.NICID) bool {
+	if addr == anyIPAddress {
+		// If binding to the "any" address then check that there are no conflicts
+		// with all addresses.
+		for _, d := range b {
+			if !d.isAvailable(reuse, bindToDevice) {
+				return false
+			}
+		}
+		return true
+	}
+
+	// Check that there is no conflict with the "any" address.
+	if d, ok := b[anyIPAddress]; ok {
+		if !d.isAvailable(reuse, bindToDevice) {
+			return false
+		}
+	}
+
+	// Check that this is no conflict with the provided address.
+	if d, ok := b[addr]; ok {
+		if !d.isAvailable(reuse, bindToDevice) {
 			return false
 		}
-		return n.reuse
 	}
+
 	return true
 }
 
@@ -116,17 +149,17 @@ func (s *PortManager) PickEphemeralPort(testPort func(p uint16) (bool, *tcpip.Er
 }
 
 // IsPortAvailable tests if the given port is available on all given protocols.
-func (s *PortManager) IsPortAvailable(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16, reuse bool) bool {
+func (s *PortManager) IsPortAvailable(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16, reuse bool, bindToDevice tcpip.NICID) bool {
 	s.mu.Lock()
 	defer s.mu.Unlock()
-	return s.isPortAvailableLocked(networks, transport, addr, port, reuse)
+	return s.isPortAvailableLocked(networks, transport, addr, port, reuse, bindToDevice)
 }
 
-func (s *PortManager) isPortAvailableLocked(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16, reuse bool) bool {
+func (s *PortManager) isPortAvailableLocked(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16, reuse bool, bindToDevice tcpip.NICID) bool {
 	for _, network := range networks {
 		desc := portDescriptor{network, transport, port}
 		if addrs, ok := s.allocatedPorts[desc]; ok {
-			if !addrs.isAvailable(addr, reuse) {
+			if !addrs.isAvailable(addr, reuse, bindToDevice) {
 				return false
 			}
 		}
@@ -138,14 +171,14 @@ func (s *PortManager) isPortAvailableLocked(networks []tcpip.NetworkProtocolNumb
 // reserved by another endpoint. If port is zero, ReservePort will search for
 // an unreserved ephemeral port and reserve it, returning its value in the
 // "port" return value.
-func (s *PortManager) ReservePort(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16, reuse bool) (reservedPort uint16, err *tcpip.Error) {
+func (s *PortManager) ReservePort(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16, reuse bool, bindToDevice tcpip.NICID) (reservedPort uint16, err *tcpip.Error) {
 	s.mu.Lock()
 	defer s.mu.Unlock()
 
 	// If a port is specified, just try to reserve it for all network
 	// protocols.
 	if port != 0 {
-		if !s.reserveSpecificPort(networks, transport, addr, port, reuse) {
+		if !s.reserveSpecificPort(networks, transport, addr, port, reuse, bindToDevice) {
 			return 0, tcpip.ErrPortInUse
 		}
 		return port, nil
@@ -153,13 +186,13 @@ func (s *PortManager) ReservePort(networks []tcpip.NetworkProtocolNumber, transp
 
 	// A port wasn't specified, so try to find one.
 	return s.PickEphemeralPort(func(p uint16) (bool, *tcpip.Error) {
-		return s.reserveSpecificPort(networks, transport, addr, p, reuse), nil
+		return s.reserveSpecificPort(networks, transport, addr, p, reuse, bindToDevice), nil
 	})
 }
 
 // reserveSpecificPort tries to reserve the given port on all given protocols.
-func (s *PortManager) reserveSpecificPort(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16, reuse bool) bool {
-	if !s.isPortAvailableLocked(networks, transport, addr, port, reuse) {
+func (s *PortManager) reserveSpecificPort(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16, reuse bool, bindToDevice tcpip.NICID) bool {
+	if !s.isPortAvailableLocked(networks, transport, addr, port, reuse, bindToDevice) {
 		return false
 	}
 
@@ -171,11 +204,16 @@ func (s *PortManager) reserveSpecificPort(networks []tcpip.NetworkProtocolNumber
 			m = make(bindAddresses)
 			s.allocatedPorts[desc] = m
 		}
-		if n, ok := m[addr]; ok {
+		d, ok := m[addr]
+		if !ok {
+			d = make(deviceNode)
+			m[addr] = d
+		}
+		if n, ok := d[bindToDevice]; ok {
 			n.refs++
-			m[addr] = n
+			d[bindToDevice] = n
 		} else {
-			m[addr] = portNode{reuse: reuse, refs: 1}
+			d[bindToDevice] = portNode{reuse: reuse, refs: 1}
 		}
 	}
 
@@ -184,22 +222,28 @@ func (s *PortManager) reserveSpecificPort(networks []tcpip.NetworkProtocolNumber
 
 // ReleasePort releases the reservation on a port/IP combination so that it can
 // be reserved by other endpoints.
-func (s *PortManager) ReleasePort(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16) {
+func (s *PortManager) ReleasePort(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16, bindToDevice tcpip.NICID) {
 	s.mu.Lock()
 	defer s.mu.Unlock()
 
 	for _, network := range networks {
 		desc := portDescriptor{network, transport, port}
 		if m, ok := s.allocatedPorts[desc]; ok {
-			n, ok := m[addr]
+			d, ok := m[addr]
+			if !ok {
+				continue
+			}
+			n, ok := d[bindToDevice]
 			if !ok {
 				continue
 			}
 			n.refs--
+			d[bindToDevice] = n
 			if n.refs == 0 {
+				delete(d, bindToDevice)
+			}
+			if len(d) == 0 {
 				delete(m, addr)
-			} else {
-				m[addr] = n
 			}
 			if len(m) == 0 {
 				delete(s.allocatedPorts, desc)
diff --git a/pkg/tcpip/ports/ports_test.go b/pkg/tcpip/ports/ports_test.go
index 689401661..a67e283f1 100644
--- a/pkg/tcpip/ports/ports_test.go
+++ b/pkg/tcpip/ports/ports_test.go
@@ -34,6 +34,7 @@ type portReserveTestAction struct {
 	want    *tcpip.Error
 	reuse   bool
 	release bool
+	device  tcpip.NICID
 }
 
 func TestPortReservation(t *testing.T) {
@@ -100,6 +101,112 @@ func TestPortReservation(t *testing.T) {
 				{port: 24, ip: anyIPAddress, release: true},
 				{port: 24, ip: anyIPAddress, reuse: false, want: nil},
 			},
+		}, {
+			tname: "bind twice with device fails",
+			actions: []portReserveTestAction{
+				{port: 24, ip: fakeIPAddress, device: 3, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 3, want: tcpip.ErrPortInUse},
+			},
+		}, {
+			tname: "bind to device",
+			actions: []portReserveTestAction{
+				{port: 24, ip: fakeIPAddress, device: 1, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 2, want: nil},
+			},
+		}, {
+			tname: "bind to device and then without device",
+			actions: []portReserveTestAction{
+				{port: 24, ip: fakeIPAddress, device: 123, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 0, want: tcpip.ErrPortInUse},
+			},
+		}, {
+			tname: "bind without device",
+			actions: []portReserveTestAction{
+				{port: 24, ip: fakeIPAddress, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 123, want: tcpip.ErrPortInUse},
+				{port: 24, ip: fakeIPAddress, device: 123, reuse: true, want: tcpip.ErrPortInUse},
+				{port: 24, ip: fakeIPAddress, want: tcpip.ErrPortInUse},
+				{port: 24, ip: fakeIPAddress, reuse: true, want: tcpip.ErrPortInUse},
+			},
+		}, {
+			tname: "bind with device",
+			actions: []portReserveTestAction{
+				{port: 24, ip: fakeIPAddress, device: 123, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 123, want: tcpip.ErrPortInUse},
+				{port: 24, ip: fakeIPAddress, device: 123, reuse: true, want: tcpip.ErrPortInUse},
+				{port: 24, ip: fakeIPAddress, device: 0, want: tcpip.ErrPortInUse},
+				{port: 24, ip: fakeIPAddress, device: 0, reuse: true, want: tcpip.ErrPortInUse},
+				{port: 24, ip: fakeIPAddress, device: 456, reuse: true, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 789, want: nil},
+				{port: 24, ip: fakeIPAddress, want: tcpip.ErrPortInUse},
+				{port: 24, ip: fakeIPAddress, reuse: true, want: tcpip.ErrPortInUse},
+			},
+		}, {
+			tname: "bind with reuse",
+			actions: []portReserveTestAction{
+				{port: 24, ip: fakeIPAddress, reuse: true, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 123, want: tcpip.ErrPortInUse},
+				{port: 24, ip: fakeIPAddress, device: 123, reuse: true, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 0, want: tcpip.ErrPortInUse},
+				{port: 24, ip: fakeIPAddress, device: 0, reuse: true, want: nil},
+			},
+		}, {
+			tname: "binding with reuse and device",
+			actions: []portReserveTestAction{
+				{port: 24, ip: fakeIPAddress, device: 123, reuse: true, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 123, want: tcpip.ErrPortInUse},
+				{port: 24, ip: fakeIPAddress, device: 123, reuse: true, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 0, want: tcpip.ErrPortInUse},
+				{port: 24, ip: fakeIPAddress, device: 456, reuse: true, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 0, reuse: true, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 789, reuse: true, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 999, want: tcpip.ErrPortInUse},
+			},
+		}, {
+			tname: "mixing reuse and not reuse by binding to device",
+			actions: []portReserveTestAction{
+				{port: 24, ip: fakeIPAddress, device: 123, reuse: true, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 456, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 789, reuse: true, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 999, want: nil},
+			},
+		}, {
+			tname: "can't bind to 0 after mixing reuse and not reuse",
+			actions: []portReserveTestAction{
+				{port: 24, ip: fakeIPAddress, device: 123, reuse: true, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 456, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 0, reuse: true, want: tcpip.ErrPortInUse},
+			},
+		}, {
+			tname: "bind and release",
+			actions: []portReserveTestAction{
+				{port: 24, ip: fakeIPAddress, device: 123, reuse: true, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 0, reuse: true, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 345, reuse: false, want: tcpip.ErrPortInUse},
+				{port: 24, ip: fakeIPAddress, device: 789, reuse: true, want: nil},
+
+				// Release the bind to device 0 and try again.
+				{port: 24, ip: fakeIPAddress, device: 0, reuse: true, want: nil, release: true},
+				{port: 24, ip: fakeIPAddress, device: 345, reuse: false, want: nil},
+			},
+		}, {
+			tname: "bind twice with reuse once",
+			actions: []portReserveTestAction{
+				{port: 24, ip: fakeIPAddress, device: 123, reuse: false, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 0, reuse: true, want: tcpip.ErrPortInUse},
+			},
+		}, {
+			tname: "release an unreserved device",
+			actions: []portReserveTestAction{
+				{port: 24, ip: fakeIPAddress, device: 123, reuse: false, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 456, reuse: false, want: nil},
+				// The below don't exist.
+				{port: 24, ip: fakeIPAddress, device: 345, reuse: false, want: nil, release: true},
+				{port: 9999, ip: fakeIPAddress, device: 123, reuse: false, want: nil, release: true},
+				// Release all.
+				{port: 24, ip: fakeIPAddress, device: 123, reuse: false, want: nil, release: true},
+				{port: 24, ip: fakeIPAddress, device: 456, reuse: false, want: nil, release: true},
+			},
 		},
 	} {
 		t.Run(test.tname, func(t *testing.T) {
@@ -108,12 +215,12 @@ func TestPortReservation(t *testing.T) {
 
 			for _, test := range test.actions {
 				if test.release {
-					pm.ReleasePort(net, fakeTransNumber, test.ip, test.port)
+					pm.ReleasePort(net, fakeTransNumber, test.ip, test.port, test.device)
 					continue
 				}
-				gotPort, err := pm.ReservePort(net, fakeTransNumber, test.ip, test.port, test.reuse)
+				gotPort, err := pm.ReservePort(net, fakeTransNumber, test.ip, test.port, test.reuse, test.device)
 				if err != test.want {
-					t.Fatalf("ReservePort(.., .., %s, %d, %t) = %v, want %v", test.ip, test.port, test.release, err, test.want)
+					t.Fatalf("ReservePort(.., .., %s, %d, %t, %d) = %v, want %v", test.ip, test.port, test.reuse, test.device, err, test.want)
 				}
 				if test.port == 0 && (gotPort == 0 || gotPort < FirstEphemeral) {
 					t.Fatalf("ReservePort(.., .., .., 0) = %d, want port number >= %d to be picked", gotPort, FirstEphemeral)
diff --git a/pkg/tcpip/stack/BUILD b/pkg/tcpip/stack/BUILD
index 28c49e8ff..3842f1f7d 100644
--- a/pkg/tcpip/stack/BUILD
+++ b/pkg/tcpip/stack/BUILD
@@ -54,6 +54,7 @@ go_test(
     size = "small",
     srcs = [
         "stack_test.go",
+        "transport_demuxer_test.go",
         "transport_test.go",
     ],
     deps = [
@@ -64,6 +65,9 @@ go_test(
         "//pkg/tcpip/iptables",
         "//pkg/tcpip/link/channel",
         "//pkg/tcpip/link/loopback",
+        "//pkg/tcpip/network/ipv4",
+        "//pkg/tcpip/network/ipv6",
+        "//pkg/tcpip/transport/udp",
         "//pkg/waiter",
     ],
 )
diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index 0e8a23f00..f6106f762 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -34,8 +34,6 @@ type NIC struct {
 	linkEP   LinkEndpoint
 	loopback bool
 
-	demux *transportDemuxer
-
 	mu            sync.RWMutex
 	spoofing      bool
 	promiscuous   bool
@@ -85,7 +83,6 @@ func newNIC(stack *Stack, id tcpip.NICID, name string, ep LinkEndpoint, loopback
 		name:       name,
 		linkEP:     ep,
 		loopback:   loopback,
-		demux:      newTransportDemuxer(stack),
 		primary:    make(map[tcpip.NetworkProtocolNumber]*ilist.List),
 		endpoints:  make(map[NetworkEndpointID]*referencedNetworkEndpoint),
 		mcastJoins: make(map[NetworkEndpointID]int32),
@@ -707,9 +704,7 @@ func (n *NIC) DeliverTransportPacket(r *Route, protocol tcpip.TransportProtocolN
 	// Raw socket packets are delivered based solely on the transport
 	// protocol number. We do not inspect the payload to ensure it's
 	// validly formed.
-	if !n.demux.deliverRawPacket(r, protocol, netHeader, vv) {
-		n.stack.demux.deliverRawPacket(r, protocol, netHeader, vv)
-	}
+	n.stack.demux.deliverRawPacket(r, protocol, netHeader, vv)
 
 	if len(vv.First()) < transProto.MinimumPacketSize() {
 		n.stack.stats.MalformedRcvdPackets.Increment()
@@ -723,9 +718,6 @@ func (n *NIC) DeliverTransportPacket(r *Route, protocol tcpip.TransportProtocolN
 	}
 
 	id := TransportEndpointID{dstPort, r.LocalAddress, srcPort, r.RemoteAddress}
-	if n.demux.deliverPacket(r, protocol, netHeader, vv, id) {
-		return
-	}
 	if n.stack.demux.deliverPacket(r, protocol, netHeader, vv, id) {
 		return
 	}
@@ -767,10 +759,7 @@ func (n *NIC) DeliverTransportControlPacket(local, remote tcpip.Address, net tcp
 	}
 
 	id := TransportEndpointID{srcPort, local, dstPort, remote}
-	if n.demux.deliverControlPacket(net, trans, typ, extra, vv, id) {
-		return
-	}
-	if n.stack.demux.deliverControlPacket(net, trans, typ, extra, vv, id) {
+	if n.stack.demux.deliverControlPacket(n, net, trans, typ, extra, vv, id) {
 		return
 	}
 }
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index 18d1704a5..6a8079823 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -1033,73 +1033,27 @@ func (s *Stack) RemoveWaker(nicid tcpip.NICID, addr tcpip.Address, waker *sleep.
 // transport dispatcher. Received packets that match the provided id will be
 // delivered to the given endpoint; specifying a nic is optional, but
 // nic-specific IDs have precedence over global ones.
-func (s *Stack) RegisterTransportEndpoint(nicID tcpip.NICID, netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint, reusePort bool) *tcpip.Error {
-	if nicID == 0 {
-		return s.demux.registerEndpoint(netProtos, protocol, id, ep, reusePort)
-	}
-
-	s.mu.RLock()
-	defer s.mu.RUnlock()
-
-	nic := s.nics[nicID]
-	if nic == nil {
-		return tcpip.ErrUnknownNICID
-	}
-
-	return nic.demux.registerEndpoint(netProtos, protocol, id, ep, reusePort)
+func (s *Stack) RegisterTransportEndpoint(nicID tcpip.NICID, netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint, reusePort bool, bindToDevice tcpip.NICID) *tcpip.Error {
+	return s.demux.registerEndpoint(netProtos, protocol, id, ep, reusePort, bindToDevice)
 }
 
 // UnregisterTransportEndpoint removes the endpoint with the given id from the
 // stack transport dispatcher.
-func (s *Stack) UnregisterTransportEndpoint(nicID tcpip.NICID, netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint) {
-	if nicID == 0 {
-		s.demux.unregisterEndpoint(netProtos, protocol, id, ep)
-		return
-	}
-
-	s.mu.RLock()
-	defer s.mu.RUnlock()
-
-	nic := s.nics[nicID]
-	if nic != nil {
-		nic.demux.unregisterEndpoint(netProtos, protocol, id, ep)
-	}
+func (s *Stack) UnregisterTransportEndpoint(nicID tcpip.NICID, netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint, bindToDevice tcpip.NICID) {
+	s.demux.unregisterEndpoint(netProtos, protocol, id, ep, bindToDevice)
 }
 
 // RegisterRawTransportEndpoint registers the given endpoint with the stack
 // transport dispatcher. Received packets that match the provided transport
 // protocol will be delivered to the given endpoint.
 func (s *Stack) RegisterRawTransportEndpoint(nicID tcpip.NICID, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, ep RawTransportEndpoint) *tcpip.Error {
-	if nicID == 0 {
-		return s.demux.registerRawEndpoint(netProto, transProto, ep)
-	}
-
-	s.mu.RLock()
-	defer s.mu.RUnlock()
-
-	nic := s.nics[nicID]
-	if nic == nil {
-		return tcpip.ErrUnknownNICID
-	}
-
-	return nic.demux.registerRawEndpoint(netProto, transProto, ep)
+	return s.demux.registerRawEndpoint(netProto, transProto, ep)
 }
 
 // UnregisterRawTransportEndpoint removes the endpoint for the transport
 // protocol from the stack transport dispatcher.
 func (s *Stack) UnregisterRawTransportEndpoint(nicID tcpip.NICID, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, ep RawTransportEndpoint) {
-	if nicID == 0 {
-		s.demux.unregisterRawEndpoint(netProto, transProto, ep)
-		return
-	}
-
-	s.mu.RLock()
-	defer s.mu.RUnlock()
-
-	nic := s.nics[nicID]
-	if nic != nil {
-		nic.demux.unregisterRawEndpoint(netProto, transProto, ep)
-	}
+	s.demux.unregisterRawEndpoint(netProto, transProto, ep)
 }
 
 // RegisterRestoredEndpoint records e as an endpoint that has been restored on
diff --git a/pkg/tcpip/stack/transport_demuxer.go b/pkg/tcpip/stack/transport_demuxer.go
index cf8a6d129..8c768c299 100644
--- a/pkg/tcpip/stack/transport_demuxer.go
+++ b/pkg/tcpip/stack/transport_demuxer.go
@@ -35,25 +35,109 @@ type protocolIDs struct {
 type transportEndpoints struct {
 	// mu protects all fields of the transportEndpoints.
 	mu        sync.RWMutex
-	endpoints map[TransportEndpointID]TransportEndpoint
+	endpoints map[TransportEndpointID]*endpointsByNic
 	// rawEndpoints contains endpoints for raw sockets, which receive all
 	// traffic of a given protocol regardless of port.
 	rawEndpoints []RawTransportEndpoint
 }
 
+type endpointsByNic struct {
+	mu        sync.RWMutex
+	endpoints map[tcpip.NICID]*multiPortEndpoint
+	// seed is a random secret for a jenkins hash.
+	seed uint32
+}
+
+// HandlePacket is called by the stack when new packets arrive to this transport
+// endpoint.
+func (epsByNic *endpointsByNic) handlePacket(r *Route, id TransportEndpointID, vv buffer.VectorisedView) {
+	epsByNic.mu.RLock()
+
+	mpep, ok := epsByNic.endpoints[r.ref.nic.ID()]
+	if !ok {
+		if mpep, ok = epsByNic.endpoints[0]; !ok {
+			epsByNic.mu.RUnlock() // Don't use defer for performance reasons.
+			return
+		}
+	}
+
+	// If this is a broadcast or multicast datagram, deliver the datagram to all
+	// endpoints bound to the right device.
+	if id.LocalAddress == header.IPv4Broadcast || header.IsV4MulticastAddress(id.LocalAddress) || header.IsV6MulticastAddress(id.LocalAddress) {
+		mpep.handlePacketAll(r, id, vv)
+		epsByNic.mu.RUnlock() // Don't use defer for performance reasons.
+		return
+	}
+
+	// multiPortEndpoints are guaranteed to have at least one element.
+	selectEndpoint(id, mpep, epsByNic.seed).HandlePacket(r, id, vv)
+	epsByNic.mu.RUnlock() // Don't use defer for performance reasons.
+}
+
+// HandleControlPacket implements stack.TransportEndpoint.HandleControlPacket.
+func (epsByNic *endpointsByNic) handleControlPacket(n *NIC, id TransportEndpointID, typ ControlType, extra uint32, vv buffer.VectorisedView) {
+	epsByNic.mu.RLock()
+	defer epsByNic.mu.RUnlock()
+
+	mpep, ok := epsByNic.endpoints[n.ID()]
+	if !ok {
+		mpep, ok = epsByNic.endpoints[0]
+	}
+	if !ok {
+		return
+	}
+
+	// TODO(eyalsoha): Why don't we look at id to see if this packet needs to
+	// broadcast like we are doing with handlePacket above?
+
+	// multiPortEndpoints are guaranteed to have at least one element.
+	selectEndpoint(id, mpep, epsByNic.seed).HandleControlPacket(id, typ, extra, vv)
+}
+
+// registerEndpoint returns true if it succeeds. It fails and returns
+// false if ep already has an element with the same key.
+func (epsByNic *endpointsByNic) registerEndpoint(t TransportEndpoint, reusePort bool, bindToDevice tcpip.NICID) *tcpip.Error {
+	epsByNic.mu.Lock()
+	defer epsByNic.mu.Unlock()
+
+	if multiPortEp, ok := epsByNic.endpoints[bindToDevice]; ok {
+		// There was already a bind.
+		return multiPortEp.singleRegisterEndpoint(t, reusePort)
+	}
+
+	// This is a new binding.
+	multiPortEp := &multiPortEndpoint{}
+	multiPortEp.endpointsMap = make(map[TransportEndpoint]int)
+	multiPortEp.reuse = reusePort
+	epsByNic.endpoints[bindToDevice] = multiPortEp
+	return multiPortEp.singleRegisterEndpoint(t, reusePort)
+}
+
+// unregisterEndpoint returns true if endpointsByNic has to be unregistered.
+func (epsByNic *endpointsByNic) unregisterEndpoint(bindToDevice tcpip.NICID, t TransportEndpoint) bool {
+	epsByNic.mu.Lock()
+	defer epsByNic.mu.Unlock()
+	multiPortEp, ok := epsByNic.endpoints[bindToDevice]
+	if !ok {
+		return false
+	}
+	if multiPortEp.unregisterEndpoint(t) {
+		delete(epsByNic.endpoints, bindToDevice)
+	}
+	return len(epsByNic.endpoints) == 0
+}
+
 // unregisterEndpoint unregisters the endpoint with the given id such that it
 // won't receive any more packets.
-func (eps *transportEndpoints) unregisterEndpoint(id TransportEndpointID, ep TransportEndpoint) {
+func (eps *transportEndpoints) unregisterEndpoint(id TransportEndpointID, ep TransportEndpoint, bindToDevice tcpip.NICID) {
 	eps.mu.Lock()
 	defer eps.mu.Unlock()
-	e, ok := eps.endpoints[id]
+	epsByNic, ok := eps.endpoints[id]
 	if !ok {
 		return
 	}
-	if multiPortEp, ok := e.(*multiPortEndpoint); ok {
-		if !multiPortEp.unregisterEndpoint(ep) {
-			return
-		}
+	if !epsByNic.unregisterEndpoint(bindToDevice, ep) {
+		return
 	}
 	delete(eps.endpoints, id)
 }
@@ -75,7 +159,7 @@ func newTransportDemuxer(stack *Stack) *transportDemuxer {
 	for netProto := range stack.networkProtocols {
 		for proto := range stack.transportProtocols {
 			d.protocol[protocolIDs{netProto, proto}] = &transportEndpoints{
-				endpoints: make(map[TransportEndpointID]TransportEndpoint),
+				endpoints: make(map[TransportEndpointID]*endpointsByNic),
 			}
 		}
 	}
@@ -85,10 +169,10 @@ func newTransportDemuxer(stack *Stack) *transportDemuxer {
 
 // registerEndpoint registers the given endpoint with the dispatcher such that
 // packets that match the endpoint ID are delivered to it.
-func (d *transportDemuxer) registerEndpoint(netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint, reusePort bool) *tcpip.Error {
+func (d *transportDemuxer) registerEndpoint(netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint, reusePort bool, bindToDevice tcpip.NICID) *tcpip.Error {
 	for i, n := range netProtos {
-		if err := d.singleRegisterEndpoint(n, protocol, id, ep, reusePort); err != nil {
-			d.unregisterEndpoint(netProtos[:i], protocol, id, ep)
+		if err := d.singleRegisterEndpoint(n, protocol, id, ep, reusePort, bindToDevice); err != nil {
+			d.unregisterEndpoint(netProtos[:i], protocol, id, ep, bindToDevice)
 			return err
 		}
 	}
@@ -97,13 +181,14 @@ func (d *transportDemuxer) registerEndpoint(netProtos []tcpip.NetworkProtocolNum
 }
 
 // multiPortEndpoint is a container for TransportEndpoints which are bound to
-// the same pair of address and port.
+// the same pair of address and port. endpointsArr always has at least one
+// element.
 type multiPortEndpoint struct {
 	mu           sync.RWMutex
 	endpointsArr []TransportEndpoint
 	endpointsMap map[TransportEndpoint]int
-	// seed is a random secret for a jenkins hash.
-	seed uint32
+	// reuse indicates if more than one endpoint is allowed.
+	reuse bool
 }
 
 // reciprocalScale scales a value into range [0, n).
@@ -117,9 +202,10 @@ func reciprocalScale(val, n uint32) uint32 {
 // selectEndpoint calculates a hash of destination and source addresses and
 // ports then uses it to select a socket. In this case, all packets from one
 // address will be sent to same endpoint.
-func (ep *multiPortEndpoint) selectEndpoint(id TransportEndpointID) TransportEndpoint {
-	ep.mu.RLock()
-	defer ep.mu.RUnlock()
+func selectEndpoint(id TransportEndpointID, mpep *multiPortEndpoint, seed uint32) TransportEndpoint {
+	if len(mpep.endpointsArr) == 1 {
+		return mpep.endpointsArr[0]
+	}
 
 	payload := []byte{
 		byte(id.LocalPort),
@@ -128,51 +214,50 @@ func (ep *multiPortEndpoint) selectEndpoint(id TransportEndpointID) TransportEnd
 		byte(id.RemotePort >> 8),
 	}
 
-	h := jenkins.Sum32(ep.seed)
+	h := jenkins.Sum32(seed)
 	h.Write(payload)
 	h.Write([]byte(id.LocalAddress))
 	h.Write([]byte(id.RemoteAddress))
 	hash := h.Sum32()
 
-	idx := reciprocalScale(hash, uint32(len(ep.endpointsArr)))
-	return ep.endpointsArr[idx]
+	idx := reciprocalScale(hash, uint32(len(mpep.endpointsArr)))
+	return mpep.endpointsArr[idx]
 }
 
-// HandlePacket is called by the stack when new packets arrive to this transport
-// endpoint.
-func (ep *multiPortEndpoint) HandlePacket(r *Route, id TransportEndpointID, vv buffer.VectorisedView) {
-	// If this is a broadcast or multicast datagram, deliver the datagram to all
-	// endpoints managed by ep.
-	if id.LocalAddress == header.IPv4Broadcast || header.IsV4MulticastAddress(id.LocalAddress) || header.IsV6MulticastAddress(id.LocalAddress) {
-		for i, endpoint := range ep.endpointsArr {
-			// HandlePacket modifies vv, so each endpoint needs its own copy.
-			if i == len(ep.endpointsArr)-1 {
-				endpoint.HandlePacket(r, id, vv)
-				break
-			}
-			vvCopy := buffer.NewView(vv.Size())
-			copy(vvCopy, vv.ToView())
-			endpoint.HandlePacket(r, id, vvCopy.ToVectorisedView())
+func (ep *multiPortEndpoint) handlePacketAll(r *Route, id TransportEndpointID, vv buffer.VectorisedView) {
+	ep.mu.RLock()
+	for i, endpoint := range ep.endpointsArr {
+		// HandlePacket modifies vv, so each endpoint needs its own copy except for
+		// the final one.
+		if i == len(ep.endpointsArr)-1 {
+			endpoint.HandlePacket(r, id, vv)
+			break
 		}
-	} else {
-		ep.selectEndpoint(id).HandlePacket(r, id, vv)
+		vvCopy := buffer.NewView(vv.Size())
+		copy(vvCopy, vv.ToView())
+		endpoint.HandlePacket(r, id, vvCopy.ToVectorisedView())
 	}
+	ep.mu.RUnlock() // Don't use defer for performance reasons.
 }
 
-// HandleControlPacket implements stack.TransportEndpoint.HandleControlPacket.
-func (ep *multiPortEndpoint) HandleControlPacket(id TransportEndpointID, typ ControlType, extra uint32, vv buffer.VectorisedView) {
-	ep.selectEndpoint(id).HandleControlPacket(id, typ, extra, vv)
-}
-
-func (ep *multiPortEndpoint) singleRegisterEndpoint(t TransportEndpoint) {
+// singleRegisterEndpoint tries to add an endpoint to the multiPortEndpoint
+// list. The list might be empty already.
+func (ep *multiPortEndpoint) singleRegisterEndpoint(t TransportEndpoint, reusePort bool) *tcpip.Error {
 	ep.mu.Lock()
 	defer ep.mu.Unlock()
 
-	// A new endpoint is added into endpointsArr and its index there is
-	// saved in endpointsMap. This will allows to remove endpoint from
-	// the array fast.
+	if len(ep.endpointsArr) > 0 {
+		// If it was previously bound, we need to check if we can bind again.
+		if !ep.reuse || !reusePort {
+			return tcpip.ErrPortInUse
+		}
+	}
+
+	// A new endpoint is added into endpointsArr and its index there is saved in
+	// endpointsMap. This will allow us to remove endpoint from the array fast.
 	ep.endpointsMap[t] = len(ep.endpointsArr)
 	ep.endpointsArr = append(ep.endpointsArr, t)
+	return nil
 }
 
 // unregisterEndpoint returns true if multiPortEndpoint has to be unregistered.
@@ -197,53 +282,41 @@ func (ep *multiPortEndpoint) unregisterEndpoint(t TransportEndpoint) bool {
 	return true
 }
 
-func (d *transportDemuxer) singleRegisterEndpoint(netProto tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint, reusePort bool) *tcpip.Error {
+func (d *transportDemuxer) singleRegisterEndpoint(netProto tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint, reusePort bool, bindToDevice tcpip.NICID) *tcpip.Error {
 	if id.RemotePort != 0 {
+		// TODO(eyalsoha): Why?
 		reusePort = false
 	}
 
 	eps, ok := d.protocol[protocolIDs{netProto, protocol}]
 	if !ok {
-		return nil
+		return tcpip.ErrUnknownProtocol
 	}
 
 	eps.mu.Lock()
 	defer eps.mu.Unlock()
 
-	var multiPortEp *multiPortEndpoint
-	if _, ok := eps.endpoints[id]; ok {
-		if !reusePort {
-			return tcpip.ErrPortInUse
-		}
-		multiPortEp, ok = eps.endpoints[id].(*multiPortEndpoint)
-		if !ok {
-			return tcpip.ErrPortInUse
-		}
+	if epsByNic, ok := eps.endpoints[id]; ok {
+		// There was already a binding.
+		return epsByNic.registerEndpoint(ep, reusePort, bindToDevice)
 	}
 
-	if reusePort {
-		if multiPortEp == nil {
-			multiPortEp = &multiPortEndpoint{}
-			multiPortEp.endpointsMap = make(map[TransportEndpoint]int)
-			multiPortEp.seed = rand.Uint32()
-			eps.endpoints[id] = multiPortEp
-		}
-
-		multiPortEp.singleRegisterEndpoint(ep)
-
-		return nil
+	// This is a new binding.
+	epsByNic := &endpointsByNic{
+		endpoints: make(map[tcpip.NICID]*multiPortEndpoint),
+		seed:      rand.Uint32(),
 	}
-	eps.endpoints[id] = ep
+	eps.endpoints[id] = epsByNic
 
-	return nil
+	return epsByNic.registerEndpoint(ep, reusePort, bindToDevice)
 }
 
 // unregisterEndpoint unregisters the endpoint with the given id such that it
 // won't receive any more packets.
-func (d *transportDemuxer) unregisterEndpoint(netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint) {
+func (d *transportDemuxer) unregisterEndpoint(netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint, bindToDevice tcpip.NICID) {
 	for _, n := range netProtos {
 		if eps, ok := d.protocol[protocolIDs{n, protocol}]; ok {
-			eps.unregisterEndpoint(id, ep)
+			eps.unregisterEndpoint(id, ep, bindToDevice)
 		}
 	}
 }
@@ -273,7 +346,7 @@ func (d *transportDemuxer) deliverPacket(r *Route, protocol tcpip.TransportProto
 
 	// If the packet is a broadcast, then find all matching transport endpoints.
 	// Otherwise, try to find a single matching transport endpoint.
-	destEps := make([]TransportEndpoint, 0, 1)
+	destEps := make([]*endpointsByNic, 0, 1)
 	eps.mu.RLock()
 
 	if protocol == header.UDPProtocolNumber && id.LocalAddress == header.IPv4Broadcast {
@@ -299,7 +372,7 @@ func (d *transportDemuxer) deliverPacket(r *Route, protocol tcpip.TransportProto
 
 	// Deliver the packet.
 	for _, ep := range destEps {
-		ep.HandlePacket(r, id, vv)
+		ep.handlePacket(r, id, vv)
 	}
 
 	return true
@@ -331,7 +404,7 @@ func (d *transportDemuxer) deliverRawPacket(r *Route, protocol tcpip.TransportPr
 
 // deliverControlPacket attempts to deliver the given control packet. Returns
 // true if it found an endpoint, false otherwise.
-func (d *transportDemuxer) deliverControlPacket(net tcpip.NetworkProtocolNumber, trans tcpip.TransportProtocolNumber, typ ControlType, extra uint32, vv buffer.VectorisedView, id TransportEndpointID) bool {
+func (d *transportDemuxer) deliverControlPacket(n *NIC, net tcpip.NetworkProtocolNumber, trans tcpip.TransportProtocolNumber, typ ControlType, extra uint32, vv buffer.VectorisedView, id TransportEndpointID) bool {
 	eps, ok := d.protocol[protocolIDs{net, trans}]
 	if !ok {
 		return false
@@ -348,12 +421,12 @@ func (d *transportDemuxer) deliverControlPacket(net tcpip.NetworkProtocolNumber,
 	}
 
 	// Deliver the packet.
-	ep.HandleControlPacket(id, typ, extra, vv)
+	ep.handleControlPacket(n, id, typ, extra, vv)
 
 	return true
 }
 
-func (d *transportDemuxer) findEndpointLocked(eps *transportEndpoints, vv buffer.VectorisedView, id TransportEndpointID) TransportEndpoint {
+func (d *transportDemuxer) findEndpointLocked(eps *transportEndpoints, vv buffer.VectorisedView, id TransportEndpointID) *endpointsByNic {
 	// Try to find a match with the id as provided.
 	if ep, ok := eps.endpoints[id]; ok {
 		return ep
diff --git a/pkg/tcpip/stack/transport_demuxer_test.go b/pkg/tcpip/stack/transport_demuxer_test.go
new file mode 100644
index 000000000..210233dc0
--- /dev/null
+++ b/pkg/tcpip/stack/transport_demuxer_test.go
@@ -0,0 +1,352 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package stack_test
+
+import (
+	"math"
+	"math/rand"
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/link/channel"
+	"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
+	"gvisor.dev/gvisor/pkg/tcpip/network/ipv6"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+	"gvisor.dev/gvisor/pkg/tcpip/transport/udp"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+const (
+	stackV6Addr = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01"
+	testV6Addr  = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02"
+
+	stackAddr = "\x0a\x00\x00\x01"
+	stackPort = 1234
+	testPort  = 4096
+)
+
+type testContext struct {
+	t       *testing.T
+	linkEPs map[string]*channel.Endpoint
+	s       *stack.Stack
+
+	ep tcpip.Endpoint
+	wq waiter.Queue
+}
+
+func (c *testContext) cleanup() {
+	if c.ep != nil {
+		c.ep.Close()
+	}
+}
+
+func (c *testContext) createV6Endpoint(v6only bool) {
+	var err *tcpip.Error
+	c.ep, err = c.s.NewEndpoint(udp.ProtocolNumber, ipv6.ProtocolNumber, &c.wq)
+	if err != nil {
+		c.t.Fatalf("NewEndpoint failed: %v", err)
+	}
+
+	var v tcpip.V6OnlyOption
+	if v6only {
+		v = 1
+	}
+	if err := c.ep.SetSockOpt(v); err != nil {
+		c.t.Fatalf("SetSockOpt failed: %v", err)
+	}
+}
+
+// newDualTestContextMultiNic creates the testing context and also linkEpNames
+// named NICs.
+func newDualTestContextMultiNic(t *testing.T, mtu uint32, linkEpNames []string) *testContext {
+	s := stack.New(stack.Options{
+		NetworkProtocols:   []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol()},
+		TransportProtocols: []stack.TransportProtocol{udp.NewProtocol()}})
+	linkEPs := make(map[string]*channel.Endpoint)
+	for i, linkEpName := range linkEpNames {
+		channelEP := channel.New(256, mtu, "")
+		nicid := tcpip.NICID(i + 1)
+		if err := s.CreateNamedNIC(nicid, linkEpName, channelEP); err != nil {
+			t.Fatalf("CreateNIC failed: %v", err)
+		}
+		linkEPs[linkEpName] = channelEP
+
+		if err := s.AddAddress(nicid, ipv4.ProtocolNumber, stackAddr); err != nil {
+			t.Fatalf("AddAddress IPv4 failed: %v", err)
+		}
+
+		if err := s.AddAddress(nicid, ipv6.ProtocolNumber, stackV6Addr); err != nil {
+			t.Fatalf("AddAddress IPv6 failed: %v", err)
+		}
+	}
+
+	s.SetRouteTable([]tcpip.Route{
+		{
+			Destination: header.IPv4EmptySubnet,
+			NIC:         1,
+		},
+		{
+			Destination: header.IPv6EmptySubnet,
+			NIC:         1,
+		},
+	})
+
+	return &testContext{
+		t:       t,
+		s:       s,
+		linkEPs: linkEPs,
+	}
+}
+
+type headers struct {
+	srcPort uint16
+	dstPort uint16
+}
+
+func newPayload() []byte {
+	b := make([]byte, 30+rand.Intn(100))
+	for i := range b {
+		b[i] = byte(rand.Intn(256))
+	}
+	return b
+}
+
+func (c *testContext) sendV6Packet(payload []byte, h *headers, linkEpName string) {
+	// Allocate a buffer for data and headers.
+	buf := buffer.NewView(header.UDPMinimumSize + header.IPv6MinimumSize + len(payload))
+	copy(buf[len(buf)-len(payload):], payload)
+
+	// Initialize the IP header.
+	ip := header.IPv6(buf)
+	ip.Encode(&header.IPv6Fields{
+		PayloadLength: uint16(header.UDPMinimumSize + len(payload)),
+		NextHeader:    uint8(udp.ProtocolNumber),
+		HopLimit:      65,
+		SrcAddr:       testV6Addr,
+		DstAddr:       stackV6Addr,
+	})
+
+	// Initialize the UDP header.
+	u := header.UDP(buf[header.IPv6MinimumSize:])
+	u.Encode(&header.UDPFields{
+		SrcPort: h.srcPort,
+		DstPort: h.dstPort,
+		Length:  uint16(header.UDPMinimumSize + len(payload)),
+	})
+
+	// Calculate the UDP pseudo-header checksum.
+	xsum := header.PseudoHeaderChecksum(udp.ProtocolNumber, testV6Addr, stackV6Addr, uint16(len(u)))
+
+	// Calculate the UDP checksum and set it.
+	xsum = header.Checksum(payload, xsum)
+	u.SetChecksum(^u.CalculateChecksum(xsum))
+
+	// Inject packet.
+	c.linkEPs[linkEpName].Inject(ipv6.ProtocolNumber, buf.ToVectorisedView())
+}
+
+func TestTransportDemuxerRegister(t *testing.T) {
+	for _, test := range []struct {
+		name  string
+		proto tcpip.NetworkProtocolNumber
+		want  *tcpip.Error
+	}{
+		{"failure", ipv6.ProtocolNumber, tcpip.ErrUnknownProtocol},
+		{"success", ipv4.ProtocolNumber, nil},
+	} {
+		t.Run(test.name, func(t *testing.T) {
+			s := stack.New(stack.Options{
+				NetworkProtocols:   []stack.NetworkProtocol{ipv4.NewProtocol()},
+				TransportProtocols: []stack.TransportProtocol{udp.NewProtocol()}})
+			if got, want := s.RegisterTransportEndpoint(0, []tcpip.NetworkProtocolNumber{test.proto}, udp.ProtocolNumber, stack.TransportEndpointID{}, nil, false, 0), test.want; got != want {
+				t.Fatalf("s.RegisterTransportEndpoint(...) = %v, want %v", got, want)
+			}
+		})
+	}
+}
+
+// TestReuseBindToDevice injects varied packets on input devices and checks that
+// the distribution of packets received matches expectations.
+func TestDistribution(t *testing.T) {
+	type endpointSockopts struct {
+		reuse        int
+		bindToDevice string
+	}
+	for _, test := range []struct {
+		name string
+		// endpoints will received the inject packets.
+		endpoints []endpointSockopts
+		// wantedDistribution is the wanted ratio of packets received on each
+		// endpoint for each NIC on which packets are injected.
+		wantedDistributions map[string][]float64
+	}{
+		{
+			"BindPortReuse",
+			// 5 endpoints that all have reuse set.
+			[]endpointSockopts{
+				endpointSockopts{1, ""},
+				endpointSockopts{1, ""},
+				endpointSockopts{1, ""},
+				endpointSockopts{1, ""},
+				endpointSockopts{1, ""},
+			},
+			map[string][]float64{
+				// Injected packets on dev0 get distributed evenly.
+				"dev0": []float64{0.2, 0.2, 0.2, 0.2, 0.2},
+			},
+		},
+		{
+			"BindToDevice",
+			// 3 endpoints with various bindings.
+			[]endpointSockopts{
+				endpointSockopts{0, "dev0"},
+				endpointSockopts{0, "dev1"},
+				endpointSockopts{0, "dev2"},
+			},
+			map[string][]float64{
+				// Injected packets on dev0 go only to the endpoint bound to dev0.
+				"dev0": []float64{1, 0, 0},
+				// Injected packets on dev1 go only to the endpoint bound to dev1.
+				"dev1": []float64{0, 1, 0},
+				// Injected packets on dev2 go only to the endpoint bound to dev2.
+				"dev2": []float64{0, 0, 1},
+			},
+		},
+		{
+			"ReuseAndBindToDevice",
+			// 6 endpoints with various bindings.
+			[]endpointSockopts{
+				endpointSockopts{1, "dev0"},
+				endpointSockopts{1, "dev0"},
+				endpointSockopts{1, "dev1"},
+				endpointSockopts{1, "dev1"},
+				endpointSockopts{1, "dev1"},
+				endpointSockopts{1, ""},
+			},
+			map[string][]float64{
+				// Injected packets on dev0 get distributed among endpoints bound to
+				// dev0.
+				"dev0": []float64{0.5, 0.5, 0, 0, 0, 0},
+				// Injected packets on dev1 get distributed among endpoints bound to
+				// dev1 or unbound.
+				"dev1": []float64{0, 0, 1. / 3, 1. / 3, 1. / 3, 0},
+				// Injected packets on dev999 go only to the unbound.
+				"dev999": []float64{0, 0, 0, 0, 0, 1},
+			},
+		},
+	} {
+		t.Run(test.name, func(t *testing.T) {
+			for device, wantedDistribution := range test.wantedDistributions {
+				t.Run(device, func(t *testing.T) {
+					var devices []string
+					for d := range test.wantedDistributions {
+						devices = append(devices, d)
+					}
+					c := newDualTestContextMultiNic(t, defaultMTU, devices)
+					defer c.cleanup()
+
+					c.createV6Endpoint(false)
+
+					eps := make(map[tcpip.Endpoint]int)
+
+					pollChannel := make(chan tcpip.Endpoint)
+					for i, endpoint := range test.endpoints {
+						// Try to receive the data.
+						wq := waiter.Queue{}
+						we, ch := waiter.NewChannelEntry(nil)
+						wq.EventRegister(&we, waiter.EventIn)
+						defer wq.EventUnregister(&we)
+						defer close(ch)
+
+						var err *tcpip.Error
+						ep, err := c.s.NewEndpoint(udp.ProtocolNumber, ipv6.ProtocolNumber, &wq)
+						if err != nil {
+							c.t.Fatalf("NewEndpoint failed: %v", err)
+						}
+						eps[ep] = i
+
+						go func(ep tcpip.Endpoint) {
+							for range ch {
+								pollChannel <- ep
+							}
+						}(ep)
+
+						defer ep.Close()
+						reusePortOption := tcpip.ReusePortOption(endpoint.reuse)
+						if err := ep.SetSockOpt(reusePortOption); err != nil {
+							c.t.Fatalf("SetSockOpt(%#v) on endpoint %d failed: %v", reusePortOption, i, err)
+						}
+						bindToDeviceOption := tcpip.BindToDeviceOption(endpoint.bindToDevice)
+						if err := ep.SetSockOpt(bindToDeviceOption); err != nil {
+							c.t.Fatalf("SetSockOpt(%#v) on endpoint %d failed: %v", bindToDeviceOption, i, err)
+						}
+						if err := ep.Bind(tcpip.FullAddress{Addr: stackV6Addr, Port: stackPort}); err != nil {
+							t.Fatalf("ep.Bind(...) on endpoint %d failed: %v", i, err)
+						}
+					}
+
+					npackets := 100000
+					nports := 10000
+					if got, want := len(test.endpoints), len(wantedDistribution); got != want {
+						t.Fatalf("got len(test.endpoints) = %d, want %d", got, want)
+					}
+					ports := make(map[uint16]tcpip.Endpoint)
+					stats := make(map[tcpip.Endpoint]int)
+					for i := 0; i < npackets; i++ {
+						// Send a packet.
+						port := uint16(i % nports)
+						payload := newPayload()
+						c.sendV6Packet(payload,
+							&headers{
+								srcPort: testPort + port,
+								dstPort: stackPort},
+							device)
+
+						var addr tcpip.FullAddress
+						ep := <-pollChannel
+						_, _, err := ep.Read(&addr)
+						if err != nil {
+							c.t.Fatalf("Read on endpoint %d failed: %v", eps[ep], err)
+						}
+						stats[ep]++
+						if i < nports {
+							ports[uint16(i)] = ep
+						} else {
+							// Check that all packets from one client are handled by the same
+							// socket.
+							if want, got := ports[port], ep; want != got {
+								t.Fatalf("Packet sent on port %d expected on endpoint %d but received on endpoint %d", port, eps[want], eps[got])
+							}
+						}
+					}
+
+					// Check that a packet distribution is as expected.
+					for ep, i := range eps {
+						wantedRatio := wantedDistribution[i]
+						wantedRecv := wantedRatio * float64(npackets)
+						actualRecv := stats[ep]
+						actualRatio := float64(stats[ep]) / float64(npackets)
+						// The deviation is less than 10%.
+						if math.Abs(actualRatio-wantedRatio) > 0.05 {
+							t.Errorf("wanted about %.0f%% (%.0f of %d) packets to arrive on endpoint %d, got %.0f%% (%d of %d)", wantedRatio*100, wantedRecv, npackets, i, actualRatio*100, actualRecv, npackets)
+						}
+					}
+				})
+			}
+		})
+	}
+}
diff --git a/pkg/tcpip/stack/transport_test.go b/pkg/tcpip/stack/transport_test.go
index 56e8a5d9b..842a16277 100644
--- a/pkg/tcpip/stack/transport_test.go
+++ b/pkg/tcpip/stack/transport_test.go
@@ -127,7 +127,7 @@ func (f *fakeTransportEndpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
 
 	// Try to register so that we can start receiving packets.
 	f.id.RemoteAddress = addr.Addr
-	err = f.stack.RegisterTransportEndpoint(0, []tcpip.NetworkProtocolNumber{fakeNetNumber}, fakeTransNumber, f.id, f, false)
+	err = f.stack.RegisterTransportEndpoint(0, []tcpip.NetworkProtocolNumber{fakeNetNumber}, fakeTransNumber, f.id, f, false /* reuse */, 0 /* bindToDevice */)
 	if err != nil {
 		return err
 	}
@@ -168,7 +168,8 @@ func (f *fakeTransportEndpoint) Bind(a tcpip.FullAddress) *tcpip.Error {
 		fakeTransNumber,
 		stack.TransportEndpointID{LocalAddress: a.Addr},
 		f,
-		false,
+		false, /* reuse */
+		0,     /* bindtoDevice */
 	); err != nil {
 		return err
 	}
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index c021c67ac..faaa4a4e3 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -495,6 +495,10 @@ type ReuseAddressOption int
 // to be bound to an identical socket address.
 type ReusePortOption int
 
+// BindToDeviceOption is used by SetSockOpt/GetSockOpt to specify that sockets
+// should bind only on a specific NIC.
+type BindToDeviceOption string
+
 // QuickAckOption is stubbed out in SetSockOpt/GetSockOpt.
 type QuickAckOption int
 
diff --git a/pkg/tcpip/transport/icmp/endpoint.go b/pkg/tcpip/transport/icmp/endpoint.go
index a111fdb2a..a3a910d41 100644
--- a/pkg/tcpip/transport/icmp/endpoint.go
+++ b/pkg/tcpip/transport/icmp/endpoint.go
@@ -104,7 +104,7 @@ func (e *endpoint) Close() {
 	e.shutdownFlags = tcpip.ShutdownRead | tcpip.ShutdownWrite
 	switch e.state {
 	case stateBound, stateConnected:
-		e.stack.UnregisterTransportEndpoint(e.regNICID, []tcpip.NetworkProtocolNumber{e.netProto}, e.transProto, e.id, e)
+		e.stack.UnregisterTransportEndpoint(e.regNICID, []tcpip.NetworkProtocolNumber{e.netProto}, e.transProto, e.id, e, 0 /* bindToDevice */)
 	}
 
 	// Close the receive list and drain it.
@@ -543,14 +543,14 @@ func (e *endpoint) registerWithStack(nicid tcpip.NICID, netProtos []tcpip.Networ
 	if id.LocalPort != 0 {
 		// The endpoint already has a local port, just attempt to
 		// register it.
-		err := e.stack.RegisterTransportEndpoint(nicid, netProtos, e.transProto, id, e, false)
+		err := e.stack.RegisterTransportEndpoint(nicid, netProtos, e.transProto, id, e, false /* reuse */, 0 /* bindToDevice */)
 		return id, err
 	}
 
 	// We need to find a port for the endpoint.
 	_, err := e.stack.PickEphemeralPort(func(p uint16) (bool, *tcpip.Error) {
 		id.LocalPort = p
-		err := e.stack.RegisterTransportEndpoint(nicid, netProtos, e.transProto, id, e, false)
+		err := e.stack.RegisterTransportEndpoint(nicid, netProtos, e.transProto, id, e, false /* reuse */, 0 /* bindtodevice */)
 		switch err {
 		case nil:
 			return true, nil
diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go
index 0802e984e..3ae4a5426 100644
--- a/pkg/tcpip/transport/tcp/accept.go
+++ b/pkg/tcpip/transport/tcp/accept.go
@@ -242,7 +242,7 @@ func (l *listenContext) createConnectingEndpoint(s *segment, iss seqnum.Value, i
 	n.initGSO()
 
 	// Register new endpoint so that packets are routed to it.
-	if err := n.stack.RegisterTransportEndpoint(n.boundNICID, n.effectiveNetProtos, ProtocolNumber, n.id, n, n.reusePort); err != nil {
+	if err := n.stack.RegisterTransportEndpoint(n.boundNICID, n.effectiveNetProtos, ProtocolNumber, n.id, n, n.reusePort, n.bindToDevice); err != nil {
 		n.Close()
 		return nil, err
 	}
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index 35b489c68..a1cd0d481 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -280,6 +280,9 @@ type endpoint struct {
 	// reusePort is set to true if SO_REUSEPORT is enabled.
 	reusePort bool
 
+	// bindToDevice is set to the NIC on which to bind or disabled if 0.
+	bindToDevice tcpip.NICID
+
 	// delay enables Nagle's algorithm.
 	//
 	// delay is a boolean (0 is false) and must be accessed atomically.
@@ -564,11 +567,11 @@ func (e *endpoint) Close() {
 	// in Listen() when trying to register.
 	if e.state == StateListen && e.isPortReserved {
 		if e.isRegistered {
-			e.stack.UnregisterTransportEndpoint(e.boundNICID, e.effectiveNetProtos, ProtocolNumber, e.id, e)
+			e.stack.UnregisterTransportEndpoint(e.boundNICID, e.effectiveNetProtos, ProtocolNumber, e.id, e, e.bindToDevice)
 			e.isRegistered = false
 		}
 
-		e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.id.LocalAddress, e.id.LocalPort)
+		e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.id.LocalAddress, e.id.LocalPort, e.bindToDevice)
 		e.isPortReserved = false
 	}
 
@@ -625,12 +628,12 @@ func (e *endpoint) cleanupLocked() {
 	e.workerCleanup = false
 
 	if e.isRegistered {
-		e.stack.UnregisterTransportEndpoint(e.boundNICID, e.effectiveNetProtos, ProtocolNumber, e.id, e)
+		e.stack.UnregisterTransportEndpoint(e.boundNICID, e.effectiveNetProtos, ProtocolNumber, e.id, e, e.bindToDevice)
 		e.isRegistered = false
 	}
 
 	if e.isPortReserved {
-		e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.id.LocalAddress, e.id.LocalPort)
+		e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.id.LocalAddress, e.id.LocalPort, e.bindToDevice)
 		e.isPortReserved = false
 	}
 
@@ -1060,6 +1063,21 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 		e.mu.Unlock()
 		return nil
 
+	case tcpip.BindToDeviceOption:
+		e.mu.Lock()
+		defer e.mu.Unlock()
+		if v == "" {
+			e.bindToDevice = 0
+			return nil
+		}
+		for nicid, nic := range e.stack.NICInfo() {
+			if nic.Name == string(v) {
+				e.bindToDevice = nicid
+				return nil
+			}
+		}
+		return tcpip.ErrUnknownDevice
+
 	case tcpip.QuickAckOption:
 		if v == 0 {
 			atomic.StoreUint32(&e.slowAck, 1)
@@ -1260,6 +1278,16 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 		}
 		return nil
 
+	case *tcpip.BindToDeviceOption:
+		e.mu.RLock()
+		defer e.mu.RUnlock()
+		if nic, ok := e.stack.NICInfo()[e.bindToDevice]; ok {
+			*o = tcpip.BindToDeviceOption(nic.Name)
+			return nil
+		}
+		*o = ""
+		return nil
+
 	case *tcpip.QuickAckOption:
 		*o = 1
 		if v := atomic.LoadUint32(&e.slowAck); v != 0 {
@@ -1466,7 +1494,7 @@ func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) (er
 
 	if e.id.LocalPort != 0 {
 		// The endpoint is bound to a port, attempt to register it.
-		err := e.stack.RegisterTransportEndpoint(nicid, netProtos, ProtocolNumber, e.id, e, e.reusePort)
+		err := e.stack.RegisterTransportEndpoint(nicid, netProtos, ProtocolNumber, e.id, e, e.reusePort, e.bindToDevice)
 		if err != nil {
 			return err
 		}
@@ -1480,13 +1508,15 @@ func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) (er
 			if sameAddr && p == e.id.RemotePort {
 				return false, nil
 			}
-			if !e.stack.IsPortAvailable(netProtos, ProtocolNumber, e.id.LocalAddress, p, false) {
+			// reusePort is false below because connect cannot reuse a port even if
+			// reusePort was set.
+			if !e.stack.IsPortAvailable(netProtos, ProtocolNumber, e.id.LocalAddress, p, false /* reusePort */, e.bindToDevice) {
 				return false, nil
 			}
 
 			id := e.id
 			id.LocalPort = p
-			switch e.stack.RegisterTransportEndpoint(nicid, netProtos, ProtocolNumber, id, e, e.reusePort) {
+			switch e.stack.RegisterTransportEndpoint(nicid, netProtos, ProtocolNumber, id, e, e.reusePort, e.bindToDevice) {
 			case nil:
 				e.id = id
 				return true, nil
@@ -1504,7 +1534,7 @@ func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) (er
 	// before Connect: in such a case we don't want to hold on to
 	// reservations anymore.
 	if e.isPortReserved {
-		e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, origID.LocalAddress, origID.LocalPort)
+		e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, origID.LocalAddress, origID.LocalPort, e.bindToDevice)
 		e.isPortReserved = false
 	}
 
@@ -1648,7 +1678,7 @@ func (e *endpoint) Listen(backlog int) (err *tcpip.Error) {
 	}
 
 	// Register the endpoint.
-	if err := e.stack.RegisterTransportEndpoint(e.boundNICID, e.effectiveNetProtos, ProtocolNumber, e.id, e, e.reusePort); err != nil {
+	if err := e.stack.RegisterTransportEndpoint(e.boundNICID, e.effectiveNetProtos, ProtocolNumber, e.id, e, e.reusePort, e.bindToDevice); err != nil {
 		return err
 	}
 
@@ -1729,7 +1759,7 @@ func (e *endpoint) Bind(addr tcpip.FullAddress) (err *tcpip.Error) {
 		}
 	}
 
-	port, err := e.stack.ReservePort(netProtos, ProtocolNumber, addr.Addr, addr.Port, e.reusePort)
+	port, err := e.stack.ReservePort(netProtos, ProtocolNumber, addr.Addr, addr.Port, e.reusePort, e.bindToDevice)
 	if err != nil {
 		return err
 	}
@@ -1739,16 +1769,16 @@ func (e *endpoint) Bind(addr tcpip.FullAddress) (err *tcpip.Error) {
 	e.id.LocalPort = port
 
 	// Any failures beyond this point must remove the port registration.
-	defer func() {
+	defer func(bindToDevice tcpip.NICID) {
 		if err != nil {
-			e.stack.ReleasePort(netProtos, ProtocolNumber, addr.Addr, port)
+			e.stack.ReleasePort(netProtos, ProtocolNumber, addr.Addr, port, bindToDevice)
 			e.isPortReserved = false
 			e.effectiveNetProtos = nil
 			e.id.LocalPort = 0
 			e.id.LocalAddress = ""
 			e.boundNICID = 0
 		}
-	}()
+	}(e.bindToDevice)
 
 	// If an address is specified, we must ensure that it's one of our
 	// local addresses.
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index 2be094876..089826a88 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -465,6 +465,66 @@ func TestSimpleReceive(t *testing.T) {
 	)
 }
 
+func TestConnectBindToDevice(t *testing.T) {
+	for _, test := range []struct {
+		name   string
+		device string
+		want   tcp.EndpointState
+	}{
+		{"RightDevice", "nic1", tcp.StateEstablished},
+		{"WrongDevice", "nic2", tcp.StateSynSent},
+		{"AnyDevice", "", tcp.StateEstablished},
+	} {
+		t.Run(test.name, func(t *testing.T) {
+			c := context.New(t, defaultMTU)
+			defer c.Cleanup()
+
+			c.Create(-1)
+			bindToDevice := tcpip.BindToDeviceOption(test.device)
+			c.EP.SetSockOpt(bindToDevice)
+			// Start connection attempt.
+			waitEntry, _ := waiter.NewChannelEntry(nil)
+			c.WQ.EventRegister(&waitEntry, waiter.EventOut)
+			defer c.WQ.EventUnregister(&waitEntry)
+
+			if err := c.EP.Connect(tcpip.FullAddress{Addr: context.TestAddr, Port: context.TestPort}); err != tcpip.ErrConnectStarted {
+				t.Fatalf("Unexpected return value from Connect: %v", err)
+			}
+
+			// Receive SYN packet.
+			b := c.GetPacket()
+			checker.IPv4(t, b,
+				checker.TCP(
+					checker.DstPort(context.TestPort),
+					checker.TCPFlags(header.TCPFlagSyn),
+				),
+			)
+			if got, want := tcp.EndpointState(c.EP.State()), tcp.StateSynSent; got != want {
+				t.Fatalf("Unexpected endpoint state: want %v, got %v", want, got)
+			}
+			tcpHdr := header.TCP(header.IPv4(b).Payload())
+			c.IRS = seqnum.Value(tcpHdr.SequenceNumber())
+
+			iss := seqnum.Value(789)
+			rcvWnd := seqnum.Size(30000)
+			c.SendPacket(nil, &context.Headers{
+				SrcPort: tcpHdr.DestinationPort(),
+				DstPort: tcpHdr.SourcePort(),
+				Flags:   header.TCPFlagSyn | header.TCPFlagAck,
+				SeqNum:  iss,
+				AckNum:  c.IRS.Add(1),
+				RcvWnd:  rcvWnd,
+				TCPOpts: nil,
+			})
+
+			c.GetPacket()
+			if got, want := tcp.EndpointState(c.EP.State()), test.want; got != want {
+				t.Fatalf("Unexpected endpoint state: want %v, got %v", want, got)
+			}
+		})
+	}
+}
+
 func TestOutOfOrderReceive(t *testing.T) {
 	c := context.New(t, defaultMTU)
 	defer c.Cleanup()
@@ -2970,6 +3030,62 @@ func TestMinMaxBufferSizes(t *testing.T) {
 	checkSendBufferSize(t, ep, tcp.DefaultSendBufferSize*30)
 }
 
+func TestBindToDeviceOption(t *testing.T) {
+	s := stack.New(stack.Options{
+		NetworkProtocols:   []stack.NetworkProtocol{ipv4.NewProtocol()},
+		TransportProtocols: []stack.TransportProtocol{tcp.NewProtocol()}})
+
+	ep, err := s.NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &waiter.Queue{})
+	if err != nil {
+		t.Fatalf("NewEndpoint failed; %v", err)
+	}
+	defer ep.Close()
+
+	if err := s.CreateNamedNIC(321, "my_device", loopback.New()); err != nil {
+		t.Errorf("CreateNamedNIC failed: %v", err)
+	}
+
+	// Make an nameless NIC.
+	if err := s.CreateNIC(54321, loopback.New()); err != nil {
+		t.Errorf("CreateNIC failed: %v", err)
+	}
+
+	// strPtr is used instead of taking the address of string literals, which is
+	// a compiler error.
+	strPtr := func(s string) *string {
+		return &s
+	}
+
+	testActions := []struct {
+		name                 string
+		setBindToDevice      *string
+		setBindToDeviceError *tcpip.Error
+		getBindToDevice      tcpip.BindToDeviceOption
+	}{
+		{"GetDefaultValue", nil, nil, ""},
+		{"BindToNonExistent", strPtr("non_existent_device"), tcpip.ErrUnknownDevice, ""},
+		{"BindToExistent", strPtr("my_device"), nil, "my_device"},
+		{"UnbindToDevice", strPtr(""), nil, ""},
+	}
+	for _, testAction := range testActions {
+		t.Run(testAction.name, func(t *testing.T) {
+			if testAction.setBindToDevice != nil {
+				bindToDevice := tcpip.BindToDeviceOption(*testAction.setBindToDevice)
+				if got, want := ep.SetSockOpt(bindToDevice), testAction.setBindToDeviceError; got != want {
+					t.Errorf("SetSockOpt(%v) got %v, want %v", bindToDevice, got, want)
+				}
+			}
+			bindToDevice := tcpip.BindToDeviceOption("to be modified by GetSockOpt")
+			if ep.GetSockOpt(&bindToDevice) != nil {
+				t.Errorf("GetSockOpt got %v, want %v", ep.GetSockOpt(&bindToDevice), nil)
+			}
+			if got, want := bindToDevice, testAction.getBindToDevice; got != want {
+				t.Errorf("bindToDevice got %q, want %q", got, want)
+			}
+		})
+	}
+}
+
 func makeStack() (*stack.Stack, *tcpip.Error) {
 	s := stack.New(stack.Options{
 		NetworkProtocols: []stack.NetworkProtocol{
diff --git a/pkg/tcpip/transport/tcp/testing/context/context.go b/pkg/tcpip/transport/tcp/testing/context/context.go
index d3f1d2cdf..ef823e4ae 100644
--- a/pkg/tcpip/transport/tcp/testing/context/context.go
+++ b/pkg/tcpip/transport/tcp/testing/context/context.go
@@ -158,7 +158,14 @@ func New(t *testing.T, mtu uint32) *Context {
 	if testing.Verbose() {
 		wep = sniffer.New(ep)
 	}
-	if err := s.CreateNIC(1, wep); err != nil {
+	if err := s.CreateNamedNIC(1, "nic1", wep); err != nil {
+		t.Fatalf("CreateNIC failed: %v", err)
+	}
+	wep2 := stack.LinkEndpoint(channel.New(1000, mtu, ""))
+	if testing.Verbose() {
+		wep2 = sniffer.New(channel.New(1000, mtu, ""))
+	}
+	if err := s.CreateNamedNIC(2, "nic2", wep2); err != nil {
 		t.Fatalf("CreateNIC failed: %v", err)
 	}
 
@@ -588,12 +595,8 @@ func (c *Context) Connect(iss seqnum.Value, rcvWnd seqnum.Size, options []byte)
 	c.Port = tcpHdr.SourcePort()
 }
 
-// CreateConnectedWithRawOptions creates a connected TCP endpoint and sends
-// the specified option bytes as the Option field in the initial SYN packet.
-//
-// It also sets the receive buffer for the endpoint to the specified
-// value in epRcvBuf.
-func (c *Context) CreateConnectedWithRawOptions(iss seqnum.Value, rcvWnd seqnum.Size, epRcvBuf int, options []byte) {
+// Create creates a TCP endpoint.
+func (c *Context) Create(epRcvBuf int) {
 	// Create TCP endpoint.
 	var err *tcpip.Error
 	c.EP, err = c.s.NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &c.WQ)
@@ -606,6 +609,15 @@ func (c *Context) CreateConnectedWithRawOptions(iss seqnum.Value, rcvWnd seqnum.
 			c.t.Fatalf("SetSockOpt failed failed: %v", err)
 		}
 	}
+}
+
+// CreateConnectedWithRawOptions creates a connected TCP endpoint and sends
+// the specified option bytes as the Option field in the initial SYN packet.
+//
+// It also sets the receive buffer for the endpoint to the specified
+// value in epRcvBuf.
+func (c *Context) CreateConnectedWithRawOptions(iss seqnum.Value, rcvWnd seqnum.Size, epRcvBuf int, options []byte) {
+	c.Create(epRcvBuf)
 	c.Connect(iss, rcvWnd, options)
 }
 
diff --git a/pkg/tcpip/transport/udp/BUILD b/pkg/tcpip/transport/udp/BUILD
index c1ca22b35..7a635ab8d 100644
--- a/pkg/tcpip/transport/udp/BUILD
+++ b/pkg/tcpip/transport/udp/BUILD
@@ -52,6 +52,7 @@ go_test(
         "//pkg/tcpip/checker",
         "//pkg/tcpip/header",
         "//pkg/tcpip/link/channel",
+        "//pkg/tcpip/link/loopback",
         "//pkg/tcpip/link/sniffer",
         "//pkg/tcpip/network/ipv4",
         "//pkg/tcpip/network/ipv6",
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index 0bec7e62d..52f5af777 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -88,6 +88,7 @@ type endpoint struct {
 	multicastNICID tcpip.NICID
 	multicastLoop  bool
 	reusePort      bool
+	bindToDevice   tcpip.NICID
 	broadcast      bool
 
 	// shutdownFlags represent the current shutdown state of the endpoint.
@@ -144,8 +145,8 @@ func (e *endpoint) Close() {
 
 	switch e.state {
 	case StateBound, StateConnected:
-		e.stack.UnregisterTransportEndpoint(e.regNICID, e.effectiveNetProtos, ProtocolNumber, e.id, e)
-		e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.id.LocalAddress, e.id.LocalPort)
+		e.stack.UnregisterTransportEndpoint(e.regNICID, e.effectiveNetProtos, ProtocolNumber, e.id, e, e.bindToDevice)
+		e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.id.LocalAddress, e.id.LocalPort, e.bindToDevice)
 	}
 
 	for _, mem := range e.multicastMemberships {
@@ -551,6 +552,21 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 		e.reusePort = v != 0
 		e.mu.Unlock()
 
+	case tcpip.BindToDeviceOption:
+		e.mu.Lock()
+		defer e.mu.Unlock()
+		if v == "" {
+			e.bindToDevice = 0
+			return nil
+		}
+		for nicid, nic := range e.stack.NICInfo() {
+			if nic.Name == string(v) {
+				e.bindToDevice = nicid
+				return nil
+			}
+		}
+		return tcpip.ErrUnknownDevice
+
 	case tcpip.BroadcastOption:
 		e.mu.Lock()
 		e.broadcast = v != 0
@@ -646,6 +662,16 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 		}
 		return nil
 
+	case *tcpip.BindToDeviceOption:
+		e.mu.RLock()
+		defer e.mu.RUnlock()
+		if nic, ok := e.stack.NICInfo()[e.bindToDevice]; ok {
+			*o = tcpip.BindToDeviceOption(nic.Name)
+			return nil
+		}
+		*o = tcpip.BindToDeviceOption("")
+		return nil
+
 	case *tcpip.KeepaliveEnabledOption:
 		*o = 0
 		return nil
@@ -753,12 +779,12 @@ func (e *endpoint) Disconnect() *tcpip.Error {
 	} else {
 		if e.id.LocalPort != 0 {
 			// Release the ephemeral port.
-			e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.id.LocalAddress, e.id.LocalPort)
+			e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.id.LocalAddress, e.id.LocalPort, e.bindToDevice)
 		}
 		e.state = StateInitial
 	}
 
-	e.stack.UnregisterTransportEndpoint(e.regNICID, e.effectiveNetProtos, ProtocolNumber, e.id, e)
+	e.stack.UnregisterTransportEndpoint(e.regNICID, e.effectiveNetProtos, ProtocolNumber, e.id, e, e.bindToDevice)
 	e.id = id
 	e.route.Release()
 	e.route = stack.Route{}
@@ -835,7 +861,7 @@ func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
 
 	// Remove the old registration.
 	if e.id.LocalPort != 0 {
-		e.stack.UnregisterTransportEndpoint(e.regNICID, e.effectiveNetProtos, ProtocolNumber, e.id, e)
+		e.stack.UnregisterTransportEndpoint(e.regNICID, e.effectiveNetProtos, ProtocolNumber, e.id, e, e.bindToDevice)
 	}
 
 	e.id = id
@@ -898,16 +924,16 @@ func (*endpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
 
 func (e *endpoint) registerWithStack(nicid tcpip.NICID, netProtos []tcpip.NetworkProtocolNumber, id stack.TransportEndpointID) (stack.TransportEndpointID, *tcpip.Error) {
 	if e.id.LocalPort == 0 {
-		port, err := e.stack.ReservePort(netProtos, ProtocolNumber, id.LocalAddress, id.LocalPort, e.reusePort)
+		port, err := e.stack.ReservePort(netProtos, ProtocolNumber, id.LocalAddress, id.LocalPort, e.reusePort, e.bindToDevice)
 		if err != nil {
 			return id, err
 		}
 		id.LocalPort = port
 	}
 
-	err := e.stack.RegisterTransportEndpoint(nicid, netProtos, ProtocolNumber, id, e, e.reusePort)
+	err := e.stack.RegisterTransportEndpoint(nicid, netProtos, ProtocolNumber, id, e, e.reusePort, e.bindToDevice)
 	if err != nil {
-		e.stack.ReleasePort(netProtos, ProtocolNumber, id.LocalAddress, id.LocalPort)
+		e.stack.ReleasePort(netProtos, ProtocolNumber, id.LocalAddress, id.LocalPort, e.bindToDevice)
 	}
 	return id, err
 }
diff --git a/pkg/tcpip/transport/udp/forwarder.go b/pkg/tcpip/transport/udp/forwarder.go
index a9edc2c8d..2d0bc5221 100644
--- a/pkg/tcpip/transport/udp/forwarder.go
+++ b/pkg/tcpip/transport/udp/forwarder.go
@@ -74,7 +74,7 @@ func (r *ForwarderRequest) ID() stack.TransportEndpointID {
 // CreateEndpoint creates a connected UDP endpoint for the session request.
 func (r *ForwarderRequest) CreateEndpoint(queue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
 	ep := newEndpoint(r.stack, r.route.NetProto, queue)
-	if err := r.stack.RegisterTransportEndpoint(r.route.NICID(), []tcpip.NetworkProtocolNumber{r.route.NetProto}, ProtocolNumber, r.id, ep, ep.reusePort); err != nil {
+	if err := r.stack.RegisterTransportEndpoint(r.route.NICID(), []tcpip.NetworkProtocolNumber{r.route.NetProto}, ProtocolNumber, r.id, ep, ep.reusePort, ep.bindToDevice); err != nil {
 		ep.Close()
 		return nil, err
 	}
diff --git a/pkg/tcpip/transport/udp/udp_test.go b/pkg/tcpip/transport/udp/udp_test.go
index 2ec27be4d..5059ca22d 100644
--- a/pkg/tcpip/transport/udp/udp_test.go
+++ b/pkg/tcpip/transport/udp/udp_test.go
@@ -17,7 +17,6 @@ package udp_test
 import (
 	"bytes"
 	"fmt"
-	"math"
 	"math/rand"
 	"testing"
 	"time"
@@ -27,6 +26,7 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip/checker"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/link/channel"
+	"gvisor.dev/gvisor/pkg/tcpip/link/loopback"
 	"gvisor.dev/gvisor/pkg/tcpip/link/sniffer"
 	"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
 	"gvisor.dev/gvisor/pkg/tcpip/network/ipv6"
@@ -476,87 +476,59 @@ func newMinPayload(minSize int) []byte {
 	return b
 }
 
-func TestBindPortReuse(t *testing.T) {
-	c := newDualTestContext(t, defaultMTU)
-	defer c.cleanup()
-
-	c.createEndpoint(ipv6.ProtocolNumber)
-
-	var eps [5]tcpip.Endpoint
-	reusePortOpt := tcpip.ReusePortOption(1)
-
-	pollChannel := make(chan tcpip.Endpoint)
-	for i := 0; i < len(eps); i++ {
-		// Try to receive the data.
-		wq := waiter.Queue{}
-		we, ch := waiter.NewChannelEntry(nil)
-		wq.EventRegister(&we, waiter.EventIn)
-		defer wq.EventUnregister(&we)
-		defer close(ch)
-
-		var err *tcpip.Error
-		eps[i], err = c.s.NewEndpoint(udp.ProtocolNumber, ipv6.ProtocolNumber, &wq)
-		if err != nil {
-			c.t.Fatalf("NewEndpoint failed: %v", err)
-		}
-
-		go func(ep tcpip.Endpoint) {
-			for range ch {
-				pollChannel <- ep
-			}
-		}(eps[i])
+func TestBindToDeviceOption(t *testing.T) {
+	s := stack.New(stack.Options{
+		NetworkProtocols:   []stack.NetworkProtocol{ipv4.NewProtocol()},
+		TransportProtocols: []stack.TransportProtocol{udp.NewProtocol()}})
 
-		defer eps[i].Close()
-		if err := eps[i].SetSockOpt(reusePortOpt); err != nil {
-			c.t.Fatalf("SetSockOpt failed failed: %v", err)
-		}
-		if err := eps[i].Bind(tcpip.FullAddress{Addr: stackV6Addr, Port: stackPort}); err != nil {
-			t.Fatalf("ep.Bind(...) failed: %v", err)
-		}
+	ep, err := s.NewEndpoint(udp.ProtocolNumber, ipv4.ProtocolNumber, &waiter.Queue{})
+	if err != nil {
+		t.Fatalf("NewEndpoint failed; %v", err)
 	}
+	defer ep.Close()
 
-	npackets := 100000
-	nports := 10000
-	ports := make(map[uint16]tcpip.Endpoint)
-	stats := make(map[tcpip.Endpoint]int)
-	for i := 0; i < npackets; i++ {
-		// Send a packet.
-		port := uint16(i % nports)
-		payload := newPayload()
-		c.injectV6Packet(payload, &header4Tuple{
-			srcAddr: tcpip.FullAddress{Addr: testV6Addr, Port: testPort + port},
-			dstAddr: tcpip.FullAddress{Addr: stackV6Addr, Port: stackPort},
-		})
+	if err := s.CreateNamedNIC(321, "my_device", loopback.New()); err != nil {
+		t.Errorf("CreateNamedNIC failed: %v", err)
+	}
 
-		var addr tcpip.FullAddress
-		ep := <-pollChannel
-		_, _, err := ep.Read(&addr)
-		if err != nil {
-			c.t.Fatalf("Read failed: %v", err)
-		}
-		stats[ep]++
-		if i < nports {
-			ports[uint16(i)] = ep
-		} else {
-			// Check that all packets from one client are handled
-			// by the same socket.
-			if ports[port] != ep {
-				t.Fatalf("Port mismatch")
-			}
-		}
+	// Make an nameless NIC.
+	if err := s.CreateNIC(54321, loopback.New()); err != nil {
+		t.Errorf("CreateNIC failed: %v", err)
 	}
 
-	if len(stats) != len(eps) {
-		t.Fatalf("Only %d(expected %d) sockets received packets", len(stats), len(eps))
+	// strPtr is used instead of taking the address of string literals, which is
+	// a compiler error.
+	strPtr := func(s string) *string {
+		return &s
 	}
 
-	// Check that a packet distribution is fair between sockets.
-	for _, c := range stats {
-		n := float64(npackets) / float64(len(eps))
-		// The deviation is less than 10%.
-		if math.Abs(float64(c)-n) > n/10 {
-			t.Fatal(c, n)
-		}
+	testActions := []struct {
+		name                 string
+		setBindToDevice      *string
+		setBindToDeviceError *tcpip.Error
+		getBindToDevice      tcpip.BindToDeviceOption
+	}{
+		{"GetDefaultValue", nil, nil, ""},
+		{"BindToNonExistent", strPtr("non_existent_device"), tcpip.ErrUnknownDevice, ""},
+		{"BindToExistent", strPtr("my_device"), nil, "my_device"},
+		{"UnbindToDevice", strPtr(""), nil, ""},
+	}
+	for _, testAction := range testActions {
+		t.Run(testAction.name, func(t *testing.T) {
+			if testAction.setBindToDevice != nil {
+				bindToDevice := tcpip.BindToDeviceOption(*testAction.setBindToDevice)
+				if got, want := ep.SetSockOpt(bindToDevice), testAction.setBindToDeviceError; got != want {
+					t.Errorf("SetSockOpt(%v) got %v, want %v", bindToDevice, got, want)
+				}
+			}
+			bindToDevice := tcpip.BindToDeviceOption("to be modified by GetSockOpt")
+			if ep.GetSockOpt(&bindToDevice) != nil {
+				t.Errorf("GetSockOpt got %v, want %v", ep.GetSockOpt(&bindToDevice), nil)
+			}
+			if got, want := bindToDevice, testAction.getBindToDevice; got != want {
+				t.Errorf("bindToDevice got %q, want %q", got, want)
+			}
+		})
 	}
 }
 
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 28b23ce58..e645eebfa 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -2463,6 +2463,63 @@ cc_binary(
     ],
 )
 
+cc_binary(
+    name = "socket_bind_to_device_test",
+    testonly = 1,
+    srcs = [
+        "socket_bind_to_device.cc",
+    ],
+    linkstatic = 1,
+    deps = [
+        ":ip_socket_test_util",
+        ":socket_bind_to_device_util",
+        ":socket_test_util",
+        "//test/util:capability_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "//test/util:thread_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "socket_bind_to_device_sequence_test",
+    testonly = 1,
+    srcs = [
+        "socket_bind_to_device_sequence.cc",
+    ],
+    linkstatic = 1,
+    deps = [
+        ":ip_socket_test_util",
+        ":socket_bind_to_device_util",
+        ":socket_test_util",
+        "//test/util:capability_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "//test/util:thread_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "socket_bind_to_device_distribution_test",
+    testonly = 1,
+    srcs = [
+        "socket_bind_to_device_distribution.cc",
+    ],
+    linkstatic = 1,
+    deps = [
+        ":ip_socket_test_util",
+        ":socket_bind_to_device_util",
+        ":socket_test_util",
+        "//test/util:capability_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "//test/util:thread_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 cc_binary(
     name = "socket_ip_udp_loopback_non_blocking_test",
     testonly = 1,
@@ -2740,6 +2797,23 @@ cc_library(
     alwayslink = 1,
 )
 
+cc_library(
+    name = "socket_bind_to_device_util",
+    testonly = 1,
+    srcs = [
+        "socket_bind_to_device_util.cc",
+    ],
+    hdrs = [
+        "socket_bind_to_device_util.h",
+    ],
+    deps = [
+        "//test/util:test_util",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+    ],
+    alwayslink = 1,
+)
+
 cc_binary(
     name = "socket_stream_local_test",
     testonly = 1,
@@ -3253,6 +3327,7 @@ cc_binary(
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
+        "//test/util:uid_util",
         "@com_google_absl//absl/flags:flag",
         "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest",
diff --git a/test/syscalls/linux/socket_bind_to_device.cc b/test/syscalls/linux/socket_bind_to_device.cc
new file mode 100644
index 000000000..d20821cac
--- /dev/null
+++ b/test/syscalls/linux/socket_bind_to_device.cc
@@ -0,0 +1,314 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <arpa/inet.h>
+#include <linux/if_tun.h>
+#include <net/if.h>
+#include <netinet/in.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <sys/un.h>
+
+#include <cstdio>
+#include <cstring>
+#include <map>
+#include <memory>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/ip_socket_test_util.h"
+#include "test/syscalls/linux/socket_bind_to_device_util.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/capability_util.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+using std::string;
+
+// Test fixture for SO_BINDTODEVICE tests.
+class BindToDeviceTest : public ::testing::TestWithParam<SocketKind> {
+ protected:
+  void SetUp() override {
+    printf("Testing case: %s\n", GetParam().description.c_str());
+    ASSERT_TRUE(ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)))
+        << "CAP_NET_RAW is required to use SO_BINDTODEVICE";
+
+    interface_name_ = "eth1";
+    auto interface_names = GetInterfaceNames();
+    if (interface_names.find(interface_name_) == interface_names.end()) {
+      // Need a tunnel.
+      tunnel_ = ASSERT_NO_ERRNO_AND_VALUE(Tunnel::New());
+      interface_name_ = tunnel_->GetName();
+      ASSERT_FALSE(interface_name_.empty());
+    }
+    socket_ = ASSERT_NO_ERRNO_AND_VALUE(GetParam().Create());
+  }
+
+  string interface_name() const { return interface_name_; }
+
+  int socket_fd() const { return socket_->get(); }
+
+ private:
+  std::unique_ptr<Tunnel> tunnel_;
+  string interface_name_;
+  std::unique_ptr<FileDescriptor> socket_;
+};
+
+constexpr char kIllegalIfnameChar = '/';
+
+// Tests getsockopt of the default value.
+TEST_P(BindToDeviceTest, GetsockoptDefault) {
+  char name_buffer[IFNAMSIZ * 2];
+  char original_name_buffer[IFNAMSIZ * 2];
+  socklen_t name_buffer_size;
+
+  // Read the default SO_BINDTODEVICE.
+  memset(original_name_buffer, kIllegalIfnameChar, sizeof(name_buffer));
+  for (size_t i = 0; i <= sizeof(name_buffer); i++) {
+    memset(name_buffer, kIllegalIfnameChar, sizeof(name_buffer));
+    name_buffer_size = i;
+    EXPECT_THAT(getsockopt(socket_fd(), SOL_SOCKET, SO_BINDTODEVICE,
+                           name_buffer, &name_buffer_size),
+                SyscallSucceedsWithValue(0));
+    EXPECT_EQ(name_buffer_size, 0);
+    EXPECT_EQ(memcmp(name_buffer, original_name_buffer, sizeof(name_buffer)),
+              0);
+  }
+}
+
+// Tests setsockopt of invalid device name.
+TEST_P(BindToDeviceTest, SetsockoptInvalidDeviceName) {
+  char name_buffer[IFNAMSIZ * 2];
+  socklen_t name_buffer_size;
+
+  // Set an invalid device name.
+  memset(name_buffer, kIllegalIfnameChar, 5);
+  name_buffer_size = 5;
+  EXPECT_THAT(setsockopt(socket_fd(), SOL_SOCKET, SO_BINDTODEVICE, name_buffer,
+                         name_buffer_size),
+              SyscallFailsWithErrno(ENODEV));
+}
+
+// Tests setsockopt of a buffer with a valid device name but not
+// null-terminated, with different sizes of buffer.
+TEST_P(BindToDeviceTest, SetsockoptValidDeviceNameWithoutNullTermination) {
+  char name_buffer[IFNAMSIZ * 2];
+  socklen_t name_buffer_size;
+
+  strncpy(name_buffer, interface_name().c_str(), interface_name().size() + 1);
+  // Intentionally overwrite the null at the end.
+  memset(name_buffer + interface_name().size(), kIllegalIfnameChar,
+         sizeof(name_buffer) - interface_name().size());
+  for (size_t i = 1; i <= sizeof(name_buffer); i++) {
+    name_buffer_size = i;
+    SCOPED_TRACE(absl::StrCat("Buffer size: ", i));
+    // It should only work if the size provided is exactly right.
+    if (name_buffer_size == interface_name().size()) {
+      EXPECT_THAT(setsockopt(socket_fd(), SOL_SOCKET, SO_BINDTODEVICE,
+                             name_buffer, name_buffer_size),
+                  SyscallSucceeds());
+    } else {
+      EXPECT_THAT(setsockopt(socket_fd(), SOL_SOCKET, SO_BINDTODEVICE,
+                             name_buffer, name_buffer_size),
+                  SyscallFailsWithErrno(ENODEV));
+    }
+  }
+}
+
+// Tests setsockopt of a buffer with a valid device name and null-terminated,
+// with different sizes of buffer.
+TEST_P(BindToDeviceTest, SetsockoptValidDeviceNameWithNullTermination) {
+  char name_buffer[IFNAMSIZ * 2];
+  socklen_t name_buffer_size;
+
+  strncpy(name_buffer, interface_name().c_str(), interface_name().size() + 1);
+  // Don't overwrite the null at the end.
+  memset(name_buffer + interface_name().size() + 1, kIllegalIfnameChar,
+         sizeof(name_buffer) - interface_name().size() - 1);
+  for (size_t i = 1; i <= sizeof(name_buffer); i++) {
+    name_buffer_size = i;
+    SCOPED_TRACE(absl::StrCat("Buffer size: ", i));
+    // It should only work if the size provided is at least the right size.
+    if (name_buffer_size >= interface_name().size()) {
+      EXPECT_THAT(setsockopt(socket_fd(), SOL_SOCKET, SO_BINDTODEVICE,
+                             name_buffer, name_buffer_size),
+                  SyscallSucceeds());
+    } else {
+      EXPECT_THAT(setsockopt(socket_fd(), SOL_SOCKET, SO_BINDTODEVICE,
+                             name_buffer, name_buffer_size),
+                  SyscallFailsWithErrno(ENODEV));
+    }
+  }
+}
+
+// Tests that setsockopt of an invalid device name doesn't unset the previous
+// valid setsockopt.
+TEST_P(BindToDeviceTest, SetsockoptValidThenInvalid) {
+  char name_buffer[IFNAMSIZ * 2];
+  socklen_t name_buffer_size;
+
+  // Write successfully.
+  strncpy(name_buffer, interface_name().c_str(), sizeof(name_buffer));
+  ASSERT_THAT(setsockopt(socket_fd(), SOL_SOCKET, SO_BINDTODEVICE, name_buffer,
+                         sizeof(name_buffer)),
+              SyscallSucceeds());
+
+  // Read it back successfully.
+  memset(name_buffer, kIllegalIfnameChar, sizeof(name_buffer));
+  name_buffer_size = sizeof(name_buffer);
+  EXPECT_THAT(getsockopt(socket_fd(), SOL_SOCKET, SO_BINDTODEVICE, name_buffer,
+                         &name_buffer_size),
+              SyscallSucceeds());
+  EXPECT_EQ(name_buffer_size, interface_name().size() + 1);
+  EXPECT_STREQ(name_buffer, interface_name().c_str());
+
+  // Write unsuccessfully.
+  memset(name_buffer, kIllegalIfnameChar, sizeof(name_buffer));
+  name_buffer_size = 5;
+  EXPECT_THAT(setsockopt(socket_fd(), SOL_SOCKET, SO_BINDTODEVICE, name_buffer,
+                         sizeof(name_buffer)),
+              SyscallFailsWithErrno(ENODEV));
+
+  // Read it back successfully, it's unchanged.
+  memset(name_buffer, kIllegalIfnameChar, sizeof(name_buffer));
+  name_buffer_size = sizeof(name_buffer);
+  EXPECT_THAT(getsockopt(socket_fd(), SOL_SOCKET, SO_BINDTODEVICE, name_buffer,
+                         &name_buffer_size),
+              SyscallSucceeds());
+  EXPECT_EQ(name_buffer_size, interface_name().size() + 1);
+  EXPECT_STREQ(name_buffer, interface_name().c_str());
+}
+
+// Tests that setsockopt of zero-length string correctly unsets the previous
+// value.
+TEST_P(BindToDeviceTest, SetsockoptValidThenClear) {
+  char name_buffer[IFNAMSIZ * 2];
+  socklen_t name_buffer_size;
+
+  // Write successfully.
+  strncpy(name_buffer, interface_name().c_str(), sizeof(name_buffer));
+  EXPECT_THAT(setsockopt(socket_fd(), SOL_SOCKET, SO_BINDTODEVICE, name_buffer,
+                         sizeof(name_buffer)),
+              SyscallSucceeds());
+
+  // Read it back successfully.
+  memset(name_buffer, kIllegalIfnameChar, sizeof(name_buffer));
+  name_buffer_size = sizeof(name_buffer);
+  EXPECT_THAT(getsockopt(socket_fd(), SOL_SOCKET, SO_BINDTODEVICE, name_buffer,
+                         &name_buffer_size),
+              SyscallSucceeds());
+  EXPECT_EQ(name_buffer_size, interface_name().size() + 1);
+  EXPECT_STREQ(name_buffer, interface_name().c_str());
+
+  // Clear it successfully.
+  name_buffer_size = 0;
+  EXPECT_THAT(setsockopt(socket_fd(), SOL_SOCKET, SO_BINDTODEVICE, name_buffer,
+                         name_buffer_size),
+              SyscallSucceeds());
+
+  // Read it back successfully, it's cleared.
+  memset(name_buffer, kIllegalIfnameChar, sizeof(name_buffer));
+  name_buffer_size = sizeof(name_buffer);
+  EXPECT_THAT(getsockopt(socket_fd(), SOL_SOCKET, SO_BINDTODEVICE, name_buffer,
+                         &name_buffer_size),
+              SyscallSucceeds());
+  EXPECT_EQ(name_buffer_size, 0);
+}
+
+// Tests that setsockopt of empty string correctly unsets the previous
+// value.
+TEST_P(BindToDeviceTest, SetsockoptValidThenClearWithNull) {
+  char name_buffer[IFNAMSIZ * 2];
+  socklen_t name_buffer_size;
+
+  // Write successfully.
+  strncpy(name_buffer, interface_name().c_str(), sizeof(name_buffer));
+  EXPECT_THAT(setsockopt(socket_fd(), SOL_SOCKET, SO_BINDTODEVICE, name_buffer,
+                         sizeof(name_buffer)),
+              SyscallSucceeds());
+
+  // Read it back successfully.
+  memset(name_buffer, kIllegalIfnameChar, sizeof(name_buffer));
+  name_buffer_size = sizeof(name_buffer);
+  EXPECT_THAT(getsockopt(socket_fd(), SOL_SOCKET, SO_BINDTODEVICE, name_buffer,
+                         &name_buffer_size),
+              SyscallSucceeds());
+  EXPECT_EQ(name_buffer_size, interface_name().size() + 1);
+  EXPECT_STREQ(name_buffer, interface_name().c_str());
+
+  // Clear it successfully.
+  memset(name_buffer, kIllegalIfnameChar, sizeof(name_buffer));
+  name_buffer[0] = 0;
+  name_buffer_size = sizeof(name_buffer);
+  EXPECT_THAT(setsockopt(socket_fd(), SOL_SOCKET, SO_BINDTODEVICE, name_buffer,
+                         name_buffer_size),
+              SyscallSucceeds());
+
+  // Read it back successfully, it's cleared.
+  memset(name_buffer, kIllegalIfnameChar, sizeof(name_buffer));
+  name_buffer_size = sizeof(name_buffer);
+  EXPECT_THAT(getsockopt(socket_fd(), SOL_SOCKET, SO_BINDTODEVICE, name_buffer,
+                         &name_buffer_size),
+              SyscallSucceeds());
+  EXPECT_EQ(name_buffer_size, 0);
+}
+
+// Tests getsockopt with different buffer sizes.
+TEST_P(BindToDeviceTest, GetsockoptDevice) {
+  char name_buffer[IFNAMSIZ * 2];
+  socklen_t name_buffer_size;
+
+  // Write successfully.
+  strncpy(name_buffer, interface_name().c_str(), sizeof(name_buffer));
+  ASSERT_THAT(setsockopt(socket_fd(), SOL_SOCKET, SO_BINDTODEVICE, name_buffer,
+                         sizeof(name_buffer)),
+              SyscallSucceeds());
+
+  // Read it back at various buffer sizes.
+  for (size_t i = 0; i <= sizeof(name_buffer); i++) {
+    memset(name_buffer, kIllegalIfnameChar, sizeof(name_buffer));
+    name_buffer_size = i;
+    SCOPED_TRACE(absl::StrCat("Buffer size: ", i));
+    // Linux only allows a buffer at least IFNAMSIZ, even if less would suffice
+    // for this interface name.
+    if (name_buffer_size >= IFNAMSIZ) {
+      EXPECT_THAT(getsockopt(socket_fd(), SOL_SOCKET, SO_BINDTODEVICE,
+                             name_buffer, &name_buffer_size),
+                  SyscallSucceeds());
+      EXPECT_EQ(name_buffer_size, interface_name().size() + 1);
+      EXPECT_STREQ(name_buffer, interface_name().c_str());
+    } else {
+      EXPECT_THAT(getsockopt(socket_fd(), SOL_SOCKET, SO_BINDTODEVICE,
+                             name_buffer, &name_buffer_size),
+                  SyscallFailsWithErrno(EINVAL));
+      EXPECT_EQ(name_buffer_size, i);
+    }
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(BindToDeviceTest, BindToDeviceTest,
+                         ::testing::Values(IPv4UDPUnboundSocket(0),
+                                           IPv4TCPUnboundSocket(0)));
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/socket_bind_to_device_distribution.cc b/test/syscalls/linux/socket_bind_to_device_distribution.cc
new file mode 100644
index 000000000..4d2400328
--- /dev/null
+++ b/test/syscalls/linux/socket_bind_to_device_distribution.cc
@@ -0,0 +1,381 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <arpa/inet.h>
+#include <linux/if_tun.h>
+#include <net/if.h>
+#include <netinet/in.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <sys/un.h>
+
+#include <atomic>
+#include <cstdio>
+#include <cstring>
+#include <map>
+#include <memory>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/ip_socket_test_util.h"
+#include "test/syscalls/linux/socket_bind_to_device_util.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/capability_util.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+using std::string;
+using std::vector;
+
+struct EndpointConfig {
+  std::string bind_to_device;
+  double expected_ratio;
+};
+
+struct DistributionTestCase {
+  std::string name;
+  std::vector<EndpointConfig> endpoints;
+};
+
+struct ListenerConnector {
+  TestAddress listener;
+  TestAddress connector;
+};
+
+// Test fixture for SO_BINDTODEVICE tests the distribution of packets received
+// with varying SO_BINDTODEVICE settings.
+class BindToDeviceDistributionTest
+    : public ::testing::TestWithParam<
+          ::testing::tuple<ListenerConnector, DistributionTestCase>> {
+ protected:
+  void SetUp() override {
+    printf("Testing case: %s, listener=%s, connector=%s\n",
+           ::testing::get<1>(GetParam()).name.c_str(),
+           ::testing::get<0>(GetParam()).listener.description.c_str(),
+           ::testing::get<0>(GetParam()).connector.description.c_str());
+    ASSERT_TRUE(ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)))
+        << "CAP_NET_RAW is required to use SO_BINDTODEVICE";
+  }
+};
+
+PosixErrorOr<uint16_t> AddrPort(int family, sockaddr_storage const& addr) {
+  switch (family) {
+    case AF_INET:
+      return static_cast<uint16_t>(
+          reinterpret_cast<sockaddr_in const*>(&addr)->sin_port);
+    case AF_INET6:
+      return static_cast<uint16_t>(
+          reinterpret_cast<sockaddr_in6 const*>(&addr)->sin6_port);
+    default:
+      return PosixError(EINVAL,
+                        absl::StrCat("unknown socket family: ", family));
+  }
+}
+
+PosixError SetAddrPort(int family, sockaddr_storage* addr, uint16_t port) {
+  switch (family) {
+    case AF_INET:
+      reinterpret_cast<sockaddr_in*>(addr)->sin_port = port;
+      return NoError();
+    case AF_INET6:
+      reinterpret_cast<sockaddr_in6*>(addr)->sin6_port = port;
+      return NoError();
+    default:
+      return PosixError(EINVAL,
+                        absl::StrCat("unknown socket family: ", family));
+  }
+}
+
+// Binds sockets to different devices and then creates many TCP connections.
+// Checks that the distribution of connections received on the sockets matches
+// the expectation.
+TEST_P(BindToDeviceDistributionTest, Tcp) {
+  auto const& [listener_connector, test] = GetParam();
+
+  TestAddress const& listener = listener_connector.listener;
+  TestAddress const& connector = listener_connector.connector;
+  sockaddr_storage listen_addr = listener.addr;
+  sockaddr_storage conn_addr = connector.addr;
+
+  auto interface_names = GetInterfaceNames();
+
+  // Create the listening sockets.
+  std::vector<FileDescriptor> listener_fds;
+  std::vector<std::unique_ptr<Tunnel>> all_tunnels;
+  for (auto const& endpoint : test.endpoints) {
+    if (!endpoint.bind_to_device.empty() &&
+        interface_names.find(endpoint.bind_to_device) ==
+            interface_names.end()) {
+      all_tunnels.push_back(
+          ASSERT_NO_ERRNO_AND_VALUE(Tunnel::New(endpoint.bind_to_device)));
+      interface_names.insert(endpoint.bind_to_device);
+    }
+
+    listener_fds.push_back(ASSERT_NO_ERRNO_AND_VALUE(
+        Socket(listener.family(), SOCK_STREAM, IPPROTO_TCP)));
+    int fd = listener_fds.back().get();
+
+    ASSERT_THAT(setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, &kSockOptOn,
+                           sizeof(kSockOptOn)),
+                SyscallSucceeds());
+    ASSERT_THAT(setsockopt(fd, SOL_SOCKET, SO_BINDTODEVICE,
+                           endpoint.bind_to_device.c_str(),
+                           endpoint.bind_to_device.size() + 1),
+                SyscallSucceeds());
+    ASSERT_THAT(
+        bind(fd, reinterpret_cast<sockaddr*>(&listen_addr), listener.addr_len),
+        SyscallSucceeds());
+    ASSERT_THAT(listen(fd, 40), SyscallSucceeds());
+
+    // On the first bind we need to determine which port was bound.
+    if (listener_fds.size() > 1) {
+      continue;
+    }
+
+    // Get the port bound by the listening socket.
+    socklen_t addrlen = listener.addr_len;
+    ASSERT_THAT(
+        getsockname(listener_fds[0].get(),
+                    reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
+        SyscallSucceeds());
+    uint16_t const port =
+        ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
+    ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
+  }
+
+  constexpr int kConnectAttempts = 10000;
+  std::atomic<int> connects_received = ATOMIC_VAR_INIT(0);
+  std::vector<int> accept_counts(listener_fds.size(), 0);
+  std::vector<std::unique_ptr<ScopedThread>> listen_threads(
+      listener_fds.size());
+
+  for (int i = 0; i < listener_fds.size(); i++) {
+    listen_threads[i] = absl::make_unique<ScopedThread>(
+        [&listener_fds, &accept_counts, &connects_received, i,
+         kConnectAttempts]() {
+          do {
+            auto fd = Accept(listener_fds[i].get(), nullptr, nullptr);
+            if (!fd.ok()) {
+              // Another thread has shutdown our read side causing the accept to
+              // fail.
+              ASSERT_GE(connects_received, kConnectAttempts)
+                  << "errno = " << fd.error();
+              return;
+            }
+            // Receive some data from a socket to be sure that the connect()
+            // system call has been completed on another side.
+            int data;
+            EXPECT_THAT(
+                RetryEINTR(recv)(fd.ValueOrDie().get(), &data, sizeof(data), 0),
+                SyscallSucceedsWithValue(sizeof(data)));
+            accept_counts[i]++;
+          } while (++connects_received < kConnectAttempts);
+
+          // Shutdown all sockets to wake up other threads.
+          for (auto const& listener_fd : listener_fds) {
+            shutdown(listener_fd.get(), SHUT_RDWR);
+          }
+        });
+  }
+
+  for (int i = 0; i < kConnectAttempts; i++) {
+    FileDescriptor const fd = ASSERT_NO_ERRNO_AND_VALUE(
+        Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP));
+    ASSERT_THAT(
+        RetryEINTR(connect)(fd.get(), reinterpret_cast<sockaddr*>(&conn_addr),
+                            connector.addr_len),
+        SyscallSucceeds());
+
+    EXPECT_THAT(RetryEINTR(send)(fd.get(), &i, sizeof(i), 0),
+                SyscallSucceedsWithValue(sizeof(i)));
+  }
+
+  // Join threads to be sure that all connections have been counted.
+  for (auto const& listen_thread : listen_threads) {
+    listen_thread->Join();
+  }
+  // Check that connections are distributed correctly among listening sockets.
+  for (int i = 0; i < accept_counts.size(); i++) {
+    EXPECT_THAT(
+        accept_counts[i],
+        EquivalentWithin(static_cast<int>(kConnectAttempts *
+                                          test.endpoints[i].expected_ratio),
+                         0.10))
+        << "endpoint " << i << " got the wrong number of packets";
+  }
+}
+
+// Binds sockets to different devices and then sends many UDP packets.  Checks
+// that the distribution of packets received on the sockets matches the
+// expectation.
+TEST_P(BindToDeviceDistributionTest, Udp) {
+  auto const& [listener_connector, test] = GetParam();
+
+  TestAddress const& listener = listener_connector.listener;
+  TestAddress const& connector = listener_connector.connector;
+  sockaddr_storage listen_addr = listener.addr;
+  sockaddr_storage conn_addr = connector.addr;
+
+  auto interface_names = GetInterfaceNames();
+
+  // Create the listening socket.
+  std::vector<FileDescriptor> listener_fds;
+  std::vector<std::unique_ptr<Tunnel>> all_tunnels;
+  for (auto const& endpoint : test.endpoints) {
+    if (!endpoint.bind_to_device.empty() &&
+        interface_names.find(endpoint.bind_to_device) ==
+            interface_names.end()) {
+      all_tunnels.push_back(
+          ASSERT_NO_ERRNO_AND_VALUE(Tunnel::New(endpoint.bind_to_device)));
+      interface_names.insert(endpoint.bind_to_device);
+    }
+
+    listener_fds.push_back(
+        ASSERT_NO_ERRNO_AND_VALUE(Socket(listener.family(), SOCK_DGRAM, 0)));
+    int fd = listener_fds.back().get();
+
+    ASSERT_THAT(setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, &kSockOptOn,
+                           sizeof(kSockOptOn)),
+                SyscallSucceeds());
+    ASSERT_THAT(setsockopt(fd, SOL_SOCKET, SO_BINDTODEVICE,
+                           endpoint.bind_to_device.c_str(),
+                           endpoint.bind_to_device.size() + 1),
+                SyscallSucceeds());
+    ASSERT_THAT(
+        bind(fd, reinterpret_cast<sockaddr*>(&listen_addr), listener.addr_len),
+        SyscallSucceeds());
+
+    // On the first bind we need to determine which port was bound.
+    if (listener_fds.size() > 1) {
+      continue;
+    }
+
+    // Get the port bound by the listening socket.
+    socklen_t addrlen = listener.addr_len;
+    ASSERT_THAT(
+        getsockname(listener_fds[0].get(),
+                    reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
+        SyscallSucceeds());
+    uint16_t const port =
+        ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
+    ASSERT_NO_ERRNO(SetAddrPort(listener.family(), &listen_addr, port));
+    ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
+  }
+
+  constexpr int kConnectAttempts = 10000;
+  std::atomic<int> packets_received = ATOMIC_VAR_INIT(0);
+  std::vector<int> packets_per_socket(listener_fds.size(), 0);
+  std::vector<std::unique_ptr<ScopedThread>> receiver_threads(
+      listener_fds.size());
+
+  for (int i = 0; i < listener_fds.size(); i++) {
+    receiver_threads[i] = absl::make_unique<ScopedThread>(
+        [&listener_fds, &packets_per_socket, &packets_received, i]() {
+          do {
+            struct sockaddr_storage addr = {};
+            socklen_t addrlen = sizeof(addr);
+            int data;
+
+            auto ret = RetryEINTR(recvfrom)(
+                listener_fds[i].get(), &data, sizeof(data), 0,
+                reinterpret_cast<struct sockaddr*>(&addr), &addrlen);
+
+            if (packets_received < kConnectAttempts) {
+              ASSERT_THAT(ret, SyscallSucceedsWithValue(sizeof(data)));
+            }
+
+            if (ret != sizeof(data)) {
+              // Another thread may have shutdown our read side causing the
+              // recvfrom to fail.
+              break;
+            }
+
+            packets_received++;
+            packets_per_socket[i]++;
+
+            // A response is required to synchronize with the main thread,
+            // otherwise the main thread can send more than can fit into receive
+            // queues.
+            EXPECT_THAT(RetryEINTR(sendto)(
+                            listener_fds[i].get(), &data, sizeof(data), 0,
+                            reinterpret_cast<sockaddr*>(&addr), addrlen),
+                        SyscallSucceedsWithValue(sizeof(data)));
+          } while (packets_received < kConnectAttempts);
+
+          // Shutdown all sockets to wake up other threads.
+          for (auto const& listener_fd : listener_fds) {
+            shutdown(listener_fd.get(), SHUT_RDWR);
+          }
+        });
+  }
+
+  for (int i = 0; i < kConnectAttempts; i++) {
+    FileDescriptor const fd =
+        ASSERT_NO_ERRNO_AND_VALUE(Socket(connector.family(), SOCK_DGRAM, 0));
+    EXPECT_THAT(RetryEINTR(sendto)(fd.get(), &i, sizeof(i), 0,
+                                   reinterpret_cast<sockaddr*>(&conn_addr),
+                                   connector.addr_len),
+                SyscallSucceedsWithValue(sizeof(i)));
+    int data;
+    EXPECT_THAT(RetryEINTR(recv)(fd.get(), &data, sizeof(data), 0),
+                SyscallSucceedsWithValue(sizeof(data)));
+  }
+
+  // Join threads to be sure that all connections have been counted.
+  for (auto const& receiver_thread : receiver_threads) {
+    receiver_thread->Join();
+  }
+  // Check that packets are distributed correctly among listening sockets.
+  for (int i = 0; i < packets_per_socket.size(); i++) {
+    EXPECT_THAT(
+        packets_per_socket[i],
+        EquivalentWithin(static_cast<int>(kConnectAttempts *
+                                          test.endpoints[i].expected_ratio),
+                         0.10))
+        << "endpoint " << i << " got the wrong number of packets";
+  }
+}
+
+std::vector<DistributionTestCase> GetDistributionTestCases() {
+  return std::vector<DistributionTestCase>{
+      {"Even distribution among sockets not bound to device",
+       {{"", 1. / 3}, {"", 1. / 3}, {"", 1. / 3}}},
+      {"Sockets bound to other interfaces get no packets",
+       {{"eth1", 0}, {"", 1. / 2}, {"", 1. / 2}}},
+      {"Bound has priority over unbound", {{"eth1", 0}, {"", 0}, {"lo", 1}}},
+      {"Even distribution among sockets bound to device",
+       {{"eth1", 0}, {"lo", 1. / 2}, {"lo", 1. / 2}}},
+  };
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    BindToDeviceTest, BindToDeviceDistributionTest,
+    ::testing::Combine(::testing::Values(
+                           // Listeners bound to IPv4 addresses refuse
+                           // connections using IPv6 addresses.
+                           ListenerConnector{V4Any(), V4Loopback()},
+                           ListenerConnector{V4Loopback(), V4MappedLoopback()}),
+                       ::testing::ValuesIn(GetDistributionTestCases())));
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/socket_bind_to_device_sequence.cc b/test/syscalls/linux/socket_bind_to_device_sequence.cc
new file mode 100644
index 000000000..a7365d139
--- /dev/null
+++ b/test/syscalls/linux/socket_bind_to_device_sequence.cc
@@ -0,0 +1,316 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <arpa/inet.h>
+#include <linux/capability.h>
+#include <linux/if_tun.h>
+#include <net/if.h>
+#include <netinet/in.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <sys/un.h>
+
+#include <cstdio>
+#include <cstring>
+#include <map>
+#include <memory>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/ip_socket_test_util.h"
+#include "test/syscalls/linux/socket_bind_to_device_util.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/capability_util.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+using std::string;
+using std::vector;
+
+// Test fixture for SO_BINDTODEVICE tests the results of sequences of socket
+// binding.
+class BindToDeviceSequenceTest : public ::testing::TestWithParam<SocketKind> {
+ protected:
+  void SetUp() override {
+    printf("Testing case: %s\n", GetParam().description.c_str());
+    ASSERT_TRUE(ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)))
+        << "CAP_NET_RAW is required to use SO_BINDTODEVICE";
+    socket_factory_ = GetParam();
+
+    interface_names_ = GetInterfaceNames();
+  }
+
+  PosixErrorOr<std::unique_ptr<FileDescriptor>> NewSocket() const {
+    return socket_factory_.Create();
+  }
+
+  // Gets a device by device_id.  If the device_id has been seen before, returns
+  // the previously returned device.  If not, finds or creates a new device.
+  // Returns an empty string on failure.
+  void GetDevice(int device_id, string *device_name) {
+    auto device = devices_.find(device_id);
+    if (device != devices_.end()) {
+      *device_name = device->second;
+      return;
+    }
+
+    // Need to pick a new device.  Try ethernet first.
+    *device_name = absl::StrCat("eth", next_unused_eth_);
+    if (interface_names_.find(*device_name) != interface_names_.end()) {
+      devices_[device_id] = *device_name;
+      next_unused_eth_++;
+      return;
+    }
+
+    // Need to make a new tunnel device.  gVisor tests should have enough
+    // ethernet devices to never reach here.
+    ASSERT_FALSE(IsRunningOnGvisor());
+    // Need a tunnel.
+    tunnels_.push_back(ASSERT_NO_ERRNO_AND_VALUE(Tunnel::New()));
+    devices_[device_id] = tunnels_.back()->GetName();
+    *device_name = devices_[device_id];
+  }
+
+  // Release the socket
+  void ReleaseSocket(int socket_id) {
+    // Close the socket that was made in a previous action.  The socket_id
+    // indicates which socket to close based on index into the list of actions.
+    sockets_to_close_.erase(socket_id);
+  }
+
+  // Bind a socket with the reuse option and bind_to_device options.  Checks
+  // that all steps succeed and that the bind command's error matches want.
+  // Sets the socket_id to uniquely identify the socket bound if it is not
+  // nullptr.
+  void BindSocket(bool reuse, int device_id = 0, int want = 0,
+                  int *socket_id = nullptr) {
+    next_socket_id_++;
+    sockets_to_close_[next_socket_id_] = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+    auto socket_fd = sockets_to_close_[next_socket_id_]->get();
+    if (socket_id != nullptr) {
+      *socket_id = next_socket_id_;
+    }
+
+    // If reuse is indicated, do that.
+    if (reuse) {
+      EXPECT_THAT(setsockopt(socket_fd, SOL_SOCKET, SO_REUSEPORT, &kSockOptOn,
+                             sizeof(kSockOptOn)),
+                  SyscallSucceedsWithValue(0));
+    }
+
+    // If the device is non-zero, bind to that device.
+    if (device_id != 0) {
+      string device_name;
+      ASSERT_NO_FATAL_FAILURE(GetDevice(device_id, &device_name));
+      EXPECT_THAT(setsockopt(socket_fd, SOL_SOCKET, SO_BINDTODEVICE,
+                             device_name.c_str(), device_name.size() + 1),
+                  SyscallSucceedsWithValue(0));
+      char get_device[100];
+      socklen_t get_device_size = 100;
+      EXPECT_THAT(getsockopt(socket_fd, SOL_SOCKET, SO_BINDTODEVICE, get_device,
+                             &get_device_size),
+                  SyscallSucceedsWithValue(0));
+    }
+
+    struct sockaddr_in addr = {};
+    addr.sin_family = AF_INET;
+    addr.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+    addr.sin_port = port_;
+    if (want == 0) {
+      ASSERT_THAT(
+          bind(socket_fd, reinterpret_cast<const struct sockaddr *>(&addr),
+               sizeof(addr)),
+          SyscallSucceeds());
+    } else {
+      ASSERT_THAT(
+          bind(socket_fd, reinterpret_cast<const struct sockaddr *>(&addr),
+               sizeof(addr)),
+          SyscallFailsWithErrno(want));
+    }
+
+    if (port_ == 0) {
+      // We don't yet know what port we'll be using so we need to fetch it and
+      // remember it for future commands.
+      socklen_t addr_size = sizeof(addr);
+      ASSERT_THAT(
+          getsockname(socket_fd, reinterpret_cast<struct sockaddr *>(&addr),
+                      &addr_size),
+          SyscallSucceeds());
+      port_ = addr.sin_port;
+    }
+  }
+
+ private:
+  SocketKind socket_factory_;
+  // devices maps from the device id in the test case to the name of the device.
+  std::unordered_map<int, string> devices_;
+  // These are the tunnels that were created for the test and will be destroyed
+  // by the destructor.
+  vector<std::unique_ptr<Tunnel>> tunnels_;
+  // A list of all interface names before the test started.
+  std::unordered_set<string> interface_names_;
+  // The next ethernet device to use when requested a device.
+  int next_unused_eth_ = 1;
+  // The port for all tests.  Originally 0 (any) and later set to the port that
+  // all further commands will use.
+  in_port_t port_ = 0;
+  // sockets_to_close_ is a map from action index to the socket that was
+  // created.
+  std::unordered_map<int,
+                     std::unique_ptr<gvisor::testing::FileDescriptor>>
+      sockets_to_close_;
+  int next_socket_id_ = 0;
+};
+
+TEST_P(BindToDeviceSequenceTest, BindTwiceWithDeviceFails) {
+  ASSERT_NO_FATAL_FAILURE(
+      BindSocket(/* reuse */ false, /* bind_to_device */ 3));
+  ASSERT_NO_FATAL_FAILURE(
+      BindSocket(/* reuse */ false, /* bind_to_device */ 3, EADDRINUSE));
+}
+
+TEST_P(BindToDeviceSequenceTest, BindToDevice) {
+  ASSERT_NO_FATAL_FAILURE(
+      BindSocket(/* reuse */ false, /* bind_to_device */ 1));
+  ASSERT_NO_FATAL_FAILURE(
+      BindSocket(/* reuse */ false, /* bind_to_device */ 2));
+}
+
+TEST_P(BindToDeviceSequenceTest, BindToDeviceAndThenWithoutDevice) {
+  ASSERT_NO_FATAL_FAILURE(
+      BindSocket(/* reuse */ false, /* bind_to_device */ 123));
+  ASSERT_NO_FATAL_FAILURE(
+      BindSocket(/* reuse */ false, /* bind_to_device */ 0, EADDRINUSE));
+}
+
+TEST_P(BindToDeviceSequenceTest, BindWithoutDevice) {
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse */ false));
+  ASSERT_NO_FATAL_FAILURE(
+      BindSocket(/* reuse */ false, /* bind_to_device */ 123, EADDRINUSE));
+  ASSERT_NO_FATAL_FAILURE(
+      BindSocket(/* reuse */ true, /* bind_to_device */ 123, EADDRINUSE));
+  ASSERT_NO_FATAL_FAILURE(
+      BindSocket(/* reuse */ false, /* bind_to_device */ 0, EADDRINUSE));
+  ASSERT_NO_FATAL_FAILURE(
+      BindSocket(/* reuse */ true, /* bind_to_device */ 0, EADDRINUSE));
+}
+
+TEST_P(BindToDeviceSequenceTest, BindWithDevice) {
+  ASSERT_NO_FATAL_FAILURE(
+      BindSocket(/* reuse */ false, /* bind_to_device */ 123, 0));
+  ASSERT_NO_FATAL_FAILURE(
+      BindSocket(/* reuse */ false, /* bind_to_device */ 123, EADDRINUSE));
+  ASSERT_NO_FATAL_FAILURE(
+      BindSocket(/* reuse */ true, /* bind_to_device */ 123, EADDRINUSE));
+  ASSERT_NO_FATAL_FAILURE(
+      BindSocket(/* reuse */ false, /* bind_to_device */ 0, EADDRINUSE));
+  ASSERT_NO_FATAL_FAILURE(
+      BindSocket(/* reuse */ true, /* bind_to_device */ 0, EADDRINUSE));
+  ASSERT_NO_FATAL_FAILURE(
+      BindSocket(/* reuse */ true, /* bind_to_device */ 456, 0));
+  ASSERT_NO_FATAL_FAILURE(
+      BindSocket(/* reuse */ false, /* bind_to_device */ 789, 0));
+  ASSERT_NO_FATAL_FAILURE(
+      BindSocket(/* reuse */ false, /* bind_to_device */ 0, EADDRINUSE));
+  ASSERT_NO_FATAL_FAILURE(
+      BindSocket(/* reuse */ true, /* bind_to_device */ 0, EADDRINUSE));
+}
+
+TEST_P(BindToDeviceSequenceTest, BindWithReuse) {
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse */ true));
+  ASSERT_NO_FATAL_FAILURE(
+      BindSocket(/* reuse */ false, /* bind_to_device */ 123, EADDRINUSE));
+  ASSERT_NO_FATAL_FAILURE(
+      BindSocket(/* reuse */ true, /* bind_to_device */ 123));
+  ASSERT_NO_FATAL_FAILURE(
+      BindSocket(/* reuse */ false, /* bind_to_device */ 0, EADDRINUSE));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse */ true, /* bind_to_device */ 0));
+}
+
+TEST_P(BindToDeviceSequenceTest, BindingWithReuseAndDevice) {
+  ASSERT_NO_FATAL_FAILURE(
+      BindSocket(/* reuse */ true, /* bind_to_device */ 123));
+  ASSERT_NO_FATAL_FAILURE(
+      BindSocket(/* reuse */ false, /* bind_to_device */ 123, EADDRINUSE));
+  ASSERT_NO_FATAL_FAILURE(
+      BindSocket(/* reuse */ true, /* bind_to_device */ 123));
+  ASSERT_NO_FATAL_FAILURE(
+      BindSocket(/* reuse */ false, /* bind_to_device */ 0, EADDRINUSE));
+  ASSERT_NO_FATAL_FAILURE(
+      BindSocket(/* reuse */ true, /* bind_to_device */ 456));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse */ true));
+  ASSERT_NO_FATAL_FAILURE(
+      BindSocket(/* reuse */ true, /* bind_to_device */ 789));
+  ASSERT_NO_FATAL_FAILURE(
+      BindSocket(/* reuse */ false, /* bind_to_device */ 999, EADDRINUSE));
+}
+
+TEST_P(BindToDeviceSequenceTest, MixingReuseAndNotReuseByBindingToDevice) {
+  ASSERT_NO_FATAL_FAILURE(
+      BindSocket(/* reuse */ true, /* bind_to_device */ 123, 0));
+  ASSERT_NO_FATAL_FAILURE(
+      BindSocket(/* reuse */ false, /* bind_to_device */ 456, 0));
+  ASSERT_NO_FATAL_FAILURE(
+      BindSocket(/* reuse */ true, /* bind_to_device */ 789, 0));
+  ASSERT_NO_FATAL_FAILURE(
+      BindSocket(/* reuse */ false, /* bind_to_device */ 999, 0));
+}
+
+TEST_P(BindToDeviceSequenceTest, CannotBindTo0AfterMixingReuseAndNotReuse) {
+  ASSERT_NO_FATAL_FAILURE(
+      BindSocket(/* reuse */ true, /* bind_to_device */ 123));
+  ASSERT_NO_FATAL_FAILURE(
+      BindSocket(/* reuse */ false, /* bind_to_device */ 456));
+  ASSERT_NO_FATAL_FAILURE(
+      BindSocket(/* reuse */ true, /* bind_to_device */ 0, EADDRINUSE));
+}
+
+TEST_P(BindToDeviceSequenceTest, BindAndRelease) {
+  ASSERT_NO_FATAL_FAILURE(
+      BindSocket(/* reuse */ true, /* bind_to_device */ 123));
+  int to_release;
+  ASSERT_NO_FATAL_FAILURE(
+      BindSocket(/* reuse */ true, /* bind_to_device */ 0, 0, &to_release));
+  ASSERT_NO_FATAL_FAILURE(
+      BindSocket(/* reuse */ false, /* bind_to_device */ 345, EADDRINUSE));
+  ASSERT_NO_FATAL_FAILURE(
+      BindSocket(/* reuse */ true, /* bind_to_device */ 789));
+  // Release the bind to device 0 and try again.
+  ASSERT_NO_FATAL_FAILURE(ReleaseSocket(to_release));
+  ASSERT_NO_FATAL_FAILURE(
+      BindSocket(/* reuse */ false, /* bind_to_device */ 345));
+}
+
+TEST_P(BindToDeviceSequenceTest, BindTwiceWithReuseOnce) {
+  ASSERT_NO_FATAL_FAILURE(
+      BindSocket(/* reuse */ false, /* bind_to_device */ 123));
+  ASSERT_NO_FATAL_FAILURE(
+      BindSocket(/* reuse */ true, /* bind_to_device */ 0, EADDRINUSE));
+}
+
+INSTANTIATE_TEST_SUITE_P(BindToDeviceTest, BindToDeviceSequenceTest,
+                         ::testing::Values(IPv4UDPUnboundSocket(0),
+                                           IPv4TCPUnboundSocket(0)));
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/socket_bind_to_device_util.cc b/test/syscalls/linux/socket_bind_to_device_util.cc
new file mode 100644
index 000000000..f4ee775bd
--- /dev/null
+++ b/test/syscalls/linux/socket_bind_to_device_util.cc
@@ -0,0 +1,75 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/syscalls/linux/socket_bind_to_device_util.h"
+
+#include <arpa/inet.h>
+#include <fcntl.h>
+#include <linux/if_tun.h>
+#include <net/if.h>
+#include <netinet/in.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <sys/un.h>
+#include <unistd.h>
+
+#include <cstdio>
+#include <cstring>
+#include <map>
+#include <memory>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+using std::string;
+
+PosixErrorOr<std::unique_ptr<Tunnel>> Tunnel::New(string tunnel_name) {
+  int fd;
+  RETURN_ERROR_IF_SYSCALL_FAIL(fd = open("/dev/net/tun", O_RDWR));
+
+  // Using `new` to access a non-public constructor.
+  auto new_tunnel = absl::WrapUnique(new Tunnel(fd));
+
+  ifreq ifr = {};
+  ifr.ifr_flags = IFF_TUN;
+  strncpy(ifr.ifr_name, tunnel_name.c_str(), sizeof(ifr.ifr_name));
+
+  RETURN_ERROR_IF_SYSCALL_FAIL(ioctl(fd, TUNSETIFF, &ifr));
+  new_tunnel->name_ = ifr.ifr_name;
+  return new_tunnel;
+}
+
+std::unordered_set<string> GetInterfaceNames() {
+  struct if_nameindex* interfaces = if_nameindex();
+  std::unordered_set<string> names;
+  if (interfaces == nullptr) {
+    return names;
+  }
+  for (auto interface = interfaces;
+       interface->if_index != 0 || interface->if_name != nullptr; interface++) {
+    names.insert(interface->if_name);
+  }
+  if_freenameindex(interfaces);
+  return names;
+}
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/socket_bind_to_device_util.h b/test/syscalls/linux/socket_bind_to_device_util.h
new file mode 100644
index 000000000..f941ccc86
--- /dev/null
+++ b/test/syscalls/linux/socket_bind_to_device_util.h
@@ -0,0 +1,67 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_SYSCALLS_SOCKET_BIND_TO_DEVICE_UTILS_H_
+#define GVISOR_TEST_SYSCALLS_SOCKET_BIND_TO_DEVICE_UTILS_H_
+
+#include <arpa/inet.h>
+#include <linux/if_tun.h>
+#include <net/if.h>
+#include <netinet/in.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <sys/un.h>
+#include <unistd.h>
+
+#include <cstdio>
+#include <cstring>
+#include <map>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "absl/memory/memory.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+class Tunnel {
+ public:
+  static PosixErrorOr<std::unique_ptr<Tunnel>> New(
+      std::string tunnel_name = "");
+  const std::string& GetName() const { return name_; }
+
+  ~Tunnel() {
+    if (fd_ != -1) {
+      close(fd_);
+    }
+  }
+
+ private:
+  Tunnel(int fd) : fd_(fd) {}
+  int fd_ = -1;
+  std::string name_;
+};
+
+std::unordered_set<std::string> GetInterfaceNames();
+
+}  // namespace testing
+}  // namespace gvisor
+
+#endif  // GVISOR_TEST_SYSCALLS_SOCKET_BIND_TO_DEVICE_UTILS_H_
diff --git a/test/syscalls/linux/uidgid.cc b/test/syscalls/linux/uidgid.cc
index d48453a93..6218fbce1 100644
--- a/test/syscalls/linux/uidgid.cc
+++ b/test/syscalls/linux/uidgid.cc
@@ -25,6 +25,7 @@
 #include "test/util/posix_error.h"
 #include "test/util/test_util.h"
 #include "test/util/thread_util.h"
+#include "test/util/uid_util.h"
 
 ABSL_FLAG(int32_t, scratch_uid1, 65534, "first scratch UID");
 ABSL_FLAG(int32_t, scratch_uid2, 65533, "second scratch UID");
@@ -68,30 +69,6 @@ TEST(UidGidTest, Getgroups) {
   // here; see the setgroups test below.
 }
 
-// If the caller's real/effective/saved user/group IDs are all 0, IsRoot returns
-// true. Otherwise IsRoot logs an explanatory message and returns false.
-PosixErrorOr<bool> IsRoot() {
-  uid_t ruid, euid, suid;
-  int rc = getresuid(&ruid, &euid, &suid);
-  MaybeSave();
-  if (rc < 0) {
-    return PosixError(errno, "getresuid");
-  }
-  if (ruid != 0 || euid != 0 || suid != 0) {
-    return false;
-  }
-  gid_t rgid, egid, sgid;
-  rc = getresgid(&rgid, &egid, &sgid);
-  MaybeSave();
-  if (rc < 0) {
-    return PosixError(errno, "getresgid");
-  }
-  if (rgid != 0 || egid != 0 || sgid != 0) {
-    return false;
-  }
-  return true;
-}
-
 // Checks that the calling process' real/effective/saved user IDs are
 // ruid/euid/suid respectively.
 PosixError CheckUIDs(uid_t ruid, uid_t euid, uid_t suid) {
diff --git a/test/util/BUILD b/test/util/BUILD
index 25ed9c944..5d2a9cc2c 100644
--- a/test/util/BUILD
+++ b/test/util/BUILD
@@ -324,3 +324,14 @@ cc_library(
         ":test_util",
     ],
 )
+
+cc_library(
+    name = "uid_util",
+    testonly = 1,
+    srcs = ["uid_util.cc"],
+    hdrs = ["uid_util.h"],
+    deps = [
+        ":posix_error",
+        ":save_util",
+    ],
+)
diff --git a/test/util/uid_util.cc b/test/util/uid_util.cc
new file mode 100644
index 000000000..b131b4b99
--- /dev/null
+++ b/test/util/uid_util.cc
@@ -0,0 +1,44 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/util/posix_error.h"
+#include "test/util/save_util.h"
+
+namespace gvisor {
+namespace testing {
+
+PosixErrorOr<bool> IsRoot() {
+  uid_t ruid, euid, suid;
+  int rc = getresuid(&ruid, &euid, &suid);
+  MaybeSave();
+  if (rc < 0) {
+    return PosixError(errno, "getresuid");
+  }
+  if (ruid != 0 || euid != 0 || suid != 0) {
+    return false;
+  }
+  gid_t rgid, egid, sgid;
+  rc = getresgid(&rgid, &egid, &sgid);
+  MaybeSave();
+  if (rc < 0) {
+    return PosixError(errno, "getresgid");
+  }
+  if (rgid != 0 || egid != 0 || sgid != 0) {
+    return false;
+  }
+  return true;
+}
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/util/uid_util.h b/test/util/uid_util.h
new file mode 100644
index 000000000..2cd387fb0
--- /dev/null
+++ b/test/util/uid_util.h
@@ -0,0 +1,29 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_SYSCALLS_UID_UTIL_H_
+#define GVISOR_TEST_SYSCALLS_UID_UTIL_H_
+
+#include "test/util/posix_error.h"
+
+namespace gvisor {
+namespace testing {
+
+// Returns true if the caller's real/effective/saved user/group IDs are all 0.
+PosixErrorOr<bool> IsRoot();
+
+}  // namespace testing
+}  // namespace gvisor
+
+#endif  // GVISOR_TEST_SYSCALLS_UID_UTIL_H_
-- 
cgit v1.2.3


From 981fc188f0f0250ad59e39d566a56c71430b3287 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Mon, 30 Sep 2019 10:02:14 -0700
Subject: Only copy out remaining time on nanosleep success

It looks like the old code attempted to do this, but didn't realize that err !=
nil even in the happy case.

PiperOrigin-RevId: 272005887
---
 pkg/sentry/syscalls/linux/sys_time.go  | 39 +++++++--------
 test/syscalls/linux/clock_nanosleep.cc | 86 ++++++++++++++++++++++------------
 2 files changed, 73 insertions(+), 52 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/syscalls/linux/sys_time.go b/pkg/sentry/syscalls/linux/sys_time.go
index 4b3f043a2..b887fa9d7 100644
--- a/pkg/sentry/syscalls/linux/sys_time.go
+++ b/pkg/sentry/syscalls/linux/sys_time.go
@@ -15,6 +15,7 @@
 package linux
 
 import (
+	"fmt"
 	"time"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
@@ -228,41 +229,35 @@ func clockNanosleepFor(t *kernel.Task, c ktime.Clock, dur time.Duration, rem use
 
 	timer.Destroy()
 
-	var remaining time.Duration
-	// Did we just block for the entire duration?
-	if err == syserror.ETIMEDOUT {
-		remaining = 0
-	} else {
-		remaining = dur - after.Sub(start)
+	switch err {
+	case syserror.ETIMEDOUT:
+		// Slept for entire timeout.
+		return nil
+	case syserror.ErrInterrupted:
+		// Interrupted.
+		remaining := dur - after.Sub(start)
 		if remaining < 0 {
 			remaining = time.Duration(0)
 		}
-	}
 
-	// Copy out remaining time.
-	if err != nil && rem != usermem.Addr(0) {
-		timeleft := linux.NsecToTimespec(remaining.Nanoseconds())
-		if err := copyTimespecOut(t, rem, &timeleft); err != nil {
-			return err
+		// Copy out remaining time.
+		if rem != 0 {
+			timeleft := linux.NsecToTimespec(remaining.Nanoseconds())
+			if err := copyTimespecOut(t, rem, &timeleft); err != nil {
+				return err
+			}
 		}
-	}
-
-	// Did we just block for the entire duration?
-	if err == syserror.ETIMEDOUT {
-		return nil
-	}
 
-	// If interrupted, arrange for a restart with the remaining duration.
-	if err == syserror.ErrInterrupted {
+		// Arrange for a restart with the remaining duration.
 		t.SetSyscallRestartBlock(&clockNanosleepRestartBlock{
 			c:        c,
 			duration: remaining,
 			rem:      rem,
 		})
 		return kernel.ERESTART_RESTARTBLOCK
+	default:
+		panic(fmt.Sprintf("Impossible BlockWithTimer error %v", err))
 	}
-
-	return err
 }
 
 // Nanosleep implements linux syscall Nanosleep(2).
diff --git a/test/syscalls/linux/clock_nanosleep.cc b/test/syscalls/linux/clock_nanosleep.cc
index 52a69d230..b55cddc52 100644
--- a/test/syscalls/linux/clock_nanosleep.cc
+++ b/test/syscalls/linux/clock_nanosleep.cc
@@ -43,7 +43,7 @@ int sys_clock_nanosleep(clockid_t clkid, int flags,
 
 PosixErrorOr<absl::Time> GetTime(clockid_t clk) {
   struct timespec ts = {};
-  int rc = clock_gettime(clk, &ts);
+  const int rc = clock_gettime(clk, &ts);
   MaybeSave();
   if (rc < 0) {
     return PosixError(errno, "clock_gettime");
@@ -67,31 +67,32 @@ TEST_P(WallClockNanosleepTest, InvalidValues) {
 }
 
 TEST_P(WallClockNanosleepTest, SleepOneSecond) {
-  absl::Duration const duration = absl::Seconds(1);
-  struct timespec dur = absl::ToTimespec(duration);
+  constexpr absl::Duration kSleepDuration = absl::Seconds(1);
+  struct timespec duration = absl::ToTimespec(kSleepDuration);
 
-  absl::Time const before = ASSERT_NO_ERRNO_AND_VALUE(GetTime(GetParam()));
-  EXPECT_THAT(RetryEINTR(sys_clock_nanosleep)(GetParam(), 0, &dur, &dur),
-              SyscallSucceeds());
-  absl::Time const after = ASSERT_NO_ERRNO_AND_VALUE(GetTime(GetParam()));
+  const absl::Time before = ASSERT_NO_ERRNO_AND_VALUE(GetTime(GetParam()));
+  EXPECT_THAT(
+      RetryEINTR(sys_clock_nanosleep)(GetParam(), 0, &duration, &duration),
+      SyscallSucceeds());
+  const absl::Time after = ASSERT_NO_ERRNO_AND_VALUE(GetTime(GetParam()));
 
-  EXPECT_GE(after - before, duration);
+  EXPECT_GE(after - before, kSleepDuration);
 }
 
 TEST_P(WallClockNanosleepTest, InterruptedNanosleep) {
-  absl::Duration const duration = absl::Seconds(60);
-  struct timespec dur = absl::ToTimespec(duration);
+  constexpr absl::Duration kSleepDuration = absl::Seconds(60);
+  struct timespec duration = absl::ToTimespec(kSleepDuration);
 
   // Install no-op signal handler for SIGALRM.
   struct sigaction sa = {};
   sigfillset(&sa.sa_mask);
   sa.sa_handler = +[](int signo) {};
-  auto const cleanup_sa =
+  const auto cleanup_sa =
       ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaction(SIGALRM, sa));
 
   // Measure time since setting the alarm, since the alarm will interrupt the
   // sleep and hence determine how long we sleep.
-  absl::Time const before = ASSERT_NO_ERRNO_AND_VALUE(GetTime(GetParam()));
+  const absl::Time before = ASSERT_NO_ERRNO_AND_VALUE(GetTime(GetParam()));
 
   // Set an alarm to go off while sleeping.
   struct itimerval timer = {};
@@ -99,26 +100,51 @@ TEST_P(WallClockNanosleepTest, InterruptedNanosleep) {
   timer.it_value.tv_usec = 0;
   timer.it_interval.tv_sec = 1;
   timer.it_interval.tv_usec = 0;
-  auto const cleanup =
+  const auto cleanup =
       ASSERT_NO_ERRNO_AND_VALUE(ScopedItimer(ITIMER_REAL, timer));
 
-  EXPECT_THAT(sys_clock_nanosleep(GetParam(), 0, &dur, &dur),
+  EXPECT_THAT(sys_clock_nanosleep(GetParam(), 0, &duration, &duration),
               SyscallFailsWithErrno(EINTR));
-  absl::Time const after = ASSERT_NO_ERRNO_AND_VALUE(GetTime(GetParam()));
+  const absl::Time after = ASSERT_NO_ERRNO_AND_VALUE(GetTime(GetParam()));
 
-  absl::Duration const remaining = absl::DurationFromTimespec(dur);
-  EXPECT_GE(after - before + remaining, duration);
+  // Remaining time updated.
+  const absl::Duration remaining = absl::DurationFromTimespec(duration);
+  EXPECT_GE(after - before + remaining, kSleepDuration);
+}
+
+// Remaining time is *not* updated if nanosleep completes uninterrupted.
+TEST_P(WallClockNanosleepTest, UninterruptedNanosleep) {
+  constexpr absl::Duration kSleepDuration = absl::Milliseconds(10);
+  const struct timespec duration = absl::ToTimespec(kSleepDuration);
+
+  while (true) {
+    constexpr int kRemainingMagic = 42;
+    struct timespec remaining;
+    remaining.tv_sec = kRemainingMagic;
+    remaining.tv_nsec = kRemainingMagic;
+
+    int ret = sys_clock_nanosleep(GetParam(), 0, &duration, &remaining);
+    if (ret == EINTR) {
+      // Retry from beginning. We want a single uninterrupted call.
+      continue;
+    }
+
+    EXPECT_THAT(ret, SyscallSucceeds());
+    EXPECT_EQ(remaining.tv_sec, kRemainingMagic);
+    EXPECT_EQ(remaining.tv_nsec, kRemainingMagic);
+    break;
+  }
 }
 
 TEST_P(WallClockNanosleepTest, SleepUntil) {
-  absl::Time const now = ASSERT_NO_ERRNO_AND_VALUE(GetTime(GetParam()));
-  absl::Time const until = now + absl::Seconds(2);
-  struct timespec ts = absl::ToTimespec(until);
+  const absl::Time now = ASSERT_NO_ERRNO_AND_VALUE(GetTime(GetParam()));
+  const absl::Time until = now + absl::Seconds(2);
+  const struct timespec ts = absl::ToTimespec(until);
 
   EXPECT_THAT(
       RetryEINTR(sys_clock_nanosleep)(GetParam(), TIMER_ABSTIME, &ts, nullptr),
       SyscallSucceeds());
-  absl::Time const after = ASSERT_NO_ERRNO_AND_VALUE(GetTime(GetParam()));
+  const absl::Time after = ASSERT_NO_ERRNO_AND_VALUE(GetTime(GetParam()));
 
   EXPECT_GE(after, until);
 }
@@ -127,8 +153,8 @@ INSTANTIATE_TEST_SUITE_P(Sleepers, WallClockNanosleepTest,
                          ::testing::Values(CLOCK_REALTIME, CLOCK_MONOTONIC));
 
 TEST(ClockNanosleepProcessTest, SleepFiveSeconds) {
-  absl::Duration const kDuration = absl::Seconds(5);
-  struct timespec dur = absl::ToTimespec(kDuration);
+  const absl::Duration kSleepDuration = absl::Seconds(5);
+  struct timespec duration = absl::ToTimespec(kSleepDuration);
 
   // Ensure that CLOCK_PROCESS_CPUTIME_ID advances.
   std::atomic<bool> done(false);
@@ -136,16 +162,16 @@ TEST(ClockNanosleepProcessTest, SleepFiveSeconds) {
     while (!done.load()) {
     }
   });
-  auto const cleanup_done = Cleanup([&] { done.store(true); });
+  const auto cleanup_done = Cleanup([&] { done.store(true); });
 
-  absl::Time const before =
+  const absl::Time before =
       ASSERT_NO_ERRNO_AND_VALUE(GetTime(CLOCK_PROCESS_CPUTIME_ID));
-  EXPECT_THAT(
-      RetryEINTR(sys_clock_nanosleep)(CLOCK_PROCESS_CPUTIME_ID, 0, &dur, &dur),
-      SyscallSucceeds());
-  absl::Time const after =
+  EXPECT_THAT(RetryEINTR(sys_clock_nanosleep)(CLOCK_PROCESS_CPUTIME_ID, 0,
+                                              &duration, &duration),
+              SyscallSucceeds());
+  const absl::Time after =
       ASSERT_NO_ERRNO_AND_VALUE(GetTime(CLOCK_PROCESS_CPUTIME_ID));
-  EXPECT_GE(after - before, kDuration);
+  EXPECT_GE(after - before, kSleepDuration);
 }
 }  // namespace
 
-- 
cgit v1.2.3


From 3ad17ff5977bc639418f5409396fac8b3ceb370b Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Mon, 30 Sep 2019 13:06:27 -0700
Subject: Force timestamps to update when set via
 InodeOperations.SetTimestamps.

The gofer's CachingInodeOperations implementation contains an optimization for
the common open-read-close pattern when we have a host FD.  In this case, the
host kernel will update the timestamp for us to a reasonably close time, so we
don't need an extra RPC to the gofer.

However, when the app explicitly sets the timestamps (via futimes or similar)
then we actually DO need to update the timestamps, because the host kernel
won't do it for us.

To fix this, a new boolean `forceSetTimestamps` was added to
CachineInodeOperations.SetMaskedAttributes. It is only set by
gofer.InodeOperations.SetTimestamps.

PiperOrigin-RevId: 272048146
---
 pkg/sentry/fs/fsutil/host_mappable.go     |  2 +-
 pkg/sentry/fs/fsutil/inode_cached.go      | 22 ++++++++++++++--------
 pkg/sentry/fs/fsutil/inode_cached_test.go |  4 ++--
 pkg/sentry/fs/gofer/inode.go              | 15 +++++++++++----
 pkg/sentry/fs/host/inode.go               |  4 ++--
 5 files changed, 30 insertions(+), 17 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/fsutil/host_mappable.go b/pkg/sentry/fs/fsutil/host_mappable.go
index d2495cb83..693625ddc 100644
--- a/pkg/sentry/fs/fsutil/host_mappable.go
+++ b/pkg/sentry/fs/fsutil/host_mappable.go
@@ -144,7 +144,7 @@ func (h *HostMappable) Truncate(ctx context.Context, newSize int64) error {
 
 	mask := fs.AttrMask{Size: true}
 	attr := fs.UnstableAttr{Size: newSize}
-	if err := h.backingFile.SetMaskedAttributes(ctx, mask, attr); err != nil {
+	if err := h.backingFile.SetMaskedAttributes(ctx, mask, attr, false); err != nil {
 		return err
 	}
 
diff --git a/pkg/sentry/fs/fsutil/inode_cached.go b/pkg/sentry/fs/fsutil/inode_cached.go
index d404a79d4..dd80757dc 100644
--- a/pkg/sentry/fs/fsutil/inode_cached.go
+++ b/pkg/sentry/fs/fsutil/inode_cached.go
@@ -140,12 +140,16 @@ type CachedFileObject interface {
 	// WriteFromBlocksAt may return a partial write without an error.
 	WriteFromBlocksAt(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error)
 
-	// SetMaskedAttributes sets the attributes in attr that are true in mask
-	// on the backing file.
+	// SetMaskedAttributes sets the attributes in attr that are true in
+	// mask on the backing file. If the mask contains only ATime or MTime
+	// and the CachedFileObject has an FD to the file, then this operation
+	// is a noop unless forceSetTimestamps is true. This avoids an extra
+	// RPC to the gofer in the open-read/write-close case, when the
+	// timestamps on the file will be updated by the host kernel for us.
 	//
 	// SetMaskedAttributes may be called at any point, regardless of whether
 	// the file was opened.
-	SetMaskedAttributes(ctx context.Context, mask fs.AttrMask, attr fs.UnstableAttr) error
+	SetMaskedAttributes(ctx context.Context, mask fs.AttrMask, attr fs.UnstableAttr, forceSetTimestamps bool) error
 
 	// Allocate allows the caller to reserve disk space for the inode.
 	// It's equivalent to fallocate(2) with 'mode=0'.
@@ -224,7 +228,7 @@ func (c *CachingInodeOperations) SetPermissions(ctx context.Context, inode *fs.I
 
 	now := ktime.NowFromContext(ctx)
 	masked := fs.AttrMask{Perms: true}
-	if err := c.backingFile.SetMaskedAttributes(ctx, masked, fs.UnstableAttr{Perms: perms}); err != nil {
+	if err := c.backingFile.SetMaskedAttributes(ctx, masked, fs.UnstableAttr{Perms: perms}, false); err != nil {
 		return false
 	}
 	c.attr.Perms = perms
@@ -246,7 +250,7 @@ func (c *CachingInodeOperations) SetOwner(ctx context.Context, inode *fs.Inode,
 		UID: owner.UID.Ok(),
 		GID: owner.GID.Ok(),
 	}
-	if err := c.backingFile.SetMaskedAttributes(ctx, masked, fs.UnstableAttr{Owner: owner}); err != nil {
+	if err := c.backingFile.SetMaskedAttributes(ctx, masked, fs.UnstableAttr{Owner: owner}, false); err != nil {
 		return err
 	}
 	if owner.UID.Ok() {
@@ -282,7 +286,9 @@ func (c *CachingInodeOperations) SetTimestamps(ctx context.Context, inode *fs.In
 		AccessTime:       !ts.ATimeOmit,
 		ModificationTime: !ts.MTimeOmit,
 	}
-	if err := c.backingFile.SetMaskedAttributes(ctx, masked, fs.UnstableAttr{AccessTime: ts.ATime, ModificationTime: ts.MTime}); err != nil {
+	// Call SetMaskedAttributes with forceSetTimestamps = true to make sure
+	// the timestamp is updated.
+	if err := c.backingFile.SetMaskedAttributes(ctx, masked, fs.UnstableAttr{AccessTime: ts.ATime, ModificationTime: ts.MTime}, true); err != nil {
 		return err
 	}
 	if !ts.ATimeOmit {
@@ -305,7 +311,7 @@ func (c *CachingInodeOperations) Truncate(ctx context.Context, inode *fs.Inode,
 	now := ktime.NowFromContext(ctx)
 	masked := fs.AttrMask{Size: true}
 	attr := fs.UnstableAttr{Size: size}
-	if err := c.backingFile.SetMaskedAttributes(ctx, masked, attr); err != nil {
+	if err := c.backingFile.SetMaskedAttributes(ctx, masked, attr, false); err != nil {
 		c.dataMu.Unlock()
 		return err
 	}
@@ -394,7 +400,7 @@ func (c *CachingInodeOperations) WriteOut(ctx context.Context, inode *fs.Inode)
 	c.dirtyAttr.Size = false
 
 	// Write out cached attributes.
-	if err := c.backingFile.SetMaskedAttributes(ctx, c.dirtyAttr, c.attr); err != nil {
+	if err := c.backingFile.SetMaskedAttributes(ctx, c.dirtyAttr, c.attr, false); err != nil {
 		c.attrMu.Unlock()
 		return err
 	}
diff --git a/pkg/sentry/fs/fsutil/inode_cached_test.go b/pkg/sentry/fs/fsutil/inode_cached_test.go
index eb5730c35..129f314c8 100644
--- a/pkg/sentry/fs/fsutil/inode_cached_test.go
+++ b/pkg/sentry/fs/fsutil/inode_cached_test.go
@@ -39,7 +39,7 @@ func (noopBackingFile) WriteFromBlocksAt(ctx context.Context, srcs safemem.Block
 	return srcs.NumBytes(), nil
 }
 
-func (noopBackingFile) SetMaskedAttributes(context.Context, fs.AttrMask, fs.UnstableAttr) error {
+func (noopBackingFile) SetMaskedAttributes(context.Context, fs.AttrMask, fs.UnstableAttr, bool) error {
 	return nil
 }
 
@@ -230,7 +230,7 @@ func (f *sliceBackingFile) WriteFromBlocksAt(ctx context.Context, srcs safemem.B
 	return w.WriteFromBlocks(srcs)
 }
 
-func (*sliceBackingFile) SetMaskedAttributes(context.Context, fs.AttrMask, fs.UnstableAttr) error {
+func (*sliceBackingFile) SetMaskedAttributes(context.Context, fs.AttrMask, fs.UnstableAttr, bool) error {
 	return nil
 }
 
diff --git a/pkg/sentry/fs/gofer/inode.go b/pkg/sentry/fs/gofer/inode.go
index 95b064aea..d918d6620 100644
--- a/pkg/sentry/fs/gofer/inode.go
+++ b/pkg/sentry/fs/gofer/inode.go
@@ -215,8 +215,8 @@ func (i *inodeFileState) WriteFromBlocksAt(ctx context.Context, srcs safemem.Blo
 }
 
 // SetMaskedAttributes implements fsutil.CachedFileObject.SetMaskedAttributes.
-func (i *inodeFileState) SetMaskedAttributes(ctx context.Context, mask fs.AttrMask, attr fs.UnstableAttr) error {
-	if i.skipSetAttr(mask) {
+func (i *inodeFileState) SetMaskedAttributes(ctx context.Context, mask fs.AttrMask, attr fs.UnstableAttr, forceSetTimestamps bool) error {
+	if i.skipSetAttr(mask, forceSetTimestamps) {
 		return nil
 	}
 	as, ans := attr.AccessTime.Unix()
@@ -251,13 +251,14 @@ func (i *inodeFileState) SetMaskedAttributes(ctx context.Context, mask fs.AttrMa
 // when:
 //   - Mask is empty
 //   - Mask contains only attributes that cannot be set in the gofer
-//   - Mask contains only atime and/or mtime, and host FD exists
+//   - forceSetTimestamps is false and mask contains only atime and/or mtime
+//     and host FD exists
 //
 // Updates to atime and mtime can be skipped because cached value will be
 // "close enough" to host value, given that operation went directly to host FD.
 // Skipping atime updates is particularly important to reduce the number of
 // operations sent to the Gofer for readonly files.
-func (i *inodeFileState) skipSetAttr(mask fs.AttrMask) bool {
+func (i *inodeFileState) skipSetAttr(mask fs.AttrMask, forceSetTimestamps bool) bool {
 	// First remove attributes that cannot be updated.
 	cpy := mask
 	cpy.Type = false
@@ -277,6 +278,12 @@ func (i *inodeFileState) skipSetAttr(mask fs.AttrMask) bool {
 		return false
 	}
 
+	// If forceSetTimestamps was passed, then we cannot skip.
+	if forceSetTimestamps {
+		return false
+	}
+
+	// Skip if we have a host FD.
 	i.handlesMu.RLock()
 	defer i.handlesMu.RUnlock()
 	return (i.readHandles != nil && i.readHandles.Host != nil) ||
diff --git a/pkg/sentry/fs/host/inode.go b/pkg/sentry/fs/host/inode.go
index 894ab01f0..a6e4a09e3 100644
--- a/pkg/sentry/fs/host/inode.go
+++ b/pkg/sentry/fs/host/inode.go
@@ -114,7 +114,7 @@ func (i *inodeFileState) WriteFromBlocksAt(ctx context.Context, srcs safemem.Blo
 }
 
 // SetMaskedAttributes implements fsutil.CachedFileObject.SetMaskedAttributes.
-func (i *inodeFileState) SetMaskedAttributes(ctx context.Context, mask fs.AttrMask, attr fs.UnstableAttr) error {
+func (i *inodeFileState) SetMaskedAttributes(ctx context.Context, mask fs.AttrMask, attr fs.UnstableAttr, _ bool) error {
 	if mask.Empty() {
 		return nil
 	}
@@ -163,7 +163,7 @@ func (i *inodeFileState) unstableAttr(ctx context.Context) (fs.UnstableAttr, err
 	return unstableAttr(i.mops, &s), nil
 }
 
-// SetMaskedAttributes implements fsutil.CachedFileObject.SetMaskedAttributes.
+// Allocate implements fsutil.CachedFileObject.Allocate.
 func (i *inodeFileState) Allocate(_ context.Context, offset, length int64) error {
 	return syscall.Fallocate(i.FD(), 0, offset, length)
 }
-- 
cgit v1.2.3


From 20841b98e14dd37aa40886668e337551b18f0fd3 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Mon, 30 Sep 2019 17:23:03 -0700
Subject: Update FIXME bug with GitHub issue.

PiperOrigin-RevId: 272101930
---
 pkg/sentry/sighandling/sighandling_unsafe.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/sighandling/sighandling_unsafe.go b/pkg/sentry/sighandling/sighandling_unsafe.go
index eace3766d..c303435d5 100644
--- a/pkg/sentry/sighandling/sighandling_unsafe.go
+++ b/pkg/sentry/sighandling/sighandling_unsafe.go
@@ -23,7 +23,7 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 )
 
-// TODO(b/34161764): Move to pkg/abi/linux along with definitions in
+// FIXME(gvisor.dev/issue/214): Move to pkg/abi/linux along with definitions in
 // pkg/sentry/arch.
 type sigaction struct {
 	handler  uintptr
-- 
cgit v1.2.3


From 29a1ba54ea427d4fdd357453d74c93d16f5eca9b Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Mon, 30 Sep 2019 17:55:55 -0700
Subject: splice: compare inode numbers only if both ends are pipes

It isn't allowed to splice data from and into the same pipe.

But right now this check is broken, because we don't check that both ends are
pipes.

PiperOrigin-RevId: 272107022
---
 pkg/sentry/syscalls/linux/sys_splice.go | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/syscalls/linux/sys_splice.go b/pkg/sentry/syscalls/linux/sys_splice.go
index f0a292f2f..9f705ebca 100644
--- a/pkg/sentry/syscalls/linux/sys_splice.go
+++ b/pkg/sentry/syscalls/linux/sys_splice.go
@@ -245,12 +245,12 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 		if inOffset != 0 || outOffset != 0 {
 			return 0, nil, syserror.ESPIPE
 		}
-	default:
-		return 0, nil, syserror.EINVAL
-	}
 
-	// We may not refer to the same pipe; otherwise it's a continuous loop.
-	if inFile.Dirent.Inode.StableAttr.InodeID == outFile.Dirent.Inode.StableAttr.InodeID {
+		// We may not refer to the same pipe; otherwise it's a continuous loop.
+		if inFile.Dirent.Inode.StableAttr.InodeID == outFile.Dirent.Inode.StableAttr.InodeID {
+			return 0, nil, syserror.EINVAL
+		}
+	default:
 		return 0, nil, syserror.EINVAL
 	}
 
-- 
cgit v1.2.3


From 7a234f736fe0e91824b50631e408bd07b2c0ed31 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Mon, 30 Sep 2019 18:22:25 -0700
Subject: splice: try another fallback option only if the previous one isn't
 supported

Reported-by: syzbot+bb5ed342be51d39b0cbb@syzkaller.appspotmail.com
PiperOrigin-RevId: 272110815
---
 pkg/sentry/fs/splice.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/splice.go b/pkg/sentry/fs/splice.go
index b03b7f836..311798811 100644
--- a/pkg/sentry/fs/splice.go
+++ b/pkg/sentry/fs/splice.go
@@ -139,7 +139,7 @@ func Splice(ctx context.Context, dst *File, src *File, opts SpliceOpts) (int64,
 
 	// Attempt to do a WriteTo; this is likely the most efficient.
 	n, err := src.FileOperations.WriteTo(ctx, src, w, opts.Length, opts.Dup)
-	if n == 0 && err != nil && err != syserror.ErrWouldBlock && !opts.Dup {
+	if n == 0 && err == syserror.ENOSYS && !opts.Dup {
 		// Attempt as a ReadFrom. If a WriteTo, a ReadFrom may also be
 		// more efficient than a copy if buffers are cached or readily
 		// available. (It's unlikely that they can actually be donated).
@@ -151,7 +151,7 @@ func Splice(ctx context.Context, dst *File, src *File, opts SpliceOpts) (int64,
 	// if we block at some point, we could lose data. If the source is
 	// not a pipe then reading is not destructive; if the destination
 	// is a regular file, then it is guaranteed not to block writing.
-	if n == 0 && err != nil && err != syserror.ErrWouldBlock && !opts.Dup && (!dstPipe || !srcPipe) {
+	if n == 0 && err == syserror.ENOSYS && !opts.Dup && (!dstPipe || !srcPipe) {
 		// Fallback to an in-kernel copy.
 		n, err = io.Copy(w, &io.LimitedReader{
 			R: r,
-- 
cgit v1.2.3


From 53cc72da90f5b5a76b024b47fe4e38a81b495eb4 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Tue, 1 Oct 2019 11:29:35 -0700
Subject: Honor X bit on extra anon pages in PT_LOAD segments

Linux changed this behavior in 16e72e9b30986ee15f17fbb68189ca842c32af58
(v4.11). Previously, extra pages were always mapped RW. Now, those pages will
be executable if the segment specified PF_X. They still must be writeable.

PiperOrigin-RevId: 272256280
---
 pkg/sentry/loader/elf.go           | 18 +++++++++++-------
 test/syscalls/linux/exec_binary.cc | 19 ++++++++++++-------
 test/util/proc_util.cc             |  2 +-
 3 files changed, 24 insertions(+), 15 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/loader/elf.go b/pkg/sentry/loader/elf.go
index ba9c9ce12..2d9251e92 100644
--- a/pkg/sentry/loader/elf.go
+++ b/pkg/sentry/loader/elf.go
@@ -323,18 +323,22 @@ func mapSegment(ctx context.Context, m *mm.MemoryManager, f *fs.File, phdr *elf.
 			return syserror.ENOEXEC
 		}
 
+		// N.B. Linux uses vm_brk_flags to map these pages, which only
+		// honors the X bit, always mapping at least RW. ignoring These
+		// pages are not included in the final brk region.
+		prot := usermem.ReadWrite
+		if phdr.Flags&elf.PF_X == elf.PF_X {
+			prot.Execute = true
+		}
+
 		if _, err := m.MMap(ctx, memmap.MMapOpts{
 			Length: uint64(anonSize),
 			Addr:   anonAddr,
 			// Fixed without Unmap will fail the mmap if something is
 			// already at addr.
-			Fixed:   true,
-			Private: true,
-			// N.B. Linux uses vm_brk to map these pages, ignoring
-			// the segment protections, instead always mapping RW.
-			// These pages are not included in the final brk
-			// region.
-			Perms:    usermem.ReadWrite,
+			Fixed:    true,
+			Private:  true,
+			Perms:    prot,
 			MaxPerms: usermem.AnyAccess,
 		}); err != nil {
 			ctx.Infof("Error mapping PT_LOAD segment %v anonymous memory: %v", phdr, err)
diff --git a/test/syscalls/linux/exec_binary.cc b/test/syscalls/linux/exec_binary.cc
index 91b55015c..68af882bb 100644
--- a/test/syscalls/linux/exec_binary.cc
+++ b/test/syscalls/linux/exec_binary.cc
@@ -401,12 +401,17 @@ TEST(ElfTest, DataSegment) {
              })));
 }
 
-// Additonal pages beyond filesz are always RW.
+// Additonal pages beyond filesz honor (only) execute protections.
 //
-// N.B. Linux uses set_brk -> vm_brk to additional pages beyond filesz (even
-// though start_brk itself will always be beyond memsz). As a result, the
-// segment permissions don't apply; the mapping is always RW.
+// N.B. Linux changed this in 4.11 (16e72e9b30986 "powerpc: do not make the
+// entire heap executable"). Previously, extra pages were always RW.
 TEST(ElfTest, ExtraMemPages) {
+  // gVisor has the newer behavior.
+  if (!IsRunningOnGvisor()) {
+    auto version = ASSERT_NO_ERRNO_AND_VALUE(GetKernelVersion());
+    SKIP_IF(version.major < 4 || (version.major == 4 && version.minor < 11));
+  }
+
   ElfBinary<64> elf = StandardElf();
 
   // Create a standard ELF, but extend to 1.5 pages. The second page will be the
@@ -415,7 +420,7 @@ TEST(ElfTest, ExtraMemPages) {
 
   decltype(elf)::ElfPhdr phdr = {};
   phdr.p_type = PT_LOAD;
-  // RWX segment. The extra anon page will be RW anyways.
+  // RWX segment. The extra anon page will also be RWX.
   //
   // N.B. Linux uses clear_user to clear the end of the file-mapped page, which
   // respects the mapping protections. Thus if we map this RO with memsz >
@@ -454,7 +459,7 @@ TEST(ElfTest, ExtraMemPages) {
                   {0x41000, 0x42000, true, true, true, true, kPageSize, 0, 0, 0,
                    file.path().c_str()},
                   // extra page from anon.
-                  {0x42000, 0x43000, true, true, false, true, 0, 0, 0, 0, ""},
+                  {0x42000, 0x43000, true, true, true, true, 0, 0, 0, 0, ""},
               })));
 }
 
@@ -469,7 +474,7 @@ TEST(ElfTest, AnonOnlySegment) {
   phdr.p_offset = 0;
   phdr.p_vaddr = 0x41000;
   phdr.p_filesz = 0;
-  phdr.p_memsz = kPageSize - 0xe8;
+  phdr.p_memsz = kPageSize;
   elf.phdrs.push_back(phdr);
 
   elf.UpdateOffsets();
diff --git a/test/util/proc_util.cc b/test/util/proc_util.cc
index 75b24da37..34d636ba9 100644
--- a/test/util/proc_util.cc
+++ b/test/util/proc_util.cc
@@ -88,7 +88,7 @@ PosixErrorOr<std::vector<ProcMapsEntry>> ParseProcMaps(
   std::vector<ProcMapsEntry> entries;
   auto lines = absl::StrSplit(contents, '\n', absl::SkipEmpty());
   for (const auto& l : lines) {
-    std::cout << "line: " << l;
+    std::cout << "line: " << l << std::endl;
     ASSIGN_OR_RETURN_ERRNO(auto entry, ParseProcMapsLine(l));
     entries.push_back(entry);
   }
-- 
cgit v1.2.3


From dd69b49ed1103bab82a6b2ac95221b89b46f3376 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Tue, 1 Oct 2019 12:13:09 -0700
Subject: Disable cpuClockTicker when app is idle

Kernel.cpuClockTicker increments kernel.cpuClock, which tasks use as a clock to
track their CPU usage. This improves latency in the syscall path by avoid
expensive monotonic clock calls on every syscall entry/exit.

However, this timer fires every 10ms. Thus, when all tasks are idle (i.e.,
blocked or stopped), this forces a sentry wakeup every 10ms, when we may
otherwise be able to sleep until the next app-relevant event. These wakeups
cause the sentry to utilize approximately 2% CPU when the application is
otherwise idle.

Updates to clock are not strictly necessary when the app is idle, as there are
no readers of cpuClock. This commit reduces idle CPU by disabling the timer
when tasks are completely idle, and computing its effects at the next wakeup.

Rather than disabling the timer as soon as the app goes idle, we wait until the
next tick, which provides a window for short sleeps to sleep and wakeup without
doing the (relatively) expensive work of disabling and enabling the timer.

PiperOrigin-RevId: 272265822
---
 pkg/sentry/fs/timerfd/timerfd.go  |   3 +-
 pkg/sentry/kernel/kernel.go       | 129 ++++++++++++++++++++++++++++++++++++++
 pkg/sentry/kernel/posixtimer.go   |   8 ++-
 pkg/sentry/kernel/task_sched.go   |  33 +++++++++-
 pkg/sentry/kernel/thread_group.go |   3 +-
 pkg/sentry/kernel/time/time.go    |  27 +++++---
 test/syscalls/linux/itimer.cc     |   4 +-
 7 files changed, 193 insertions(+), 14 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/timerfd/timerfd.go b/pkg/sentry/fs/timerfd/timerfd.go
index 59403d9db..f8bf663bb 100644
--- a/pkg/sentry/fs/timerfd/timerfd.go
+++ b/pkg/sentry/fs/timerfd/timerfd.go
@@ -141,9 +141,10 @@ func (t *TimerOperations) Write(context.Context, *fs.File, usermem.IOSequence, i
 }
 
 // Notify implements ktime.TimerListener.Notify.
-func (t *TimerOperations) Notify(exp uint64) {
+func (t *TimerOperations) Notify(exp uint64, setting ktime.Setting) (ktime.Setting, bool) {
 	atomic.AddUint64(&t.val, exp)
 	t.events.Notify(waiter.EventIn)
+	return ktime.Setting{}, false
 }
 
 // Destroy implements ktime.TimerListener.Destroy.
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 8c1f79ab5..3cda03891 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -24,6 +24,7 @@
 //       TaskSet.mu
 //         SignalHandlers.mu
 //           Task.mu
+//       runningTasksMu
 //
 // Locking SignalHandlers.mu in multiple SignalHandlers requires locking
 // TaskSet.mu exclusively first. Locking Task.mu in multiple Tasks at the same
@@ -135,6 +136,22 @@ type Kernel struct {
 	// syslog is the kernel log.
 	syslog syslog
 
+	// runningTasksMu synchronizes disable/enable of cpuClockTicker when
+	// the kernel is idle (runningTasks == 0).
+	//
+	// runningTasksMu is used to exclude critical sections when the timer
+	// disables itself and when the first active task enables the timer,
+	// ensuring that tasks always see a valid cpuClock value.
+	runningTasksMu sync.Mutex `state:"nosave"`
+
+	// runningTasks is the total count of tasks currently in
+	// TaskGoroutineRunningSys or TaskGoroutineRunningApp. i.e., they are
+	// not blocked or stopped.
+	//
+	// runningTasks must be accessed atomically. Increments from 0 to 1 are
+	// further protected by runningTasksMu (see incRunningTasks).
+	runningTasks int64
+
 	// cpuClock is incremented every linux.ClockTick. cpuClock is used to
 	// measure task CPU usage, since sampling monotonicClock twice on every
 	// syscall turns out to be unreasonably expensive. This is similar to how
@@ -150,6 +167,22 @@ type Kernel struct {
 	// cpuClockTicker increments cpuClock.
 	cpuClockTicker *ktime.Timer `state:"nosave"`
 
+	// cpuClockTickerDisabled indicates that cpuClockTicker has been
+	// disabled because no tasks are running.
+	//
+	// cpuClockTickerDisabled is protected by runningTasksMu.
+	cpuClockTickerDisabled bool
+
+	// cpuClockTickerSetting is the ktime.Setting of cpuClockTicker at the
+	// point it was disabled. It is cached here to avoid a lock ordering
+	// violation with cpuClockTicker.mu when runningTaskMu is held.
+	//
+	// cpuClockTickerSetting is only valid when cpuClockTickerDisabled is
+	// true.
+	//
+	// cpuClockTickerSetting is protected by runningTasksMu.
+	cpuClockTickerSetting ktime.Setting
+
 	// fdMapUids is an ever-increasing counter for generating FDTable uids.
 	//
 	// fdMapUids is mutable, and is accessed using atomic memory operations.
@@ -912,6 +945,102 @@ func (k *Kernel) resumeTimeLocked() {
 	}
 }
 
+func (k *Kernel) incRunningTasks() {
+	for {
+		tasks := atomic.LoadInt64(&k.runningTasks)
+		if tasks != 0 {
+			// Standard case. Simply increment.
+			if !atomic.CompareAndSwapInt64(&k.runningTasks, tasks, tasks+1) {
+				continue
+			}
+			return
+		}
+
+		// Transition from 0 -> 1. Synchronize with other transitions and timer.
+		k.runningTasksMu.Lock()
+		tasks = atomic.LoadInt64(&k.runningTasks)
+		if tasks != 0 {
+			// We're no longer the first task, no need to
+			// re-enable.
+			atomic.AddInt64(&k.runningTasks, 1)
+			k.runningTasksMu.Unlock()
+			return
+		}
+
+		if !k.cpuClockTickerDisabled {
+			// Timer was never disabled.
+			atomic.StoreInt64(&k.runningTasks, 1)
+			k.runningTasksMu.Unlock()
+			return
+		}
+
+		// We need to update cpuClock for all of the ticks missed while we
+		// slept, and then re-enable the timer.
+		//
+		// The Notify in Swap isn't sufficient. kernelCPUClockTicker.Notify
+		// always increments cpuClock by 1 regardless of the number of
+		// expirations as a heuristic to avoid over-accounting in cases of CPU
+		// throttling.
+		//
+		// We want to cover the normal case, when all time should be accounted,
+		// so we increment for all expirations. Throttling is less concerning
+		// here because the ticker is only disabled from Notify. This means
+		// that Notify must schedule and compensate for the throttled period
+		// before the timer is disabled. Throttling while the timer is disabled
+		// doesn't matter, as nothing is running or reading cpuClock anyways.
+		//
+		// S/R also adds complication, as there are two cases. Recall that
+		// monotonicClock will jump forward on restore.
+		//
+		// 1. If the ticker is enabled during save, then on Restore Notify is
+		// called with many expirations, covering the time jump, but cpuClock
+		// is only incremented by 1.
+		//
+		// 2. If the ticker is disabled during save, then after Restore the
+		// first wakeup will call this function and cpuClock will be
+		// incremented by the number of expirations across the S/R.
+		//
+		// These cause very different value of cpuClock. But again, since
+		// nothing was running while the ticker was disabled, those differences
+		// don't matter.
+		setting, exp := k.cpuClockTickerSetting.At(k.monotonicClock.Now())
+		if exp > 0 {
+			atomic.AddUint64(&k.cpuClock, exp)
+		}
+
+		// Now that cpuClock is updated it is safe to allow other tasks to
+		// transition to running.
+		atomic.StoreInt64(&k.runningTasks, 1)
+
+		// N.B. we must unlock before calling Swap to maintain lock ordering.
+		//
+		// cpuClockTickerDisabled need not wait until after Swap to become
+		// true. It is sufficient that the timer *will* be enabled.
+		k.cpuClockTickerDisabled = false
+		k.runningTasksMu.Unlock()
+
+		// This won't call Notify (unless it's been ClockTick since setting.At
+		// above). This means we skip the thread group work in Notify. However,
+		// since nothing was running while we were disabled, none of the timers
+		// could have expired.
+		k.cpuClockTicker.Swap(setting)
+
+		return
+	}
+}
+
+func (k *Kernel) decRunningTasks() {
+	tasks := atomic.AddInt64(&k.runningTasks, -1)
+	if tasks < 0 {
+		panic(fmt.Sprintf("Invalid running count %d", tasks))
+	}
+
+	// Nothing to do. The next CPU clock tick will disable the timer if
+	// there is still nothing running. This provides approximately one tick
+	// of slack in which we can switch back and forth between idle and
+	// active without an expensive transition.
+}
+
 // WaitExited blocks until all tasks in k have exited.
 func (k *Kernel) WaitExited() {
 	k.tasks.liveGoroutines.Wait()
diff --git a/pkg/sentry/kernel/posixtimer.go b/pkg/sentry/kernel/posixtimer.go
index c5d095af7..2e861a5a8 100644
--- a/pkg/sentry/kernel/posixtimer.go
+++ b/pkg/sentry/kernel/posixtimer.go
@@ -117,9 +117,9 @@ func (it *IntervalTimer) signalRejectedLocked() {
 }
 
 // Notify implements ktime.TimerListener.Notify.
-func (it *IntervalTimer) Notify(exp uint64) {
+func (it *IntervalTimer) Notify(exp uint64, setting ktime.Setting) (ktime.Setting, bool) {
 	if it.target == nil {
-		return
+		return ktime.Setting{}, false
 	}
 
 	it.target.tg.pidns.owner.mu.RLock()
@@ -129,7 +129,7 @@ func (it *IntervalTimer) Notify(exp uint64) {
 
 	if it.sigpending {
 		it.overrunCur += exp
-		return
+		return ktime.Setting{}, false
 	}
 
 	// sigpending must be set before sendSignalTimerLocked() so that it can be
@@ -148,6 +148,8 @@ func (it *IntervalTimer) Notify(exp uint64) {
 	if err := it.target.sendSignalTimerLocked(si, it.group, it); err != nil {
 		it.signalRejectedLocked()
 	}
+
+	return ktime.Setting{}, false
 }
 
 // Destroy implements ktime.TimerListener.Destroy. Users of Timer should call
diff --git a/pkg/sentry/kernel/task_sched.go b/pkg/sentry/kernel/task_sched.go
index e76c069b0..8b148db35 100644
--- a/pkg/sentry/kernel/task_sched.go
+++ b/pkg/sentry/kernel/task_sched.go
@@ -126,12 +126,22 @@ func (t *Task) accountTaskGoroutineEnter(state TaskGoroutineState) {
 	t.gosched.Timestamp = now
 	t.gosched.State = state
 	t.goschedSeq.EndWrite()
+
+	if state != TaskGoroutineRunningApp {
+		// Task is blocking/stopping.
+		t.k.decRunningTasks()
+	}
 }
 
 // Preconditions: The caller must be running on the task goroutine, and leaving
 // a state indicated by a previous call to
 // t.accountTaskGoroutineEnter(state).
 func (t *Task) accountTaskGoroutineLeave(state TaskGoroutineState) {
+	if state != TaskGoroutineRunningApp {
+		// Task is unblocking/continuing.
+		t.k.incRunningTasks()
+	}
+
 	now := t.k.CPUClockNow()
 	if t.gosched.State != state {
 		panic(fmt.Sprintf("Task goroutine switching from state %v (expected %v) to %v", t.gosched.State, state, TaskGoroutineRunningSys))
@@ -330,7 +340,7 @@ func newKernelCPUClockTicker(k *Kernel) *kernelCPUClockTicker {
 }
 
 // Notify implements ktime.TimerListener.Notify.
-func (ticker *kernelCPUClockTicker) Notify(exp uint64) {
+func (ticker *kernelCPUClockTicker) Notify(exp uint64, setting ktime.Setting) (ktime.Setting, bool) {
 	// Only increment cpuClock by 1 regardless of the number of expirations.
 	// This approximately compensates for cases where thread throttling or bad
 	// Go runtime scheduling prevents the kernelCPUClockTicker goroutine, and
@@ -426,6 +436,27 @@ func (ticker *kernelCPUClockTicker) Notify(exp uint64) {
 		tgs[i] = nil
 	}
 	ticker.tgs = tgs[:0]
+
+	// If nothing is running, we can disable the timer.
+	tasks := atomic.LoadInt64(&ticker.k.runningTasks)
+	if tasks == 0 {
+		ticker.k.runningTasksMu.Lock()
+		defer ticker.k.runningTasksMu.Unlock()
+		tasks := atomic.LoadInt64(&ticker.k.runningTasks)
+		if tasks != 0 {
+			// Raced with a 0 -> 1 transition.
+			return setting, false
+		}
+
+		// Stop the timer. We must cache the current setting so the
+		// kernel can access it without violating the lock order.
+		ticker.k.cpuClockTickerSetting = setting
+		ticker.k.cpuClockTickerDisabled = true
+		setting.Enabled = false
+		return setting, true
+	}
+
+	return setting, false
 }
 
 // Destroy implements ktime.TimerListener.Destroy.
diff --git a/pkg/sentry/kernel/thread_group.go b/pkg/sentry/kernel/thread_group.go
index 0eef24bfb..72568d296 100644
--- a/pkg/sentry/kernel/thread_group.go
+++ b/pkg/sentry/kernel/thread_group.go
@@ -511,8 +511,9 @@ type itimerRealListener struct {
 }
 
 // Notify implements ktime.TimerListener.Notify.
-func (l *itimerRealListener) Notify(exp uint64) {
+func (l *itimerRealListener) Notify(exp uint64, setting ktime.Setting) (ktime.Setting, bool) {
 	l.tg.SendSignal(SignalInfoPriv(linux.SIGALRM))
+	return ktime.Setting{}, false
 }
 
 // Destroy implements ktime.TimerListener.Destroy.
diff --git a/pkg/sentry/kernel/time/time.go b/pkg/sentry/kernel/time/time.go
index aa6c75d25..107394183 100644
--- a/pkg/sentry/kernel/time/time.go
+++ b/pkg/sentry/kernel/time/time.go
@@ -280,13 +280,16 @@ func (ClockEventsQueue) Readiness(mask waiter.EventMask) waiter.EventMask {
 // A TimerListener receives expirations from a Timer.
 type TimerListener interface {
 	// Notify is called when its associated Timer expires. exp is the number of
-	// expirations.
+	// expirations. setting is the next timer Setting.
 	//
 	// Notify is called with the associated Timer's mutex locked, so Notify
 	// must not take any locks that precede Timer.mu in lock order.
 	//
+	// If Notify returns true, the timer will use the returned setting
+	// rather than the passed one.
+	//
 	// Preconditions: exp > 0.
-	Notify(exp uint64)
+	Notify(exp uint64, setting Setting) (newSetting Setting, update bool)
 
 	// Destroy is called when the timer is destroyed.
 	Destroy()
@@ -533,7 +536,9 @@ func (t *Timer) Tick() {
 	s, exp := t.setting.At(now)
 	t.setting = s
 	if exp > 0 {
-		t.listener.Notify(exp)
+		if newS, ok := t.listener.Notify(exp, t.setting); ok {
+			t.setting = newS
+		}
 	}
 	t.resetKickerLocked(now)
 }
@@ -588,7 +593,9 @@ func (t *Timer) Get() (Time, Setting) {
 	s, exp := t.setting.At(now)
 	t.setting = s
 	if exp > 0 {
-		t.listener.Notify(exp)
+		if newS, ok := t.listener.Notify(exp, t.setting); ok {
+			t.setting = newS
+		}
 	}
 	t.resetKickerLocked(now)
 	return now, s
@@ -620,7 +627,9 @@ func (t *Timer) SwapAnd(s Setting, f func()) (Time, Setting) {
 	}
 	oldS, oldExp := t.setting.At(now)
 	if oldExp > 0 {
-		t.listener.Notify(oldExp)
+		t.listener.Notify(oldExp, oldS)
+		// N.B. The returned Setting doesn't matter because we're about
+		// to overwrite.
 	}
 	if f != nil {
 		f()
@@ -628,7 +637,9 @@ func (t *Timer) SwapAnd(s Setting, f func()) (Time, Setting) {
 	newS, newExp := s.At(now)
 	t.setting = newS
 	if newExp > 0 {
-		t.listener.Notify(newExp)
+		if newS, ok := t.listener.Notify(newExp, t.setting); ok {
+			t.setting = newS
+		}
 	}
 	t.resetKickerLocked(now)
 	return now, oldS
@@ -683,11 +694,13 @@ func NewChannelNotifier() (TimerListener, <-chan struct{}) {
 }
 
 // Notify implements ktime.TimerListener.Notify.
-func (c *ChannelNotifier) Notify(uint64) {
+func (c *ChannelNotifier) Notify(uint64, Setting) (Setting, bool) {
 	select {
 	case c.tchan <- struct{}{}:
 	default:
 	}
+
+	return Setting{}, false
 }
 
 // Destroy implements ktime.TimerListener.Destroy and will close the channel.
diff --git a/test/syscalls/linux/itimer.cc b/test/syscalls/linux/itimer.cc
index 51ce323b9..930d2b940 100644
--- a/test/syscalls/linux/itimer.cc
+++ b/test/syscalls/linux/itimer.cc
@@ -336,7 +336,9 @@ int main(int argc, char** argv) {
     }
     if (arg == gvisor::testing::kSIGPROFFairnessIdle) {
       MaskSIGPIPE();
-      return gvisor::testing::TestSIGPROFFairness(absl::Milliseconds(10));
+      // Sleep time > ClockTick (10ms) exercises sleeping gVisor's
+      // kernel.cpuClockTicker.
+      return gvisor::testing::TestSIGPROFFairness(absl::Milliseconds(25));
     }
   }
 
-- 
cgit v1.2.3


From 0d483985c57a2d001039d17bd198e2eca0f4ff7f Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Tue, 1 Oct 2019 15:41:32 -0700
Subject: Include AT_SECURE in the aux vector

gVisor does not currently implement the functionality that would result in
AT_SECURE = 1, but Linux includes AT_SECURE = 0 in the normal case, so we
should do the same.
PiperOrigin-RevId: 272311488
---
 pkg/sentry/kernel/task_identity.go | 4 ++--
 pkg/sentry/loader/loader.go        | 3 +++
 test/syscalls/linux/proc.cc        | 5 +++++
 3 files changed, 10 insertions(+), 2 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/kernel/task_identity.go b/pkg/sentry/kernel/task_identity.go
index 78ff14b20..ce3e6ef28 100644
--- a/pkg/sentry/kernel/task_identity.go
+++ b/pkg/sentry/kernel/task_identity.go
@@ -465,8 +465,8 @@ func (t *Task) SetKeepCaps(k bool) {
 // disables the features we don't support anyway, is always set. This
 // drastically simplifies this function.
 //
-// - We don't implement AT_SECURE, because no_new_privs always being set means
-// that the conditions that require AT_SECURE never arise. (Compare Linux's
+// - We don't set AT_SECURE = 1, because no_new_privs always being set means
+// that the conditions that require AT_SECURE = 1 never arise. (Compare Linux's
 // security/commoncap.c:cap_bprm_set_creds() and cap_bprm_secureexec().)
 //
 // - We don't check for CAP_SYS_ADMIN in prctl(PR_SET_SECCOMP), since
diff --git a/pkg/sentry/loader/loader.go b/pkg/sentry/loader/loader.go
index f6f1ae762..089d1635b 100644
--- a/pkg/sentry/loader/loader.go
+++ b/pkg/sentry/loader/loader.go
@@ -308,6 +308,9 @@ func Load(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamespace, r
 		arch.AuxEntry{linux.AT_EUID, usermem.Addr(c.EffectiveKUID.In(c.UserNamespace).OrOverflow())},
 		arch.AuxEntry{linux.AT_GID, usermem.Addr(c.RealKGID.In(c.UserNamespace).OrOverflow())},
 		arch.AuxEntry{linux.AT_EGID, usermem.Addr(c.EffectiveKGID.In(c.UserNamespace).OrOverflow())},
+		// The conditions that require AT_SECURE = 1 never arise. See
+		// kernel.Task.updateCredsForExecLocked.
+		arch.AuxEntry{linux.AT_SECURE, 0},
 		arch.AuxEntry{linux.AT_CLKTCK, linux.CLOCKS_PER_SEC},
 		arch.AuxEntry{linux.AT_EXECFN, execfn},
 		arch.AuxEntry{linux.AT_RANDOM, random},
diff --git a/test/syscalls/linux/proc.cc b/test/syscalls/linux/proc.cc
index 6f07803d9..e4c030bbb 100644
--- a/test/syscalls/linux/proc.cc
+++ b/test/syscalls/linux/proc.cc
@@ -440,6 +440,11 @@ TEST(ProcSelfAuxv, EntryPresence) {
   EXPECT_EQ(auxv_entries.count(AT_PHENT), 1);
   EXPECT_EQ(auxv_entries.count(AT_PHNUM), 1);
   EXPECT_EQ(auxv_entries.count(AT_BASE), 1);
+  EXPECT_EQ(auxv_entries.count(AT_UID), 1);
+  EXPECT_EQ(auxv_entries.count(AT_EUID), 1);
+  EXPECT_EQ(auxv_entries.count(AT_GID), 1);
+  EXPECT_EQ(auxv_entries.count(AT_EGID), 1);
+  EXPECT_EQ(auxv_entries.count(AT_SECURE), 1);
   EXPECT_EQ(auxv_entries.count(AT_CLKTCK), 1);
   EXPECT_EQ(auxv_entries.count(AT_RANDOM), 1);
   EXPECT_EQ(auxv_entries.count(AT_EXECFN), 1);
-- 
cgit v1.2.3