107 files changed, 3503 insertions, 1715 deletions
diff --git a/WORKSPACE b/WORKSPACE
index e1873e5c0..417ec6100 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -370,6 +370,112 @@ go_repository(
     version = "v1.5.0",
 )
 
+# Docker API dependencies.
+go_repository(
+    name = "com_github_docker_docker",
+    importpath = "github.com/docker/docker",
+    sum = "h1:iWPIG7pWIsCwT6ZtHnTUpoVMnete7O/pzd9HFE3+tn8=",
+    version = "v17.12.0-ce-rc1.0.20200618181300-9dc6525e6118+incompatible",
+)
+
+go_repository(
+    name = "com_github_docker_go_connections",
+    importpath = "github.com/docker/go-connections",
+    sum = "h1:El9xVISelRB7BuFusrZozjnkIM5YnzCViNKohAFqRJQ=",
+    version = "v0.4.0",
+)
+
+go_repository(
+    name = "com_github_pkg_errors",
+    importpath = "github.com/pkg/errors",
+    sum = "h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=",
+    version = "v0.9.1",
+)
+
+go_repository(
+    name = "com_github_docker_go_units",
+    importpath = "github.com/docker/go-units",
+    sum = "h1:3uh0PgVws3nIA0Q+MwDC8yjEPf9zjRfZZWXZYDct3Tw=",
+    version = "v0.4.0",
+)
+
+go_repository(
+    name = "com_github_opencontainers_go_digest",
+    importpath = "github.com/opencontainers/go-digest",
+    sum = "h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8Oi/yOhh5U=",
+    version = "v1.0.0",
+)
+
+go_repository(
+    name = "com_github_docker_distribution",
+    importpath = "github.com/docker/distribution",
+    sum = "h1:a5mlkVzth6W5A4fOsS3D2EO5BUmsJpcB+cRlLU7cSug=",
+    version = "v2.7.1+incompatible",
+)
+
+go_repository(
+    name = "com_github_davecgh_go_spew",
+    importpath = "github.com/davecgh/go-spew",
+    sum = "h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=",
+    version = "v1.1.1",
+)
+
+go_repository(
+    name = "com_github_konsorten_go_windows_terminal_sequences",
+    importpath = "github.com/konsorten/go-windows-terminal-sequences",
+    sum = "h1:CE8S1cTafDpPvMhIxNJKvHsGVBgn1xWYf1NbHQhywc8=",
+    version = "v2.7.1+incompatible",
+)
+
+go_repository(
+    name = "com_github_pmezard_go_difflib",
+    importpath = "github.com/pmezard/go-difflib",
+    sum = "h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=",
+    version = "v1.0.0",
+)
+
+go_repository(
+    name = "com_github_sirupsen_logrus",
+    importpath = "github.com/sirupsen/logrus",
+    sum = "h1:UBcNElsrwanuuMsnGSlYmtmgbb23qDR5dG+6X6Oo89I=",
+    version = "v1.6.0",
+)
+
+go_repository(
+    name = "com_github_stretchr_testify",
+    importpath = "github.com/stretchr/testify",
+    sum = "h1:bSDNvY7ZPG5RlJ8otE/7V6gMiyenm9RtJ7IUVIAoJ1w=",
+    version = "v1.2.2",
+)
+
+go_repository(
+    name = "com_github_opencontainers_image_spec",
+    importpath = "github.com/opencontainers/image-spec",
+    sum = "h1:JMemWkRwHx4Zj+fVxWoMCFm/8sYGGrUVojFA6h/TRcI=",
+    version = "v1.0.1",
+)
+
+go_repository(
+    name = "com_github_containerd_containerd",
+    importpath = "github.com/containerd/containerd",
+    sum = "h1:3o0smo5SKY7H6AJCmJhsnCjR2/V2T8VmiHt7seN2/kI=",
+    version = "v1.3.4",
+)
+
+go_repository(
+    name = "com_github_microsoft_go_winio",
+    importpath = "github.com/Microsoft/go-winio",
+    sum = "h1:+hMXMk01us9KgxGb7ftKQt2Xpf5hH/yky+TDA+qxleU=",
+    version = "v0.4.14",
+)
+
+go_repository(
+    name = "com_github_stretchr_objx",
+    importpath = "github.com/stretchr/objx",
+    sum = "h1:2vfRuCMp5sSVIDSqO8oNnWJq7mPa6KVP3iPIwFBuy8A=",
+    version = "v0.1.1",
+)
+
 go_repository(
     name = "org_golang_google_api",
     importpath = "google.golang.org/api",
@@ -450,3 +556,4 @@ http_archive(
         "https://github.com/google/benchmark/archive/v1.5.0.tar.gz",
     ],
 )
+
diff --git a/g3doc/README.md b/g3doc/README.md
index 304a91493..7956fe739 100644
--- a/g3doc/README.md
+++ b/g3doc/README.md
@@ -152,7 +152,7 @@ The application is a normal Linux binary provided to gVisor in an OCI runtime
 bundle. gVisor aims to provide an environment equivalent to Linux v4.4, so
 applications should be able to run unmodified. However, gVisor does not
 presently implement every system call, `/proc` file, or `/sys` file so some
-incompatibilities may occur. See [Commpatibility](./user_guide/compatibility.md)
+incompatibilities may occur. See [Compatibility](./user_guide/compatibility.md)
 for more information.
 
 [9p]: https://en.wikipedia.org/wiki/9P_(protocol)
diff --git a/g3doc/user_guide/debugging.md b/g3doc/user_guide/debugging.md
index 0525fd5c0..54fdce34f 100644
--- a/g3doc/user_guide/debugging.md
+++ b/g3doc/user_guide/debugging.md
@@ -129,3 +129,13 @@ go tool pprof -top /usr/local/bin/runsc /tmp/cpu.prof
 ```
 
 [pprof]: https://github.com/google/pprof/blob/master/doc/README.md
+
+### Docker Proxy
+
+When forwarding a port to the container, Docker will likely route traffic
+through the [docker-proxy][]. This proxy may make profiling noisy, so it can be
+helpful to bypass it. Do so by sending traffic directly to the container IP and
+port. e.g., if the `docker0` IP is `192.168.9.1`, the container IP is likely a
+subsequent IP, such as `192.168.9.2`.
+
+[docker-proxy]: https://windsock.io/the-docker-proxy/
diff --git a/g3doc/user_guide/quick_start/oci.md b/g3doc/user_guide/quick_start/oci.md
index 877169145..e7768946b 100644
--- a/g3doc/user_guide/quick_start/oci.md
+++ b/g3doc/user_guide/quick_start/oci.md
@@ -15,8 +15,8 @@ mkdir bundle
 cd bundle
 ```
 
-Create a root file system for the container. We will use the Docker hello-world
-image as the basis for our container.
+Create a root file system for the container. We will use the Docker
+`hello-world` image as the basis for our container.
 
 ```bash
 mkdir rootfs
@@ -24,12 +24,10 @@ docker export $(docker create hello-world) | tar -xf - -C rootfs
 ```
 
 Next, create an specification file called `config.json` that contains our
-container specification. We will update the default command it runs to `/hello`
-in the `hello-world` container.
+container specification. We tell the container to run the `/hello` program.
 
 ```bash
-runsc spec
-sed -i 's;"sh";"/hello";' config.json
+runsc spec -- /hello
 ```
 
 Finally run the container.
diff --git a/g3doc/user_guide/tutorials/cni.md b/g3doc/user_guide/tutorials/cni.md
index ad6c9fa59..ce2fd09a8 100644
--- a/g3doc/user_guide/tutorials/cni.md
+++ b/g3doc/user_guide/tutorials/cni.md
@@ -128,12 +128,14 @@ sudo mkdir -p rootfs/var/www/html
 sudo sh -c 'echo "Hello World!" > rootfs/var/www/html/index.html'
 ```
 
-Next create the `config.json` specifying the network namespace. `sudo
-/usr/local/bin/runsc spec sudo sed -i 's;"sh";"python", "-m", "http.server";'
-config.json sudo sed -i "s;\"cwd\": \"/\";\"cwd\": \"/var/www/html\";"
-config.json sudo sed -i "s;\"type\": \"network\";\"type\":
-\"network\",\n\t\t\t\t\"path\": \"/var/run/netns/${CNI_CONTAINERID}\";"
-config.json`
+Next create the `config.json` specifying the network namespace.
+
+```
+sudo /usr/local/bin/runsc spec \
+    --cwd /var/www/html \
+    --netns /var/run/netns/${CNI_CONTAINERID} \
+    -- python -m http.server
+```
 
 ## Run the Container
 
diff --git a/pkg/sentry/fsimpl/gofer/directory.go b/pkg/sentry/fsimpl/gofer/directory.go
index 5d83fe363..8c7c8e1b3 100644
--- a/pkg/sentry/fsimpl/gofer/directory.go
+++ b/pkg/sentry/fsimpl/gofer/directory.go
@@ -85,6 +85,7 @@ func (d *dentry) createSyntheticChildLocked(opts *createSyntheticOpts) {
 	d2 := &dentry{
 		refs:      1, // held by d
 		fs:        d.fs,
+		ino:       d.fs.nextSyntheticIno(),
 		mode:      uint32(opts.mode),
 		uid:       uint32(opts.kuid),
 		gid:       uint32(opts.kgid),
@@ -184,13 +185,13 @@ func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) {
 		{
 			Name:    ".",
 			Type:    linux.DT_DIR,
-			Ino:     d.ino,
+			Ino:     uint64(d.ino),
 			NextOff: 1,
 		},
 		{
 			Name:    "..",
 			Type:    uint8(atomic.LoadUint32(&parent.mode) >> 12),
-			Ino:     parent.ino,
+			Ino:     uint64(parent.ino),
 			NextOff: 2,
 		},
 	}
@@ -226,7 +227,7 @@ func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) {
 				}
 				dirent := vfs.Dirent{
 					Name:    p9d.Name,
-					Ino:     p9d.QID.Path,
+					Ino:     uint64(inoFromPath(p9d.QID.Path)),
 					NextOff: int64(len(dirents) + 1),
 				}
 				// p9 does not expose 9P2000.U's DMDEVICE, DMNAMEDPIPE, or
@@ -259,7 +260,7 @@ func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) {
 			dirents = append(dirents, vfs.Dirent{
 				Name:    child.name,
 				Type:    uint8(atomic.LoadUint32(&child.mode) >> 12),
-				Ino:     child.ino,
+				Ino:     uint64(child.ino),
 				NextOff: int64(len(dirents) + 1),
 			})
 		}
diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go
index 7bcc99b29..cd5f5049e 100644
--- a/pkg/sentry/fsimpl/gofer/filesystem.go
+++ b/pkg/sentry/fsimpl/gofer/filesystem.go
@@ -214,9 +214,8 @@ func (fs *filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.Vir
 		return nil, err
 	}
 	if child != nil {
-		if !file.isNil() && qid.Path == child.ino {
-			// The file at this path hasn't changed. Just update cached
-			// metadata.
+		if !file.isNil() && inoFromPath(qid.Path) == child.ino {
+			// The file at this path hasn't changed. Just update cached metadata.
 			file.close(ctx)
 			child.updateFromP9Attrs(attrMask, &attr)
 			return child, nil
@@ -1499,3 +1498,7 @@ func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDe
 	defer fs.renameMu.RUnlock()
 	return genericPrependPath(vfsroot, vd.Mount(), vd.Dentry().Impl().(*dentry), b)
 }
+
+func (fs *filesystem) nextSyntheticIno() inodeNumber {
+	return inodeNumber(atomic.AddUint64(&fs.syntheticSeq, 1) | syntheticInoMask)
+}
diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go
index 8e74e60a5..2b83094cd 100644
--- a/pkg/sentry/fsimpl/gofer/gofer.go
+++ b/pkg/sentry/fsimpl/gofer/gofer.go
@@ -110,6 +110,26 @@ type filesystem struct {
 	syncMu           sync.Mutex
 	syncableDentries map[*dentry]struct{}
 	specialFileFDs   map[*specialFileFD]struct{}
+
+	// syntheticSeq stores a counter to used to generate unique inodeNumber for
+	// synthetic dentries.
+	syntheticSeq uint64
+}
+
+// inodeNumber represents inode number reported in Dirent.Ino. For regular
+// dentries, it comes from QID.Path from the 9P server. Synthetic dentries
+// have have their inodeNumber generated sequentially, with the MSB reserved to
+// prevent conflicts with regular dentries.
+type inodeNumber uint64
+
+// Reserve MSB for synthetic mounts.
+const syntheticInoMask = uint64(1) << 63
+
+func inoFromPath(path uint64) inodeNumber {
+	if path&syntheticInoMask != 0 {
+		log.Warningf("Dropping MSB from ino, collision is possible. Original: %d, new: %d", path, path&^syntheticInoMask)
+	}
+	return inodeNumber(path &^ syntheticInoMask)
 }
 
 type filesystemOptions struct {
@@ -585,11 +605,11 @@ type dentry struct {
 	// Cached metadata; protected by metadataMu and accessed using atomic
 	// memory operations unless otherwise specified.
 	metadataMu sync.Mutex
-	ino        uint64 // immutable
-	mode       uint32 // type is immutable, perms are mutable
-	uid        uint32 // auth.KUID, but stored as raw uint32 for sync/atomic
-	gid        uint32 // auth.KGID, but ...
-	blockSize  uint32 // 0 if unknown
+	ino        inodeNumber // immutable
+	mode       uint32      // type is immutable, perms are mutable
+	uid        uint32      // auth.KUID, but stored as raw uint32 for sync/atomic
+	gid        uint32      // auth.KGID, but ...
+	blockSize  uint32      // 0 if unknown
 	// Timestamps, all nsecs from the Unix epoch.
 	atime int64
 	mtime int64
@@ -704,7 +724,7 @@ func (fs *filesystem) newDentry(ctx context.Context, file p9file, qid p9.QID, ma
 	d := &dentry{
 		fs:        fs,
 		file:      file,
-		ino:       qid.Path,
+		ino:       inoFromPath(qid.Path),
 		mode:      uint32(attr.Mode),
 		uid:       uint32(fs.opts.dfltuid),
 		gid:       uint32(fs.opts.dfltgid),
@@ -846,7 +866,7 @@ func (d *dentry) statTo(stat *linux.Statx) {
 	stat.UID = atomic.LoadUint32(&d.uid)
 	stat.GID = atomic.LoadUint32(&d.gid)
 	stat.Mode = uint16(atomic.LoadUint32(&d.mode))
-	stat.Ino = d.ino
+	stat.Ino = uint64(d.ino)
 	stat.Size = atomic.LoadUint64(&d.size)
 	// This is consistent with regularFileFD.Seek(), which treats regular files
 	// as having no holes.
diff --git a/pkg/sentry/fsimpl/gofer/handle.go b/pkg/sentry/fsimpl/gofer/handle.go
index 724a3f1f7..8792ca4f2 100644
--- a/pkg/sentry/fsimpl/gofer/handle.go
+++ b/pkg/sentry/fsimpl/gofer/handle.go
@@ -126,11 +126,16 @@ func (h *handle) writeFromBlocksAt(ctx context.Context, srcs safemem.BlockSeq, o
 }
 
 func (h *handle) sync(ctx context.Context) error {
+	// Handle most common case first.
 	if h.fd >= 0 {
 		ctx.UninterruptibleSleepStart(false)
 		err := syscall.Fsync(int(h.fd))
 		ctx.UninterruptibleSleepFinish(false)
 		return err
 	}
+	if h.file.isNil() {
+		// File hasn't been touched, there is nothing to sync.
+		return nil
+	}
 	return h.file.fsync(ctx)
 }
diff --git a/pkg/sentry/fsimpl/gofer/regular_file.go b/pkg/sentry/fsimpl/gofer/regular_file.go
index 3d2d3530a..a2f02d9c7 100644
--- a/pkg/sentry/fsimpl/gofer/regular_file.go
+++ b/pkg/sentry/fsimpl/gofer/regular_file.go
@@ -582,20 +582,19 @@ func (fd *regularFileFD) Sync(ctx context.Context) error {
 
 func (d *dentry) syncSharedHandle(ctx context.Context) error {
 	d.handleMu.RLock()
-	if !d.handleWritable {
-		d.handleMu.RUnlock()
-		return nil
-	}
-	d.dataMu.Lock()
-	// Write dirty cached data to the remote file.
-	err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, d.fs.mfp.MemoryFile(), d.handle.writeFromBlocksAt)
-	d.dataMu.Unlock()
-	if err == nil {
-		// Sync the remote file.
-		err = d.handle.sync(ctx)
+	defer d.handleMu.RUnlock()
+
+	if d.handleWritable {
+		d.dataMu.Lock()
+		// Write dirty cached data to the remote file.
+		err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, d.fs.mfp.MemoryFile(), d.handle.writeFromBlocksAt)
+		d.dataMu.Unlock()
+		if err != nil {
+			return err
+		}
 	}
-	d.handleMu.RUnlock()
-	return err
+	// Sync the remote file.
+	return d.handle.sync(ctx)
 }
 
 // ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap.
diff --git a/pkg/sentry/fsimpl/gofer/special_file.go b/pkg/sentry/fsimpl/gofer/special_file.go
index 3c4e7e2e4..c1e6b13e5 100644
--- a/pkg/sentry/fsimpl/gofer/special_file.go
+++ b/pkg/sentry/fsimpl/gofer/special_file.go
@@ -41,10 +41,10 @@ type specialFileFD struct {
 	// file offset is significant, i.e. a regular file. seekable is immutable.
 	seekable bool
 
-	// mayBlock is true if this file description represents a file for which
-	// queue may send I/O readiness events. mayBlock is immutable.
-	mayBlock bool
-	queue    waiter.Queue
+	// haveQueue is true if this file description represents a file for which
+	// queue may send I/O readiness events. haveQueue is immutable.
+	haveQueue bool
+	queue     waiter.Queue
 
 	// If seekable is true, off is the file offset. off is protected by mu.
 	mu  sync.Mutex
@@ -54,14 +54,14 @@ type specialFileFD struct {
 func newSpecialFileFD(h handle, mnt *vfs.Mount, d *dentry, locks *vfs.FileLocks, flags uint32) (*specialFileFD, error) {
 	ftype := d.fileType()
 	seekable := ftype == linux.S_IFREG
-	mayBlock := ftype == linux.S_IFIFO || ftype == linux.S_IFSOCK
+	haveQueue := (ftype == linux.S_IFIFO || ftype == linux.S_IFSOCK) && h.fd >= 0
 	fd := &specialFileFD{
-		handle:   h,
-		seekable: seekable,
-		mayBlock: mayBlock,
+		handle:    h,
+		seekable:  seekable,
+		haveQueue: haveQueue,
 	}
 	fd.LockFD.Init(locks)
-	if mayBlock && h.fd >= 0 {
+	if haveQueue {
 		if err := fdnotifier.AddFD(h.fd, &fd.queue); err != nil {
 			return nil, err
 		}
@@ -70,7 +70,7 @@ func newSpecialFileFD(h handle, mnt *vfs.Mount, d *dentry, locks *vfs.FileLocks,
 		DenyPRead:  !seekable,
 		DenyPWrite: !seekable,
 	}); err != nil {
-		if mayBlock && h.fd >= 0 {
+		if haveQueue {
 			fdnotifier.RemoveFD(h.fd)
 		}
 		return nil, err
@@ -80,7 +80,7 @@ func newSpecialFileFD(h handle, mnt *vfs.Mount, d *dentry, locks *vfs.FileLocks,
 
 // Release implements vfs.FileDescriptionImpl.Release.
 func (fd *specialFileFD) Release() {
-	if fd.mayBlock && fd.handle.fd >= 0 {
+	if fd.haveQueue {
 		fdnotifier.RemoveFD(fd.handle.fd)
 	}
 	fd.handle.close(context.Background())
@@ -100,7 +100,7 @@ func (fd *specialFileFD) OnClose(ctx context.Context) error {
 
 // Readiness implements waiter.Waitable.Readiness.
 func (fd *specialFileFD) Readiness(mask waiter.EventMask) waiter.EventMask {
-	if fd.mayBlock {
+	if fd.haveQueue {
 		return fdnotifier.NonBlockingPoll(fd.handle.fd, mask)
 	}
 	return fd.fileDescription.Readiness(mask)
@@ -108,8 +108,9 @@ func (fd *specialFileFD) Readiness(mask waiter.EventMask) waiter.EventMask {
 
 // EventRegister implements waiter.Waitable.EventRegister.
 func (fd *specialFileFD) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
-	if fd.mayBlock {
+	if fd.haveQueue {
 		fd.queue.EventRegister(e, mask)
+		fdnotifier.UpdateFD(fd.handle.fd)
 		return
 	}
 	fd.fileDescription.EventRegister(e, mask)
@@ -117,8 +118,9 @@ func (fd *specialFileFD) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
 
 // EventUnregister implements waiter.Waitable.EventUnregister.
 func (fd *specialFileFD) EventUnregister(e *waiter.Entry) {
-	if fd.mayBlock {
+	if fd.haveQueue {
 		fd.queue.EventUnregister(e)
+		fdnotifier.UpdateFD(fd.handle.fd)
 		return
 	}
 	fd.fileDescription.EventUnregister(e)
diff --git a/pkg/sentry/fsimpl/testutil/kernel.go b/pkg/sentry/fsimpl/testutil/kernel.go
index c16a36cdb..e743e8114 100644
--- a/pkg/sentry/fsimpl/testutil/kernel.go
+++ b/pkg/sentry/fsimpl/testutil/kernel.go
@@ -62,6 +62,7 @@ func Boot() (*kernel.Kernel, error) {
 		return nil, fmt.Errorf("creating platform: %v", err)
 	}
 
+	kernel.VFS2Enabled = true
 	k := &kernel.Kernel{
 		Platform: plat,
 	}
@@ -73,7 +74,7 @@ func Boot() (*kernel.Kernel, error) {
 	k.SetMemoryFile(mf)
 
 	// Pass k as the platform since it is savable, unlike the actual platform.
-	vdso, err := loader.PrepareVDSO(nil, k)
+	vdso, err := loader.PrepareVDSO(k)
 	if err != nil {
 		return nil, fmt.Errorf("creating vdso: %v", err)
 	}
@@ -103,11 +104,6 @@ func Boot() (*kernel.Kernel, error) {
 		return nil, fmt.Errorf("initializing kernel: %v", err)
 	}
 
-	kernel.VFS2Enabled = true
-
-	if err := k.VFS().Init(); err != nil {
-		return nil, fmt.Errorf("VFS init: %v", err)
-	}
 	k.VFS().MustRegisterFilesystemType(tmpfs.Name, &tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
 		AllowUserMount: true,
 		AllowUserList:  true,
diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go
index ed40f6b52..a0f20c2d4 100644
--- a/pkg/sentry/fsimpl/tmpfs/filesystem.go
+++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go
@@ -277,7 +277,7 @@ func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
 		creds := rp.Credentials()
 		var childInode *inode
 		switch opts.Mode.FileType() {
-		case 0, linux.S_IFREG:
+		case linux.S_IFREG:
 			childInode = fs.newRegularFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode)
 		case linux.S_IFIFO:
 			childInode = fs.newNamedPipe(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode)
diff --git a/pkg/sentry/loader/BUILD b/pkg/sentry/loader/BUILD
index c6aa65f28..34bdb0b69 100644
--- a/pkg/sentry/loader/BUILD
+++ b/pkg/sentry/loader/BUILD
@@ -30,9 +30,6 @@ go_library(
         "//pkg/rand",
         "//pkg/safemem",
         "//pkg/sentry/arch",
-        "//pkg/sentry/fs",
-        "//pkg/sentry/fs/anon",
-        "//pkg/sentry/fs/fsutil",
         "//pkg/sentry/fsbridge",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/limits",
@@ -45,6 +42,5 @@ go_library(
         "//pkg/syserr",
         "//pkg/syserror",
         "//pkg/usermem",
-        "//pkg/waiter",
     ],
 )
diff --git a/pkg/sentry/loader/elf.go b/pkg/sentry/loader/elf.go
index 616fafa2c..ddeaff3db 100644
--- a/pkg/sentry/loader/elf.go
+++ b/pkg/sentry/loader/elf.go
@@ -90,14 +90,23 @@ type elfInfo struct {
 	sharedObject bool
 }
 
+// fullReader interface extracts the ReadFull method from fsbridge.File so that
+// client code does not need to define an entire fsbridge.File when only read
+// functionality is needed.
+//
+// TODO(gvisor.dev/issue/1035): Once VFS2 ships, rewrite this to wrap
+// vfs.FileDescription's PRead/Read instead.
+type fullReader interface {
+	// ReadFull is the same as fsbridge.File.ReadFull.
+	ReadFull(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error)
+}
+
 // parseHeader parse the ELF header, verifying that this is a supported ELF
 // file and returning the ELF program headers.
 //
 // This is similar to elf.NewFile, except that it is more strict about what it
 // accepts from the ELF, and it doesn't parse unnecessary parts of the file.
-//
-// ctx may be nil if f does not need it.
-func parseHeader(ctx context.Context, f fsbridge.File) (elfInfo, error) {
+func parseHeader(ctx context.Context, f fullReader) (elfInfo, error) {
 	// Check ident first; it will tell us the endianness of the rest of the
 	// structs.
 	var ident [elf.EI_NIDENT]byte
diff --git a/pkg/sentry/loader/loader.go b/pkg/sentry/loader/loader.go
index 88449fe95..986c7fb4d 100644
--- a/pkg/sentry/loader/loader.go
+++ b/pkg/sentry/loader/loader.go
@@ -27,7 +27,6 @@ import (
 	"gvisor.dev/gvisor/pkg/cpuid"
 	"gvisor.dev/gvisor/pkg/rand"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
-	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fsbridge"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/mm"
@@ -80,22 +79,6 @@ type LoadArgs struct {
 	Features *cpuid.FeatureSet
 }
 
-// readFull behaves like io.ReadFull for an *fs.File.
-func readFull(ctx context.Context, f *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
-	var total int64
-	for dst.NumBytes() > 0 {
-		n, err := f.Preadv(ctx, dst, offset+total)
-		total += n
-		if err == io.EOF && total != 0 {
-			return total, io.ErrUnexpectedEOF
-		} else if err != nil {
-			return total, err
-		}
-		dst = dst.DropFirst64(n)
-	}
-	return total, nil
-}
-
 // openPath opens args.Filename and checks that it is valid for loading.
 //
 // openPath returns an *fs.Dirent and *fs.File for args.Filename, which is not
@@ -238,14 +221,14 @@ func Load(ctx context.Context, args LoadArgs, extraAuxv []arch.AuxEntry, vdso *V
 	// Load the executable itself.
 	loaded, ac, file, newArgv, err := loadExecutable(ctx, args)
 	if err != nil {
-		return 0, nil, "", syserr.NewDynamic(fmt.Sprintf("Failed to load %s: %v", args.Filename, err), syserr.FromError(err).ToLinux())
+		return 0, nil, "", syserr.NewDynamic(fmt.Sprintf("failed to load %s: %v", args.Filename, err), syserr.FromError(err).ToLinux())
 	}
 	defer file.DecRef()
 
 	// Load the VDSO.
 	vdsoAddr, err := loadVDSO(ctx, args.MemoryManager, vdso, loaded)
 	if err != nil {
-		return 0, nil, "", syserr.NewDynamic(fmt.Sprintf("Error loading VDSO: %v", err), syserr.FromError(err).ToLinux())
+		return 0, nil, "", syserr.NewDynamic(fmt.Sprintf("error loading VDSO: %v", err), syserr.FromError(err).ToLinux())
 	}
 
 	// Setup the heap. brk starts at the next page after the end of the
diff --git a/pkg/sentry/loader/vdso.go b/pkg/sentry/loader/vdso.go
index 165869028..05a294fe6 100644
--- a/pkg/sentry/loader/vdso.go
+++ b/pkg/sentry/loader/vdso.go
@@ -26,10 +26,6 @@ import (
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
-	"gvisor.dev/gvisor/pkg/sentry/fs"
-	"gvisor.dev/gvisor/pkg/sentry/fs/anon"
-	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
-	"gvisor.dev/gvisor/pkg/sentry/fsbridge"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/mm"
 	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
@@ -37,7 +33,6 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/usage"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
-	"gvisor.dev/gvisor/pkg/waiter"
 )
 
 const vdsoPrelink = 0xffffffffff700000
@@ -55,52 +50,11 @@ func (f *fileContext) Value(key interface{}) interface{} {
 	}
 }
 
-// byteReader implements fs.FileOperations for reading from a []byte source.
-type byteReader struct {
-	fsutil.FileNoFsync              `state:"nosave"`
-	fsutil.FileNoIoctl              `state:"nosave"`
-	fsutil.FileNoMMap               `state:"nosave"`
-	fsutil.FileNoSplice             `state:"nosave"`
-	fsutil.FileNoopFlush            `state:"nosave"`
-	fsutil.FileNoopRelease          `state:"nosave"`
-	fsutil.FileNotDirReaddir        `state:"nosave"`
-	fsutil.FilePipeSeek             `state:"nosave"`
-	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
-	waiter.AlwaysReady              `state:"nosave"`
-
+type byteFullReader struct {
 	data []byte
 }
 
-var _ fs.FileOperations = (*byteReader)(nil)
-
-// newByteReaderFile creates a fake file to read data from.
-//
-// TODO(gvisor.dev/issue/2921): Convert to VFS2.
-func newByteReaderFile(ctx context.Context, data []byte) *fs.File {
-	// Create a fake inode.
-	inode := fs.NewInode(
-		ctx,
-		&fsutil.SimpleFileInode{},
-		fs.NewPseudoMountSource(ctx),
-		fs.StableAttr{
-			Type:      fs.Anonymous,
-			DeviceID:  anon.PseudoDevice.DeviceID(),
-			InodeID:   anon.PseudoDevice.NextIno(),
-			BlockSize: usermem.PageSize,
-		})
-
-	// Use the fake inode to create a fake dirent.
-	dirent := fs.NewTransientDirent(inode)
-	defer dirent.DecRef()
-
-	// Use the fake dirent to make a fake file.
-	flags := fs.FileFlags{Read: true, Pread: true}
-	return fs.NewFile(&fileContext{Context: context.Background()}, dirent, flags, &byteReader{
-		data: data,
-	})
-}
-
-func (b *byteReader) Read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
+func (b *byteFullReader) ReadFull(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) {
 	if offset < 0 {
 		return 0, syserror.EINVAL
 	}
@@ -111,10 +65,6 @@ func (b *byteReader) Read(ctx context.Context, file *fs.File, dst usermem.IOSequ
 	return int64(n), err
 }
 
-func (b *byteReader) Write(ctx context.Context, file *fs.File, src usermem.IOSequence, offset int64) (int64, error) {
-	panic("Write not supported")
-}
-
 // validateVDSO checks that the VDSO can be loaded by loadVDSO.
 //
 // VDSOs are special (see below). Since we are going to map the VDSO directly
@@ -130,7 +80,7 @@ func (b *byteReader) Write(ctx context.Context, file *fs.File, src usermem.IOSeq
 // * PT_LOAD segments don't extend beyond the end of the file.
 //
 // ctx may be nil if f does not need it.
-func validateVDSO(ctx context.Context, f fsbridge.File, size uint64) (elfInfo, error) {
+func validateVDSO(ctx context.Context, f fullReader, size uint64) (elfInfo, error) {
 	info, err := parseHeader(ctx, f)
 	if err != nil {
 		log.Infof("Unable to parse VDSO header: %v", err)
@@ -248,13 +198,12 @@ func getSymbolValueFromVDSO(symbol string) (uint64, error) {
 
 // PrepareVDSO validates the system VDSO and returns a VDSO, containing the
 // param page for updating by the kernel.
-func PrepareVDSO(ctx context.Context, mfp pgalloc.MemoryFileProvider) (*VDSO, error) {
-	vdsoFile := fsbridge.NewFSFile(newByteReaderFile(ctx, vdsoBin))
+func PrepareVDSO(mfp pgalloc.MemoryFileProvider) (*VDSO, error) {
+	vdsoFile := &byteFullReader{data: vdsoBin}
 
 	// First make sure the VDSO is valid. vdsoFile does not use ctx, so a
 	// nil context can be passed.
 	info, err := validateVDSO(nil, vdsoFile, uint64(len(vdsoBin)))
-	vdsoFile.DecRef()
 	if err != nil {
 		return nil, err
 	}
diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index e7d2c83d7..78a842973 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -192,6 +192,7 @@ var Metrics = tcpip.Stats{
 		PacketsSent:              mustCreateMetric("/netstack/udp/packets_sent", "Number of UDP datagrams sent."),
 		PacketSendErrors:         mustCreateMetric("/netstack/udp/packet_send_errors", "Number of UDP datagrams failed to be sent."),
 		ChecksumErrors:           mustCreateMetric("/netstack/udp/checksum_errors", "Number of UDP datagrams dropped due to bad checksums."),
+		InvalidSourceAddress:     mustCreateMetric("/netstack/udp/invalid_source", "Number of UDP datagrams dropped due to invalid source address."),
 	},
 }
 
@@ -1753,6 +1754,11 @@ func setSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, nam
 
 		return nil
 
+	case linux.SO_DETACH_FILTER:
+		// optval is ignored.
+		var v tcpip.SocketDetachFilterOption
+		return syserr.TranslateNetstackError(ep.SetSockOpt(v))
+
 	default:
 		socket.SetSockOptEmitUnimplementedEvent(t, name)
 	}
@@ -2112,13 +2118,22 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s
 		}
 		return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.ReceiveIPPacketInfoOption, v != 0))
 
+	case linux.IP_HDRINCL:
+		if len(optVal) == 0 {
+			return nil
+		}
+		v, err := parseIntOrChar(optVal)
+		if err != nil {
+			return err
+		}
+		return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.IPHdrIncludedOption, v != 0))
+
 	case linux.IP_ADD_SOURCE_MEMBERSHIP,
 		linux.IP_BIND_ADDRESS_NO_PORT,
 		linux.IP_BLOCK_SOURCE,
 		linux.IP_CHECKSUM,
 		linux.IP_DROP_SOURCE_MEMBERSHIP,
 		linux.IP_FREEBIND,
-		linux.IP_HDRINCL,
 		linux.IP_IPSEC_POLICY,
 		linux.IP_MINTTL,
 		linux.IP_MSFILTER,
diff --git a/pkg/sentry/syscalls/linux/vfs2/filesystem.go b/pkg/sentry/syscalls/linux/vfs2/filesystem.go
index b12b5967b..6b14c2bef 100644
--- a/pkg/sentry/syscalls/linux/vfs2/filesystem.go
+++ b/pkg/sentry/syscalls/linux/vfs2/filesystem.go
@@ -107,7 +107,7 @@ func Mknod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	addr := args[0].Pointer()
 	mode := args[1].ModeT()
 	dev := args[2].Uint()
-	return 0, nil, mknodat(t, linux.AT_FDCWD, addr, mode, dev)
+	return 0, nil, mknodat(t, linux.AT_FDCWD, addr, linux.FileMode(mode), dev)
 }
 
 // Mknodat implements Linux syscall mknodat(2).
@@ -116,10 +116,10 @@ func Mknodat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
 	addr := args[1].Pointer()
 	mode := args[2].ModeT()
 	dev := args[3].Uint()
-	return 0, nil, mknodat(t, dirfd, addr, mode, dev)
+	return 0, nil, mknodat(t, dirfd, addr, linux.FileMode(mode), dev)
 }
 
-func mknodat(t *kernel.Task, dirfd int32, addr usermem.Addr, mode uint, dev uint32) error {
+func mknodat(t *kernel.Task, dirfd int32, addr usermem.Addr, mode linux.FileMode, dev uint32) error {
 	path, err := copyInPath(t, addr)
 	if err != nil {
 		return err
@@ -129,9 +129,14 @@ func mknodat(t *kernel.Task, dirfd int32, addr usermem.Addr, mode uint, dev uint
 		return err
 	}
 	defer tpop.Release()
+
+	// "Zero file type is equivalent to type S_IFREG." - mknod(2)
+	if mode.FileType() == 0 {
+		mode |= linux.ModeRegular
+	}
 	major, minor := linux.DecodeDeviceID(dev)
 	return t.Kernel().VFS().MknodAt(t, t.Credentials(), &tpop.pop, &vfs.MknodOptions{
-		Mode:     linux.FileMode(mode &^ t.FSContext().Umask()),
+		Mode:     mode &^ linux.FileMode(t.FSContext().Umask()),
 		DevMajor: uint32(major),
 		DevMinor: minor,
 	})
diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go
index 58c7ad778..522e27475 100644
--- a/pkg/sentry/vfs/vfs.go
+++ b/pkg/sentry/vfs/vfs.go
@@ -123,6 +123,9 @@ type VirtualFilesystem struct {
 
 // Init initializes a new VirtualFilesystem with no mounts or FilesystemTypes.
 func (vfs *VirtualFilesystem) Init() error {
+	if vfs.mountpoints != nil {
+		panic("VFS already initialized")
+	}
 	vfs.mountpoints = make(map[*Dentry]map[*Mount]struct{})
 	vfs.devices = make(map[devTuple]*registeredDevice)
 	vfs.anonBlockDevMinorNext = 1
diff --git a/pkg/sleep/BUILD b/pkg/sleep/BUILD
index e131455f7..ae0fe1522 100644
--- a/pkg/sleep/BUILD
+++ b/pkg/sleep/BUILD
@@ -12,6 +12,7 @@ go_library(
         "sleep_unsafe.go",
     ],
     visibility = ["//:sandbox"],
+    deps = ["//pkg/sync"],
 )
 
 go_test(
diff --git a/pkg/sleep/sleep_test.go b/pkg/sleep/sleep_test.go
index af47e2ba1..1dd11707d 100644
--- a/pkg/sleep/sleep_test.go
+++ b/pkg/sleep/sleep_test.go
@@ -379,10 +379,7 @@ func TestRace(t *testing.T) {
 // TestRaceInOrder tests that multiple wakers can continuously send wake requests to
 // the sleeper and that the wakers are retrieved in the order asserted.
 func TestRaceInOrder(t *testing.T) {
-	const wakers = 100
-	const wakeRequests = 10000
-
-	w := make([]Waker, wakers)
+	w := make([]Waker, 10000)
 	s := Sleeper{}
 
 	// Associate each waker and start goroutines that will assert them.
@@ -390,19 +387,16 @@ func TestRaceInOrder(t *testing.T) {
 		s.AddWaker(&w[i], i)
 	}
 	go func() {
-		n := 0
-		for n < wakeRequests {
-			wk := w[n%len(w)]
-			wk.Assert()
-			n++
+		for i := range w {
+			w[i].Assert()
 		}
 	}()
 
 	// Wait for all wake up notifications from all wakers.
-	for i := 0; i < wakeRequests; i++ {
-		v, _ := s.Fetch(true)
-		if got, want := v, i%wakers; got != want {
-			t.Fatalf("got  %d want %d", got, want)
+	for want := range w {
+		got, _ := s.Fetch(true)
+		if got != want {
+			t.Fatalf("got %d want %d", got, want)
 		}
 	}
 }
diff --git a/pkg/sleep/sleep_unsafe.go b/pkg/sleep/sleep_unsafe.go
index f68c12620..118805492 100644
--- a/pkg/sleep/sleep_unsafe.go
+++ b/pkg/sleep/sleep_unsafe.go
@@ -75,6 +75,8 @@ package sleep
 import (
 	"sync/atomic"
 	"unsafe"
+
+	"gvisor.dev/gvisor/pkg/sync"
 )
 
 const (
@@ -323,7 +325,12 @@ func (s *Sleeper) enqueueAssertedWaker(w *Waker) {
 //
 // This struct is thread-safe, that is, its methods can be called concurrently
 // by multiple goroutines.
+//
+// Note, it is not safe to copy a Waker as its fields are modified by value
+// (the pointer fields are individually modified with atomic operations).
 type Waker struct {
+	_ sync.NoCopy
+
 	// s is the sleeper that this waker can wake up. Only one sleeper at a
 	// time is allowed. This field can have three classes of values:
 	// nil -- the waker is not asserted: it either is not associated with
diff --git a/pkg/sync/BUILD b/pkg/sync/BUILD
index d0d77e19c..4d47207f7 100644
--- a/pkg/sync/BUILD
+++ b/pkg/sync/BUILD
@@ -33,6 +33,7 @@ go_library(
         "aliases.go",
         "memmove_unsafe.go",
         "mutex_unsafe.go",
+        "nocopy.go",
         "norace_unsafe.go",
         "race_unsafe.go",
         "rwmutex_unsafe.go",
diff --git a/pkg/sync/nocopy.go b/pkg/sync/nocopy.go
new file mode 100644
index 000000000..722b29501
--- /dev/null
+++ b/pkg/sync/nocopy.go
@@ -0,0 +1,28 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package sync
+
+// NoCopy may be embedded into structs which must not be copied
+// after the first use.
+//
+// See https://golang.org/issues/8005#issuecomment-190753527
+// for details.
+type NoCopy struct{}
+
+// Lock is a no-op used by -copylocks checker from `go vet`.
+func (*NoCopy) Lock() {}
+
+// Unlock is a no-op used by -copylocks checker from `go vet`.
+func (*NoCopy) Unlock() {}
diff --git a/pkg/tcpip/link/rawfile/rawfile_unsafe.go b/pkg/tcpip/link/rawfile/rawfile_unsafe.go
index 44e25d475..69de6eb3e 100644
--- a/pkg/tcpip/link/rawfile/rawfile_unsafe.go
+++ b/pkg/tcpip/link/rawfile/rawfile_unsafe.go
@@ -69,13 +69,12 @@ func NonBlockingWrite(fd int, buf []byte) *tcpip.Error {
 // NonBlockingWrite3 writes up to three byte slices to a file descriptor in a
 // single syscall. It fails if partial data is written.
 func NonBlockingWrite3(fd int, b1, b2, b3 []byte) *tcpip.Error {
-	// If the is no second buffer, issue a regular write.
-	if len(b2) == 0 {
+	// If there is no second and third buffer, issue a regular write.
+	if len(b2) == 0 && len(b3) == 0 {
 		return NonBlockingWrite(fd, b1)
 	}
 
-	// We have two buffers. Build the iovec that represents them and issue
-	// a writev syscall.
+	// Build the iovec that represents them and issue a writev syscall.
 	iovec := [3]syscall.Iovec{
 		{
 			Base: &b1[0],
diff --git a/pkg/tcpip/network/ipv4/ipv4.go b/pkg/tcpip/network/ipv4/ipv4.go
index 7e9f16c90..b1776e5ee 100644
--- a/pkg/tcpip/network/ipv4/ipv4.go
+++ b/pkg/tcpip/network/ipv4/ipv4.go
@@ -225,12 +225,10 @@ func (e *endpoint) writePacketFragments(r *stack.Route, gso *stack.GSO, mtu int,
 func (e *endpoint) addIPHeader(r *stack.Route, hdr *buffer.Prependable, payloadSize int, params stack.NetworkHeaderParams) header.IPv4 {
 	ip := header.IPv4(hdr.Prepend(header.IPv4MinimumSize))
 	length := uint16(hdr.UsedLength() + payloadSize)
-	id := uint32(0)
-	if length > header.IPv4MaximumHeaderSize+8 {
-		// Packets of 68 bytes or less are required by RFC 791 to not be
-		// fragmented, so we only assign ids to larger packets.
-		id = atomic.AddUint32(&e.protocol.ids[hashRoute(r, params.Protocol, e.protocol.hashIV)%buckets], 1)
-	}
+	// RFC 6864 section 4.3 mandates uniqueness of ID values for non-atomic
+	// datagrams. Since the DF bit is never being set here, all datagrams
+	// are non-atomic and need an ID.
+	id := atomic.AddUint32(&e.protocol.ids[hashRoute(r, params.Protocol, e.protocol.hashIV)%buckets], 1)
 	ip.Encode(&header.IPv4Fields{
 		IHL:         header.IPv4MinimumSize,
 		TotalLength: length,
@@ -376,13 +374,12 @@ func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt *stack.PacketBu
 
 	// Set the packet ID when zero.
 	if ip.ID() == 0 {
-		id := uint32(0)
-		if pkt.Data.Size() > header.IPv4MaximumHeaderSize+8 {
-			// Packets of 68 bytes or less are required by RFC 791 to not be
-			// fragmented, so we only assign ids to larger packets.
-			id = atomic.AddUint32(&e.protocol.ids[hashRoute(r, 0 /* protocol */, e.protocol.hashIV)%buckets], 1)
+		// RFC 6864 section 4.3 mandates uniqueness of ID values for
+		// non-atomic datagrams, so assign an ID to all such datagrams
+		// according to the definition given in RFC 6864 section 4.
+		if ip.Flags()&header.IPv4FlagDontFragment == 0 || ip.Flags()&header.IPv4FlagMoreFragments != 0 || ip.FragmentOffset() > 0 {
+			ip.SetID(uint16(atomic.AddUint32(&e.protocol.ids[hashRoute(r, 0 /* protocol */, e.protocol.hashIV)%buckets], 1)))
 		}
-		ip.SetID(uint16(id))
 	}
 
 	// Always set the checksum.
diff --git a/pkg/tcpip/stack/BUILD b/pkg/tcpip/stack/BUILD
index 794ddb5c8..800bf3f08 100644
--- a/pkg/tcpip/stack/BUILD
+++ b/pkg/tcpip/stack/BUILD
@@ -27,6 +27,18 @@ go_template_instance(
     },
 )
 
+go_template_instance(
+    name = "tuple_list",
+    out = "tuple_list.go",
+    package = "stack",
+    prefix = "tuple",
+    template = "//pkg/ilist:generic_list",
+    types = {
+        "Element": "*tuple",
+        "Linker": "*tuple",
+    },
+)
+
 go_library(
     name = "stack",
     srcs = [
@@ -35,6 +47,7 @@ go_library(
         "forwarder.go",
         "icmp_rate_limit.go",
         "iptables.go",
+        "iptables_state.go",
         "iptables_targets.go",
         "iptables_types.go",
         "linkaddrcache.go",
@@ -50,6 +63,7 @@ go_library(
         "stack_global_state.go",
         "stack_options.go",
         "transport_demuxer.go",
+        "tuple_list.go",
     ],
     visibility = ["//visibility:public"],
     deps = [
@@ -79,6 +93,7 @@ go_test(
         "transport_demuxer_test.go",
         "transport_test.go",
     ],
+    shard_count = 20,
     deps = [
         ":stack",
         "//pkg/rand",
diff --git a/pkg/tcpip/stack/conntrack.go b/pkg/tcpip/stack/conntrack.go
index af9c325ca..d39baf620 100644
--- a/pkg/tcpip/stack/conntrack.go
+++ b/pkg/tcpip/stack/conntrack.go
@@ -15,9 +15,12 @@
 package stack
 
 import (
+	"encoding/binary"
 	"sync"
+	"time"
 
 	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/hash/jenkins"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/transport/tcpconntrack"
 )
@@ -30,6 +33,10 @@ import (
 //
 // Currently, only TCP tracking is supported.
 
+// Our hash table has 16K buckets.
+// TODO(gvisor.dev/issue/170): These should be tunable.
+const numBuckets = 1 << 14
+
 // Direction of the tuple.
 type direction int
 
@@ -48,7 +55,12 @@ const (
 
 // tuple holds a connection's identifying and manipulating data in one
 // direction. It is immutable.
+//
+// +stateify savable
 type tuple struct {
+	// tupleEntry is used to build an intrusive list of tuples.
+	tupleEntry
+
 	tupleID
 
 	// conn is the connection tracking entry this tuple belongs to.
@@ -61,6 +73,8 @@ type tuple struct {
 // tupleID uniquely identifies a connection in one direction. It currently
 // contains enough information to distinguish between any TCP or UDP
 // connection, and will need to be extended to support other protocols.
+//
+// +stateify savable
 type tupleID struct {
 	srcAddr    tcpip.Address
 	srcPort    uint16
@@ -83,6 +97,8 @@ func (ti tupleID) reply() tupleID {
 }
 
 // conn is a tracked connection.
+//
+// +stateify savable
 type conn struct {
 	// original is the tuple in original direction. It is immutable.
 	original tuple
@@ -98,22 +114,67 @@ type conn struct {
 	tcbHook Hook
 
 	// mu protects tcb.
-	mu sync.Mutex
+	mu sync.Mutex `state:"nosave"`
 
 	// tcb is TCB control block. It is used to keep track of states
 	// of tcp connection and is protected by mu.
 	tcb tcpconntrack.TCB
+
+	// lastUsed is the last time the connection saw a relevant packet, and
+	// is updated by each packet on the connection. It is protected by mu.
+	lastUsed time.Time `state:".(unixTime)"`
+}
+
+// timedOut returns whether the connection timed out based on its state.
+func (cn *conn) timedOut(now time.Time) bool {
+	const establishedTimeout = 5 * 24 * time.Hour
+	const defaultTimeout = 120 * time.Second
+	cn.mu.Lock()
+	defer cn.mu.Unlock()
+	if cn.tcb.State() == tcpconntrack.ResultAlive {
+		// Use the same default as Linux, which doesn't delete
+		// established connections for 5(!) days.
+		return now.Sub(cn.lastUsed) > establishedTimeout
+	}
+	// Use the same default as Linux, which lets connections in most states
+	// other than established remain for <= 120 seconds.
+	return now.Sub(cn.lastUsed) > defaultTimeout
 }
 
 // ConnTrack tracks all connections created for NAT rules. Most users are
 // expected to only call handlePacket and createConnFor.
+//
+// ConnTrack keeps all connections in a slice of buckets, each of which holds a
+// linked list of tuples. This gives us some desirable properties:
+// - Each bucket has its own lock, lessening lock contention.
+// - The slice is large enough that lists stay short (<10 elements on average).
+//   Thus traversal is fast.
+// - During linked list traversal we reap expired connections. This amortizes
+//   the cost of reaping them and makes reapUnused faster.
+//
+// Locks are ordered by their location in the buckets slice. That is, a
+// goroutine that locks buckets[i] can only lock buckets[j] s.t. i < j.
+//
+// +stateify savable
 type ConnTrack struct {
-	// mu protects conns.
-	mu sync.RWMutex
+	// seed is a one-time random value initialized at stack startup
+	// and is used in the calculation of hash keys for the list of buckets.
+	// It is immutable.
+	seed uint32
 
-	// conns maintains a map of tuples needed for connection tracking for
-	// iptables NAT rules. It is protected by mu.
-	conns map[tupleID]tuple
+	// mu protects the buckets slice, but not buckets' contents. Only take
+	// the write lock if you are modifying the slice or saving for S/R.
+	mu sync.RWMutex `state:"nosave"`
+
+	// buckets is protected by mu.
+	buckets []bucket
+}
+
+// +stateify savable
+type bucket struct {
+	// mu protects tuples.
+	mu     sync.Mutex `state:"nosave"`
+	tuples tupleList
 }
 
 // packetToTupleID converts packet to a tuple ID. It fails when pkt lacks a valid
@@ -143,8 +204,9 @@ func packetToTupleID(pkt *PacketBuffer) (tupleID, *tcpip.Error) {
 // newConn creates new connection.
 func newConn(orig, reply tupleID, manip manipType, hook Hook) *conn {
 	conn := conn{
-		manip:   manip,
-		tcbHook: hook,
+		manip:    manip,
+		tcbHook:  hook,
+		lastUsed: time.Now(),
 	}
 	conn.original = tuple{conn: &conn, tupleID: orig}
 	conn.reply = tuple{conn: &conn, tupleID: reply, direction: dirReply}
@@ -162,14 +224,28 @@ func (ct *ConnTrack) connFor(pkt *PacketBuffer) (*conn, direction) {
 		return nil, dirOriginal
 	}
 
-	ct.mu.Lock()
-	defer ct.mu.Unlock()
-
-	tuple, ok := ct.conns[tid]
-	if !ok {
-		return nil, dirOriginal
+	bucket := ct.bucket(tid)
+	now := time.Now()
+
+	ct.mu.RLock()
+	defer ct.mu.RUnlock()
+	ct.buckets[bucket].mu.Lock()
+	defer ct.buckets[bucket].mu.Unlock()
+
+	// Iterate over the tuples in a bucket, cleaning up any unused
+	// connections we find.
+	for other := ct.buckets[bucket].tuples.Front(); other != nil; other = other.Next() {
+		// Clean up any timed-out connections we happen to find.
+		if ct.reapTupleLocked(other, bucket, now) {
+			// The tuple expired.
+			continue
+		}
+		if tid == other.tupleID {
+			return other.conn, other.direction
+		}
 	}
-	return tuple.conn, tuple.direction
+
+	return nil, dirOriginal
 }
 
 // createConnFor creates a new conn for pkt.
@@ -197,13 +273,31 @@ func (ct *ConnTrack) createConnFor(pkt *PacketBuffer, hook Hook, rt RedirectTarg
 	}
 	conn := newConn(tid, replyTID, manip, hook)
 
-	// Add the changed tuple to the map.
-	// TODO(gvisor.dev/issue/170): Need to support collisions using linked
-	// list.
-	ct.mu.Lock()
-	defer ct.mu.Unlock()
-	ct.conns[tid] = conn.original
-	ct.conns[replyTID] = conn.reply
+	// Lock the buckets in the correct order.
+	tupleBucket := ct.bucket(tid)
+	replyBucket := ct.bucket(replyTID)
+	ct.mu.RLock()
+	defer ct.mu.RUnlock()
+	if tupleBucket < replyBucket {
+		ct.buckets[tupleBucket].mu.Lock()
+		ct.buckets[replyBucket].mu.Lock()
+	} else if tupleBucket > replyBucket {
+		ct.buckets[replyBucket].mu.Lock()
+		ct.buckets[tupleBucket].mu.Lock()
+	} else {
+		// Both tuples are in the same bucket.
+		ct.buckets[tupleBucket].mu.Lock()
+	}
+
+	// Add the tuple to the map.
+	ct.buckets[tupleBucket].tuples.PushFront(&conn.original)
+	ct.buckets[replyBucket].tuples.PushFront(&conn.reply)
+
+	// Unlocking can happen in any order.
+	ct.buckets[tupleBucket].mu.Unlock()
+	if tupleBucket != replyBucket {
+		ct.buckets[replyBucket].mu.Unlock()
+	}
 
 	return conn
 }
@@ -297,35 +391,134 @@ func (ct *ConnTrack) handlePacket(pkt *PacketBuffer, hook Hook, gso *GSO, r *Rou
 	// other tcp states.
 	conn.mu.Lock()
 	defer conn.mu.Unlock()
-	var st tcpconntrack.Result
-	tcpHeader := header.TCP(pkt.TransportHeader)
-	if conn.tcb.IsEmpty() {
+
+	// Mark the connection as having been used recently so it isn't reaped.
+	conn.lastUsed = time.Now()
+	// Update connection state.
+	if tcpHeader := header.TCP(pkt.TransportHeader); conn.tcb.IsEmpty() {
 		conn.tcb.Init(tcpHeader)
 		conn.tcbHook = hook
+	} else if hook == conn.tcbHook {
+		conn.tcb.UpdateStateOutbound(tcpHeader)
 	} else {
-		switch hook {
-		case conn.tcbHook:
-			st = conn.tcb.UpdateStateOutbound(tcpHeader)
-		default:
-			st = conn.tcb.UpdateStateInbound(tcpHeader)
-		}
+		conn.tcb.UpdateStateInbound(tcpHeader)
 	}
+}
+
+// bucket gets the conntrack bucket for a tupleID.
+func (ct *ConnTrack) bucket(id tupleID) int {
+	h := jenkins.Sum32(ct.seed)
+	h.Write([]byte(id.srcAddr))
+	h.Write([]byte(id.dstAddr))
+	shortBuf := make([]byte, 2)
+	binary.LittleEndian.PutUint16(shortBuf, id.srcPort)
+	h.Write([]byte(shortBuf))
+	binary.LittleEndian.PutUint16(shortBuf, id.dstPort)
+	h.Write([]byte(shortBuf))
+	binary.LittleEndian.PutUint16(shortBuf, uint16(id.transProto))
+	h.Write([]byte(shortBuf))
+	binary.LittleEndian.PutUint16(shortBuf, uint16(id.netProto))
+	h.Write([]byte(shortBuf))
+	ct.mu.RLock()
+	defer ct.mu.RUnlock()
+	return int(h.Sum32()) % len(ct.buckets)
+}
 
-	// Delete conn if tcp connection is closed.
-	if st == tcpconntrack.ResultClosedByPeer || st == tcpconntrack.ResultClosedBySelf || st == tcpconntrack.ResultReset {
-		ct.deleteConn(conn)
+// reapUnused deletes timed out entries from the conntrack map. The rules for
+// reaping are:
+// - Most reaping occurs in connFor, which is called on each packet. connFor
+//   cleans up the bucket the packet's connection maps to. Thus calls to
+//   reapUnused should be fast.
+// - Each call to reapUnused traverses a fraction of the conntrack table.
+//   Specifically, it traverses len(ct.buckets)/fractionPerReaping.
+// - After reaping, reapUnused decides when it should next run based on the
+//   ratio of expired connections to examined connections. If the ratio is
+//   greater than maxExpiredPct, it schedules the next run quickly. Otherwise it
+//   slightly increases the interval between runs.
+// - maxFullTraversal caps the time it takes to traverse the entire table.
+//
+// reapUnused returns the next bucket that should be checked and the time after
+// which it should be called again.
+func (ct *ConnTrack) reapUnused(start int, prevInterval time.Duration) (int, time.Duration) {
+	// TODO(gvisor.dev/issue/170): This can be more finely controlled, as
+	// it is in Linux via sysctl.
+	const fractionPerReaping = 128
+	const maxExpiredPct = 50
+	const maxFullTraversal = 60 * time.Second
+	const minInterval = 10 * time.Millisecond
+	const maxInterval = maxFullTraversal / fractionPerReaping
+
+	now := time.Now()
+	checked := 0
+	expired := 0
+	var idx int
+	ct.mu.RLock()
+	defer ct.mu.RUnlock()
+	for i := 0; i < len(ct.buckets)/fractionPerReaping; i++ {
+		idx = (i + start) % len(ct.buckets)
+		ct.buckets[idx].mu.Lock()
+		for tuple := ct.buckets[idx].tuples.Front(); tuple != nil; tuple = tuple.Next() {
+			checked++
+			if ct.reapTupleLocked(tuple, idx, now) {
+				expired++
+			}
+		}
+		ct.buckets[idx].mu.Unlock()
+	}
+	// We already checked buckets[idx].
+	idx++
+
+	// If half or more of the connections are expired, the table has gotten
+	// stale. Reschedule quickly.
+	expiredPct := 0
+	if checked != 0 {
+		expiredPct = expired * 100 / checked
+	}
+	if expiredPct > maxExpiredPct {
+		return idx, minInterval
+	}
+	if interval := prevInterval + minInterval; interval <= maxInterval {
+		// Increment the interval between runs.
+		return idx, interval
 	}
+	// We've hit the maximum interval.
+	return idx, maxInterval
 }
 
-// deleteConn deletes the connection.
-func (ct *ConnTrack) deleteConn(conn *conn) {
-	if conn == nil {
-		return
+// reapTupleLocked tries to remove tuple and its reply from the table. It
+// returns whether the tuple's connection has timed out.
+//
+// Preconditions: ct.mu is locked for reading and bucket is locked.
+func (ct *ConnTrack) reapTupleLocked(tuple *tuple, bucket int, now time.Time) bool {
+	if !tuple.conn.timedOut(now) {
+		return false
 	}
 
-	ct.mu.Lock()
-	defer ct.mu.Unlock()
+	// To maintain lock order, we can only reap these tuples if the reply
+	// appears later in the table.
+	replyBucket := ct.bucket(tuple.reply())
+	if bucket > replyBucket {
+		return true
+	}
+
+	// Don't re-lock if both tuples are in the same bucket.
+	differentBuckets := bucket != replyBucket
+	if differentBuckets {
+		ct.buckets[replyBucket].mu.Lock()
+	}
+
+	// We have the buckets locked and can remove both tuples.
+	if tuple.direction == dirOriginal {
+		ct.buckets[replyBucket].tuples.Remove(&tuple.conn.reply)
+	} else {
+		ct.buckets[replyBucket].tuples.Remove(&tuple.conn.original)
+	}
+	ct.buckets[bucket].tuples.Remove(tuple)
+
+	// Don't re-unlock if both tuples are in the same bucket.
+	if differentBuckets {
+		ct.buckets[replyBucket].mu.Unlock()
+	}
 
-	delete(ct.conns, conn.original.tupleID)
-	delete(ct.conns, conn.reply.tupleID)
+	return true
 }
diff --git a/pkg/tcpip/stack/iptables.go b/pkg/tcpip/stack/iptables.go
index 974d77c36..f846ea2e5 100644
--- a/pkg/tcpip/stack/iptables.go
+++ b/pkg/tcpip/stack/iptables.go
@@ -16,6 +16,7 @@ package stack
 
 import (
 	"fmt"
+	"time"
 
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
@@ -41,6 +42,9 @@ const (
 // underflow.
 const HookUnset = -1
 
+// reaperDelay is how long to wait before starting to reap connections.
+const reaperDelay = 5 * time.Second
+
 // DefaultTables returns a default set of tables. Each chain is set to accept
 // all packets.
 func DefaultTables() *IPTables {
@@ -112,8 +116,9 @@ func DefaultTables() *IPTables {
 			Output:     []string{TablenameMangle, TablenameNat, TablenameFilter},
 		},
 		connections: ConnTrack{
-			conns: make(map[tupleID]tuple),
+			seed: generateRandUint32(),
 		},
+		reaperDone: make(chan struct{}, 1),
 	}
 }
 
@@ -169,6 +174,12 @@ func (it *IPTables) GetTable(name string) (Table, bool) {
 func (it *IPTables) ReplaceTable(name string, table Table) {
 	it.mu.Lock()
 	defer it.mu.Unlock()
+	// If iptables is being enabled, initialize the conntrack table and
+	// reaper.
+	if !it.modified {
+		it.connections.buckets = make([]bucket, numBuckets)
+		it.startReaper(reaperDelay)
+	}
 	it.modified = true
 	it.tables[name] = table
 }
@@ -249,6 +260,35 @@ func (it *IPTables) Check(hook Hook, pkt *PacketBuffer, gso *GSO, r *Route, addr
 	return true
 }
 
+// beforeSave is invoked by stateify.
+func (it *IPTables) beforeSave() {
+	// Ensure the reaper exits cleanly.
+	it.reaperDone <- struct{}{}
+	// Prevent others from modifying the connection table.
+	it.connections.mu.Lock()
+}
+
+// afterLoad is invoked by stateify.
+func (it *IPTables) afterLoad() {
+	it.startReaper(reaperDelay)
+}
+
+// startReaper starts a goroutine that wakes up periodically to reap timed out
+// connections.
+func (it *IPTables) startReaper(interval time.Duration) {
+	go func() { // S/R-SAFE: reaperDone is signalled when iptables is saved.
+		bucket := 0
+		for {
+			select {
+			case <-it.reaperDone:
+				return
+			case <-time.After(interval):
+				bucket, interval = it.connections.reapUnused(bucket, interval)
+			}
+		}
+	}()
+}
+
 // CheckPackets runs pkts through the rules for hook and returns a map of packets that
 // should not go forward.
 //
diff --git a/pkg/tcpip/stack/iptables_state.go b/pkg/tcpip/stack/iptables_state.go
new file mode 100644
index 000000000..529e02a07
--- /dev/null
+++ b/pkg/tcpip/stack/iptables_state.go
@@ -0,0 +1,40 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package stack
+
+import (
+	"time"
+)
+
+// +stateify savable
+type unixTime struct {
+	second int64
+	nano   int64
+}
+
+// saveLastUsed is invoked by stateify.
+func (cn *conn) saveLastUsed() unixTime {
+	return unixTime{cn.lastUsed.Unix(), cn.lastUsed.UnixNano()}
+}
+
+// loadLastUsed is invoked by stateify.
+func (cn *conn) loadLastUsed(unix unixTime) {
+	cn.lastUsed = time.Unix(unix.second, unix.nano)
+}
+
+// beforeSave is invoked by stateify.
+func (ct *ConnTrack) beforeSave() {
+	ct.mu.Lock()
+}
diff --git a/pkg/tcpip/stack/iptables_types.go b/pkg/tcpip/stack/iptables_types.go
index c528ec381..eb70e3104 100644
--- a/pkg/tcpip/stack/iptables_types.go
+++ b/pkg/tcpip/stack/iptables_types.go
@@ -78,6 +78,8 @@ const (
 )
 
 // IPTables holds all the tables for a netstack.
+//
+// +stateify savable
 type IPTables struct {
 	// mu protects tables, priorities, and modified.
 	mu sync.RWMutex
@@ -97,10 +99,15 @@ type IPTables struct {
 	modified bool
 
 	connections ConnTrack
+
+	// reaperDone can be signalled to stop the reaper goroutine.
+	reaperDone chan struct{}
 }
 
 // A Table defines a set of chains and hooks into the network stack. It is
 // really just a list of rules.
+//
+// +stateify savable
 type Table struct {
 	// Rules holds the rules that make up the table.
 	Rules []Rule
@@ -130,6 +137,8 @@ func (table *Table) ValidHooks() uint32 {
 // contains zero or more matchers, each of which is a specification of which
 // packets this rule applies to. If there are no matchers in the rule, it
 // applies to any packet.
+//
+// +stateify savable
 type Rule struct {
 	// Filter holds basic IP filtering fields common to every rule.
 	Filter IPHeaderFilter
@@ -142,6 +151,8 @@ type Rule struct {
 }
 
 // IPHeaderFilter holds basic IP filtering data common to every rule.
+//
+// +stateify savable
 type IPHeaderFilter struct {
 	// Protocol matches the transport protocol.
 	Protocol tcpip.TransportProtocolNumber
diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index afb7dfeaf..7b80534e6 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -1358,16 +1358,19 @@ func (n *NIC) DeliverTransportPacket(r *Route, protocol tcpip.TransportProtocolN
 	// TransportHeader is nil only when pkt is an ICMP packet or was reassembled
 	// from fragments.
 	if pkt.TransportHeader == nil {
-		// TODO(gvisor.dev/issue/170): ICMP packets don't have their
-		// TransportHeader fields set. See icmp/protocol.go:protocol.Parse for a
+		// TODO(gvisor.dev/issue/170): ICMP packets don't have their TransportHeader
+		// fields set yet, parse it here. See icmp/protocol.go:protocol.Parse for a
 		// full explanation.
 		if protocol == header.ICMPv4ProtocolNumber || protocol == header.ICMPv6ProtocolNumber {
+			// ICMP packets may be longer, but until icmp.Parse is implemented, here
+			// we parse it using the minimum size.
 			transHeader, ok := pkt.Data.PullUp(transProto.MinimumPacketSize())
 			if !ok {
 				n.stack.stats.MalformedRcvdPackets.Increment()
 				return
 			}
 			pkt.TransportHeader = transHeader
+			pkt.Data.TrimFront(len(pkt.TransportHeader))
 		} else {
 			// This is either a bad packet or was re-assembled from fragments.
 			transProto.Parse(pkt)
diff --git a/pkg/tcpip/stack/packet_buffer.go b/pkg/tcpip/stack/packet_buffer.go
index 1b5da6017..e3556d5d2 100644
--- a/pkg/tcpip/stack/packet_buffer.go
+++ b/pkg/tcpip/stack/packet_buffer.go
@@ -14,6 +14,7 @@
 package stack
 
 import (
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 )
@@ -24,7 +25,7 @@ import (
 // multiple endpoints. Clone() should be called in such cases so that
 // modifications to the Data field do not affect other copies.
 type PacketBuffer struct {
-	_ noCopy
+	_ sync.NoCopy
 
 	// PacketBufferEntry is used to build an intrusive list of
 	// PacketBuffers.
@@ -102,14 +103,3 @@ func (pk *PacketBuffer) Clone() *PacketBuffer {
 		NatDone:               pk.NatDone,
 	}
 }
-
-// noCopy may be embedded into structs which must not be copied
-// after the first use.
-//
-// See https://golang.org/issues/8005#issuecomment-190753527
-// for details.
-type noCopy struct{}
-
-// Lock is a no-op used by -copylocks checker from `go vet`.
-func (*noCopy) Lock()   {}
-func (*noCopy) Unlock() {}
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index cdcfb8321..0aa815447 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -425,6 +425,7 @@ type Stack struct {
 	handleLocal bool
 
 	// tables are the iptables packet filtering and manipulation rules.
+	// TODO(gvisor.dev/issue/170): S/R this field.
 	tables *IPTables
 
 	// resumableEndpoints is a list of endpoints that need to be resumed if the
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index 2be1c107a..71bcee785 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -648,6 +648,11 @@ const (
 	// whether an IPv6 socket is to be restricted to sending and receiving
 	// IPv6 packets only.
 	V6OnlyOption
+
+	// IPHdrIncludedOption is used by SetSockOpt to indicate for a raw
+	// endpoint that all packets being written have an IP header and the
+	// endpoint should not attach an IP header.
+	IPHdrIncludedOption
 )
 
 // SockOptInt represents socket options which values have the int type.
@@ -673,6 +678,13 @@ const (
 	// TCP_MAXSEG option.
 	MaxSegOption
 
+	// MTUDiscoverOption is used to set/get the path MTU discovery setting.
+	//
+	// NOTE: Setting this option to any other value than PMTUDiscoveryDont
+	// is not supported and will fail as such, and getting this option will
+	// always return PMTUDiscoveryDont.
+	MTUDiscoverOption
+
 	// MulticastTTLOption is used by SetSockOptInt/GetSockOptInt to control
 	// the default TTL value for multicast messages. The default is 1.
 	MulticastTTLOption
@@ -714,6 +726,24 @@ const (
 	TCPWindowClampOption
 )
 
+const (
+	// PMTUDiscoveryWant is a setting of the MTUDiscoverOption to use
+	// per-route settings.
+	PMTUDiscoveryWant int = iota
+
+	// PMTUDiscoveryDont is a setting of the MTUDiscoverOption to disable
+	// path MTU discovery.
+	PMTUDiscoveryDont
+
+	// PMTUDiscoveryDo is a setting of the MTUDiscoverOption to always do
+	// path MTU discovery.
+	PMTUDiscoveryDo
+
+	// PMTUDiscoveryProbe is a setting of the MTUDiscoverOption to set DF
+	// but ignore path MTU.
+	PMTUDiscoveryProbe
+)
+
 // ErrorOption is used in GetSockOpt to specify that the last error reported by
 // the endpoint should be cleared and returned.
 type ErrorOption struct{}
@@ -752,7 +782,7 @@ type CongestionControlOption string
 // control algorithms.
 type AvailableCongestionControlOption string
 
-// buffer moderation.
+// ModerateReceiveBufferOption is used by buffer moderation.
 type ModerateReceiveBufferOption bool
 
 // TCPLingerTimeoutOption is used by SetSockOpt/GetSockOpt to set/get the
@@ -825,7 +855,10 @@ type OutOfBandInlineOption int
 // a default TTL.
 type DefaultTTLOption uint8
 
-//
+// SocketDetachFilterOption is used by SetSockOpt to detach a previously attached
+// classic BPF filter on a given endpoint.
+type SocketDetachFilterOption int
+
 // IPPacketInfo is the message structure for IP_PKTINFO.
 //
 // +stateify savable
@@ -1214,6 +1247,9 @@ type UDPStats struct {
 
 	// ChecksumErrors is the number of datagrams dropped due to bad checksums.
 	ChecksumErrors *StatCounter
+
+	// InvalidSourceAddress is the number of invalid sourced datagrams dropped.
+	InvalidSourceAddress *StatCounter
 }
 
 // Stats holds statistics about the networking stack.
diff --git a/pkg/tcpip/timer.go b/pkg/tcpip/timer.go
index 59f3b391f..5554c573f 100644
--- a/pkg/tcpip/timer.go
+++ b/pkg/tcpip/timer.go
@@ -15,8 +15,9 @@
 package tcpip
 
 import (
-	"sync"
 	"time"
+
+	"gvisor.dev/gvisor/pkg/sync"
 )
 
 // cancellableTimerInstance is a specific instance of CancellableTimer.
@@ -92,6 +93,8 @@ func (t *cancellableTimerInstance) stop() {
 // Note, it is not safe to copy a CancellableTimer as its timer instance creates
 // a closure over the address of the CancellableTimer.
 type CancellableTimer struct {
+	_ sync.NoCopy
+
 	// The active instance of a cancellable timer.
 	instance cancellableTimerInstance
 
@@ -157,22 +160,6 @@ func (t *CancellableTimer) Reset(d time.Duration) {
 	}
 }
 
-// Lock is a no-op used by the copylocks checker from go vet.
-//
-// See CancellableTimer for details about why it shouldn't be copied.
-//
-// See https://github.com/golang/go/issues/8005#issuecomment-190753527 for more
-// details about the copylocks checker.
-func (*CancellableTimer) Lock() {}
-
-// Unlock is a no-op used by the copylocks checker from go vet.
-//
-// See CancellableTimer for details about why it shouldn't be copied.
-//
-// See https://github.com/golang/go/issues/8005#issuecomment-190753527 for more
-// details about the copylocks checker.
-func (*CancellableTimer) Unlock() {}
-
 // NewCancellableTimer returns an unscheduled CancellableTimer with the given
 // locker and fn.
 //
diff --git a/pkg/tcpip/transport/icmp/endpoint.go b/pkg/tcpip/transport/icmp/endpoint.go
index 8ce294002..678f4e016 100644
--- a/pkg/tcpip/transport/icmp/endpoint.go
+++ b/pkg/tcpip/transport/icmp/endpoint.go
@@ -344,6 +344,10 @@ func (e *endpoint) Peek([][]byte) (int64, tcpip.ControlMessages, *tcpip.Error) {
 
 // SetSockOpt sets a socket option.
 func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
+	switch opt.(type) {
+	case tcpip.SocketDetachFilterOption:
+		return nil
+	}
 	return nil
 }
 
@@ -744,15 +748,15 @@ func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pk
 	// Only accept echo replies.
 	switch e.NetProto {
 	case header.IPv4ProtocolNumber:
-		h, ok := pkt.Data.PullUp(header.ICMPv4MinimumSize)
-		if !ok || header.ICMPv4(h).Type() != header.ICMPv4EchoReply {
+		h := header.ICMPv4(pkt.TransportHeader)
+		if len(h) < header.ICMPv4MinimumSize || h.Type() != header.ICMPv4EchoReply {
 			e.stack.Stats().DroppedPackets.Increment()
 			e.stats.ReceiveErrors.MalformedPacketsReceived.Increment()
 			return
 		}
 	case header.IPv6ProtocolNumber:
-		h, ok := pkt.Data.PullUp(header.ICMPv6MinimumSize)
-		if !ok || header.ICMPv6(h).Type() != header.ICMPv6EchoReply {
+		h := header.ICMPv6(pkt.TransportHeader)
+		if len(h) < header.ICMPv6MinimumSize || h.Type() != header.ICMPv6EchoReply {
 			e.stack.Stats().DroppedPackets.Increment()
 			e.stats.ReceiveErrors.MalformedPacketsReceived.Increment()
 			return
@@ -786,7 +790,9 @@ func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pk
 		},
 	}
 
-	packet.data = pkt.Data
+	// ICMP socket's data includes ICMP header.
+	packet.data = pkt.TransportHeader.ToVectorisedView()
+	packet.data.Append(pkt.Data)
 
 	e.rcvList.PushBack(packet)
 	e.rcvBufSize += packet.data.Size()
diff --git a/pkg/tcpip/transport/packet/endpoint.go b/pkg/tcpip/transport/packet/endpoint.go
index baf08eda6..57b7f5c19 100644
--- a/pkg/tcpip/transport/packet/endpoint.go
+++ b/pkg/tcpip/transport/packet/endpoint.go
@@ -25,6 +25,8 @@
 package packet
 
 import (
+	"fmt"
+
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
@@ -71,11 +73,12 @@ type endpoint struct {
 	rcvClosed     bool
 
 	// The following fields are protected by mu.
-	mu         sync.RWMutex `state:"nosave"`
-	sndBufSize int
-	closed     bool
-	stats      tcpip.TransportEndpointStats `state:"nosave"`
-	bound      bool
+	mu            sync.RWMutex `state:"nosave"`
+	sndBufSize    int
+	sndBufSizeMax int
+	closed        bool
+	stats         tcpip.TransportEndpointStats `state:"nosave"`
+	bound         bool
 }
 
 // NewEndpoint returns a new packet endpoint.
@@ -92,6 +95,17 @@ func NewEndpoint(s *stack.Stack, cooked bool, netProto tcpip.NetworkProtocolNumb
 		sndBufSize:    32 * 1024,
 	}
 
+	// Override with stack defaults.
+	var ss stack.SendBufferSizeOption
+	if err := s.Option(&ss); err == nil {
+		ep.sndBufSizeMax = ss.Default
+	}
+
+	var rs stack.ReceiveBufferSizeOption
+	if err := s.Option(&rs); err == nil {
+		ep.rcvBufSizeMax = rs.Default
+	}
+
 	if err := s.RegisterPacketEndpoint(0, netProto, ep); err != nil {
 		return nil, err
 	}
@@ -264,7 +278,13 @@ func (ep *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
 // used with SetSockOpt, and this function always returns
 // tcpip.ErrNotSupported.
 func (ep *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
-	return tcpip.ErrUnknownProtocolOption
+	switch opt.(type) {
+	case tcpip.SocketDetachFilterOption:
+		return nil
+
+	default:
+		return tcpip.ErrUnknownProtocolOption
+	}
 }
 
 // SetSockOptBool implements tcpip.Endpoint.SetSockOptBool.
@@ -274,7 +294,46 @@ func (ep *endpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error {
 
 // SetSockOptInt implements tcpip.Endpoint.SetSockOptInt.
 func (ep *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
-	return tcpip.ErrUnknownProtocolOption
+	switch opt {
+	case tcpip.SendBufferSizeOption:
+		// Make sure the send buffer size is within the min and max
+		// allowed.
+		var ss stack.SendBufferSizeOption
+		if err := ep.stack.Option(&ss); err != nil {
+			panic(fmt.Sprintf("s.Option(%#v) = %s", ss, err))
+		}
+		if v > ss.Max {
+			v = ss.Max
+		}
+		if v < ss.Min {
+			v = ss.Min
+		}
+		ep.mu.Lock()
+		ep.sndBufSizeMax = v
+		ep.mu.Unlock()
+		return nil
+
+	case tcpip.ReceiveBufferSizeOption:
+		// Make sure the receive buffer size is within the min and max
+		// allowed.
+		var rs stack.ReceiveBufferSizeOption
+		if err := ep.stack.Option(&rs); err != nil {
+			panic(fmt.Sprintf("s.Option(%#v) = %s", rs, err))
+		}
+		if v > rs.Max {
+			v = rs.Max
+		}
+		if v < rs.Min {
+			v = rs.Min
+		}
+		ep.rcvMu.Lock()
+		ep.rcvBufSizeMax = v
+		ep.rcvMu.Unlock()
+		return nil
+
+	default:
+		return tcpip.ErrUnknownProtocolOption
+	}
 }
 
 // GetSockOpt implements tcpip.Endpoint.GetSockOpt.
@@ -289,7 +348,32 @@ func (ep *endpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) {
 
 // GetSockOptInt implements tcpip.Endpoint.GetSockOptInt.
 func (ep *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
-	return 0, tcpip.ErrNotSupported
+	switch opt {
+	case tcpip.ReceiveQueueSizeOption:
+		v := 0
+		ep.rcvMu.Lock()
+		if !ep.rcvList.Empty() {
+			p := ep.rcvList.Front()
+			v = p.data.Size()
+		}
+		ep.rcvMu.Unlock()
+		return v, nil
+
+	case tcpip.SendBufferSizeOption:
+		ep.mu.Lock()
+		v := ep.sndBufSizeMax
+		ep.mu.Unlock()
+		return v, nil
+
+	case tcpip.ReceiveBufferSizeOption:
+		ep.rcvMu.Lock()
+		v := ep.rcvBufSizeMax
+		ep.rcvMu.Unlock()
+		return v, nil
+
+	default:
+		return -1, tcpip.ErrUnknownProtocolOption
+	}
 }
 
 // HandlePacket implements stack.PacketEndpoint.HandlePacket.
diff --git a/pkg/tcpip/transport/raw/endpoint.go b/pkg/tcpip/transport/raw/endpoint.go
index 766c7648e..c2e9fd29f 100644
--- a/pkg/tcpip/transport/raw/endpoint.go
+++ b/pkg/tcpip/transport/raw/endpoint.go
@@ -63,6 +63,7 @@ type endpoint struct {
 	stack       *stack.Stack `state:"manual"`
 	waiterQueue *waiter.Queue
 	associated  bool
+	hdrIncluded bool
 
 	// The following fields are used to manage the receive queue and are
 	// protected by rcvMu.
@@ -108,6 +109,7 @@ func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, transProt
 		rcvBufSizeMax: 32 * 1024,
 		sndBufSizeMax: 32 * 1024,
 		associated:    associated,
+		hdrIncluded:   !associated,
 	}
 
 	// Override with stack defaults.
@@ -182,10 +184,6 @@ func (e *endpoint) SetOwner(owner tcpip.PacketOwner) {
 
 // Read implements tcpip.Endpoint.Read.
 func (e *endpoint) Read(addr *tcpip.FullAddress) (buffer.View, tcpip.ControlMessages, *tcpip.Error) {
-	if !e.associated {
-		return buffer.View{}, tcpip.ControlMessages{}, tcpip.ErrInvalidOptionValue
-	}
-
 	e.rcvMu.Lock()
 
 	// If there's no data to read, return that read would block or that the
@@ -263,7 +261,7 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c
 
 	// If this is an unassociated socket and callee provided a nonzero
 	// destination address, route using that address.
-	if !e.associated {
+	if e.hdrIncluded {
 		ip := header.IPv4(payloadBytes)
 		if !ip.IsValid(len(payloadBytes)) {
 			e.mu.RUnlock()
@@ -353,7 +351,7 @@ func (e *endpoint) finishWrite(payloadBytes []byte, route *stack.Route) (int64,
 		}
 	}
 
-	if !e.associated {
+	if e.hdrIncluded {
 		if err := route.WriteHeaderIncludedPacket(&stack.PacketBuffer{
 			Data: buffer.View(payloadBytes).ToVectorisedView(),
 		}); err != nil {
@@ -508,11 +506,24 @@ func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
 
 // SetSockOpt implements tcpip.Endpoint.SetSockOpt.
 func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
-	return tcpip.ErrUnknownProtocolOption
+	switch opt.(type) {
+	case tcpip.SocketDetachFilterOption:
+		return nil
+
+	default:
+		return tcpip.ErrUnknownProtocolOption
+	}
 }
 
 // SetSockOptBool implements tcpip.Endpoint.SetSockOptBool.
 func (e *endpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error {
+	switch opt {
+	case tcpip.IPHdrIncludedOption:
+		e.mu.Lock()
+		e.hdrIncluded = v
+		e.mu.Unlock()
+		return nil
+	}
 	return tcpip.ErrUnknownProtocolOption
 }
 
@@ -577,6 +588,12 @@ func (e *endpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) {
 	case tcpip.KeepaliveEnabledOption:
 		return false, nil
 
+	case tcpip.IPHdrIncludedOption:
+		e.mu.Lock()
+		v := e.hdrIncluded
+		e.mu.Unlock()
+		return v, nil
+
 	default:
 		return false, tcpip.ErrUnknownProtocolOption
 	}
@@ -616,8 +633,15 @@ func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
 func (e *endpoint) HandlePacket(route *stack.Route, pkt *stack.PacketBuffer) {
 	e.rcvMu.Lock()
 
-	// Drop the packet if our buffer is currently full.
-	if e.rcvClosed {
+	// Drop the packet if our buffer is currently full or if this is an unassociated
+	// endpoint (i.e endpoint created  w/ IPPROTO_RAW). Such endpoints are send only
+	// See: https://man7.org/linux/man-pages/man7/raw.7.html
+	//
+	//    An IPPROTO_RAW socket is send only.  If you really want to receive
+	//    all IP packets, use a packet(7) socket with the ETH_P_IP protocol.
+	//    Note that packet sockets don't reassemble IP fragments, unlike raw
+	//    sockets.
+	if e.rcvClosed || !e.associated {
 		e.rcvMu.Unlock()
 		e.stack.Stats().DroppedPackets.Increment()
 		e.stats.ReceiveErrors.ClosedReceiver.Increment()
diff --git a/pkg/tcpip/transport/tcp/BUILD b/pkg/tcpip/transport/tcp/BUILD
index 6baeda8e4..18ff89ffc 100644
--- a/pkg/tcpip/transport/tcp/BUILD
+++ b/pkg/tcpip/transport/tcp/BUILD
@@ -86,6 +86,7 @@ go_test(
         "tcp_test.go",
         "tcp_timestamp_test.go",
     ],
+    shard_count = 10,
     deps = [
         ":tcp",
         "//pkg/sync",
diff --git a/pkg/tcpip/transport/tcp/dispatcher.go b/pkg/tcpip/transport/tcp/dispatcher.go
index 047704c80..98aecab9e 100644
--- a/pkg/tcpip/transport/tcp/dispatcher.go
+++ b/pkg/tcpip/transport/tcp/dispatcher.go
@@ -15,6 +15,8 @@
 package tcp
 
 import (
+	"encoding/binary"
+
 	"gvisor.dev/gvisor/pkg/rand"
 	"gvisor.dev/gvisor/pkg/sleep"
 	"gvisor.dev/gvisor/pkg/sync"
@@ -66,89 +68,68 @@ func (q *epQueue) empty() bool {
 // processor is responsible for processing packets queued to a tcp endpoint.
 type processor struct {
 	epQ              epQueue
+	sleeper          sleep.Sleeper
 	newEndpointWaker sleep.Waker
 	closeWaker       sleep.Waker
-	id               int
-	wg               sync.WaitGroup
-}
-
-func newProcessor(id int) *processor {
-	p := &processor{
-		id: id,
-	}
-	p.wg.Add(1)
-	go p.handleSegments()
-	return p
 }
 
 func (p *processor) close() {
 	p.closeWaker.Assert()
 }
 
-func (p *processor) wait() {
-	p.wg.Wait()
-}
-
 func (p *processor) queueEndpoint(ep *endpoint) {
 	// Queue an endpoint for processing by the processor goroutine.
 	p.epQ.enqueue(ep)
 	p.newEndpointWaker.Assert()
 }
 
-func (p *processor) handleSegments() {
-	const newEndpointWaker = 1
-	const closeWaker = 2
-	s := sleep.Sleeper{}
-	s.AddWaker(&p.newEndpointWaker, newEndpointWaker)
-	s.AddWaker(&p.closeWaker, closeWaker)
-	defer s.Done()
+const (
+	newEndpointWaker = 1
+	closeWaker       = 2
+)
+
+func (p *processor) start(wg *sync.WaitGroup) {
+	defer wg.Done()
+	defer p.sleeper.Done()
+
 	for {
-		id, ok := s.Fetch(true)
-		if ok && id == closeWaker {
-			p.wg.Done()
-			return
+		if id, _ := p.sleeper.Fetch(true); id == closeWaker {
+			break
 		}
-		for ep := p.epQ.dequeue(); ep != nil; ep = p.epQ.dequeue() {
+		for {
+			ep := p.epQ.dequeue()
+			if ep == nil {
+				break
+			}
 			if ep.segmentQueue.empty() {
 				continue
 			}
 
-			// If socket has transitioned out of connected state
-			// then just let the worker handle the packet.
+			// If socket has transitioned out of connected state then just let the
+			// worker handle the packet.
 			//
-			// NOTE: We read this outside of e.mu lock which means
-			// that by the time we get to handleSegments the
-			// endpoint may not be in ESTABLISHED. But this should
-			// be fine as all normal shutdown states are handled by
-			// handleSegments and if the endpoint moves to a
-			// CLOSED/ERROR state then handleSegments is a noop.
-			if ep.EndpointState() != StateEstablished {
-				ep.newSegmentWaker.Assert()
-				continue
-			}
-
-			if !ep.mu.TryLock() {
-				ep.newSegmentWaker.Assert()
-				continue
-			}
-			// If the endpoint is in a connected state then we do
-			// direct delivery to ensure low latency and avoid
-			// scheduler interactions.
-			if err := ep.handleSegments(true /* fastPath */); err != nil || ep.EndpointState() == StateClose {
-				// Send any active resets if required.
-				if err != nil {
+			// NOTE: We read this outside of e.mu lock which means that by the time
+			// we get to handleSegments the endpoint may not be in ESTABLISHED. But
+			// this should be fine as all normal shutdown states are handled by
+			// handleSegments and if the endpoint moves to a CLOSED/ERROR state
+			// then handleSegments is a noop.
+			if ep.EndpointState() == StateEstablished && ep.mu.TryLock() {
+				// If the endpoint is in a connected state then we do direct delivery
+				// to ensure low latency and avoid scheduler interactions.
+				switch err := ep.handleSegments(true /* fastPath */); {
+				case err != nil:
+					// Send any active resets if required.
 					ep.resetConnectionLocked(err)
+					fallthrough
+				case ep.EndpointState() == StateClose:
+					ep.notifyProtocolGoroutine(notifyTickleWorker)
+				case !ep.segmentQueue.empty():
+					p.epQ.enqueue(ep)
 				}
-				ep.notifyProtocolGoroutine(notifyTickleWorker)
 				ep.mu.Unlock()
-				continue
-			}
-
-			if !ep.segmentQueue.empty() {
-				p.epQ.enqueue(ep)
+			} else {
+				ep.newSegmentWaker.Assert()
 			}
-
-			ep.mu.Unlock()
 		}
 	}
 }
@@ -159,31 +140,36 @@ func (p *processor) handleSegments() {
 // hash of the endpoint id to ensure that delivery for the same endpoint happens
 // in-order.
 type dispatcher struct {
-	processors []*processor
+	processors []processor
 	seed       uint32
-}
-
-func newDispatcher(nProcessors int) *dispatcher {
-	processors := []*processor{}
-	for i := 0; i < nProcessors; i++ {
-		processors = append(processors, newProcessor(i))
-	}
-	return &dispatcher{
-		processors: processors,
-		seed:       generateRandUint32(),
+	wg         sync.WaitGroup
+}
+
+func (d *dispatcher) init(nProcessors int) {
+	d.close()
+	d.wait()
+	d.processors = make([]processor, nProcessors)
+	d.seed = generateRandUint32()
+	for i := range d.processors {
+		p := &d.processors[i]
+		p.sleeper.AddWaker(&p.newEndpointWaker, newEndpointWaker)
+		p.sleeper.AddWaker(&p.closeWaker, closeWaker)
+		d.wg.Add(1)
+		// NB: sleeper-waker registration must happen synchronously to avoid races
+		// with `close`.  It's possible to pull all this logic into `start`, but
+		// that results in a heap-allocated function literal.
+		go p.start(&d.wg)
 	}
 }
 
 func (d *dispatcher) close() {
-	for _, p := range d.processors {
-		p.close()
+	for i := range d.processors {
+		d.processors[i].close()
 	}
 }
 
 func (d *dispatcher) wait() {
-	for _, p := range d.processors {
-		p.wait()
-	}
+	d.wg.Wait()
 }
 
 func (d *dispatcher) queuePacket(r *stack.Route, stackEP stack.TransportEndpoint, id stack.TransportEndpointID, pkt *stack.PacketBuffer) {
@@ -231,20 +217,18 @@ func generateRandUint32() uint32 {
 	if _, err := rand.Read(b); err != nil {
 		panic(err)
 	}
-	return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
+	return binary.LittleEndian.Uint32(b)
 }
 
 func (d *dispatcher) selectProcessor(id stack.TransportEndpointID) *processor {
-	payload := []byte{
-		byte(id.LocalPort),
-		byte(id.LocalPort >> 8),
-		byte(id.RemotePort),
-		byte(id.RemotePort >> 8)}
+	var payload [4]byte
+	binary.LittleEndian.PutUint16(payload[0:], id.LocalPort)
+	binary.LittleEndian.PutUint16(payload[2:], id.RemotePort)
 
 	h := jenkins.Sum32(d.seed)
-	h.Write(payload)
+	h.Write(payload[:])
 	h.Write([]byte(id.LocalAddress))
 	h.Write([]byte(id.RemoteAddress))
 
-	return d.processors[h.Sum32()%uint32(len(d.processors))]
+	return &d.processors[h.Sum32()%uint32(len(d.processors))]
 }
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index bd3ec5a8d..83dc10ed0 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -1589,6 +1589,13 @@ func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
 		e.UnlockUser()
 		e.notifyProtocolGoroutine(notifyMSSChanged)
 
+	case tcpip.MTUDiscoverOption:
+		// Return not supported if attempting to set this option to
+		// anything other than path MTU discovery disabled.
+		if v != tcpip.PMTUDiscoveryDont {
+			return tcpip.ErrNotSupported
+		}
+
 	case tcpip.ReceiveBufferSizeOption:
 		// Make sure the receive buffer size is within the min and max
 		// allowed.
@@ -1785,6 +1792,9 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 		e.deferAccept = time.Duration(v)
 		e.UnlockUser()
 
+	case tcpip.SocketDetachFilterOption:
+		return nil
+
 	default:
 		return nil
 	}
@@ -1896,6 +1906,11 @@ func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
 		v := header.TCPDefaultMSS
 		return v, nil
 
+	case tcpip.MTUDiscoverOption:
+		// Always return the path MTU discovery disabled setting since
+		// it's the only one supported.
+		return tcpip.PMTUDiscoveryDont, nil
+
 	case tcpip.ReceiveQueueSizeOption:
 		return e.readyReceiveSize()
 
diff --git a/pkg/tcpip/transport/tcp/protocol.go b/pkg/tcpip/transport/tcp/protocol.go
index f2ae6ce50..b34e47bbd 100644
--- a/pkg/tcpip/transport/tcp/protocol.go
+++ b/pkg/tcpip/transport/tcp/protocol.go
@@ -174,7 +174,7 @@ type protocol struct {
 	maxRetries                 uint32
 	synRcvdCount               synRcvdCounter
 	synRetries                 uint8
-	dispatcher                 *dispatcher
+	dispatcher                 dispatcher
 }
 
 // Number returns the tcp protocol number.
@@ -515,7 +515,7 @@ func (*protocol) Parse(pkt *stack.PacketBuffer) bool {
 
 // NewProtocol returns a TCP transport protocol.
 func NewProtocol() stack.TransportProtocol {
-	return &protocol{
+	p := protocol{
 		sendBufferSize: SendBufferSizeOption{
 			Min:     MinBufferSize,
 			Default: DefaultSendBufferSize,
@@ -531,10 +531,11 @@ func NewProtocol() stack.TransportProtocol {
 		tcpLingerTimeout:           DefaultTCPLingerTimeout,
 		tcpTimeWaitTimeout:         DefaultTCPTimeWaitTimeout,
 		synRcvdCount:               synRcvdCounter{threshold: SynRcvdCountThreshold},
-		dispatcher:                 newDispatcher(runtime.GOMAXPROCS(0)),
 		synRetries:                 DefaultSynRetries,
 		minRTO:                     MinRTO,
 		maxRTO:                     MaxRTO,
 		maxRetries:                 MaxRetries,
 	}
+	p.dispatcher.init(runtime.GOMAXPROCS(0))
+	return &p
 }
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index 169adb16b..e67ec42b1 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -3095,6 +3095,63 @@ func TestMaxRTO(t *testing.T) {
 	}
 }
 
+// TestRetransmitIPv4IDUniqueness tests that the IPv4 Identification field is
+// unique on retransmits.
+func TestRetransmitIPv4IDUniqueness(t *testing.T) {
+	for _, tc := range []struct {
+		name string
+		size int
+	}{
+		{"1Byte", 1},
+		{"512Bytes", 512},
+	} {
+		t.Run(tc.name, func(t *testing.T) {
+			c := context.New(t, defaultMTU)
+			defer c.Cleanup()
+
+			c.CreateConnected(789 /* iss */, 30000 /* rcvWnd */, -1 /* epRcvBuf */)
+
+			// Disabling PMTU discovery causes all packets sent from this socket to
+			// have DF=0. This needs to be done because the IPv4 ID uniqueness
+			// applies only to non-atomic IPv4 datagrams as defined in RFC 6864
+			// Section 4, and datagrams with DF=0 are non-atomic.
+			if err := c.EP.SetSockOptInt(tcpip.MTUDiscoverOption, tcpip.PMTUDiscoveryDont); err != nil {
+				t.Fatalf("disabling PMTU discovery via sockopt to force DF=0 failed: %s", err)
+			}
+
+			if _, _, err := c.EP.Write(tcpip.SlicePayload(buffer.NewView(tc.size)), tcpip.WriteOptions{}); err != nil {
+				t.Fatalf("Write failed: %s", err)
+			}
+			pkt := c.GetPacket()
+			checker.IPv4(t, pkt,
+				checker.FragmentFlags(0),
+				checker.TCP(
+					checker.DstPort(context.TestPort),
+					checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
+				),
+			)
+			idSet := map[uint16]struct{}{header.IPv4(pkt).ID(): struct{}{}}
+			// Expect two retransmitted packets, and that all packets received have
+			// unique IPv4 ID values.
+			for i := 0; i <= 2; i++ {
+				pkt := c.GetPacket()
+				checker.IPv4(t, pkt,
+					checker.FragmentFlags(0),
+					checker.TCP(
+						checker.DstPort(context.TestPort),
+						checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
+					),
+				)
+				id := header.IPv4(pkt).ID()
+				if _, exists := idSet[id]; exists {
+					t.Fatalf("duplicate IPv4 ID=%d found in retransmitted packet", id)
+				}
+				idSet[id] = struct{}{}
+			}
+		})
+	}
+}
+
 func TestFinImmediately(t *testing.T) {
 	c := context.New(t, defaultMTU)
 	defer c.Cleanup()
diff --git a/pkg/tcpip/transport/tcpconntrack/tcp_conntrack.go b/pkg/tcpip/transport/tcpconntrack/tcp_conntrack.go
index 12bc1b5b5..558b06df0 100644
--- a/pkg/tcpip/transport/tcpconntrack/tcp_conntrack.go
+++ b/pkg/tcpip/transport/tcpconntrack/tcp_conntrack.go
@@ -106,6 +106,11 @@ func (t *TCB) UpdateStateOutbound(tcp header.TCP) Result {
 	return st
 }
 
+// State returns the current state of the TCB.
+func (t *TCB) State() Result {
+	return t.state
+}
+
 // IsAlive returns true as long as the connection is established(Alive)
 // or connecting state.
 func (t *TCB) IsAlive() bool {
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index cae29fbff..a14643ae8 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -612,6 +612,13 @@ func (e *endpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error {
 // SetSockOptInt implements tcpip.Endpoint.SetSockOptInt.
 func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
 	switch opt {
+	case tcpip.MTUDiscoverOption:
+		// Return not supported if the value is not disabling path
+		// MTU discovery.
+		if v != tcpip.PMTUDiscoveryDont {
+			return tcpip.ErrNotSupported
+		}
+
 	case tcpip.MulticastTTLOption:
 		e.mu.Lock()
 		e.multicastTTL = uint8(v)
@@ -809,6 +816,9 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 		e.mu.Lock()
 		e.bindToDevice = id
 		e.mu.Unlock()
+
+	case tcpip.SocketDetachFilterOption:
+		return nil
 	}
 	return nil
 }
@@ -906,6 +916,10 @@ func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
 		e.mu.RUnlock()
 		return v, nil
 
+	case tcpip.MTUDiscoverOption:
+		// The only supported setting is path MTU discovery disabled.
+		return tcpip.PMTUDiscoveryDont, nil
+
 	case tcpip.MulticastTTLOption:
 		e.mu.Lock()
 		v := int(e.multicastTTL)
@@ -1366,6 +1380,15 @@ func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pk
 		return
 	}
 
+	// Never receive from a multicast address.
+	if header.IsV4MulticastAddress(id.RemoteAddress) ||
+		header.IsV6MulticastAddress(id.RemoteAddress) {
+		e.stack.Stats().UDP.InvalidSourceAddress.Increment()
+		e.stack.Stats().IP.InvalidSourceAddressesReceived.Increment()
+		e.stats.ReceiveErrors.MalformedPacketsReceived.Increment()
+		return
+	}
+
 	// Verify checksum unless RX checksum offload is enabled.
 	// On IPv4, UDP checksum is optional, and a zero value means
 	// the transmitter omitted the checksum generation (RFC768).
@@ -1384,10 +1407,10 @@ func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pk
 		}
 	}
 
-	e.rcvMu.Lock()
 	e.stack.Stats().UDP.PacketsReceived.Increment()
 	e.stats.PacketsReceived.Increment()
 
+	e.rcvMu.Lock()
 	// Drop the packet if our buffer is currently full.
 	if !e.rcvReady || e.rcvClosed {
 		e.rcvMu.Unlock()
diff --git a/pkg/tcpip/transport/udp/udp_test.go b/pkg/tcpip/transport/udp/udp_test.go
index db59eb5a0..90781cf49 100644
--- a/pkg/tcpip/transport/udp/udp_test.go
+++ b/pkg/tcpip/transport/udp/udp_test.go
@@ -83,16 +83,18 @@ type header4Tuple struct {
 type testFlow int
 
 const (
-	unicastV4       testFlow = iota // V4 unicast on a V4 socket
-	unicastV4in6                    // V4-mapped unicast on a V6-dual socket
-	unicastV6                       // V6 unicast on a V6 socket
-	unicastV6Only                   // V6 unicast on a V6-only socket
-	multicastV4                     // V4 multicast on a V4 socket
-	multicastV4in6                  // V4-mapped multicast on a V6-dual socket
-	multicastV6                     // V6 multicast on a V6 socket
-	multicastV6Only                 // V6 multicast on a V6-only socket
-	broadcast                       // V4 broadcast on a V4 socket
-	broadcastIn6                    // V4-mapped broadcast on a V6-dual socket
+	unicastV4         testFlow = iota // V4 unicast on a V4 socket
+	unicastV4in6                      // V4-mapped unicast on a V6-dual socket
+	unicastV6                         // V6 unicast on a V6 socket
+	unicastV6Only                     // V6 unicast on a V6-only socket
+	multicastV4                       // V4 multicast on a V4 socket
+	multicastV4in6                    // V4-mapped multicast on a V6-dual socket
+	multicastV6                       // V6 multicast on a V6 socket
+	multicastV6Only                   // V6 multicast on a V6-only socket
+	broadcast                         // V4 broadcast on a V4 socket
+	broadcastIn6                      // V4-mapped broadcast on a V6-dual socket
+	reverseMulticast4                 // V4 multicast src. Must fail.
+	reverseMulticast6                 // V6 multicast src. Must fail.
 )
 
 func (flow testFlow) String() string {
@@ -117,6 +119,10 @@ func (flow testFlow) String() string {
 		return "broadcast"
 	case broadcastIn6:
 		return "broadcastIn6"
+	case reverseMulticast4:
+		return "reverseMulticast4"
+	case reverseMulticast6:
+		return "reverseMulticast6"
 	default:
 		return "unknown"
 	}
@@ -168,6 +174,9 @@ func (flow testFlow) header4Tuple(d packetDirection) header4Tuple {
 			h.dstAddr.Addr = multicastV6Addr
 		}
 	}
+	if flow.isReverseMulticast() {
+		h.srcAddr.Addr = flow.getMcastAddr()
+	}
 	return h
 }
 
@@ -199,9 +208,9 @@ func (flow testFlow) netProto() tcpip.NetworkProtocolNumber {
 // endpoint for this flow.
 func (flow testFlow) sockProto() tcpip.NetworkProtocolNumber {
 	switch flow {
-	case unicastV4in6, unicastV6, unicastV6Only, multicastV4in6, multicastV6, multicastV6Only, broadcastIn6:
+	case unicastV4in6, unicastV6, unicastV6Only, multicastV4in6, multicastV6, multicastV6Only, broadcastIn6, reverseMulticast6:
 		return ipv6.ProtocolNumber
-	case unicastV4, multicastV4, broadcast:
+	case unicastV4, multicastV4, broadcast, reverseMulticast4:
 		return ipv4.ProtocolNumber
 	default:
 		panic(fmt.Sprintf("invalid testFlow given: %d", flow))
@@ -224,7 +233,7 @@ func (flow testFlow) isV6Only() bool {
 	switch flow {
 	case unicastV6Only, multicastV6Only:
 		return true
-	case unicastV4, unicastV4in6, unicastV6, multicastV4, multicastV4in6, multicastV6, broadcast, broadcastIn6:
+	case unicastV4, unicastV4in6, unicastV6, multicastV4, multicastV4in6, multicastV6, broadcast, broadcastIn6, reverseMulticast4, reverseMulticast6:
 		return false
 	default:
 		panic(fmt.Sprintf("invalid testFlow given: %d", flow))
@@ -235,7 +244,7 @@ func (flow testFlow) isMulticast() bool {
 	switch flow {
 	case multicastV4, multicastV4in6, multicastV6, multicastV6Only:
 		return true
-	case unicastV4, unicastV4in6, unicastV6, unicastV6Only, broadcast, broadcastIn6:
+	case unicastV4, unicastV4in6, unicastV6, unicastV6Only, broadcast, broadcastIn6, reverseMulticast4, reverseMulticast6:
 		return false
 	default:
 		panic(fmt.Sprintf("invalid testFlow given: %d", flow))
@@ -246,7 +255,7 @@ func (flow testFlow) isBroadcast() bool {
 	switch flow {
 	case broadcast, broadcastIn6:
 		return true
-	case unicastV4, unicastV4in6, unicastV6, unicastV6Only, multicastV4, multicastV4in6, multicastV6, multicastV6Only:
+	case unicastV4, unicastV4in6, unicastV6, unicastV6Only, multicastV4, multicastV4in6, multicastV6, multicastV6Only, reverseMulticast4, reverseMulticast6:
 		return false
 	default:
 		panic(fmt.Sprintf("invalid testFlow given: %d", flow))
@@ -257,13 +266,22 @@ func (flow testFlow) isMapped() bool {
 	switch flow {
 	case unicastV4in6, multicastV4in6, broadcastIn6:
 		return true
-	case unicastV4, unicastV6, unicastV6Only, multicastV4, multicastV6, multicastV6Only, broadcast:
+	case unicastV4, unicastV6, unicastV6Only, multicastV4, multicastV6, multicastV6Only, broadcast, reverseMulticast4, reverseMulticast6:
 		return false
 	default:
 		panic(fmt.Sprintf("invalid testFlow given: %d", flow))
 	}
 }
 
+func (flow testFlow) isReverseMulticast() bool {
+	switch flow {
+	case reverseMulticast4, reverseMulticast6:
+		return true
+	default:
+		return false
+	}
+}
+
 type testContext struct {
 	t      *testing.T
 	linkEP *channel.Endpoint
@@ -872,6 +890,60 @@ func TestV4ReadOnBoundToBroadcast(t *testing.T) {
 	}
 }
 
+// TestReadFromMulticast checks that an endpoint will NOT receive a packet
+// that was sent with multicast SOURCE address.
+func TestReadFromMulticast(t *testing.T) {
+	for _, flow := range []testFlow{reverseMulticast4, reverseMulticast6} {
+		t.Run(fmt.Sprintf("flow:%s", flow), func(t *testing.T) {
+			c := newDualTestContext(t, defaultMTU)
+			defer c.cleanup()
+
+			c.createEndpointForFlow(flow)
+
+			if err := c.ep.Bind(tcpip.FullAddress{Port: stackPort}); err != nil {
+				t.Fatalf("Bind failed: %s", err)
+			}
+			testFailingRead(c, flow, false /* expectReadError */)
+		})
+	}
+}
+
+// TestReadFromMulticaststats checks that a discarded packet
+// that that was sent with multicast SOURCE address increments
+// the correct counters and that a regular packet does not.
+func TestReadFromMulticastStats(t *testing.T) {
+	t.Helper()
+	for _, flow := range []testFlow{reverseMulticast4, reverseMulticast6, unicastV4} {
+		t.Run(fmt.Sprintf("flow:%s", flow), func(t *testing.T) {
+			c := newDualTestContext(t, defaultMTU)
+			defer c.cleanup()
+
+			c.createEndpointForFlow(flow)
+
+			if err := c.ep.Bind(tcpip.FullAddress{Port: stackPort}); err != nil {
+				t.Fatalf("Bind failed: %s", err)
+			}
+
+			payload := newPayload()
+			c.injectPacket(flow, payload)
+
+			var want uint64 = 0
+			if flow.isReverseMulticast() {
+				want = 1
+			}
+			if got := c.s.Stats().IP.InvalidSourceAddressesReceived.Value(); got != want {
+				t.Errorf("got stats.IP.InvalidSourceAddressesReceived.Value() = %d, want = %d", got, want)
+			}
+			if got := c.s.Stats().UDP.InvalidSourceAddress.Value(); got != want {
+				t.Errorf("got stats.UDP.InvalidSourceAddress.Value() = %d, want = %d", got, want)
+			}
+			if got := c.ep.Stats().(*tcpip.TransportEndpointStats).ReceiveErrors.MalformedPacketsReceived.Value(); got != want {
+				t.Errorf("got EP Stats.ReceiveErrors.MalformedPacketsReceived stats = %d, want = %d", got, want)
+			}
+		})
+	}
+}
+
 // TestV4ReadBroadcastOnBoundToWildcard checks that an endpoint can bind to ANY
 // and receive broadcast and unicast data.
 func TestV4ReadBroadcastOnBoundToWildcard(t *testing.T) {
@@ -1721,9 +1793,11 @@ func TestIncrementMalformedPacketsReceived(t *testing.T) {
 	payload := newPayload()
 	h := unicastV6.header4Tuple(incoming)
 	buf := c.buildV6Packet(payload, &h)
-	// Invalidate the packet length field in the UDP header by adding one.
+
+	// Invalidate the UDP header length field.
 	u := header.UDP(buf[header.IPv6MinimumSize:])
 	u.SetLength(u.Length() + 1)
+
 	c.linkEP.InjectInbound(ipv6.ProtocolNumber, &stack.PacketBuffer{
 		Data: buf.ToVectorisedView(),
 	})
@@ -1803,9 +1877,16 @@ func TestIncrementChecksumErrorsV4(t *testing.T) {
 	payload := newPayload()
 	h := unicastV4.header4Tuple(incoming)
 	buf := c.buildV4Packet(payload, &h)
-	// Invalidate the checksum field in the UDP header by adding one.
-	u := header.UDP(buf[header.IPv4MinimumSize:])
-	u.SetChecksum(u.Checksum() + 1)
+
+	// Invalidate the UDP header checksum field, taking care to avoid
+	// overflow to zero, which would disable checksum validation.
+	for u := header.UDP(buf[header.IPv4MinimumSize:]); ; {
+		u.SetChecksum(u.Checksum() + 1)
+		if u.Checksum() != 0 {
+			break
+		}
+	}
+
 	c.linkEP.InjectInbound(ipv4.ProtocolNumber, &stack.PacketBuffer{
 		Data: buf.ToVectorisedView(),
 	})
@@ -1834,9 +1915,11 @@ func TestIncrementChecksumErrorsV6(t *testing.T) {
 	payload := newPayload()
 	h := unicastV6.header4Tuple(incoming)
 	buf := c.buildV6Packet(payload, &h)
-	// Invalidate the checksum field in the UDP header by adding one.
+
+	// Invalidate the UDP header checksum field.
 	u := header.UDP(buf[header.IPv6MinimumSize:])
 	u.SetChecksum(u.Checksum() + 1)
+
 	c.linkEP.InjectInbound(ipv6.ProtocolNumber, &stack.PacketBuffer{
 		Data: buf.ToVectorisedView(),
 	})
diff --git a/pkg/test/dockerutil/BUILD b/pkg/test/dockerutil/BUILD
index 7c8758e35..83b80c8bc 100644
--- a/pkg/test/dockerutil/BUILD
+++ b/pkg/test/dockerutil/BUILD
@@ -5,10 +5,21 @@ package(licenses = ["notice"])
 go_library(
     name = "dockerutil",
     testonly = 1,
-    srcs = ["dockerutil.go"],
+    srcs = [
+        "container.go",
+        "dockerutil.go",
+        "exec.go",
+        "network.go",
+    ],
     visibility = ["//:sandbox"],
     deps = [
         "//pkg/test/testutil",
-        "@com_github_kr_pty//:go_default_library",
+        "@com_github_docker_docker//api/types:go_default_library",
+        "@com_github_docker_docker//api/types/container:go_default_library",
+        "@com_github_docker_docker//api/types/mount:go_default_library",
+        "@com_github_docker_docker//api/types/network:go_default_library",
+        "@com_github_docker_docker//client:go_default_library",
+        "@com_github_docker_docker//pkg/stdcopy:go_default_library",
+        "@com_github_docker_go_connections//nat:go_default_library",
     ],
 )
diff --git a/pkg/test/dockerutil/container.go b/pkg/test/dockerutil/container.go
new file mode 100644
index 000000000..17acdaf6f
--- /dev/null
+++ b/pkg/test/dockerutil/container.go
@@ -0,0 +1,501 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package dockerutil
+
+import (
+	"bytes"
+	"context"
+	"fmt"
+	"io/ioutil"
+	"net"
+	"os"
+	"path"
+	"regexp"
+	"strconv"
+	"strings"
+	"time"
+
+	"github.com/docker/docker/api/types"
+	"github.com/docker/docker/api/types/container"
+	"github.com/docker/docker/api/types/mount"
+	"github.com/docker/docker/api/types/network"
+	"github.com/docker/docker/client"
+	"github.com/docker/docker/pkg/stdcopy"
+	"github.com/docker/go-connections/nat"
+	"gvisor.dev/gvisor/pkg/test/testutil"
+)
+
+// Container represents a Docker Container allowing
+// user to configure and control as one would with the 'docker'
+// client. Container is backed by the offical golang docker API.
+// See: https://pkg.go.dev/github.com/docker/docker.
+type Container struct {
+	Name    string
+	Runtime string
+
+	logger   testutil.Logger
+	client   *client.Client
+	id       string
+	mounts   []mount.Mount
+	links    []string
+	cleanups []func()
+	copyErr  error
+
+	// Stores streams attached to the container. Used by WaitForOutputSubmatch.
+	streams types.HijackedResponse
+
+	// stores previously read data from the attached streams.
+	streamBuf bytes.Buffer
+}
+
+// RunOpts are options for running a container.
+type RunOpts struct {
+	// Image is the image relative to images/. This will be mangled
+	// appropriately, to ensure that only first-party images are used.
+	Image string
+
+	// Memory is the memory limit in bytes.
+	Memory int
+
+	// Cpus in which to allow execution. ("0", "1", "0-2").
+	CpusetCpus string
+
+	// Ports are the ports to be allocated.
+	Ports []int
+
+	// WorkDir sets the working directory.
+	WorkDir string
+
+	// ReadOnly sets the read-only flag.
+	ReadOnly bool
+
+	// Env are additional environment variables.
+	Env []string
+
+	// User is the user to use.
+	User string
+
+	// Privileged enables privileged mode.
+	Privileged bool
+
+	// CapAdd are the extra set of capabilities to add.
+	CapAdd []string
+
+	// CapDrop are the extra set of capabilities to drop.
+	CapDrop []string
+
+	// Mounts is the list of directories/files to be mounted inside the container.
+	Mounts []mount.Mount
+
+	// Links is the list of containers to be connected to the container.
+	Links []string
+}
+
+// MakeContainer sets up the struct for a Docker container.
+//
+// Names of containers will be unique.
+func MakeContainer(ctx context.Context, logger testutil.Logger) *Container {
+	// Slashes are not allowed in container names.
+	name := testutil.RandomID(logger.Name())
+	name = strings.ReplaceAll(name, "/", "-")
+	client, err := client.NewClientWithOpts(client.FromEnv)
+	if err != nil {
+		return nil
+	}
+
+	client.NegotiateAPIVersion(ctx)
+
+	return &Container{
+		logger:  logger,
+		Name:    name,
+		Runtime: *runtime,
+		client:  client,
+	}
+}
+
+// Spawn is analogous to 'docker run -d'.
+func (c *Container) Spawn(ctx context.Context, r RunOpts, args ...string) error {
+	if err := c.create(ctx, r, args); err != nil {
+		return err
+	}
+	return c.Start(ctx)
+}
+
+// SpawnProcess is analogous to 'docker run -it'. It returns a process
+// which represents the root process.
+func (c *Container) SpawnProcess(ctx context.Context, r RunOpts, args ...string) (Process, error) {
+	config, hostconf, netconf := c.ConfigsFrom(r, args...)
+	config.Tty = true
+	config.OpenStdin = true
+
+	if err := c.CreateFrom(ctx, config, hostconf, netconf); err != nil {
+		return Process{}, err
+	}
+
+	if err := c.Start(ctx); err != nil {
+		return Process{}, err
+	}
+
+	return Process{container: c, conn: c.streams}, nil
+}
+
+// Run is analogous to 'docker run'.
+func (c *Container) Run(ctx context.Context, r RunOpts, args ...string) (string, error) {
+	if err := c.create(ctx, r, args); err != nil {
+		return "", err
+	}
+
+	if err := c.Start(ctx); err != nil {
+		return "", err
+	}
+
+	if err := c.Wait(ctx); err != nil {
+		return "", err
+	}
+
+	return c.Logs(ctx)
+}
+
+// ConfigsFrom returns container configs from RunOpts and args. The caller should call 'CreateFrom'
+// and Start.
+func (c *Container) ConfigsFrom(r RunOpts, args ...string) (*container.Config, *container.HostConfig, *network.NetworkingConfig) {
+	return c.config(r, args), c.hostConfig(r), &network.NetworkingConfig{}
+}
+
+// MakeLink formats a link to add to a RunOpts.
+func (c *Container) MakeLink(target string) string {
+	return fmt.Sprintf("%s:%s", c.Name, target)
+}
+
+// CreateFrom creates a container from the given configs.
+func (c *Container) CreateFrom(ctx context.Context, conf *container.Config, hostconf *container.HostConfig, netconf *network.NetworkingConfig) error {
+	cont, err := c.client.ContainerCreate(ctx, conf, hostconf, netconf, c.Name)
+	if err != nil {
+		return err
+	}
+	c.id = cont.ID
+	return nil
+}
+
+// Create is analogous to 'docker create'.
+func (c *Container) Create(ctx context.Context, r RunOpts, args ...string) error {
+	return c.create(ctx, r, args)
+}
+
+func (c *Container) create(ctx context.Context, r RunOpts, args []string) error {
+	conf := c.config(r, args)
+	hostconf := c.hostConfig(r)
+	cont, err := c.client.ContainerCreate(ctx, conf, hostconf, nil, c.Name)
+	if err != nil {
+		return err
+	}
+	c.id = cont.ID
+	return nil
+}
+
+func (c *Container) config(r RunOpts, args []string) *container.Config {
+	ports := nat.PortSet{}
+	for _, p := range r.Ports {
+		port := nat.Port(fmt.Sprintf("%d", p))
+		ports[port] = struct{}{}
+	}
+	env := append(r.Env, fmt.Sprintf("RUNSC_TEST_NAME=%s", c.Name))
+
+	return &container.Config{
+		Image:        testutil.ImageByName(r.Image),
+		Cmd:          args,
+		ExposedPorts: ports,
+		Env:          env,
+		WorkingDir:   r.WorkDir,
+		User:         r.User,
+	}
+}
+
+func (c *Container) hostConfig(r RunOpts) *container.HostConfig {
+	c.mounts = append(c.mounts, r.Mounts...)
+
+	return &container.HostConfig{
+		Runtime:         c.Runtime,
+		Mounts:          c.mounts,
+		PublishAllPorts: true,
+		Links:           r.Links,
+		CapAdd:          r.CapAdd,
+		CapDrop:         r.CapDrop,
+		Privileged:      r.Privileged,
+		ReadonlyRootfs:  r.ReadOnly,
+		Resources: container.Resources{
+			Memory:     int64(r.Memory), // In bytes.
+			CpusetCpus: r.CpusetCpus,
+		},
+	}
+}
+
+// Start is analogous to 'docker start'.
+func (c *Container) Start(ctx context.Context) error {
+
+	// Open a connection to the container for parsing logs and for TTY.
+	streams, err := c.client.ContainerAttach(ctx, c.id,
+		types.ContainerAttachOptions{
+			Stream: true,
+			Stdin:  true,
+			Stdout: true,
+			Stderr: true,
+		})
+	if err != nil {
+		return fmt.Errorf("failed to connect to container: %v", err)
+	}
+
+	c.streams = streams
+	c.cleanups = append(c.cleanups, func() {
+		c.streams.Close()
+	})
+
+	return c.client.ContainerStart(ctx, c.id, types.ContainerStartOptions{})
+}
+
+// Stop is analogous to 'docker stop'.
+func (c *Container) Stop(ctx context.Context) error {
+	return c.client.ContainerStop(ctx, c.id, nil)
+}
+
+// Pause is analogous to'docker pause'.
+func (c *Container) Pause(ctx context.Context) error {
+	return c.client.ContainerPause(ctx, c.id)
+}
+
+// Unpause is analogous to 'docker unpause'.
+func (c *Container) Unpause(ctx context.Context) error {
+	return c.client.ContainerUnpause(ctx, c.id)
+}
+
+// Checkpoint is analogous to 'docker checkpoint'.
+func (c *Container) Checkpoint(ctx context.Context, name string) error {
+	return c.client.CheckpointCreate(ctx, c.Name, types.CheckpointCreateOptions{CheckpointID: name, Exit: true})
+}
+
+// Restore is analogous to 'docker start --checkname [name]'.
+func (c *Container) Restore(ctx context.Context, name string) error {
+	return c.client.ContainerStart(ctx, c.id, types.ContainerStartOptions{CheckpointID: name})
+}
+
+// Logs is analogous 'docker logs'.
+func (c *Container) Logs(ctx context.Context) (string, error) {
+	var out bytes.Buffer
+	err := c.logs(ctx, &out, &out)
+	return out.String(), err
+}
+
+func (c *Container) logs(ctx context.Context, stdout, stderr *bytes.Buffer) error {
+	opts := types.ContainerLogsOptions{ShowStdout: true, ShowStderr: true}
+	writer, err := c.client.ContainerLogs(ctx, c.id, opts)
+	if err != nil {
+		return err
+	}
+	defer writer.Close()
+	_, err = stdcopy.StdCopy(stdout, stderr, writer)
+
+	return err
+}
+
+// ID returns the container id.
+func (c *Container) ID() string {
+	return c.id
+}
+
+// SandboxPid returns the container's pid.
+func (c *Container) SandboxPid(ctx context.Context) (int, error) {
+	resp, err := c.client.ContainerInspect(ctx, c.id)
+	if err != nil {
+		return -1, err
+	}
+	return resp.ContainerJSONBase.State.Pid, nil
+}
+
+// FindIP returns the IP address of the container.
+func (c *Container) FindIP(ctx context.Context) (net.IP, error) {
+	resp, err := c.client.ContainerInspect(ctx, c.id)
+	if err != nil {
+		return nil, err
+	}
+
+	ip := net.ParseIP(resp.NetworkSettings.DefaultNetworkSettings.IPAddress)
+	if ip == nil {
+		return net.IP{}, fmt.Errorf("invalid IP: %q", ip)
+	}
+	return ip, nil
+}
+
+// FindPort returns the host port that is mapped to 'sandboxPort'.
+func (c *Container) FindPort(ctx context.Context, sandboxPort int) (int, error) {
+	desc, err := c.client.ContainerInspect(ctx, c.id)
+	if err != nil {
+		return -1, fmt.Errorf("error retrieving port: %v", err)
+	}
+
+	format := fmt.Sprintf("%d/tcp", sandboxPort)
+	ports, ok := desc.NetworkSettings.Ports[nat.Port(format)]
+	if !ok {
+		return -1, fmt.Errorf("error retrieving port: %v", err)
+
+	}
+
+	port, err := strconv.Atoi(ports[0].HostPort)
+	if err != nil {
+		return -1, fmt.Errorf("error parsing port %q: %v", port, err)
+	}
+	return port, nil
+}
+
+// CopyFiles copies in and mounts the given files. They are always ReadOnly.
+func (c *Container) CopyFiles(opts *RunOpts, target string, sources ...string) {
+	dir, err := ioutil.TempDir("", c.Name)
+	if err != nil {
+		c.copyErr = fmt.Errorf("ioutil.TempDir failed: %v", err)
+		return
+	}
+	c.cleanups = append(c.cleanups, func() { os.RemoveAll(dir) })
+	if err := os.Chmod(dir, 0755); err != nil {
+		c.copyErr = fmt.Errorf("os.Chmod(%q, 0755) failed: %v", dir, err)
+		return
+	}
+	for _, name := range sources {
+		src, err := testutil.FindFile(name)
+		if err != nil {
+			c.copyErr = fmt.Errorf("testutil.FindFile(%q) failed: %v", name, err)
+			return
+		}
+		dst := path.Join(dir, path.Base(name))
+		if err := testutil.Copy(src, dst); err != nil {
+			c.copyErr = fmt.Errorf("testutil.Copy(%q, %q) failed: %v", src, dst, err)
+			return
+		}
+		c.logger.Logf("copy: %s -> %s", src, dst)
+	}
+	opts.Mounts = append(opts.Mounts, mount.Mount{
+		Type:     mount.TypeBind,
+		Source:   dir,
+		Target:   target,
+		ReadOnly: false,
+	})
+}
+
+// Status inspects the container returns its status.
+func (c *Container) Status(ctx context.Context) (types.ContainerState, error) {
+	resp, err := c.client.ContainerInspect(ctx, c.id)
+	if err != nil {
+		return types.ContainerState{}, err
+	}
+	return *resp.State, err
+}
+
+// Wait waits for the container to exit.
+func (c *Container) Wait(ctx context.Context) error {
+	statusChan, errChan := c.client.ContainerWait(ctx, c.id, container.WaitConditionNotRunning)
+	select {
+	case err := <-errChan:
+		return err
+	case <-statusChan:
+		return nil
+	}
+}
+
+// WaitTimeout waits for the container to exit with a timeout.
+func (c *Container) WaitTimeout(ctx context.Context, timeout time.Duration) error {
+	timeoutChan := time.After(timeout)
+	statusChan, errChan := c.client.ContainerWait(ctx, c.id, container.WaitConditionNotRunning)
+	select {
+	case err := <-errChan:
+		return err
+	case <-statusChan:
+		return nil
+	case <-timeoutChan:
+		return fmt.Errorf("container %s timed out after %v seconds", c.Name, timeout.Seconds())
+	}
+}
+
+// WaitForOutput searches container logs for pattern and returns or timesout.
+func (c *Container) WaitForOutput(ctx context.Context, pattern string, timeout time.Duration) (string, error) {
+	matches, err := c.WaitForOutputSubmatch(ctx, pattern, timeout)
+	if err != nil {
+		return "", err
+	}
+	if len(matches) == 0 {
+		return "", fmt.Errorf("didn't find pattern %s logs", pattern)
+	}
+	return matches[0], nil
+}
+
+// WaitForOutputSubmatch searches container logs for the given
+// pattern or times out. It returns any regexp submatches as well.
+func (c *Container) WaitForOutputSubmatch(ctx context.Context, pattern string, timeout time.Duration) ([]string, error) {
+	re := regexp.MustCompile(pattern)
+	if matches := re.FindStringSubmatch(c.streamBuf.String()); matches != nil {
+		return matches, nil
+	}
+
+	for exp := time.Now().Add(timeout); time.Now().Before(exp); {
+		c.streams.Conn.SetDeadline(time.Now().Add(50 * time.Millisecond))
+		_, err := stdcopy.StdCopy(&c.streamBuf, &c.streamBuf, c.streams.Reader)
+
+		if err != nil {
+			// check that it wasn't a timeout
+			if nerr, ok := err.(net.Error); !ok || !nerr.Timeout() {
+				return nil, err
+			}
+		}
+
+		if matches := re.FindStringSubmatch(c.streamBuf.String()); matches != nil {
+			return matches, nil
+		}
+	}
+
+	return nil, fmt.Errorf("timeout waiting for output %q: out: %s", re.String(), c.streamBuf.String())
+}
+
+// Kill kills the container.
+func (c *Container) Kill(ctx context.Context) error {
+	return c.client.ContainerKill(ctx, c.id, "")
+}
+
+// Remove is analogous to 'docker rm'.
+func (c *Container) Remove(ctx context.Context) error {
+	// Remove the image.
+	remove := types.ContainerRemoveOptions{
+		RemoveVolumes: c.mounts != nil,
+		RemoveLinks:   c.links != nil,
+		Force:         true,
+	}
+	return c.client.ContainerRemove(ctx, c.Name, remove)
+}
+
+// CleanUp kills and deletes the container (best effort).
+func (c *Container) CleanUp(ctx context.Context) {
+	// Kill the container.
+	if err := c.Kill(ctx); err != nil && !strings.Contains(err.Error(), "is not running") {
+		// Just log; can't do anything here.
+		c.logger.Logf("error killing container %q: %v", c.Name, err)
+	}
+	// Remove the image.
+	if err := c.Remove(ctx); err != nil {
+		c.logger.Logf("error removing container %q: %v", c.Name, err)
+	}
+	// Forget all mounts.
+	c.mounts = nil
+	// Execute all cleanups.
+	for _, c := range c.cleanups {
+		c()
+	}
+	c.cleanups = nil
+}
diff --git a/pkg/test/dockerutil/dockerutil.go b/pkg/test/dockerutil/dockerutil.go
index 819dd0a59..f95ae3cd1 100644
--- a/pkg/test/dockerutil/dockerutil.go
+++ b/pkg/test/dockerutil/dockerutil.go
@@ -22,17 +22,10 @@ import (
 	"io"
 	"io/ioutil"
 	"log"
-	"net"
-	"os"
 	"os/exec"
-	"path"
 	"regexp"
 	"strconv"
-	"strings"
-	"syscall"
-	"time"
 
-	"github.com/kr/pty"
 	"gvisor.dev/gvisor/pkg/test/testutil"
 )
 
@@ -126,596 +119,3 @@ func Save(logger testutil.Logger, image string, w io.Writer) error {
 	cmd.Stdout = w // Send directly to the writer.
 	return cmd.Run()
 }
-
-// MountMode describes if the mount should be ro or rw.
-type MountMode int
-
-const (
-	// ReadOnly is what the name says.
-	ReadOnly MountMode = iota
-	// ReadWrite is what the name says.
-	ReadWrite
-)
-
-// String returns the mount mode argument for this MountMode.
-func (m MountMode) String() string {
-	switch m {
-	case ReadOnly:
-		return "ro"
-	case ReadWrite:
-		return "rw"
-	}
-	panic(fmt.Sprintf("invalid mode: %d", m))
-}
-
-// DockerNetwork contains the name of a docker network.
-type DockerNetwork struct {
-	logger     testutil.Logger
-	Name       string
-	Subnet     *net.IPNet
-	containers []*Docker
-}
-
-// NewDockerNetwork sets up the struct for a Docker network. Names of networks
-// will be unique.
-func NewDockerNetwork(logger testutil.Logger) *DockerNetwork {
-	return &DockerNetwork{
-		logger: logger,
-		Name:   testutil.RandomID(logger.Name()),
-	}
-}
-
-// Create calls 'docker network create'.
-func (n *DockerNetwork) Create(args ...string) error {
-	a := []string{"docker", "network", "create"}
-	if n.Subnet != nil {
-		a = append(a, fmt.Sprintf("--subnet=%s", n.Subnet))
-	}
-	a = append(a, args...)
-	a = append(a, n.Name)
-	return testutil.Command(n.logger, a...).Run()
-}
-
-// Connect calls 'docker network connect' with the arguments provided.
-func (n *DockerNetwork) Connect(container *Docker, args ...string) error {
-	a := []string{"docker", "network", "connect"}
-	a = append(a, args...)
-	a = append(a, n.Name, container.Name)
-	if err := testutil.Command(n.logger, a...).Run(); err != nil {
-		return err
-	}
-	n.containers = append(n.containers, container)
-	return nil
-}
-
-// Cleanup cleans up the docker network and all the containers attached to it.
-func (n *DockerNetwork) Cleanup() error {
-	for _, c := range n.containers {
-		// Don't propagate the error, it might be that the container
-		// was already cleaned up.
-		if err := c.Kill(); err != nil {
-			n.logger.Logf("unable to kill container during cleanup: %s", err)
-		}
-	}
-
-	if err := testutil.Command(n.logger, "docker", "network", "rm", n.Name).Run(); err != nil {
-		return err
-	}
-	return nil
-}
-
-// Docker contains the name and the runtime of a docker container.
-type Docker struct {
-	logger   testutil.Logger
-	Runtime  string
-	Name     string
-	copyErr  error
-	cleanups []func()
-}
-
-// MakeDocker sets up the struct for a Docker container.
-//
-// Names of containers will be unique.
-func MakeDocker(logger testutil.Logger) *Docker {
-	// Slashes are not allowed in container names.
-	name := testutil.RandomID(logger.Name())
-	name = strings.ReplaceAll(name, "/", "-")
-
-	return &Docker{
-		logger:  logger,
-		Name:    name,
-		Runtime: *runtime,
-	}
-}
-
-// CopyFiles copies in and mounts the given files. They are always ReadOnly.
-func (d *Docker) CopyFiles(opts *RunOpts, targetDir string, sources ...string) {
-	dir, err := ioutil.TempDir("", d.Name)
-	if err != nil {
-		d.copyErr = fmt.Errorf("ioutil.TempDir failed: %v", err)
-		return
-	}
-	d.cleanups = append(d.cleanups, func() { os.RemoveAll(dir) })
-	if err := os.Chmod(dir, 0755); err != nil {
-		d.copyErr = fmt.Errorf("os.Chmod(%q, 0755) failed: %v", dir, err)
-		return
-	}
-	for _, name := range sources {
-		src, err := testutil.FindFile(name)
-		if err != nil {
-			d.copyErr = fmt.Errorf("testutil.FindFile(%q) failed: %v", name, err)
-			return
-		}
-		dst := path.Join(dir, path.Base(name))
-		if err := testutil.Copy(src, dst); err != nil {
-			d.copyErr = fmt.Errorf("testutil.Copy(%q, %q) failed: %v", src, dst, err)
-			return
-		}
-		d.logger.Logf("copy: %s -> %s", src, dst)
-	}
-	opts.Mounts = append(opts.Mounts, Mount{
-		Source: dir,
-		Target: targetDir,
-		Mode:   ReadOnly,
-	})
-}
-
-// Mount describes a mount point inside the container.
-type Mount struct {
-	// Source is the path outside the container.
-	Source string
-
-	// Target is the path inside the container.
-	Target string
-
-	// Mode tells whether the mount inside the container should be readonly.
-	Mode MountMode
-}
-
-// Link informs dockers that a given container needs to be made accessible from
-// the container being configured.
-type Link struct {
-	// Source is the container to connect to.
-	Source *Docker
-
-	// Target is the alias for the container.
-	Target string
-}
-
-// RunOpts are options for running a container.
-type RunOpts struct {
-	// Image is the image relative to images/. This will be mangled
-	// appropriately, to ensure that only first-party images are used.
-	Image string
-
-	// Memory is the memory limit in kB.
-	Memory int
-
-	// Ports are the ports to be allocated.
-	Ports []int
-
-	// WorkDir sets the working directory.
-	WorkDir string
-
-	// ReadOnly sets the read-only flag.
-	ReadOnly bool
-
-	// Env are additional environment variables.
-	Env []string
-
-	// User is the user to use.
-	User string
-
-	// Privileged enables privileged mode.
-	Privileged bool
-
-	// CapAdd are the extra set of capabilities to add.
-	CapAdd []string
-
-	// CapDrop are the extra set of capabilities to drop.
-	CapDrop []string
-
-	// Pty indicates that a pty will be allocated. If this is non-nil, then
-	// this will run after start-up with the *exec.Command and Pty file
-	// passed in to the function.
-	Pty func(*exec.Cmd, *os.File)
-
-	// Foreground indicates that the container should be run in the
-	// foreground. If this is true, then the output will be available as a
-	// return value from the Run function.
-	Foreground bool
-
-	// Mounts is the list of directories/files to be mounted inside the container.
-	Mounts []Mount
-
-	// Links is the list of containers to be connected to the container.
-	Links []Link
-
-	// Extra are extra arguments that may be passed.
-	Extra []string
-}
-
-// args returns common arguments.
-//
-// Note that this does not define the complete behavior.
-func (d *Docker) argsFor(r *RunOpts, command string, p []string) (rv []string) {
-	isExec := command == "exec"
-	isRun := command == "run"
-
-	if isRun || isExec {
-		rv = append(rv, "-i")
-	}
-	if r.Pty != nil {
-		rv = append(rv, "-t")
-	}
-	if r.User != "" {
-		rv = append(rv, fmt.Sprintf("--user=%s", r.User))
-	}
-	if r.Privileged {
-		rv = append(rv, "--privileged")
-	}
-	for _, c := range r.CapAdd {
-		rv = append(rv, fmt.Sprintf("--cap-add=%s", c))
-	}
-	for _, c := range r.CapDrop {
-		rv = append(rv, fmt.Sprintf("--cap-drop=%s", c))
-	}
-	for _, e := range r.Env {
-		rv = append(rv, fmt.Sprintf("--env=%s", e))
-	}
-	if r.WorkDir != "" {
-		rv = append(rv, fmt.Sprintf("--workdir=%s", r.WorkDir))
-	}
-	if !isExec {
-		if r.Memory != 0 {
-			rv = append(rv, fmt.Sprintf("--memory=%dk", r.Memory))
-		}
-		for _, p := range r.Ports {
-			rv = append(rv, fmt.Sprintf("--publish=%d", p))
-		}
-		if r.ReadOnly {
-			rv = append(rv, fmt.Sprintf("--read-only"))
-		}
-		if len(p) > 0 {
-			rv = append(rv, "--entrypoint=")
-		}
-	}
-
-	// Always attach the test environment & Extra.
-	rv = append(rv, fmt.Sprintf("--env=RUNSC_TEST_NAME=%s", d.Name))
-	rv = append(rv, r.Extra...)
-
-	// Attach necessary bits.
-	if isExec {
-		rv = append(rv, d.Name)
-	} else {
-		for _, m := range r.Mounts {
-			rv = append(rv, fmt.Sprintf("-v=%s:%s:%v", m.Source, m.Target, m.Mode))
-		}
-		for _, l := range r.Links {
-			rv = append(rv, fmt.Sprintf("--link=%s:%s", l.Source.Name, l.Target))
-		}
-
-		if len(d.Runtime) > 0 {
-			rv = append(rv, fmt.Sprintf("--runtime=%s", d.Runtime))
-		}
-		rv = append(rv, fmt.Sprintf("--name=%s", d.Name))
-		rv = append(rv, testutil.ImageByName(r.Image))
-	}
-
-	// Attach other arguments.
-	rv = append(rv, p...)
-	return rv
-}
-
-// run runs a complete command.
-func (d *Docker) run(r RunOpts, command string, p ...string) (string, error) {
-	if d.copyErr != nil {
-		return "", d.copyErr
-	}
-	basicArgs := []string{"docker"}
-	if command == "spawn" {
-		command = "run"
-		basicArgs = append(basicArgs, command)
-		basicArgs = append(basicArgs, "-d")
-	} else {
-		basicArgs = append(basicArgs, command)
-	}
-	customArgs := d.argsFor(&r, command, p)
-	cmd := testutil.Command(d.logger, append(basicArgs, customArgs...)...)
-	if r.Pty != nil {
-		// If allocating a terminal, then we just ignore the output
-		// from the command.
-		ptmx, err := pty.Start(cmd.Cmd)
-		if err != nil {
-			return "", err
-		}
-		defer cmd.Wait() // Best effort.
-		r.Pty(cmd.Cmd, ptmx)
-	} else {
-		// Can't support PTY or streaming.
-		out, err := cmd.CombinedOutput()
-		return string(out), err
-	}
-	return "", nil
-}
-
-// Create calls 'docker create' with the arguments provided.
-func (d *Docker) Create(r RunOpts, args ...string) error {
-	out, err := d.run(r, "create", args...)
-	if strings.Contains(out, "Unable to find image") {
-		return fmt.Errorf("unable to find image, did you remember to `make load-%s`: %w", r.Image, err)
-	}
-	return err
-}
-
-// Start calls 'docker start'.
-func (d *Docker) Start() error {
-	return testutil.Command(d.logger, "docker", "start", d.Name).Run()
-}
-
-// Stop calls 'docker stop'.
-func (d *Docker) Stop() error {
-	return testutil.Command(d.logger, "docker", "stop", d.Name).Run()
-}
-
-// Run calls 'docker run' with the arguments provided.
-func (d *Docker) Run(r RunOpts, args ...string) (string, error) {
-	return d.run(r, "run", args...)
-}
-
-// Spawn starts the container and detaches.
-func (d *Docker) Spawn(r RunOpts, args ...string) error {
-	_, err := d.run(r, "spawn", args...)
-	return err
-}
-
-// Logs calls 'docker logs'.
-func (d *Docker) Logs() (string, error) {
-	// Don't capture the output; since it will swamp the logs.
-	out, err := exec.Command("docker", "logs", d.Name).CombinedOutput()
-	return string(out), err
-}
-
-// Exec calls 'docker exec' with the arguments provided.
-func (d *Docker) Exec(r RunOpts, args ...string) (string, error) {
-	return d.run(r, "exec", args...)
-}
-
-// Pause calls 'docker pause'.
-func (d *Docker) Pause() error {
-	return testutil.Command(d.logger, "docker", "pause", d.Name).Run()
-}
-
-// Unpause calls 'docker pause'.
-func (d *Docker) Unpause() error {
-	return testutil.Command(d.logger, "docker", "unpause", d.Name).Run()
-}
-
-// Checkpoint calls 'docker checkpoint'.
-func (d *Docker) Checkpoint(name string) error {
-	return testutil.Command(d.logger, "docker", "checkpoint", "create", d.Name, name).Run()
-}
-
-// Restore calls 'docker start --checkname [name]'.
-func (d *Docker) Restore(name string) error {
-	return testutil.Command(d.logger, "docker", "start", fmt.Sprintf("--checkpoint=%s", name), d.Name).Run()
-}
-
-// Kill calls 'docker kill'.
-func (d *Docker) Kill() error {
-	// Skip logging this command, it will likely be an error.
-	out, err := exec.Command("docker", "kill", d.Name).CombinedOutput()
-	if err != nil && !strings.Contains(string(out), "is not running") {
-		return err
-	}
-	return nil
-}
-
-// Remove calls 'docker rm'.
-func (d *Docker) Remove() error {
-	return testutil.Command(d.logger, "docker", "rm", d.Name).Run()
-}
-
-// CleanUp kills and deletes the container (best effort).
-func (d *Docker) CleanUp() {
-	// Kill the container.
-	if err := d.Kill(); err != nil {
-		// Just log; can't do anything here.
-		d.logger.Logf("error killing container %q: %v", d.Name, err)
-	}
-	// Remove the image.
-	if err := d.Remove(); err != nil {
-		d.logger.Logf("error removing container %q: %v", d.Name, err)
-	}
-	// Execute all cleanups.
-	for _, c := range d.cleanups {
-		c()
-	}
-	d.cleanups = nil
-}
-
-// FindPort returns the host port that is mapped to 'sandboxPort'. This calls
-// docker to allocate a free port in the host and prevent conflicts.
-func (d *Docker) FindPort(sandboxPort int) (int, error) {
-	format := fmt.Sprintf(`{{ (index (index .NetworkSettings.Ports "%d/tcp") 0).HostPort }}`, sandboxPort)
-	out, err := testutil.Command(d.logger, "docker", "inspect", "-f", format, d.Name).CombinedOutput()
-	if err != nil {
-		return -1, fmt.Errorf("error retrieving port: %v", err)
-	}
-	port, err := strconv.Atoi(strings.TrimSuffix(string(out), "\n"))
-	if err != nil {
-		return -1, fmt.Errorf("error parsing port %q: %v", out, err)
-	}
-	return port, nil
-}
-
-// FindIP returns the IP address of the container.
-func (d *Docker) FindIP() (net.IP, error) {
-	const format = `{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}`
-	out, err := testutil.Command(d.logger, "docker", "inspect", "-f", format, d.Name).CombinedOutput()
-	if err != nil {
-		return net.IP{}, fmt.Errorf("error retrieving IP: %v", err)
-	}
-	ip := net.ParseIP(strings.TrimSpace(string(out)))
-	if ip == nil {
-		return net.IP{}, fmt.Errorf("invalid IP: %q", string(out))
-	}
-	return ip, nil
-}
-
-// A NetworkInterface is container's network interface information.
-type NetworkInterface struct {
-	IPv4 net.IP
-	MAC  net.HardwareAddr
-}
-
-// ListNetworks returns the network interfaces of the container, keyed by
-// Docker network name.
-func (d *Docker) ListNetworks() (map[string]NetworkInterface, error) {
-	const format = `{{json .NetworkSettings.Networks}}`
-	out, err := testutil.Command(d.logger, "docker", "inspect", "-f", format, d.Name).CombinedOutput()
-	if err != nil {
-		return nil, fmt.Errorf("error network interfaces: %q: %w", string(out), err)
-	}
-
-	networks := map[string]map[string]string{}
-	if err := json.Unmarshal(out, &networks); err != nil {
-		return nil, fmt.Errorf("error decoding network interfaces: %w", err)
-	}
-
-	interfaces := map[string]NetworkInterface{}
-	for name, iface := range networks {
-		var netface NetworkInterface
-
-		rawIP := strings.TrimSpace(iface["IPAddress"])
-		if rawIP != "" {
-			ip := net.ParseIP(rawIP)
-			if ip == nil {
-				return nil, fmt.Errorf("invalid IP: %q", rawIP)
-			}
-			// Docker's IPAddress field is IPv4. The IPv6 address
-			// is stored in the GlobalIPv6Address field.
-			netface.IPv4 = ip
-		}
-
-		rawMAC := strings.TrimSpace(iface["MacAddress"])
-		if rawMAC != "" {
-			mac, err := net.ParseMAC(rawMAC)
-			if err != nil {
-				return nil, fmt.Errorf("invalid MAC: %q: %w", rawMAC, err)
-			}
-			netface.MAC = mac
-		}
-
-		interfaces[name] = netface
-	}
-
-	return interfaces, nil
-}
-
-// SandboxPid returns the PID to the sandbox process.
-func (d *Docker) SandboxPid() (int, error) {
-	out, err := testutil.Command(d.logger, "docker", "inspect", "-f={{.State.Pid}}", d.Name).CombinedOutput()
-	if err != nil {
-		return -1, fmt.Errorf("error retrieving pid: %v", err)
-	}
-	pid, err := strconv.Atoi(strings.TrimSuffix(string(out), "\n"))
-	if err != nil {
-		return -1, fmt.Errorf("error parsing pid %q: %v", out, err)
-	}
-	return pid, nil
-}
-
-// ID returns the container ID.
-func (d *Docker) ID() (string, error) {
-	out, err := testutil.Command(d.logger, "docker", "inspect", "-f={{.Id}}", d.Name).CombinedOutput()
-	if err != nil {
-		return "", fmt.Errorf("error retrieving ID: %v", err)
-	}
-	return strings.TrimSpace(string(out)), nil
-}
-
-// Wait waits for container to exit, up to the given timeout. Returns error if
-// wait fails or timeout is hit. Returns the application return code otherwise.
-// Note that the application may have failed even if err == nil, always check
-// the exit code.
-func (d *Docker) Wait(timeout time.Duration) (syscall.WaitStatus, error) {
-	timeoutChan := time.After(timeout)
-	waitChan := make(chan (syscall.WaitStatus))
-	errChan := make(chan (error))
-
-	go func() {
-		out, err := testutil.Command(d.logger, "docker", "wait", d.Name).CombinedOutput()
-		if err != nil {
-			errChan <- fmt.Errorf("error waiting for container %q: %v", d.Name, err)
-		}
-		exit, err := strconv.Atoi(strings.TrimSuffix(string(out), "\n"))
-		if err != nil {
-			errChan <- fmt.Errorf("error parsing exit code %q: %v", out, err)
-		}
-		waitChan <- syscall.WaitStatus(uint32(exit))
-	}()
-
-	select {
-	case ws := <-waitChan:
-		return ws, nil
-	case err := <-errChan:
-		return syscall.WaitStatus(1), err
-	case <-timeoutChan:
-		return syscall.WaitStatus(1), fmt.Errorf("timeout waiting for container %q", d.Name)
-	}
-}
-
-// WaitForOutput calls 'docker logs' to retrieve containers output and searches
-// for the given pattern.
-func (d *Docker) WaitForOutput(pattern string, timeout time.Duration) (string, error) {
-	matches, err := d.WaitForOutputSubmatch(pattern, timeout)
-	if err != nil {
-		return "", err
-	}
-	if len(matches) == 0 {
-		return "", nil
-	}
-	return matches[0], nil
-}
-
-// WaitForOutputSubmatch calls 'docker logs' to retrieve containers output and
-// searches for the given pattern. It returns any regexp submatches as well.
-func (d *Docker) WaitForOutputSubmatch(pattern string, timeout time.Duration) ([]string, error) {
-	re := regexp.MustCompile(pattern)
-	var (
-		lastOut string
-		stopped bool
-	)
-	for exp := time.Now().Add(timeout); time.Now().Before(exp); {
-		out, err := d.Logs()
-		if err != nil {
-			return nil, err
-		}
-		if out != lastOut {
-			if lastOut == "" {
-				d.logger.Logf("output (start): %s", out)
-			} else if strings.HasPrefix(out, lastOut) {
-				d.logger.Logf("output (contn): %s", out[len(lastOut):])
-			} else {
-				d.logger.Logf("output (trunc): %s", out)
-			}
-			lastOut = out // Save for future.
-			if matches := re.FindStringSubmatch(lastOut); matches != nil {
-				return matches, nil // Success!
-			}
-		} else if stopped {
-			// The sandbox stopped and we looked at the
-			// logs at least once since determining that.
-			return nil, fmt.Errorf("no longer running: %v", err)
-		} else if pid, err := d.SandboxPid(); pid == 0 || err != nil {
-			// The sandbox may have stopped, but it's
-			// possible that it has emitted the terminal
-			// line between the last call to Logs and here.
-			stopped = true
-		}
-		time.Sleep(100 * time.Millisecond)
-	}
-	return nil, fmt.Errorf("timeout waiting for output %q: %s", re.String(), lastOut)
-}
diff --git a/pkg/test/dockerutil/exec.go b/pkg/test/dockerutil/exec.go
new file mode 100644
index 000000000..921d1da9e
--- /dev/null
+++ b/pkg/test/dockerutil/exec.go
@@ -0,0 +1,194 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package dockerutil
+
+import (
+	"bytes"
+	"context"
+	"fmt"
+	"time"
+
+	"github.com/docker/docker/api/types"
+	"github.com/docker/docker/pkg/stdcopy"
+)
+
+// ExecOpts holds arguments for Exec calls.
+type ExecOpts struct {
+	// Env are additional environment variables.
+	Env []string
+
+	// Privileged enables privileged mode.
+	Privileged bool
+
+	// User is the user to use.
+	User string
+
+	// Enables Tty and stdin for the created process.
+	UseTTY bool
+
+	// WorkDir is the working directory of the process.
+	WorkDir string
+}
+
+// Exec creates a process inside the container.
+func (c *Container) Exec(ctx context.Context, opts ExecOpts, args ...string) (string, error) {
+	p, err := c.doExec(ctx, opts, args)
+	if err != nil {
+		return "", err
+	}
+
+	if exitStatus, err := p.WaitExitStatus(ctx); err != nil {
+		return "", err
+	} else if exitStatus != 0 {
+		out, _ := p.Logs()
+		return out, fmt.Errorf("process terminated with status: %d", exitStatus)
+	}
+
+	return p.Logs()
+}
+
+// ExecProcess creates a process inside the container and returns a process struct
+// for the caller to use.
+func (c *Container) ExecProcess(ctx context.Context, opts ExecOpts, args ...string) (Process, error) {
+	return c.doExec(ctx, opts, args)
+}
+
+func (c *Container) doExec(ctx context.Context, r ExecOpts, args []string) (Process, error) {
+	config := c.execConfig(r, args)
+	resp, err := c.client.ContainerExecCreate(ctx, c.id, config)
+	if err != nil {
+		return Process{}, fmt.Errorf("exec create failed with err: %v", err)
+	}
+
+	hijack, err := c.client.ContainerExecAttach(ctx, resp.ID, types.ExecStartCheck{})
+	if err != nil {
+		return Process{}, fmt.Errorf("exec attach failed with err: %v", err)
+	}
+
+	if err := c.client.ContainerExecStart(ctx, resp.ID, types.ExecStartCheck{}); err != nil {
+		hijack.Close()
+		return Process{}, fmt.Errorf("exec start failed with err: %v", err)
+	}
+
+	return Process{
+		container: c,
+		execid:    resp.ID,
+		conn:      hijack,
+	}, nil
+
+}
+
+func (c *Container) execConfig(r ExecOpts, cmd []string) types.ExecConfig {
+	env := append(r.Env, fmt.Sprintf("RUNSC_TEST_NAME=%s", c.Name))
+	return types.ExecConfig{
+		AttachStdin:  r.UseTTY,
+		AttachStderr: true,
+		AttachStdout: true,
+		Cmd:          cmd,
+		Privileged:   r.Privileged,
+		WorkingDir:   r.WorkDir,
+		Env:          env,
+		Tty:          r.UseTTY,
+		User:         r.User,
+	}
+
+}
+
+// Process represents a containerized process.
+type Process struct {
+	container *Container
+	execid    string
+	conn      types.HijackedResponse
+}
+
+// Write writes buf to the process's stdin.
+func (p *Process) Write(timeout time.Duration, buf []byte) (int, error) {
+	p.conn.Conn.SetDeadline(time.Now().Add(timeout))
+	return p.conn.Conn.Write(buf)
+}
+
+// Read returns process's stdout and stderr.
+func (p *Process) Read() (string, string, error) {
+	var stdout, stderr bytes.Buffer
+	if err := p.read(&stdout, &stderr); err != nil {
+		return "", "", err
+	}
+	return stdout.String(), stderr.String(), nil
+}
+
+// Logs returns combined stdout/stderr from the process.
+func (p *Process) Logs() (string, error) {
+	var out bytes.Buffer
+	if err := p.read(&out, &out); err != nil {
+		return "", err
+	}
+	return out.String(), nil
+}
+
+func (p *Process) read(stdout, stderr *bytes.Buffer) error {
+	_, err := stdcopy.StdCopy(stdout, stderr, p.conn.Reader)
+	return err
+}
+
+// ExitCode returns the process's exit code.
+func (p *Process) ExitCode(ctx context.Context) (int, error) {
+	_, exitCode, err := p.runningExitCode(ctx)
+	return exitCode, err
+}
+
+// IsRunning checks if the process is running.
+func (p *Process) IsRunning(ctx context.Context) (bool, error) {
+	running, _, err := p.runningExitCode(ctx)
+	return running, err
+}
+
+// WaitExitStatus until process completes and returns exit status.
+func (p *Process) WaitExitStatus(ctx context.Context) (int, error) {
+	waitChan := make(chan (int))
+	errChan := make(chan (error))
+
+	go func() {
+		for {
+			running, exitcode, err := p.runningExitCode(ctx)
+			if err != nil {
+				errChan <- fmt.Errorf("error waiting process %s: container %v", p.execid, p.container.Name)
+			}
+			if !running {
+				waitChan <- exitcode
+			}
+			time.Sleep(time.Millisecond * 500)
+		}
+	}()
+
+	select {
+	case ws := <-waitChan:
+		return ws, nil
+	case err := <-errChan:
+		return -1, err
+	}
+}
+
+// runningExitCode collects if the process is running and the exit code.
+// The exit code is only valid if the process has exited.
+func (p *Process) runningExitCode(ctx context.Context) (bool, int, error) {
+	// If execid is not empty, this is a execed process.
+	if p.execid != "" {
+		status, err := p.container.client.ContainerExecInspect(ctx, p.execid)
+		return status.Running, status.ExitCode, err
+	}
+	// else this is the root process.
+	status, err := p.container.Status(ctx)
+	return status.Running, status.ExitCode, err
+}
diff --git a/pkg/test/dockerutil/network.go b/pkg/test/dockerutil/network.go
new file mode 100644
index 000000000..047091e75
--- /dev/null
+++ b/pkg/test/dockerutil/network.go
@@ -0,0 +1,113 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package dockerutil
+
+import (
+	"context"
+	"net"
+
+	"github.com/docker/docker/api/types"
+	"github.com/docker/docker/api/types/network"
+	"github.com/docker/docker/client"
+	"gvisor.dev/gvisor/pkg/test/testutil"
+)
+
+// Network is a docker network.
+type Network struct {
+	client     *client.Client
+	id         string
+	logger     testutil.Logger
+	Name       string
+	containers []*Container
+	Subnet     *net.IPNet
+}
+
+// NewNetwork sets up the struct for a Docker network. Names of networks
+// will be unique.
+func NewNetwork(ctx context.Context, logger testutil.Logger) *Network {
+	client, err := client.NewClientWithOpts(client.FromEnv)
+	if err != nil {
+		logger.Logf("create client failed with: %v", err)
+		return nil
+	}
+	client.NegotiateAPIVersion(ctx)
+
+	return &Network{
+		logger: logger,
+		Name:   testutil.RandomID(logger.Name()),
+		client: client,
+	}
+}
+
+func (n *Network) networkCreate() types.NetworkCreate {
+
+	var subnet string
+	if n.Subnet != nil {
+		subnet = n.Subnet.String()
+	}
+
+	ipam := network.IPAM{
+		Config: []network.IPAMConfig{{
+			Subnet: subnet,
+		}},
+	}
+
+	return types.NetworkCreate{
+		CheckDuplicate: true,
+		IPAM:           &ipam,
+	}
+}
+
+// Create is analogous to 'docker network create'.
+func (n *Network) Create(ctx context.Context) error {
+
+	opts := n.networkCreate()
+	resp, err := n.client.NetworkCreate(ctx, n.Name, opts)
+	if err != nil {
+		return err
+	}
+	n.id = resp.ID
+	return nil
+}
+
+// Connect is analogous to 'docker network connect' with the arguments provided.
+func (n *Network) Connect(ctx context.Context, container *Container, ipv4, ipv6 string) error {
+	settings := network.EndpointSettings{
+		IPAMConfig: &network.EndpointIPAMConfig{
+			IPv4Address: ipv4,
+			IPv6Address: ipv6,
+		},
+	}
+	err := n.client.NetworkConnect(ctx, n.id, container.id, &settings)
+	if err == nil {
+		n.containers = append(n.containers, container)
+	}
+	return err
+}
+
+// Inspect returns this network's info.
+func (n *Network) Inspect(ctx context.Context) (types.NetworkResource, error) {
+	return n.client.NetworkInspect(ctx, n.id, types.NetworkInspectOptions{Verbose: true})
+}
+
+// Cleanup cleans up the docker network and all the containers attached to it.
+func (n *Network) Cleanup(ctx context.Context) error {
+	for _, c := range n.containers {
+		c.CleanUp(ctx)
+	}
+	n.containers = nil
+
+	return n.client.NetworkRemove(ctx, n.id)
+}
diff --git a/pkg/test/testutil/testutil.go b/pkg/test/testutil/testutil.go
index f21d6769a..64c292698 100644
--- a/pkg/test/testutil/testutil.go
+++ b/pkg/test/testutil/testutil.go
@@ -482,6 +482,21 @@ func IsStatic(filename string) (bool, error) {
 	return true, nil
 }
 
+// TouchShardStatusFile indicates to Bazel that the test runner supports
+// sharding by creating or updating the last modified date of the file
+// specified by TEST_SHARD_STATUS_FILE.
+//
+// See https://docs.bazel.build/versions/master/test-encyclopedia.html#role-of-the-test-runner.
+func TouchShardStatusFile() error {
+	if statusFile := os.Getenv("TEST_SHARD_STATUS_FILE"); statusFile != "" {
+		cmd := exec.Command("touch", statusFile)
+		if b, err := cmd.CombinedOutput(); err != nil {
+			return fmt.Errorf("touch %q failed:\n output: %s\n error: %s", statusFile, string(b), err.Error())
+		}
+	}
+	return nil
+}
+
 // TestIndicesForShard returns indices for this test shard based on the
 // TEST_SHARD_INDEX and TEST_TOTAL_SHARDS environment vars.
 //
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index e83584b82..59639ba19 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -29,6 +29,7 @@ import (
 	_ "gvisor.dev/gvisor/pkg/sentry/fs/sys"
 	_ "gvisor.dev/gvisor/pkg/sentry/fs/tmpfs"
 	_ "gvisor.dev/gvisor/pkg/sentry/fs/tty"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
 
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"gvisor.dev/gvisor/pkg/abi/linux"
@@ -390,6 +391,10 @@ type mountHint struct {
 	// root is the inode where the volume is mounted. For mounts with 'pod' share
 	// the volume is mounted once and then bind mounted inside the containers.
 	root *fs.Inode
+
+	// vfsMount is the master mount for the volume. For mounts with 'pod' share
+	// the master volume is bind mounted inside the containers.
+	vfsMount *vfs.Mount
 }
 
 func (m *mountHint) setField(key, val string) error {
@@ -571,9 +576,9 @@ func newContainerMounter(spec *specs.Spec, goferFDs []int, k *kernel.Kernel, hin
 // processHints processes annotations that container hints about how volumes
 // should be mounted (e.g. a volume shared between containers). It must be
 // called for the root container only.
-func (c *containerMounter) processHints(conf *Config) error {
+func (c *containerMounter) processHints(conf *Config, creds *auth.Credentials) error {
 	if conf.VFS2 {
-		return nil
+		return c.processHintsVFS2(conf, creds)
 	}
 	ctx := c.k.SupervisorContext()
 	for _, hint := range c.hints.mounts {
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index b5df1deb9..0c0423ab2 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -227,9 +227,7 @@ func New(args Args) (*Loader, error) {
 	// Create VDSO.
 	//
 	// Pass k as the platform since it is savable, unlike the actual platform.
-	//
-	// FIXME(b/109889800): Use non-nil context.
-	vdso, err := loader.PrepareVDSO(nil, k)
+	vdso, err := loader.PrepareVDSO(k)
 	if err != nil {
 		return nil, fmt.Errorf("creating vdso: %v", err)
 	}
@@ -300,6 +298,12 @@ func New(args Args) (*Loader, error) {
 		return nil, fmt.Errorf("initializing kernel: %v", err)
 	}
 
+	if kernel.VFS2Enabled {
+		if err := registerFilesystems(k); err != nil {
+			return nil, fmt.Errorf("registering filesystems: %w", err)
+		}
+	}
+
 	if err := adjustDirentCache(k); err != nil {
 		return nil, err
 	}
@@ -561,7 +565,7 @@ func (l *Loader) run() error {
 		l.startGoferMonitor(l.sandboxID, l.goferFDs)
 
 		mntr := newContainerMounter(l.spec, l.goferFDs, l.k, l.mountHints)
-		if err := mntr.processHints(l.conf); err != nil {
+		if err := mntr.processHints(l.conf, l.rootProcArgs.Credentials); err != nil {
 			return err
 		}
 		if err := setupContainerFS(ctx, l.conf, mntr, &l.rootProcArgs); err != nil {
diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go
index e448fd773..b723e4335 100644
--- a/runsc/boot/loader_test.go
+++ b/runsc/boot/loader_test.go
@@ -480,7 +480,7 @@ func TestCreateMountNamespaceVFS2(t *testing.T) {
 			defer loaderCleanup()
 
 			mntr := newContainerMounter(l.spec, l.goferFDs, l.k, l.mountHints)
-			if err := mntr.processHints(l.conf); err != nil {
+			if err := mntr.processHints(l.conf, l.rootProcArgs.Credentials); err != nil {
 				t.Fatalf("failed process hints: %v", err)
 			}
 
diff --git a/runsc/boot/vfs.go b/runsc/boot/vfs.go
index b68117867..6ee6fae04 100644
--- a/runsc/boot/vfs.go
+++ b/runsc/boot/vfs.go
@@ -43,7 +43,11 @@ import (
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
-func registerFilesystems(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials) error {
+func registerFilesystems(k *kernel.Kernel) error {
+	ctx := k.SupervisorContext()
+	creds := auth.NewRootCredentials(k.RootUserNamespace())
+	vfsObj := k.VFS()
+
 	vfsObj.MustRegisterFilesystemType(devpts.Name, &devpts.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
 		AllowUserList: true,
 		// TODO(b/29356795): Users may mount this once the terminals are in a
@@ -113,9 +117,6 @@ func registerFilesystems(ctx context.Context, vfsObj *vfs.VirtualFilesystem, cre
 }
 
 func setupContainerVFS2(ctx context.Context, conf *Config, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error {
-	if err := mntr.k.VFS().Init(); err != nil {
-		return fmt.Errorf("failed to initialize VFS: %w", err)
-	}
 	mns, err := mntr.setupVFS2(ctx, conf, procArgs)
 	if err != nil {
 		return fmt.Errorf("failed to setupFS: %w", err)
@@ -144,10 +145,6 @@ func (c *containerMounter) setupVFS2(ctx context.Context, conf *Config, procArgs
 	rootProcArgs.MaxSymlinkTraversals = linux.MaxSymlinkTraversals
 	rootCtx := procArgs.NewContext(c.k)
 
-	if err := registerFilesystems(rootCtx, c.k.VFS(), rootCreds); err != nil {
-		return nil, fmt.Errorf("register filesystems: %w", err)
-	}
-
 	mns, err := c.createMountNamespaceVFS2(rootCtx, conf, rootCreds)
 	if err != nil {
 		return nil, fmt.Errorf("creating mount namespace: %w", err)
@@ -182,8 +179,14 @@ func (c *containerMounter) mountSubmountsVFS2(ctx context.Context, conf *Config,
 	for i := range mounts {
 		submount := &mounts[i]
 		log.Debugf("Mounting %q to %q, type: %s, options: %s", submount.Source, submount.Destination, submount.Type, submount.Options)
-		if err := c.mountSubmountVFS2(ctx, conf, mns, creds, submount); err != nil {
-			return err
+		if hint := c.hints.findMount(submount.Mount); hint != nil && hint.isSupported() {
+			if err := c.mountSharedSubmountVFS2(ctx, conf, mns, creds, submount.Mount, hint); err != nil {
+				return fmt.Errorf("mount shared mount %q to %q: %v", hint.name, submount.Destination, err)
+			}
+		} else {
+			if err := c.mountSubmountVFS2(ctx, conf, mns, creds, submount); err != nil {
+				return fmt.Errorf("mount submount %q: %w", submount.Destination, err)
+			}
 		}
 	}
 
@@ -257,20 +260,18 @@ func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *Config,
 // getMountNameAndOptionsVFS2 retrieves the fsName, opts, and useOverlay values
 // used for mounts.
 func (c *containerMounter) getMountNameAndOptionsVFS2(conf *Config, m *mountAndFD) (string, *vfs.MountOptions, error) {
-	var (
-		fsName string
-		data   []string
-	)
+	fsName := m.Type
+	var data []string
 
 	// Find filesystem name and FS specific data field.
 	switch m.Type {
 	case devpts.Name, devtmpfs.Name, proc.Name, sys.Name:
-		fsName = m.Type
+		// Nothing to do.
+
 	case nonefs:
 		fsName = sys.Name
-	case tmpfs.Name:
-		fsName = m.Type
 
+	case tmpfs.Name:
 		var err error
 		data, err = parseAndFilterOptions(m.Options, tmpfsAllowedData...)
 		if err != nil {
@@ -279,10 +280,16 @@ func (c *containerMounter) getMountNameAndOptionsVFS2(conf *Config, m *mountAndF
 
 	case bind:
 		fsName = gofer.Name
+		if m.fd == 0 {
+			// Check that an FD was provided to fails fast. Technically FD=0 is valid,
+			// but unlikely to be correct in this context.
+			return "", nil, fmt.Errorf("9P mount requires a connection FD")
+		}
 		data = p9MountData(m.fd, c.getMountAccessType(m.Mount), true /* vfs2 */)
 
 	default:
 		log.Warningf("ignoring unknown filesystem type %q", m.Type)
+		return "", nil, nil
 	}
 
 	opts := &vfs.MountOptions{
@@ -322,7 +329,7 @@ func (c *containerMounter) makeSyntheticMount(ctx context.Context, currentPath s
 	}
 	_, err := c.k.VFS().StatAt(ctx, creds, target, &vfs.StatOptions{})
 	if err == nil {
-		// Mount point exists, nothing else to do.
+		log.Debugf("Mount point %q already exists", currentPath)
 		return nil
 	}
 	if err != syserror.ENOENT {
@@ -400,3 +407,76 @@ func (c *containerMounter) mountTmpVFS2(ctx context.Context, conf *Config, creds
 		return fmt.Errorf(`stating "/tmp" inside container: %w`, err)
 	}
 }
+
+// processHintsVFS2 processes annotations that container hints about how volumes
+// should be mounted (e.g. a volume shared between containers). It must be
+// called for the root container only.
+func (c *containerMounter) processHintsVFS2(conf *Config, creds *auth.Credentials) error {
+	ctx := c.k.SupervisorContext()
+	for _, hint := range c.hints.mounts {
+		// TODO(b/142076984): Only support tmpfs for now. Bind mounts require a
+		// common gofer to mount all shared volumes.
+		if hint.mount.Type != tmpfs.Name {
+			continue
+		}
+
+		log.Infof("Mounting master of shared mount %q from %q type %q", hint.name, hint.mount.Source, hint.mount.Type)
+		mnt, err := c.mountSharedMasterVFS2(ctx, conf, hint, creds)
+		if err != nil {
+			return fmt.Errorf("mounting shared master %q: %v", hint.name, err)
+		}
+		hint.vfsMount = mnt
+	}
+	return nil
+}
+
+// mountSharedMasterVFS2 mounts the master of a volume that is shared among
+// containers in a pod.
+func (c *containerMounter) mountSharedMasterVFS2(ctx context.Context, conf *Config, hint *mountHint, creds *auth.Credentials) (*vfs.Mount, error) {
+	// Map mount type to filesystem name, and parse out the options that we are
+	// capable of dealing with.
+	mntFD := &mountAndFD{Mount: hint.mount}
+	fsName, opts, err := c.getMountNameAndOptionsVFS2(conf, mntFD)
+	if err != nil {
+		return nil, err
+	}
+	if len(fsName) == 0 {
+		return nil, fmt.Errorf("mount type not supported %q", hint.mount.Type)
+	}
+	return c.k.VFS().MountDisconnected(ctx, creds, "", fsName, opts)
+}
+
+// mountSharedSubmount binds mount to a previously mounted volume that is shared
+// among containers in the same pod.
+func (c *containerMounter) mountSharedSubmountVFS2(ctx context.Context, conf *Config, mns *vfs.MountNamespace, creds *auth.Credentials, mount specs.Mount, source *mountHint) error {
+	if err := source.checkCompatible(mount); err != nil {
+		return err
+	}
+
+	_, opts, err := c.getMountNameAndOptionsVFS2(conf, &mountAndFD{Mount: mount})
+	if err != nil {
+		return err
+	}
+	newMnt, err := c.k.VFS().NewDisconnectedMount(source.vfsMount.Filesystem(), source.vfsMount.Root(), opts)
+	if err != nil {
+		return err
+	}
+	defer newMnt.DecRef()
+
+	root := mns.Root()
+	defer root.DecRef()
+	if err := c.makeSyntheticMount(ctx, mount.Destination, root, creds); err != nil {
+		return err
+	}
+
+	target := &vfs.PathOperation{
+		Root:  root,
+		Start: root,
+		Path:  fspath.Parse(mount.Destination),
+	}
+	if err := c.k.VFS().ConnectMountAt(ctx, creds, newMnt, target); err != nil {
+		return err
+	}
+	log.Infof("Mounted %q type shared bind to %q", mount.Destination, source.name)
+	return nil
+}
diff --git a/runsc/cmd/spec.go b/runsc/cmd/spec.go
index a2b0a4b14..55194e641 100644
--- a/runsc/cmd/spec.go
+++ b/runsc/cmd/spec.go
@@ -16,124 +16,122 @@ package cmd
 
 import (
 	"context"
-	"fmt"
-	"io/ioutil"
+	"encoding/json"
+	"io"
 	"os"
 	"path/filepath"
 
 	"github.com/google/subcommands"
+	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"gvisor.dev/gvisor/runsc/flag"
 )
 
-func genSpec(cwd string) []byte {
-	var template = fmt.Sprintf(`{
-	"ociVersion": "1.0.0",
-	"process": {
-		"terminal": true,
-		"user": {
-			"uid": 0,
-			"gid": 0
-		},
-		"args": [
-			"sh"
-		],
-		"env": [
-			"PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
-			"TERM=xterm"
-		],
-		"cwd": "%s",
-		"capabilities": {
-			"bounding": [
-				"CAP_AUDIT_WRITE",
-				"CAP_KILL",
-				"CAP_NET_BIND_SERVICE"
-			],
-			"effective": [
-				"CAP_AUDIT_WRITE",
-				"CAP_KILL",
-				"CAP_NET_BIND_SERVICE"
-			],
-			"inheritable": [
-				"CAP_AUDIT_WRITE",
-				"CAP_KILL",
-				"CAP_NET_BIND_SERVICE"
-			],
-			"permitted": [
-				"CAP_AUDIT_WRITE",
-				"CAP_KILL",
-				"CAP_NET_BIND_SERVICE"
-			],
-			"ambient": [
-				"CAP_AUDIT_WRITE",
-				"CAP_KILL",
-				"CAP_NET_BIND_SERVICE"
-			]
-		},
-		"rlimits": [
-			{
-				"type": "RLIMIT_NOFILE",
-				"hard": 1024,
-				"soft": 1024
-			}
-		]
-	},
-	"root": {
-		"path": "rootfs",
-		"readonly": true
-	},
-	"hostname": "runsc",
-	"mounts": [
-		{
-			"destination": "/proc",
-			"type": "proc",
-			"source": "proc"
+func writeSpec(w io.Writer, cwd string, netns string, args []string) error {
+	spec := &specs.Spec{
+		Version: "1.0.0",
+		Process: &specs.Process{
+			Terminal: true,
+			User: specs.User{
+				UID: 0,
+				GID: 0,
+			},
+			Args: args,
+			Env: []string{
+				"PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
+				"TERM=xterm",
+			},
+			Cwd: cwd,
+			Capabilities: &specs.LinuxCapabilities{
+				Bounding: []string{
+					"CAP_AUDIT_WRITE",
+					"CAP_KILL",
+					"CAP_NET_BIND_SERVICE",
+				},
+				Effective: []string{
+					"CAP_AUDIT_WRITE",
+					"CAP_KILL",
+					"CAP_NET_BIND_SERVICE",
+				},
+				Inheritable: []string{
+					"CAP_AUDIT_WRITE",
+					"CAP_KILL",
+					"CAP_NET_BIND_SERVICE",
+				},
+				Permitted: []string{
+					"CAP_AUDIT_WRITE",
+					"CAP_KILL",
+					"CAP_NET_BIND_SERVICE",
+				},
+				// TODO(gvisor.dev/issue/3166): support ambient capabilities
+			},
+			Rlimits: []specs.POSIXRlimit{
+				{
+					Type: "RLIMIT_NOFILE",
+					Hard: 1024,
+					Soft: 1024,
+				},
+			},
 		},
-		{
-			"destination": "/dev",
-			"type": "tmpfs",
-			"source": "tmpfs",
-			"options": []
+		Root: &specs.Root{
+			Path:     "rootfs",
+			Readonly: true,
 		},
-		{
-			"destination": "/sys",
-			"type": "sysfs",
-			"source": "sysfs",
-			"options": [
-				"nosuid",
-				"noexec",
-				"nodev",
-				"ro"
-			]
-		}
-	],
-	"linux": {
-		"namespaces": [
+		Hostname: "runsc",
+		Mounts: []specs.Mount{
 			{
-				"type": "pid"
+				Destination: "/proc",
+				Type:        "proc",
+				Source:      "proc",
 			},
 			{
-				"type": "network"
+				Destination: "/dev",
+				Type:        "tmpfs",
+				Source:      "tmpfs",
 			},
 			{
-				"type": "ipc"
+				Destination: "/sys",
+				Type:        "sysfs",
+				Source:      "sysfs",
+				Options: []string{
+					"nosuid",
+					"noexec",
+					"nodev",
+					"ro",
+				},
 			},
-			{
-				"type": "uts"
+		},
+		Linux: &specs.Linux{
+			Namespaces: []specs.LinuxNamespace{
+				{
+					Type: "pid",
+				},
+				{
+					Type: "network",
+					Path: netns,
+				},
+				{
+					Type: "ipc",
+				},
+				{
+					Type: "uts",
+				},
+				{
+					Type: "mount",
+				},
 			},
-			{
-				"type": "mount"
-			}
-		]
+		},
 	}
-}`, cwd)
 
-	return []byte(template)
+	e := json.NewEncoder(w)
+	e.SetIndent("", "    ")
+	return e.Encode(spec)
 }
 
 // Spec implements subcommands.Command for the "spec" command.
 type Spec struct {
 	bundle string
 	cwd    string
+	netns  string
 }
 
 // Name implements subcommands.Command.Name.
@@ -148,21 +146,26 @@ func (*Spec) Synopsis() string {
 
 // Usage implements subcommands.Command.Usage.
 func (*Spec) Usage() string {
-	return `spec [options] - create a new OCI bundle specification file.
+	return `spec [options] [-- args...] - create a new OCI bundle specification file.
+
+The spec command creates a new specification file (config.json) for a new OCI
+bundle.
 
-The spec command creates a new specification file (config.json) for a new OCI bundle.
+The specification file is a starter file that runs the command specified by
+'args' in the container. If 'args' is not specified the default is to run the
+'sh' program.
 
-The specification file is a starter file that runs the "sh" command in the container. You
-should edit the file to suit your needs. You can find out more about the format of the
-specification file by visiting the OCI runtime spec repository:
+While a number of flags are provided to change values in the specification, you
+can examine the file and edit it to suit your needs after this command runs.
+You can find out more about the format of the specification file by visiting
+the OCI runtime spec repository:
 https://github.com/opencontainers/runtime-spec/
 
 EXAMPLE:
     $ mkdir -p bundle/rootfs
     $ cd bundle
-    $ runsc spec
+    $ runsc spec -- /hello
     $ docker export $(docker create hello-world) | tar -xf - -C rootfs
-    $ sed -i 's;"sh";"/hello";' config.json
     $ sudo runsc run hello
 
 `
@@ -173,18 +176,29 @@ func (s *Spec) SetFlags(f *flag.FlagSet) {
 	f.StringVar(&s.bundle, "bundle", ".", "path to the root of the OCI bundle")
 	f.StringVar(&s.cwd, "cwd", "/", "working directory that will be set for the executable, "+
 		"this value MUST be an absolute path")
+	f.StringVar(&s.netns, "netns", "", "network namespace path")
 }
 
 // Execute implements subcommands.Command.Execute.
 func (s *Spec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	// Grab the arguments.
+	containerArgs := f.Args()
+	if len(containerArgs) == 0 {
+		containerArgs = []string{"sh"}
+	}
+
 	confPath := filepath.Join(s.bundle, "config.json")
 	if _, err := os.Stat(confPath); !os.IsNotExist(err) {
 		Fatalf("file %q already exists", confPath)
 	}
 
-	var spec = genSpec(s.cwd)
+	configFile, err := os.OpenFile(confPath, os.O_WRONLY|os.O_CREATE, 0664)
+	if err != nil {
+		Fatalf("opening file %q: %v", confPath, err)
+	}
 
-	if err := ioutil.WriteFile(confPath, spec, 0664); err != nil {
+	err = writeSpec(configFile, s.cwd, s.netns, containerArgs)
+	if err != nil {
 		Fatalf("writing to %q: %v", confPath, err)
 	}
 
diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go
index a27a01942..e189648f4 100644
--- a/runsc/container/multi_container_test.go
+++ b/runsc/container/multi_container_test.go
@@ -100,19 +100,20 @@ type execDesc struct {
 	c    *Container
 	cmd  []string
 	want int
-	desc string
+	name string
 }
 
-func execMany(execs []execDesc) error {
+func execMany(t *testing.T, execs []execDesc) {
 	for _, exec := range execs {
-		args := &control.ExecArgs{Argv: exec.cmd}
-		if ws, err := exec.c.executeSync(args); err != nil {
-			return fmt.Errorf("error executing %+v: %v", args, err)
-		} else if ws.ExitStatus() != exec.want {
-			return fmt.Errorf("%q: exec %q got exit status: %d, want: %d", exec.desc, exec.cmd, ws.ExitStatus(), exec.want)
-		}
+		t.Run(exec.name, func(t *testing.T) {
+			args := &control.ExecArgs{Argv: exec.cmd}
+			if ws, err := exec.c.executeSync(args); err != nil {
+				t.Errorf("error executing %+v: %v", args, err)
+			} else if ws.ExitStatus() != exec.want {
+				t.Errorf("%q: exec %q got exit status: %d, want: %d", exec.name, exec.cmd, ws.ExitStatus(), exec.want)
+			}
+		})
 	}
-	return nil
 }
 
 func createSharedMount(mount specs.Mount, name string, pod ...*specs.Spec) {
@@ -1072,7 +1073,7 @@ func TestMultiContainerContainerDestroyStress(t *testing.T) {
 // Test that pod shared mounts are properly mounted in 2 containers and that
 // changes from one container is reflected in the other.
 func TestMultiContainerSharedMount(t *testing.T) {
-	for name, conf := range configs(t, all...) {
+	for name, conf := range configsWithVFS2(t, all...) {
 		t.Run(name, func(t *testing.T) {
 			rootDir, cleanup, err := testutil.SetupRootDir()
 			if err != nil {
@@ -1110,84 +1111,82 @@ func TestMultiContainerSharedMount(t *testing.T) {
 				{
 					c:    containers[0],
 					cmd:  []string{"/usr/bin/test", "-d", mnt0.Destination},
-					desc: "directory is mounted in container0",
+					name: "directory is mounted in container0",
 				},
 				{
 					c:    containers[1],
 					cmd:  []string{"/usr/bin/test", "-d", mnt1.Destination},
-					desc: "directory is mounted in container1",
+					name: "directory is mounted in container1",
 				},
 				{
 					c:    containers[0],
-					cmd:  []string{"/usr/bin/touch", file0},
-					desc: "create file in container0",
+					cmd:  []string{"/bin/touch", file0},
+					name: "create file in container0",
 				},
 				{
 					c:    containers[0],
 					cmd:  []string{"/usr/bin/test", "-f", file0},
-					desc: "file appears in container0",
+					name: "file appears in container0",
 				},
 				{
 					c:    containers[1],
 					cmd:  []string{"/usr/bin/test", "-f", file1},
-					desc: "file appears in container1",
+					name: "file appears in container1",
 				},
 				{
 					c:    containers[1],
 					cmd:  []string{"/bin/rm", file1},
-					desc: "file removed from container1",
+					name: "remove file from container1",
 				},
 				{
 					c:    containers[0],
 					cmd:  []string{"/usr/bin/test", "!", "-f", file0},
-					desc: "file removed from container0",
+					name: "file removed from container0",
 				},
 				{
 					c:    containers[1],
 					cmd:  []string{"/usr/bin/test", "!", "-f", file1},
-					desc: "file removed from container1",
+					name: "file removed from container1",
 				},
 				{
 					c:    containers[1],
 					cmd:  []string{"/bin/mkdir", file1},
-					desc: "create directory in container1",
+					name: "create directory in container1",
 				},
 				{
 					c:    containers[0],
 					cmd:  []string{"/usr/bin/test", "-d", file0},
-					desc: "dir appears in container0",
+					name: "dir appears in container0",
 				},
 				{
 					c:    containers[1],
 					cmd:  []string{"/usr/bin/test", "-d", file1},
-					desc: "dir appears in container1",
+					name: "dir appears in container1",
 				},
 				{
 					c:    containers[0],
 					cmd:  []string{"/bin/rmdir", file0},
-					desc: "create directory in container0",
+					name: "remove directory from container0",
 				},
 				{
 					c:    containers[0],
 					cmd:  []string{"/usr/bin/test", "!", "-d", file0},
-					desc: "dir removed from container0",
+					name: "dir removed from container0",
 				},
 				{
 					c:    containers[1],
 					cmd:  []string{"/usr/bin/test", "!", "-d", file1},
-					desc: "dir removed from container1",
+					name: "dir removed from container1",
 				},
 			}
-			if err := execMany(execs); err != nil {
-				t.Fatal(err.Error())
-			}
+			execMany(t, execs)
 		})
 	}
 }
 
 // Test that pod mounts are mounted as readonly when requested.
 func TestMultiContainerSharedMountReadonly(t *testing.T) {
-	for name, conf := range configs(t, all...) {
+	for name, conf := range configsWithVFS2(t, all...) {
 		t.Run(name, func(t *testing.T) {
 			rootDir, cleanup, err := testutil.SetupRootDir()
 			if err != nil {
@@ -1225,35 +1224,34 @@ func TestMultiContainerSharedMountReadonly(t *testing.T) {
 				{
 					c:    containers[0],
 					cmd:  []string{"/usr/bin/test", "-d", mnt0.Destination},
-					desc: "directory is mounted in container0",
+					name: "directory is mounted in container0",
 				},
 				{
 					c:    containers[1],
 					cmd:  []string{"/usr/bin/test", "-d", mnt1.Destination},
-					desc: "directory is mounted in container1",
+					name: "directory is mounted in container1",
 				},
 				{
 					c:    containers[0],
-					cmd:  []string{"/usr/bin/touch", file0},
+					cmd:  []string{"/bin/touch", file0},
 					want: 1,
-					desc: "fails to write to container0",
+					name: "fails to write to container0",
 				},
 				{
 					c:    containers[1],
-					cmd:  []string{"/usr/bin/touch", file1},
+					cmd:  []string{"/bin/touch", file1},
 					want: 1,
-					desc: "fails to write to container1",
+					name: "fails to write to container1",
 				},
 			}
-			if err := execMany(execs); err != nil {
-				t.Fatal(err.Error())
-			}
+			execMany(t, execs)
 		})
 	}
 }
 
 // Test that shared pod mounts continue to work after container is restarted.
 func TestMultiContainerSharedMountRestart(t *testing.T) {
+	//TODO(gvisor.dev/issue/1487): This is failing with VFS2.
 	for name, conf := range configs(t, all...) {
 		t.Run(name, func(t *testing.T) {
 			rootDir, cleanup, err := testutil.SetupRootDir()
@@ -1291,23 +1289,21 @@ func TestMultiContainerSharedMountRestart(t *testing.T) {
 			execs := []execDesc{
 				{
 					c:    containers[0],
-					cmd:  []string{"/usr/bin/touch", file0},
-					desc: "create file in container0",
+					cmd:  []string{"/bin/touch", file0},
+					name: "create file in container0",
 				},
 				{
 					c:    containers[0],
 					cmd:  []string{"/usr/bin/test", "-f", file0},
-					desc: "file appears in container0",
+					name: "file appears in container0",
 				},
 				{
 					c:    containers[1],
 					cmd:  []string{"/usr/bin/test", "-f", file1},
-					desc: "file appears in container1",
+					name: "file appears in container1",
 				},
 			}
-			if err := execMany(execs); err != nil {
-				t.Fatal(err.Error())
-			}
+			execMany(t, execs)
 
 			containers[1].Destroy()
 
@@ -1334,32 +1330,30 @@ func TestMultiContainerSharedMountRestart(t *testing.T) {
 				{
 					c:    containers[0],
 					cmd:  []string{"/usr/bin/test", "-f", file0},
-					desc: "file is still in container0",
+					name: "file is still in container0",
 				},
 				{
 					c:    containers[1],
 					cmd:  []string{"/usr/bin/test", "-f", file1},
-					desc: "file is still in container1",
+					name: "file is still in container1",
 				},
 				{
 					c:    containers[1],
 					cmd:  []string{"/bin/rm", file1},
-					desc: "file removed from container1",
+					name: "file removed from container1",
 				},
 				{
 					c:    containers[0],
 					cmd:  []string{"/usr/bin/test", "!", "-f", file0},
-					desc: "file removed from container0",
+					name: "file removed from container0",
 				},
 				{
 					c:    containers[1],
 					cmd:  []string{"/usr/bin/test", "!", "-f", file1},
-					desc: "file removed from container1",
+					name: "file removed from container1",
 				},
 			}
-			if err := execMany(execs); err != nil {
-				t.Fatal(err.Error())
-			}
+			execMany(t, execs)
 		})
 	}
 }
@@ -1367,53 +1361,53 @@ func TestMultiContainerSharedMountRestart(t *testing.T) {
 // Test that unsupported pod mounts options are ignored when matching master and
 // slave mounts.
 func TestMultiContainerSharedMountUnsupportedOptions(t *testing.T) {
-	rootDir, cleanup, err := testutil.SetupRootDir()
-	if err != nil {
-		t.Fatalf("error creating root dir: %v", err)
-	}
-	defer cleanup()
-
-	conf := testutil.TestConfig(t)
-	conf.RootDir = rootDir
+	for name, conf := range configsWithVFS2(t, all...) {
+		t.Run(name, func(t *testing.T) {
+			rootDir, cleanup, err := testutil.SetupRootDir()
+			if err != nil {
+				t.Fatalf("error creating root dir: %v", err)
+			}
+			defer cleanup()
+			conf.RootDir = rootDir
 
-	// Setup the containers.
-	sleep := []string{"/bin/sleep", "100"}
-	podSpec, ids := createSpecs(sleep, sleep)
-	mnt0 := specs.Mount{
-		Destination: "/mydir/test",
-		Source:      "/some/dir",
-		Type:        "tmpfs",
-		Options:     []string{"rw", "rbind", "relatime"},
-	}
-	podSpec[0].Mounts = append(podSpec[0].Mounts, mnt0)
+			// Setup the containers.
+			sleep := []string{"/bin/sleep", "100"}
+			podSpec, ids := createSpecs(sleep, sleep)
+			mnt0 := specs.Mount{
+				Destination: "/mydir/test",
+				Source:      "/some/dir",
+				Type:        "tmpfs",
+				Options:     []string{"rw", "rbind", "relatime"},
+			}
+			podSpec[0].Mounts = append(podSpec[0].Mounts, mnt0)
 
-	mnt1 := mnt0
-	mnt1.Destination = "/mydir2/test2"
-	mnt1.Options = []string{"rw", "nosuid"}
-	podSpec[1].Mounts = append(podSpec[1].Mounts, mnt1)
+			mnt1 := mnt0
+			mnt1.Destination = "/mydir2/test2"
+			mnt1.Options = []string{"rw", "nosuid"}
+			podSpec[1].Mounts = append(podSpec[1].Mounts, mnt1)
 
-	createSharedMount(mnt0, "test-mount", podSpec...)
+			createSharedMount(mnt0, "test-mount", podSpec...)
 
-	containers, cleanup, err := startContainers(conf, podSpec, ids)
-	if err != nil {
-		t.Fatalf("error starting containers: %v", err)
-	}
-	defer cleanup()
+			containers, cleanup, err := startContainers(conf, podSpec, ids)
+			if err != nil {
+				t.Fatalf("error starting containers: %v", err)
+			}
+			defer cleanup()
 
-	execs := []execDesc{
-		{
-			c:    containers[0],
-			cmd:  []string{"/usr/bin/test", "-d", mnt0.Destination},
-			desc: "directory is mounted in container0",
-		},
-		{
-			c:    containers[1],
-			cmd:  []string{"/usr/bin/test", "-d", mnt1.Destination},
-			desc: "directory is mounted in container1",
-		},
-	}
-	if err := execMany(execs); err != nil {
-		t.Fatal(err.Error())
+			execs := []execDesc{
+				{
+					c:    containers[0],
+					cmd:  []string{"/usr/bin/test", "-d", mnt0.Destination},
+					name: "directory is mounted in container0",
+				},
+				{
+					c:    containers[1],
+					cmd:  []string{"/usr/bin/test", "-d", mnt1.Destination},
+					name: "directory is mounted in container1",
+				},
+			}
+			execMany(t, execs)
+		})
 	}
 }
 
diff --git a/runsc/fsgofer/filter/config.go b/runsc/fsgofer/filter/config.go
index 1dce36965..88814b83c 100644
--- a/runsc/fsgofer/filter/config.go
+++ b/runsc/fsgofer/filter/config.go
@@ -128,6 +128,7 @@ var allowedSyscalls = seccomp.SyscallRules{
 	syscall.SYS_MADVISE:      {},
 	unix.SYS_MEMFD_CREATE:    {}, /// Used by flipcall.PacketWindowAllocator.Init().
 	syscall.SYS_MKDIRAT:      {},
+	syscall.SYS_MKNODAT:      {},
 	// Used by the Go runtime as a temporarily workaround for a Linux
 	// 5.2-5.4 bug.
 	//
diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go
index 74977c313..b7521bda7 100644
--- a/runsc/fsgofer/fsgofer.go
+++ b/runsc/fsgofer/fsgofer.go
@@ -139,7 +139,7 @@ func (a *attachPoint) Attach() (p9.File, error) {
 		return nil, fmt.Errorf("unable to open %q: %v", a.prefix, err)
 	}
 
-	stat, err := stat(f.FD())
+	stat, err := fstat(f.FD())
 	if err != nil {
 		return nil, fmt.Errorf("unable to stat %q: %v", a.prefix, err)
 	}
@@ -352,7 +352,7 @@ func newFDMaybe(file *fd.FD) *fd.FD {
 	return dup
 }
 
-func stat(fd int) (syscall.Stat_t, error) {
+func fstat(fd int) (syscall.Stat_t, error) {
 	var stat syscall.Stat_t
 	if err := syscall.Fstat(fd, &stat); err != nil {
 		return syscall.Stat_t{}, err
@@ -360,6 +360,14 @@ func stat(fd int) (syscall.Stat_t, error) {
 	return stat, nil
 }
 
+func stat(path string) (syscall.Stat_t, error) {
+	var stat syscall.Stat_t
+	if err := syscall.Stat(path, &stat); err != nil {
+		return syscall.Stat_t{}, err
+	}
+	return stat, nil
+}
+
 func fchown(fd int, uid p9.UID, gid p9.GID) error {
 	return syscall.Fchownat(fd, "", int(uid), int(gid), linux.AT_EMPTY_PATH|unix.AT_SYMLINK_NOFOLLOW)
 }
@@ -388,7 +396,7 @@ func (l *localFile) Open(flags p9.OpenFlags) (*fd.FD, p9.QID, uint32, error) {
 		}
 	}
 
-	stat, err := stat(newFile.FD())
+	stat, err := fstat(newFile.FD())
 	if err != nil {
 		if newFile != l.file {
 			newFile.Close()
@@ -449,7 +457,7 @@ func (l *localFile) Create(name string, mode p9.OpenFlags, perm p9.FileMode, uid
 	if err := fchown(child.FD(), uid, gid); err != nil {
 		return nil, nil, p9.QID{}, 0, extractErrno(err)
 	}
-	stat, err := stat(child.FD())
+	stat, err := fstat(child.FD())
 	if err != nil {
 		return nil, nil, p9.QID{}, 0, extractErrno(err)
 	}
@@ -497,7 +505,7 @@ func (l *localFile) Mkdir(name string, perm p9.FileMode, uid p9.UID, gid p9.GID)
 	if err := fchown(f.FD(), uid, gid); err != nil {
 		return p9.QID{}, extractErrno(err)
 	}
-	stat, err := stat(f.FD())
+	stat, err := fstat(f.FD())
 	if err != nil {
 		return p9.QID{}, extractErrno(err)
 	}
@@ -517,7 +525,7 @@ func (l *localFile) Walk(names []string) ([]p9.QID, p9.File, error) {
 			return nil, nil, extractErrno(err)
 		}
 
-		stat, err := stat(newFile.FD())
+		stat, err := fstat(newFile.FD())
 		if err != nil {
 			newFile.Close()
 			return nil, nil, extractErrno(err)
@@ -542,7 +550,7 @@ func (l *localFile) Walk(names []string) ([]p9.QID, p9.File, error) {
 		if err != nil {
 			return nil, nil, extractErrno(err)
 		}
-		stat, err := stat(f.FD())
+		stat, err := fstat(f.FD())
 		if err != nil {
 			f.Close()
 			return nil, nil, extractErrno(err)
@@ -592,7 +600,7 @@ func (l *localFile) FSync() error {
 
 // GetAttr implements p9.File.
 func (l *localFile) GetAttr(_ p9.AttrMask) (p9.QID, p9.AttrMask, p9.Attr, error) {
-	stat, err := stat(l.file.FD())
+	stat, err := fstat(l.file.FD())
 	if err != nil {
 		return p9.QID{}, p9.AttrMask{}, p9.Attr{}, extractErrno(err)
 	}
@@ -880,7 +888,7 @@ func (l *localFile) Symlink(target, newName string, uid p9.UID, gid p9.GID) (p9.
 	if err := fchown(f.FD(), uid, gid); err != nil {
 		return p9.QID{}, extractErrno(err)
 	}
-	stat, err := stat(f.FD())
+	stat, err := fstat(f.FD())
 	if err != nil {
 		return p9.QID{}, extractErrno(err)
 	}
@@ -907,13 +915,39 @@ func (l *localFile) Link(target p9.File, newName string) error {
 }
 
 // Mknod implements p9.File.
-//
-// Not implemented.
-func (*localFile) Mknod(_ string, _ p9.FileMode, _ uint32, _ uint32, _ p9.UID, _ p9.GID) (p9.QID, error) {
+func (l *localFile) Mknod(name string, mode p9.FileMode, _ uint32, _ uint32, uid p9.UID, gid p9.GID) (p9.QID, error) {
+	conf := l.attachPoint.conf
+	if conf.ROMount {
+		if conf.PanicOnWrite {
+			panic("attempt to write to RO mount")
+		}
+		return p9.QID{}, syscall.EROFS
+	}
+
+	hostPath := path.Join(l.hostPath, name)
+
+	// Return EEXIST if the file already exists.
+	if _, err := stat(hostPath); err == nil {
+		return p9.QID{}, syscall.EEXIST
+	}
+
 	// From mknod(2) man page:
 	// "EPERM: [...] if the filesystem containing pathname does not support
 	// the type of node requested."
-	return p9.QID{}, syscall.EPERM
+	if mode.FileType() != p9.ModeRegular {
+		return p9.QID{}, syscall.EPERM
+	}
+
+	// Allow Mknod to create regular files.
+	if err := syscall.Mknod(hostPath, uint32(mode), 0); err != nil {
+		return p9.QID{}, err
+	}
+
+	stat, err := stat(hostPath)
+	if err != nil {
+		return p9.QID{}, extractErrno(err)
+	}
+	return l.attachPoint.makeQID(stat), nil
 }
 
 // UnlinkAt implements p9.File.
diff --git a/runsc/sandbox/network.go b/runsc/sandbox/network.go
index deee619f3..817a923ad 100644
--- a/runsc/sandbox/network.go
+++ b/runsc/sandbox/network.go
@@ -134,7 +134,6 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, hardwareG
 		return err
 	}
 	if isRoot {
-
 		return fmt.Errorf("cannot run with network enabled in root network namespace")
 	}
 
diff --git a/test/e2e/BUILD b/test/e2e/BUILD
index 44cce0e3b..29a84f184 100644
--- a/test/e2e/BUILD
+++ b/test/e2e/BUILD
@@ -23,6 +23,7 @@ go_test(
         "//pkg/test/dockerutil",
         "//pkg/test/testutil",
         "//runsc/specutils",
+        "@com_github_docker_docker//api/types/mount:go_default_library",
     ],
 )
 
diff --git a/test/e2e/exec_test.go b/test/e2e/exec_test.go
index 6a63b1232..b47df447c 100644
--- a/test/e2e/exec_test.go
+++ b/test/e2e/exec_test.go
@@ -22,12 +22,10 @@
 package integration
 
 import (
+	"context"
 	"fmt"
-	"os"
-	"os/exec"
 	"strconv"
 	"strings"
-	"syscall"
 	"testing"
 	"time"
 
@@ -39,18 +37,19 @@ import (
 
 // Test that exec uses the exact same capability set as the container.
 func TestExecCapabilities(t *testing.T) {
-	d := dockerutil.MakeDocker(t)
-	defer d.CleanUp()
+	ctx := context.Background()
+	d := dockerutil.MakeContainer(ctx, t)
+	defer d.CleanUp(ctx)
 
 	// Start the container.
-	if err := d.Spawn(dockerutil.RunOpts{
+	if err := d.Spawn(ctx, dockerutil.RunOpts{
 		Image: "basic/alpine",
 	}, "sh", "-c", "cat /proc/self/status; sleep 100"); err != nil {
 		t.Fatalf("docker run failed: %v", err)
 	}
 
 	// Check that capability.
-	matches, err := d.WaitForOutputSubmatch("CapEff:\t([0-9a-f]+)\n", 5*time.Second)
+	matches, err := d.WaitForOutputSubmatch(ctx, "CapEff:\t([0-9a-f]+)\n", 5*time.Second)
 	if err != nil {
 		t.Fatalf("WaitForOutputSubmatch() timeout: %v", err)
 	}
@@ -61,7 +60,7 @@ func TestExecCapabilities(t *testing.T) {
 	t.Log("Root capabilities:", want)
 
 	// Now check that exec'd process capabilities match the root.
-	got, err := d.Exec(dockerutil.RunOpts{}, "grep", "CapEff:", "/proc/self/status")
+	got, err := d.Exec(ctx, dockerutil.ExecOpts{}, "grep", "CapEff:", "/proc/self/status")
 	if err != nil {
 		t.Fatalf("docker exec failed: %v", err)
 	}
@@ -74,11 +73,12 @@ func TestExecCapabilities(t *testing.T) {
 // Test that 'exec --privileged' adds all capabilities, except for CAP_NET_RAW
 // which is removed from the container when --net-raw=false.
 func TestExecPrivileged(t *testing.T) {
-	d := dockerutil.MakeDocker(t)
-	defer d.CleanUp()
+	ctx := context.Background()
+	d := dockerutil.MakeContainer(ctx, t)
+	defer d.CleanUp(ctx)
 
 	// Start the container with all capabilities dropped.
-	if err := d.Spawn(dockerutil.RunOpts{
+	if err := d.Spawn(ctx, dockerutil.RunOpts{
 		Image:   "basic/alpine",
 		CapDrop: []string{"all"},
 	}, "sh", "-c", "cat /proc/self/status; sleep 100"); err != nil {
@@ -86,7 +86,7 @@ func TestExecPrivileged(t *testing.T) {
 	}
 
 	// Check that all capabilities where dropped from container.
-	matches, err := d.WaitForOutputSubmatch("CapEff:\t([0-9a-f]+)\n", 5*time.Second)
+	matches, err := d.WaitForOutputSubmatch(ctx, "CapEff:\t([0-9a-f]+)\n", 5*time.Second)
 	if err != nil {
 		t.Fatalf("WaitForOutputSubmatch() timeout: %v", err)
 	}
@@ -104,7 +104,7 @@ func TestExecPrivileged(t *testing.T) {
 
 	// Check that 'exec --privileged' adds all capabilities, except for
 	// CAP_NET_RAW.
-	got, err := d.Exec(dockerutil.RunOpts{
+	got, err := d.Exec(ctx, dockerutil.ExecOpts{
 		Privileged: true,
 	}, "grep", "CapEff:", "/proc/self/status")
 	if err != nil {
@@ -118,76 +118,59 @@ func TestExecPrivileged(t *testing.T) {
 }
 
 func TestExecJobControl(t *testing.T) {
-	d := dockerutil.MakeDocker(t)
-	defer d.CleanUp()
+	ctx := context.Background()
+	d := dockerutil.MakeContainer(ctx, t)
+	defer d.CleanUp(ctx)
 
 	// Start the container.
-	if err := d.Spawn(dockerutil.RunOpts{
+	if err := d.Spawn(ctx, dockerutil.RunOpts{
 		Image: "basic/alpine",
 	}, "sleep", "1000"); err != nil {
 		t.Fatalf("docker run failed: %v", err)
 	}
 
-	// Exec 'sh' with an attached pty.
-	if _, err := d.Exec(dockerutil.RunOpts{
-		Pty: func(cmd *exec.Cmd, ptmx *os.File) {
-			// Call "sleep 100 | cat" in the shell. We pipe to cat
-			// so that there will be two processes in the
-			// foreground process group.
-			if _, err := ptmx.Write([]byte("sleep 100 | cat\n")); err != nil {
-				t.Fatalf("error writing to pty: %v", err)
-			}
-
-			// Give shell a few seconds to start executing the sleep.
-			time.Sleep(2 * time.Second)
-
-			// Send a ^C to the pty, which should kill sleep and
-			// cat, but not the shell.  \x03 is ASCII "end of
-			// text", which is the same as ^C.
-			if _, err := ptmx.Write([]byte{'\x03'}); err != nil {
-				t.Fatalf("error writing to pty: %v", err)
-			}
-
-			// The shell should still be alive at this point. Sleep
-			// should have exited with code 2+128=130. We'll exit
-			// with 10 plus that number, so that we can be sure
-			// that the shell did not get signalled.
-			if _, err := ptmx.Write([]byte("exit $(expr $? + 10)\n")); err != nil {
-				t.Fatalf("error writing to pty: %v", err)
-			}
-
-			// Exec process should exit with code 10+130=140.
-			ps, err := cmd.Process.Wait()
-			if err != nil {
-				t.Fatalf("error waiting for exec process: %v", err)
-			}
-			ws := ps.Sys().(syscall.WaitStatus)
-			if !ws.Exited() {
-				t.Errorf("ws.Exited got false, want true")
-			}
-			if got, want := ws.ExitStatus(), 140; got != want {
-				t.Errorf("ws.ExitedStatus got %d, want %d", got, want)
-			}
-		},
-	}, "sh"); err != nil {
+	p, err := d.ExecProcess(ctx, dockerutil.ExecOpts{UseTTY: true}, "/bin/sh")
+	if err != nil {
 		t.Fatalf("docker exec failed: %v", err)
 	}
+
+	if _, err = p.Write(time.Second, []byte("sleep 100 | cat\n")); err != nil {
+		t.Fatalf("error exit: %v", err)
+	}
+	time.Sleep(time.Second)
+
+	if _, err = p.Write(time.Second, []byte{0x03}); err != nil {
+		t.Fatalf("error exit: %v", err)
+	}
+
+	if _, err = p.Write(time.Second, []byte("exit $(expr $? + 10)\n")); err != nil {
+		t.Fatalf("error exit: %v", err)
+	}
+
+	want := 140
+	got, err := p.WaitExitStatus(ctx)
+	if err != nil {
+		t.Fatalf("wait for exit failed with: %v", err)
+	} else if got != want {
+		t.Fatalf("wait for exit returned: %d want: %d", got, want)
+	}
 }
 
 // Test that failure to exec returns proper error message.
 func TestExecError(t *testing.T) {
-	d := dockerutil.MakeDocker(t)
-	defer d.CleanUp()
+	ctx := context.Background()
+	d := dockerutil.MakeContainer(ctx, t)
+	defer d.CleanUp(ctx)
 
 	// Start the container.
-	if err := d.Spawn(dockerutil.RunOpts{
+	if err := d.Spawn(ctx, dockerutil.RunOpts{
 		Image: "basic/alpine",
 	}, "sleep", "1000"); err != nil {
 		t.Fatalf("docker run failed: %v", err)
 	}
 
 	// Attempt to exec a binary that doesn't exist.
-	out, err := d.Exec(dockerutil.RunOpts{}, "no_can_find")
+	out, err := d.Exec(ctx, dockerutil.ExecOpts{}, "no_can_find")
 	if err == nil {
 		t.Fatalf("docker exec didn't fail")
 	}
@@ -198,11 +181,12 @@ func TestExecError(t *testing.T) {
 
 // Test that exec inherits environment from run.
 func TestExecEnv(t *testing.T) {
-	d := dockerutil.MakeDocker(t)
-	defer d.CleanUp()
+	ctx := context.Background()
+	d := dockerutil.MakeContainer(ctx, t)
+	defer d.CleanUp(ctx)
 
 	// Start the container with env FOO=BAR.
-	if err := d.Spawn(dockerutil.RunOpts{
+	if err := d.Spawn(ctx, dockerutil.RunOpts{
 		Image: "basic/alpine",
 		Env:   []string{"FOO=BAR"},
 	}, "sleep", "1000"); err != nil {
@@ -210,7 +194,7 @@ func TestExecEnv(t *testing.T) {
 	}
 
 	// Exec "echo $FOO".
-	got, err := d.Exec(dockerutil.RunOpts{}, "/bin/sh", "-c", "echo $FOO")
+	got, err := d.Exec(ctx, dockerutil.ExecOpts{}, "/bin/sh", "-c", "echo $FOO")
 	if err != nil {
 		t.Fatalf("docker exec failed: %v", err)
 	}
@@ -222,11 +206,12 @@ func TestExecEnv(t *testing.T) {
 // TestRunEnvHasHome tests that run always has HOME environment set.
 func TestRunEnvHasHome(t *testing.T) {
 	// Base alpine image does not have any environment variables set.
-	d := dockerutil.MakeDocker(t)
-	defer d.CleanUp()
+	ctx := context.Background()
+	d := dockerutil.MakeContainer(ctx, t)
+	defer d.CleanUp(ctx)
 
 	// Exec "echo $HOME". The 'bin' user's home dir is '/bin'.
-	got, err := d.Run(dockerutil.RunOpts{
+	got, err := d.Run(ctx, dockerutil.RunOpts{
 		Image: "basic/alpine",
 		User:  "bin",
 	}, "/bin/sh", "-c", "echo $HOME")
@@ -243,17 +228,18 @@ func TestRunEnvHasHome(t *testing.T) {
 // Test that exec always has HOME environment set, even when not set in run.
 func TestExecEnvHasHome(t *testing.T) {
 	// Base alpine image does not have any environment variables set.
-	d := dockerutil.MakeDocker(t)
-	defer d.CleanUp()
+	ctx := context.Background()
+	d := dockerutil.MakeContainer(ctx, t)
+	defer d.CleanUp(ctx)
 
-	if err := d.Spawn(dockerutil.RunOpts{
+	if err := d.Spawn(ctx, dockerutil.RunOpts{
 		Image: "basic/alpine",
 	}, "sleep", "1000"); err != nil {
 		t.Fatalf("docker run failed: %v", err)
 	}
 
 	// Exec "echo $HOME", and expect to see "/root".
-	got, err := d.Exec(dockerutil.RunOpts{}, "/bin/sh", "-c", "echo $HOME")
+	got, err := d.Exec(ctx, dockerutil.ExecOpts{}, "/bin/sh", "-c", "echo $HOME")
 	if err != nil {
 		t.Fatalf("docker exec failed: %v", err)
 	}
@@ -265,12 +251,12 @@ func TestExecEnvHasHome(t *testing.T) {
 	newUID := 1234
 	newHome := "/foo/bar"
 	cmd := fmt.Sprintf("mkdir -p -m 777 %q && adduser foo -D -u %d -h %q", newHome, newUID, newHome)
-	if _, err := d.Exec(dockerutil.RunOpts{}, "/bin/sh", "-c", cmd); err != nil {
+	if _, err := d.Exec(ctx, dockerutil.ExecOpts{}, "/bin/sh", "-c", cmd); err != nil {
 		t.Fatalf("docker exec failed: %v", err)
 	}
 
 	// Execute the same as the new user and expect newHome.
-	got, err = d.Exec(dockerutil.RunOpts{
+	got, err = d.Exec(ctx, dockerutil.ExecOpts{
 		User: strconv.Itoa(newUID),
 	}, "/bin/sh", "-c", "echo $HOME")
 	if err != nil {
diff --git a/test/e2e/integration_test.go b/test/e2e/integration_test.go
index 60e739c6a..5a9455b33 100644
--- a/test/e2e/integration_test.go
+++ b/test/e2e/integration_test.go
@@ -22,20 +22,20 @@
 package integration
 
 import (
+	"context"
 	"flag"
 	"fmt"
 	"io/ioutil"
 	"net"
 	"net/http"
 	"os"
-	"os/exec"
 	"path/filepath"
 	"strconv"
 	"strings"
-	"syscall"
 	"testing"
 	"time"
 
+	"github.com/docker/docker/api/types/mount"
 	"gvisor.dev/gvisor/pkg/test/dockerutil"
 	"gvisor.dev/gvisor/pkg/test/testutil"
 )
@@ -56,22 +56,23 @@ func httpRequestSucceeds(client http.Client, server string, port int) error {
 
 // TestLifeCycle tests a basic Create/Start/Stop docker container life cycle.
 func TestLifeCycle(t *testing.T) {
-	d := dockerutil.MakeDocker(t)
-	defer d.CleanUp()
+	ctx := context.Background()
+	d := dockerutil.MakeContainer(ctx, t)
+	defer d.CleanUp(ctx)
 
 	// Start the container.
-	if err := d.Create(dockerutil.RunOpts{
+	if err := d.Create(ctx, dockerutil.RunOpts{
 		Image: "basic/nginx",
 		Ports: []int{80},
 	}); err != nil {
 		t.Fatalf("docker create failed: %v", err)
 	}
-	if err := d.Start(); err != nil {
+	if err := d.Start(ctx); err != nil {
 		t.Fatalf("docker start failed: %v", err)
 	}
 
 	// Test that container is working.
-	port, err := d.FindPort(80)
+	port, err := d.FindPort(ctx, 80)
 	if err != nil {
 		t.Fatalf("docker.FindPort(80) failed: %v", err)
 	}
@@ -83,10 +84,10 @@ func TestLifeCycle(t *testing.T) {
 		t.Errorf("http request failed: %v", err)
 	}
 
-	if err := d.Stop(); err != nil {
+	if err := d.Stop(ctx); err != nil {
 		t.Fatalf("docker stop failed: %v", err)
 	}
-	if err := d.Remove(); err != nil {
+	if err := d.Remove(ctx); err != nil {
 		t.Fatalf("docker rm failed: %v", err)
 	}
 }
@@ -96,11 +97,12 @@ func TestPauseResume(t *testing.T) {
 		t.Skip("Checkpoint is not supported.")
 	}
 
-	d := dockerutil.MakeDocker(t)
-	defer d.CleanUp()
+	ctx := context.Background()
+	d := dockerutil.MakeContainer(ctx, t)
+	defer d.CleanUp(ctx)
 
 	// Start the container.
-	if err := d.Spawn(dockerutil.RunOpts{
+	if err := d.Spawn(ctx, dockerutil.RunOpts{
 		Image: "basic/python",
 		Ports: []int{8080}, // See Dockerfile.
 	}); err != nil {
@@ -108,7 +110,7 @@ func TestPauseResume(t *testing.T) {
 	}
 
 	// Find where port 8080 is mapped to.
-	port, err := d.FindPort(8080)
+	port, err := d.FindPort(ctx, 8080)
 	if err != nil {
 		t.Fatalf("docker.FindPort(8080) failed: %v", err)
 	}
@@ -124,7 +126,7 @@ func TestPauseResume(t *testing.T) {
 		t.Error("http request failed:", err)
 	}
 
-	if err := d.Pause(); err != nil {
+	if err := d.Pause(ctx); err != nil {
 		t.Fatalf("docker pause failed: %v", err)
 	}
 
@@ -140,7 +142,7 @@ func TestPauseResume(t *testing.T) {
 		t.Errorf("http req got unexpected error %v", v)
 	}
 
-	if err := d.Unpause(); err != nil {
+	if err := d.Unpause(ctx); err != nil {
 		t.Fatalf("docker unpause failed: %v", err)
 	}
 
@@ -160,11 +162,12 @@ func TestCheckpointRestore(t *testing.T) {
 		t.Skip("Pause/resume is not supported.")
 	}
 
-	d := dockerutil.MakeDocker(t)
-	defer d.CleanUp()
+	ctx := context.Background()
+	d := dockerutil.MakeContainer(ctx, t)
+	defer d.CleanUp(ctx)
 
 	// Start the container.
-	if err := d.Spawn(dockerutil.RunOpts{
+	if err := d.Spawn(ctx, dockerutil.RunOpts{
 		Image: "basic/python",
 		Ports: []int{8080}, // See Dockerfile.
 	}); err != nil {
@@ -172,20 +175,20 @@ func TestCheckpointRestore(t *testing.T) {
 	}
 
 	// Create a snapshot.
-	if err := d.Checkpoint("test"); err != nil {
+	if err := d.Checkpoint(ctx, "test"); err != nil {
 		t.Fatalf("docker checkpoint failed: %v", err)
 	}
-	if _, err := d.Wait(30 * time.Second); err != nil {
+	if err := d.WaitTimeout(ctx, 30*time.Second); err != nil {
 		t.Fatalf("wait failed: %v", err)
 	}
 
 	// TODO(b/143498576): Remove Poll after github.com/moby/moby/issues/38963 is fixed.
-	if err := testutil.Poll(func() error { return d.Restore("test") }, 15*time.Second); err != nil {
+	if err := testutil.Poll(func() error { return d.Restore(ctx, "test") }, 15*time.Second); err != nil {
 		t.Fatalf("docker restore failed: %v", err)
 	}
 
 	// Find where port 8080 is mapped to.
-	port, err := d.FindPort(8080)
+	port, err := d.FindPort(ctx, 8080)
 	if err != nil {
 		t.Fatalf("docker.FindPort(8080) failed: %v", err)
 	}
@@ -204,26 +207,27 @@ func TestCheckpointRestore(t *testing.T) {
 
 // Create client and server that talk to each other using the local IP.
 func TestConnectToSelf(t *testing.T) {
-	d := dockerutil.MakeDocker(t)
-	defer d.CleanUp()
+	ctx := context.Background()
+	d := dockerutil.MakeContainer(ctx, t)
+	defer d.CleanUp(ctx)
 
 	// Creates server that replies "server" and exists. Sleeps at the end because
 	// 'docker exec' gets killed if the init process exists before it can finish.
-	if err := d.Spawn(dockerutil.RunOpts{
+	if err := d.Spawn(ctx, dockerutil.RunOpts{
 		Image: "basic/ubuntu",
 	}, "/bin/sh", "-c", "echo server | nc -l -p 8080 && sleep 1"); err != nil {
 		t.Fatalf("docker run failed: %v", err)
 	}
 
 	// Finds IP address for host.
-	ip, err := d.Exec(dockerutil.RunOpts{}, "/bin/sh", "-c", "cat /etc/hosts | grep ${HOSTNAME} | awk '{print $1}'")
+	ip, err := d.Exec(ctx, dockerutil.ExecOpts{}, "/bin/sh", "-c", "cat /etc/hosts | grep ${HOSTNAME} | awk '{print $1}'")
 	if err != nil {
 		t.Fatalf("docker exec failed: %v", err)
 	}
 	ip = strings.TrimRight(ip, "\n")
 
 	// Runs client that sends "client" to the server and exits.
-	reply, err := d.Exec(dockerutil.RunOpts{}, "/bin/sh", "-c", fmt.Sprintf("echo client | nc %s 8080", ip))
+	reply, err := d.Exec(ctx, dockerutil.ExecOpts{}, "/bin/sh", "-c", fmt.Sprintf("echo client | nc %s 8080", ip))
 	if err != nil {
 		t.Fatalf("docker exec failed: %v", err)
 	}
@@ -232,21 +236,22 @@ func TestConnectToSelf(t *testing.T) {
 	if want := "server\n"; reply != want {
 		t.Errorf("Error on server, want: %q, got: %q", want, reply)
 	}
-	if _, err := d.WaitForOutput("^client\n$", 1*time.Second); err != nil {
+	if _, err := d.WaitForOutput(ctx, "^client\n$", 1*time.Second); err != nil {
 		t.Fatalf("docker.WaitForOutput(client) timeout: %v", err)
 	}
 }
 
 func TestMemLimit(t *testing.T) {
-	d := dockerutil.MakeDocker(t)
-	defer d.CleanUp()
+	ctx := context.Background()
+	d := dockerutil.MakeContainer(ctx, t)
+	defer d.CleanUp(ctx)
 
 	// N.B. Because the size of the memory file may grow in large chunks,
 	// there is a minimum threshold of 1GB for the MemTotal figure.
-	allocMemory := 1024 * 1024
-	out, err := d.Run(dockerutil.RunOpts{
+	allocMemory := 1024 * 1024 // In kb.
+	out, err := d.Run(ctx, dockerutil.RunOpts{
 		Image:  "basic/alpine",
-		Memory: allocMemory, // In kB.
+		Memory: allocMemory * 1024, // In bytes.
 	}, "sh", "-c", "cat /proc/meminfo | grep MemTotal: | awk '{print $2}'")
 	if err != nil {
 		t.Fatalf("docker run failed: %v", err)
@@ -272,13 +277,14 @@ func TestMemLimit(t *testing.T) {
 }
 
 func TestNumCPU(t *testing.T) {
-	d := dockerutil.MakeDocker(t)
-	defer d.CleanUp()
+	ctx := context.Background()
+	d := dockerutil.MakeContainer(ctx, t)
+	defer d.CleanUp(ctx)
 
 	// Read how many cores are in the container.
-	out, err := d.Run(dockerutil.RunOpts{
-		Image: "basic/alpine",
-		Extra: []string{"--cpuset-cpus=0"},
+	out, err := d.Run(ctx, dockerutil.RunOpts{
+		Image:      "basic/alpine",
+		CpusetCpus: "0",
 	}, "sh", "-c", "cat /proc/cpuinfo | grep 'processor.*:' | wc -l")
 	if err != nil {
 		t.Fatalf("docker run failed: %v", err)
@@ -296,48 +302,34 @@ func TestNumCPU(t *testing.T) {
 
 // TestJobControl tests that job control characters are handled properly.
 func TestJobControl(t *testing.T) {
-	d := dockerutil.MakeDocker(t)
-	defer d.CleanUp()
+	ctx := context.Background()
+	d := dockerutil.MakeContainer(ctx, t)
+	defer d.CleanUp(ctx)
 
 	// Start the container with an attached PTY.
-	if _, err := d.Run(dockerutil.RunOpts{
+	p, err := d.SpawnProcess(ctx, dockerutil.RunOpts{
 		Image: "basic/alpine",
-		Pty: func(_ *exec.Cmd, ptmx *os.File) {
-			// Call "sleep 100" in the shell.
-			if _, err := ptmx.Write([]byte("sleep 100\n")); err != nil {
-				t.Fatalf("error writing to pty: %v", err)
-			}
-
-			// Give shell a few seconds to start executing the sleep.
-			time.Sleep(2 * time.Second)
+	}, "sh", "-c", "sleep 100 | cat")
+	if err != nil {
+		t.Fatalf("docker run failed: %v", err)
+	}
+	// Give shell a few seconds to start executing the sleep.
+	time.Sleep(2 * time.Second)
 
-			// Send a ^C to the pty, which should kill sleep, but
-			// not the shell.  \x03 is ASCII "end of text", which
-			// is the same as ^C.
-			if _, err := ptmx.Write([]byte{'\x03'}); err != nil {
-				t.Fatalf("error writing to pty: %v", err)
-			}
+	if _, err := p.Write(time.Second, []byte{0x03}); err != nil {
+		t.Fatalf("error exit: %v", err)
+	}
 
-			// The shell should still be alive at this point. Sleep
-			// should have exited with code 2+128=130. We'll exit
-			// with 10 plus that number, so that we can be sure
-			// that the shell did not get signalled.
-			if _, err := ptmx.Write([]byte("exit $(expr $? + 10)\n")); err != nil {
-				t.Fatalf("error writing to pty: %v", err)
-			}
-		},
-	}, "sh"); err != nil {
-		t.Fatalf("docker run failed: %v", err)
+	if err := d.WaitTimeout(ctx, 3*time.Second); err != nil {
+		t.Fatalf("WaitTimeout failed: %v", err)
 	}
 
-	// Wait for the container to exit.
-	got, err := d.Wait(5 * time.Second)
+	want := 130
+	got, err := p.WaitExitStatus(ctx)
 	if err != nil {
-		t.Fatalf("error getting exit code: %v", err)
-	}
-	// Container should exit with code 10+130=140.
-	if want := syscall.WaitStatus(140); got != want {
-		t.Errorf("container exited with code %d want %d", got, want)
+		t.Fatalf("wait for exit failed with: %v", err)
+	} else if got != want {
+		t.Fatalf("got: %d want: %d", got, want)
 	}
 }
 
@@ -356,15 +348,16 @@ func TestWorkingDirCreation(t *testing.T) {
 				name += "-readonly"
 			}
 			t.Run(name, func(t *testing.T) {
-				d := dockerutil.MakeDocker(t)
-				defer d.CleanUp()
+				ctx := context.Background()
+				d := dockerutil.MakeContainer(ctx, t)
+				defer d.CleanUp(ctx)
 
 				opts := dockerutil.RunOpts{
 					Image:    "basic/alpine",
 					WorkDir:  tc.workingDir,
 					ReadOnly: readonly,
 				}
-				got, err := d.Run(opts, "sh", "-c", "echo ${PWD}")
+				got, err := d.Run(ctx, opts, "sh", "-c", "echo ${PWD}")
 				if err != nil {
 					t.Fatalf("docker run failed: %v", err)
 				}
@@ -378,11 +371,12 @@ func TestWorkingDirCreation(t *testing.T) {
 
 // TestTmpFile checks that files inside '/tmp' are not overridden.
 func TestTmpFile(t *testing.T) {
-	d := dockerutil.MakeDocker(t)
-	defer d.CleanUp()
+	ctx := context.Background()
+	d := dockerutil.MakeContainer(ctx, t)
+	defer d.CleanUp(ctx)
 
 	opts := dockerutil.RunOpts{Image: "tmpfile"}
-	got, err := d.Run(opts, "cat", "/tmp/foo/file.txt")
+	got, err := d.Run(ctx, opts, "cat", "/tmp/foo/file.txt")
 	if err != nil {
 		t.Fatalf("docker run failed: %v", err)
 	}
@@ -393,6 +387,7 @@ func TestTmpFile(t *testing.T) {
 
 // TestTmpMount checks that mounts inside '/tmp' are not overridden.
 func TestTmpMount(t *testing.T) {
+	ctx := context.Background()
 	dir, err := ioutil.TempDir(testutil.TmpDir(), "tmp-mount")
 	if err != nil {
 		t.Fatalf("TempDir(): %v", err)
@@ -401,19 +396,20 @@ func TestTmpMount(t *testing.T) {
 	if err := ioutil.WriteFile(filepath.Join(dir, "file.txt"), []byte("123"), 0666); err != nil {
 		t.Fatalf("WriteFile(): %v", err)
 	}
-	d := dockerutil.MakeDocker(t)
-	defer d.CleanUp()
+	d := dockerutil.MakeContainer(ctx, t)
+	defer d.CleanUp(ctx)
 
 	opts := dockerutil.RunOpts{
 		Image: "basic/alpine",
-		Mounts: []dockerutil.Mount{
+		Mounts: []mount.Mount{
 			{
+				Type:   mount.TypeBind,
 				Source: dir,
 				Target: "/tmp/foo",
 			},
 		},
 	}
-	got, err := d.Run(opts, "cat", "/tmp/foo/file.txt")
+	got, err := d.Run(ctx, opts, "cat", "/tmp/foo/file.txt")
 	if err != nil {
 		t.Fatalf("docker run failed: %v", err)
 	}
@@ -426,10 +422,11 @@ func TestTmpMount(t *testing.T) {
 // runsc to hide the incoherence of FDs opened before and after overlayfs
 // copy-up on the host.
 func TestHostOverlayfsCopyUp(t *testing.T) {
-	d := dockerutil.MakeDocker(t)
-	defer d.CleanUp()
+	ctx := context.Background()
+	d := dockerutil.MakeContainer(ctx, t)
+	defer d.CleanUp(ctx)
 
-	if _, err := d.Run(dockerutil.RunOpts{
+	if _, err := d.Run(ctx, dockerutil.RunOpts{
 		Image:   "hostoverlaytest",
 		WorkDir: "/root",
 	}, "./test"); err != nil {
diff --git a/test/e2e/regression_test.go b/test/e2e/regression_test.go
index 327a2174c..70bbe5121 100644
--- a/test/e2e/regression_test.go
+++ b/test/e2e/regression_test.go
@@ -15,6 +15,7 @@
 package integration
 
 import (
+	"context"
 	"strings"
 	"testing"
 
@@ -27,11 +28,12 @@ import (
 // Prerequisite: the directory where the socket file is created must not have
 // been open for write before bind(2) is called.
 func TestBindOverlay(t *testing.T) {
-	d := dockerutil.MakeDocker(t)
-	defer d.CleanUp()
+	ctx := context.Background()
+	d := dockerutil.MakeContainer(ctx, t)
+	defer d.CleanUp(ctx)
 
 	// Run the container.
-	got, err := d.Run(dockerutil.RunOpts{
+	got, err := d.Run(ctx, dockerutil.RunOpts{
 		Image: "basic/ubuntu",
 	}, "bash", "-c", "nc -l -U /var/run/sock & p=$! && sleep 1 && echo foobar-asdf | nc -U /var/run/sock && wait $p")
 	if err != nil {
diff --git a/test/image/image_test.go b/test/image/image_test.go
index 3e4321480..8aa78035f 100644
--- a/test/image/image_test.go
+++ b/test/image/image_test.go
@@ -22,6 +22,7 @@
 package image
 
 import (
+	"context"
 	"flag"
 	"fmt"
 	"io/ioutil"
@@ -37,11 +38,12 @@ import (
 )
 
 func TestHelloWorld(t *testing.T) {
-	d := dockerutil.MakeDocker(t)
-	defer d.CleanUp()
+	ctx := context.Background()
+	d := dockerutil.MakeContainer(ctx, t)
+	defer d.CleanUp(ctx)
 
 	// Run the basic container.
-	out, err := d.Run(dockerutil.RunOpts{
+	out, err := d.Run(ctx, dockerutil.RunOpts{
 		Image: "basic/alpine",
 	}, "echo", "Hello world!")
 	if err != nil {
@@ -107,8 +109,9 @@ func testHTTPServer(t *testing.T, port int) {
 }
 
 func TestHttpd(t *testing.T) {
-	d := dockerutil.MakeDocker(t)
-	defer d.CleanUp()
+	ctx := context.Background()
+	d := dockerutil.MakeContainer(ctx, t)
+	defer d.CleanUp(ctx)
 
 	// Start the container.
 	opts := dockerutil.RunOpts{
@@ -116,12 +119,12 @@ func TestHttpd(t *testing.T) {
 		Ports: []int{80},
 	}
 	d.CopyFiles(&opts, "/usr/local/apache2/htdocs", "test/image/latin10k.txt")
-	if err := d.Spawn(opts); err != nil {
+	if err := d.Spawn(ctx, opts); err != nil {
 		t.Fatalf("docker run failed: %v", err)
 	}
 
 	// Find where port 80 is mapped to.
-	port, err := d.FindPort(80)
+	port, err := d.FindPort(ctx, 80)
 	if err != nil {
 		t.Fatalf("FindPort(80) failed: %v", err)
 	}
@@ -135,8 +138,9 @@ func TestHttpd(t *testing.T) {
 }
 
 func TestNginx(t *testing.T) {
-	d := dockerutil.MakeDocker(t)
-	defer d.CleanUp()
+	ctx := context.Background()
+	d := dockerutil.MakeContainer(ctx, t)
+	defer d.CleanUp(ctx)
 
 	// Start the container.
 	opts := dockerutil.RunOpts{
@@ -144,12 +148,12 @@ func TestNginx(t *testing.T) {
 		Ports: []int{80},
 	}
 	d.CopyFiles(&opts, "/usr/share/nginx/html", "test/image/latin10k.txt")
-	if err := d.Spawn(opts); err != nil {
+	if err := d.Spawn(ctx, opts); err != nil {
 		t.Fatalf("docker run failed: %v", err)
 	}
 
 	// Find where port 80 is mapped to.
-	port, err := d.FindPort(80)
+	port, err := d.FindPort(ctx, 80)
 	if err != nil {
 		t.Fatalf("FindPort(80) failed: %v", err)
 	}
@@ -163,11 +167,12 @@ func TestNginx(t *testing.T) {
 }
 
 func TestMysql(t *testing.T) {
-	server := dockerutil.MakeDocker(t)
-	defer server.CleanUp()
+	ctx := context.Background()
+	server := dockerutil.MakeContainer(ctx, t)
+	defer server.CleanUp(ctx)
 
 	// Start the container.
-	if err := server.Spawn(dockerutil.RunOpts{
+	if err := server.Spawn(ctx, dockerutil.RunOpts{
 		Image: "basic/mysql",
 		Env:   []string{"MYSQL_ROOT_PASSWORD=foobar123"},
 	}); err != nil {
@@ -175,42 +180,38 @@ func TestMysql(t *testing.T) {
 	}
 
 	// Wait until it's up and running.
-	if _, err := server.WaitForOutput("port: 3306  MySQL Community Server", 3*time.Minute); err != nil {
+	if _, err := server.WaitForOutput(ctx, "port: 3306  MySQL Community Server", 3*time.Minute); err != nil {
 		t.Fatalf("WaitForOutput() timeout: %v", err)
 	}
 
 	// Generate the client and copy in the SQL payload.
-	client := dockerutil.MakeDocker(t)
-	defer client.CleanUp()
+	client := dockerutil.MakeContainer(ctx, t)
+	defer client.CleanUp(ctx)
 
 	// Tell mysql client to connect to the server and execute the file in
 	// verbose mode to verify the output.
 	opts := dockerutil.RunOpts{
 		Image: "basic/mysql",
-		Links: []dockerutil.Link{
-			{
-				Source: server,
-				Target: "mysql",
-			},
-		},
+		Links: []string{server.MakeLink("mysql")},
 	}
 	client.CopyFiles(&opts, "/sql", "test/image/mysql.sql")
-	if _, err := client.Run(opts, "mysql", "-hmysql", "-uroot", "-pfoobar123", "-v", "-e", "source /sql/mysql.sql"); err != nil {
+	if _, err := client.Run(ctx, opts, "mysql", "-hmysql", "-uroot", "-pfoobar123", "-v", "-e", "source /sql/mysql.sql"); err != nil {
 		t.Fatalf("docker run failed: %v", err)
 	}
 
 	// Ensure file executed to the end and shutdown mysql.
-	if _, err := server.WaitForOutput("mysqld: Shutdown complete", 30*time.Second); err != nil {
+	if _, err := server.WaitForOutput(ctx, "mysqld: Shutdown complete", 30*time.Second); err != nil {
 		t.Fatalf("WaitForOutput() timeout: %v", err)
 	}
 }
 
 func TestTomcat(t *testing.T) {
-	d := dockerutil.MakeDocker(t)
-	defer d.CleanUp()
+	ctx := context.Background()
+	d := dockerutil.MakeContainer(ctx, t)
+	defer d.CleanUp(ctx)
 
 	// Start the server.
-	if err := d.Spawn(dockerutil.RunOpts{
+	if err := d.Spawn(ctx, dockerutil.RunOpts{
 		Image: "basic/tomcat",
 		Ports: []int{8080},
 	}); err != nil {
@@ -218,7 +219,7 @@ func TestTomcat(t *testing.T) {
 	}
 
 	// Find where port 8080 is mapped to.
-	port, err := d.FindPort(8080)
+	port, err := d.FindPort(ctx, 8080)
 	if err != nil {
 		t.Fatalf("FindPort(8080) failed: %v", err)
 	}
@@ -240,8 +241,9 @@ func TestTomcat(t *testing.T) {
 }
 
 func TestRuby(t *testing.T) {
-	d := dockerutil.MakeDocker(t)
-	defer d.CleanUp()
+	ctx := context.Background()
+	d := dockerutil.MakeContainer(ctx, t)
+	defer d.CleanUp(ctx)
 
 	// Execute the ruby workload.
 	opts := dockerutil.RunOpts{
@@ -249,12 +251,12 @@ func TestRuby(t *testing.T) {
 		Ports: []int{8080},
 	}
 	d.CopyFiles(&opts, "/src", "test/image/ruby.rb", "test/image/ruby.sh")
-	if err := d.Spawn(opts, "/src/ruby.sh"); err != nil {
+	if err := d.Spawn(ctx, opts, "/src/ruby.sh"); err != nil {
 		t.Fatalf("docker run failed: %v", err)
 	}
 
 	// Find where port 8080 is mapped to.
-	port, err := d.FindPort(8080)
+	port, err := d.FindPort(ctx, 8080)
 	if err != nil {
 		t.Fatalf("FindPort(8080) failed: %v", err)
 	}
@@ -283,20 +285,21 @@ func TestRuby(t *testing.T) {
 }
 
 func TestStdio(t *testing.T) {
-	d := dockerutil.MakeDocker(t)
-	defer d.CleanUp()
+	ctx := context.Background()
+	d := dockerutil.MakeContainer(ctx, t)
+	defer d.CleanUp(ctx)
 
 	wantStdout := "hello stdout"
 	wantStderr := "bonjour stderr"
 	cmd := fmt.Sprintf("echo %q; echo %q 1>&2;", wantStdout, wantStderr)
-	if err := d.Spawn(dockerutil.RunOpts{
+	if err := d.Spawn(ctx, dockerutil.RunOpts{
 		Image: "basic/alpine",
 	}, "/bin/sh", "-c", cmd); err != nil {
 		t.Fatalf("docker run failed: %v", err)
 	}
 
 	for _, want := range []string{wantStdout, wantStderr} {
-		if _, err := d.WaitForOutput(want, 5*time.Second); err != nil {
+		if _, err := d.WaitForOutput(ctx, want, 5*time.Second); err != nil {
 			t.Fatalf("docker didn't get output %q : %v", want, err)
 		}
 	}
diff --git a/test/iptables/filter_input.go b/test/iptables/filter_input.go
index 872021358..068f228bd 100644
--- a/test/iptables/filter_input.go
+++ b/test/iptables/filter_input.go
@@ -618,7 +618,7 @@ func (FilterInputDestination) Name() string {
 
 // ContainerAction implements TestCase.ContainerAction.
 func (FilterInputDestination) ContainerAction(ip net.IP) error {
-	addrs, err := localAddrs()
+	addrs, err := localAddrs(false)
 	if err != nil {
 		return err
 	}
diff --git a/test/iptables/iptables_test.go b/test/iptables/iptables_test.go
index 340f9426e..f5ac79370 100644
--- a/test/iptables/iptables_test.go
+++ b/test/iptables/iptables_test.go
@@ -15,8 +15,10 @@
 package iptables
 
 import (
+	"context"
 	"fmt"
 	"net"
+	"reflect"
 	"testing"
 
 	"gvisor.dev/gvisor/pkg/test/dockerutil"
@@ -37,8 +39,9 @@ func singleTest(t *testing.T, test TestCase) {
 		t.Fatalf("no test found with name %q. Has it been registered?", test.Name())
 	}
 
-	d := dockerutil.MakeDocker(t)
-	defer d.CleanUp()
+	ctx := context.Background()
+	d := dockerutil.MakeContainer(ctx, t)
+	defer d.CleanUp(ctx)
 
 	// Create and start the container.
 	opts := dockerutil.RunOpts{
@@ -46,12 +49,12 @@ func singleTest(t *testing.T, test TestCase) {
 		CapAdd: []string{"NET_ADMIN"},
 	}
 	d.CopyFiles(&opts, "/runner", "test/iptables/runner/runner")
-	if err := d.Spawn(opts, "/runner/runner", "-name", test.Name()); err != nil {
+	if err := d.Spawn(ctx, opts, "/runner/runner", "-name", test.Name()); err != nil {
 		t.Fatalf("docker run failed: %v", err)
 	}
 
 	// Get the container IP.
-	ip, err := d.FindIP()
+	ip, err := d.FindIP(ctx)
 	if err != nil {
 		t.Fatalf("failed to get container IP: %v", err)
 	}
@@ -69,7 +72,7 @@ func singleTest(t *testing.T, test TestCase) {
 	// Wait for the final statement. This structure has the side effect
 	// that all container logs will appear within the individual test
 	// context.
-	if _, err := d.WaitForOutput(TerminalStatement, TestTimeout); err != nil {
+	if _, err := d.WaitForOutput(ctx, TerminalStatement, TestTimeout); err != nil {
 		t.Fatalf("test failed: %v", err)
 	}
 }
@@ -315,3 +318,28 @@ func TestInputSource(t *testing.T) {
 func TestInputInvertSource(t *testing.T) {
 	singleTest(t, FilterInputInvertSource{})
 }
+
+func TestFilterAddrs(t *testing.T) {
+	tcs := []struct {
+		ipv6  bool
+		addrs []string
+		want  []string
+	}{
+		{
+			ipv6:  false,
+			addrs: []string{"192.168.0.1", "192.168.0.2/24", "::1", "::2/128"},
+			want:  []string{"192.168.0.1", "192.168.0.2"},
+		},
+		{
+			ipv6:  true,
+			addrs: []string{"192.168.0.1", "192.168.0.2/24", "::1", "::2/128"},
+			want:  []string{"::1", "::2"},
+		},
+	}
+
+	for _, tc := range tcs {
+		if got := filterAddrs(tc.addrs, tc.ipv6); !reflect.DeepEqual(got, tc.want) {
+			t.Errorf("%v with IPv6 %t: got %v, but wanted %v", tc.addrs, tc.ipv6, got, tc.want)
+		}
+	}
+}
diff --git a/test/iptables/iptables_util.go b/test/iptables/iptables_util.go
index 7146edbb9..d4bc55b24 100644
--- a/test/iptables/iptables_util.go
+++ b/test/iptables/iptables_util.go
@@ -18,6 +18,7 @@ import (
 	"fmt"
 	"net"
 	"os/exec"
+	"strings"
 	"time"
 
 	"gvisor.dev/gvisor/pkg/test/testutil"
@@ -157,8 +158,10 @@ func connectTCP(ip net.IP, port int, timeout time.Duration) error {
 	return nil
 }
 
-// localAddrs returns a list of local network interface addresses.
-func localAddrs() ([]string, error) {
+// localAddrs returns a list of local network interface addresses. When ipv6 is
+// true, only IPv6 addresses are returned. Otherwise only IPv4 addresses are
+// returned.
+func localAddrs(ipv6 bool) ([]string, error) {
 	addrs, err := net.InterfaceAddrs()
 	if err != nil {
 		return nil, err
@@ -167,7 +170,19 @@ func localAddrs() ([]string, error) {
 	for _, addr := range addrs {
 		addrStrs = append(addrStrs, addr.String())
 	}
-	return addrStrs, nil
+	return filterAddrs(addrStrs, ipv6), nil
+}
+
+func filterAddrs(addrs []string, ipv6 bool) []string {
+	addrStrs := make([]string, 0, len(addrs))
+	for _, addr := range addrs {
+		// Add only IPv4 or only IPv6 addresses.
+		parts := strings.Split(addr, "/")
+		if isIPv6 := net.ParseIP(parts[0]).To4() == nil; isIPv6 == ipv6 {
+			addrStrs = append(addrStrs, parts[0])
+		}
+	}
+	return addrStrs
 }
 
 // getInterfaceName returns the name of the interface other than loopback.
diff --git a/test/iptables/nat.go b/test/iptables/nat.go
index 5e54a3963..8562b0820 100644
--- a/test/iptables/nat.go
+++ b/test/iptables/nat.go
@@ -241,7 +241,7 @@ func (NATPreRedirectIP) Name() string {
 
 // ContainerAction implements TestCase.ContainerAction.
 func (NATPreRedirectIP) ContainerAction(ip net.IP) error {
-	addrs, err := localAddrs()
+	addrs, err := localAddrs(false)
 	if err != nil {
 		return err
 	}
diff --git a/test/packetimpact/runner/BUILD b/test/packetimpact/runner/BUILD
index 0b68a760a..bad4f0183 100644
--- a/test/packetimpact/runner/BUILD
+++ b/test/packetimpact/runner/BUILD
@@ -16,5 +16,6 @@ go_test(
     deps = [
         "//pkg/test/dockerutil",
         "//test/packetimpact/netdevs",
+        "@com_github_docker_docker//api/types/mount:go_default_library",
     ],
 )
diff --git a/test/packetimpact/runner/defs.bzl b/test/packetimpact/runner/defs.bzl
index ea66b9756..77cdfea12 100644
--- a/test/packetimpact/runner/defs.bzl
+++ b/test/packetimpact/runner/defs.bzl
@@ -20,12 +20,12 @@ def _packetimpact_test_impl(ctx):
     ])
     ctx.actions.write(bench, bench_content, is_executable = True)
 
-    transitive_files = depset()
+    transitive_files = []
     if hasattr(ctx.attr._test_runner, "data_runfiles"):
-        transitive_files = depset(ctx.attr._test_runner.data_runfiles.files)
+        transitive_files.append(ctx.attr._test_runner.data_runfiles.files)
     runfiles = ctx.runfiles(
         files = [test_runner] + ctx.files.testbench_binary + ctx.files._posix_server_binary,
-        transitive_files = transitive_files,
+        transitive_files = depset(transitive = transitive_files),
         collect_default = True,
         collect_data = True,
     )
diff --git a/test/packetimpact/runner/packetimpact_test.go b/test/packetimpact/runner/packetimpact_test.go
index c0a2620de..397ca3ba5 100644
--- a/test/packetimpact/runner/packetimpact_test.go
+++ b/test/packetimpact/runner/packetimpact_test.go
@@ -16,6 +16,7 @@
 package packetimpact_test
 
 import (
+	"context"
 	"flag"
 	"fmt"
 	"io/ioutil"
@@ -29,6 +30,7 @@ import (
 	"testing"
 	"time"
 
+	"github.com/docker/docker/api/types/mount"
 	"gvisor.dev/gvisor/pkg/test/dockerutil"
 	"gvisor.dev/gvisor/test/packetimpact/netdevs"
 )
@@ -94,15 +96,16 @@ func TestOne(t *testing.T) {
 		}
 	}
 	dockerutil.EnsureSupportedDockerVersion()
+	ctx := context.Background()
 
 	// Create the networks needed for the test. One control network is needed for
 	// the gRPC control packets and one test network on which to transmit the test
 	// packets.
-	ctrlNet := dockerutil.NewDockerNetwork(logger("ctrlNet"))
-	testNet := dockerutil.NewDockerNetwork(logger("testNet"))
-	for _, dn := range []*dockerutil.DockerNetwork{ctrlNet, testNet} {
+	ctrlNet := dockerutil.NewNetwork(ctx, logger("ctrlNet"))
+	testNet := dockerutil.NewNetwork(ctx, logger("testNet"))
+	for _, dn := range []*dockerutil.Network{ctrlNet, testNet} {
 		for {
-			if err := createDockerNetwork(dn); err != nil {
+			if err := createDockerNetwork(ctx, dn); err != nil {
 				t.Log("creating docker network:", err)
 				const wait = 100 * time.Millisecond
 				t.Logf("sleeping %s and will try creating docker network again", wait)
@@ -113,11 +116,19 @@ func TestOne(t *testing.T) {
 			}
 			break
 		}
-		defer func(dn *dockerutil.DockerNetwork) {
-			if err := dn.Cleanup(); err != nil {
+		defer func(dn *dockerutil.Network) {
+			if err := dn.Cleanup(ctx); err != nil {
 				t.Errorf("unable to cleanup container %s: %s", dn.Name, err)
 			}
 		}(dn)
+		// Sanity check.
+		inspect, err := dn.Inspect(ctx)
+		if err != nil {
+			t.Fatalf("failed to inspect network %s: %v", dn.Name, err)
+		} else if inspect.Name != dn.Name {
+			t.Fatalf("name mismatch for network want: %s got: %s", dn.Name, inspect.Name)
+		}
+
 	}
 
 	tmpDir, err := ioutil.TempDir("", "container-output")
@@ -128,42 +139,51 @@ func TestOne(t *testing.T) {
 
 	const testOutputDir = "/tmp/testoutput"
 
-	runOpts := dockerutil.RunOpts{
-		Image:      "packetimpact",
-		CapAdd:     []string{"NET_ADMIN"},
-		Extra:      []string{"--sysctl", "net.ipv6.conf.all.disable_ipv6=0", "--rm", "-v", tmpDir + ":" + testOutputDir},
-		Foreground: true,
-	}
-
 	// Create the Docker container for the DUT.
-	dut := dockerutil.MakeDocker(logger("dut"))
+	dut := dockerutil.MakeContainer(ctx, logger("dut"))
 	if *dutPlatform == "linux" {
 		dut.Runtime = ""
 	}
 
+	runOpts := dockerutil.RunOpts{
+		Image:  "packetimpact",
+		CapAdd: []string{"NET_ADMIN"},
+		Mounts: []mount.Mount{mount.Mount{
+			Type:     mount.TypeBind,
+			Source:   tmpDir,
+			Target:   testOutputDir,
+			ReadOnly: false,
+		}},
+	}
+
 	const containerPosixServerBinary = "/packetimpact/posix_server"
 	dut.CopyFiles(&runOpts, "/packetimpact", "/test/packetimpact/dut/posix_server")
 
-	if err := dut.Create(runOpts, containerPosixServerBinary, "--ip=0.0.0.0", "--port="+ctrlPort); err != nil {
-		t.Fatalf("unable to create container %s: %s", dut.Name, err)
+	conf, hostconf, _ := dut.ConfigsFrom(runOpts, containerPosixServerBinary, "--ip=0.0.0.0", "--port="+ctrlPort)
+	hostconf.AutoRemove = true
+	hostconf.Sysctls = map[string]string{"net.ipv6.conf.all.disable_ipv6": "0"}
+
+	if err := dut.CreateFrom(ctx, conf, hostconf, nil); err != nil {
+		t.Fatalf("unable to create container %s: %v", dut.Name, err)
 	}
-	defer dut.CleanUp()
+
+	defer dut.CleanUp(ctx)
 
 	// Add ctrlNet as eth1 and testNet as eth2.
 	const testNetDev = "eth2"
-	if err := addNetworks(dut, dutAddr, []*dockerutil.DockerNetwork{ctrlNet, testNet}); err != nil {
+	if err := addNetworks(ctx, dut, dutAddr, []*dockerutil.Network{ctrlNet, testNet}); err != nil {
 		t.Fatal(err)
 	}
 
-	if err := dut.Start(); err != nil {
+	if err := dut.Start(ctx); err != nil {
 		t.Fatalf("unable to start container %s: %s", dut.Name, err)
 	}
 
-	if _, err := dut.WaitForOutput("Server listening.*\n", 60*time.Second); err != nil {
+	if _, err := dut.WaitForOutput(ctx, "Server listening.*\n", 60*time.Second); err != nil {
 		t.Fatalf("%s on container %s never listened: %s", containerPosixServerBinary, dut.Name, err)
 	}
 
-	dutTestDevice, dutDeviceInfo, err := deviceByIP(dut, addressInSubnet(dutAddr, *testNet.Subnet))
+	dutTestDevice, dutDeviceInfo, err := deviceByIP(ctx, dut, addressInSubnet(dutAddr, *testNet.Subnet))
 	if err != nil {
 		t.Fatal(err)
 	}
@@ -173,11 +193,11 @@ func TestOne(t *testing.T) {
 	// Netstack as DUT doesn't assign IPv6 addresses automatically so do it if
 	// needed.
 	if remoteIPv6 == nil {
-		if _, err := dut.Exec(dockerutil.RunOpts{}, "ip", "addr", "add", netdevs.MACToIP(remoteMAC).String(), "scope", "link", "dev", dutTestDevice); err != nil {
+		if _, err := dut.Exec(ctx, dockerutil.ExecOpts{}, "ip", "addr", "add", netdevs.MACToIP(remoteMAC).String(), "scope", "link", "dev", dutTestDevice); err != nil {
 			t.Fatalf("unable to ip addr add on container %s: %s", dut.Name, err)
 		}
 		// Now try again, to make sure that it worked.
-		_, dutDeviceInfo, err = deviceByIP(dut, addressInSubnet(dutAddr, *testNet.Subnet))
+		_, dutDeviceInfo, err = deviceByIP(ctx, dut, addressInSubnet(dutAddr, *testNet.Subnet))
 		if err != nil {
 			t.Fatal(err)
 		}
@@ -188,16 +208,20 @@ func TestOne(t *testing.T) {
 	}
 
 	// Create the Docker container for the testbench.
-	testbench := dockerutil.MakeDocker(logger("testbench"))
+	testbench := dockerutil.MakeContainer(ctx, logger("testbench"))
 	testbench.Runtime = "" // The testbench always runs on Linux.
 
 	tbb := path.Base(*testbenchBinary)
 	containerTestbenchBinary := "/packetimpact/" + tbb
 	runOpts = dockerutil.RunOpts{
-		Image:      "packetimpact",
-		CapAdd:     []string{"NET_ADMIN"},
-		Extra:      []string{"--sysctl", "net.ipv6.conf.all.disable_ipv6=0", "--rm", "-v", tmpDir + ":" + testOutputDir},
-		Foreground: true,
+		Image:  "packetimpact",
+		CapAdd: []string{"NET_ADMIN"},
+		Mounts: []mount.Mount{mount.Mount{
+			Type:     mount.TypeBind,
+			Source:   tmpDir,
+			Target:   testOutputDir,
+			ReadOnly: false,
+		}},
 	}
 	testbench.CopyFiles(&runOpts, "/packetimpact", "/test/packetimpact/tests/"+tbb)
 
@@ -227,30 +251,31 @@ func TestOne(t *testing.T) {
 		}
 	}()
 
-	if err := testbench.Create(runOpts, snifferArgs...); err != nil {
+	conf, hostconf, _ = testbench.ConfigsFrom(runOpts, snifferArgs...)
+	hostconf.AutoRemove = true
+	hostconf.Sysctls = map[string]string{"net.ipv6.conf.all.disable_ipv6": "0"}
+
+	if err := testbench.CreateFrom(ctx, conf, hostconf, nil); err != nil {
 		t.Fatalf("unable to create container %s: %s", testbench.Name, err)
 	}
-	defer testbench.CleanUp()
+	defer testbench.CleanUp(ctx)
 
 	// Add ctrlNet as eth1 and testNet as eth2.
-	if err := addNetworks(testbench, testbenchAddr, []*dockerutil.DockerNetwork{ctrlNet, testNet}); err != nil {
+	if err := addNetworks(ctx, testbench, testbenchAddr, []*dockerutil.Network{ctrlNet, testNet}); err != nil {
 		t.Fatal(err)
 	}
 
-	if err := testbench.Start(); err != nil {
+	if err := testbench.Start(ctx); err != nil {
 		t.Fatalf("unable to start container %s: %s", testbench.Name, err)
 	}
 
 	// Kill so that it will flush output.
 	defer func() {
-		// Wait 1 second before killing tcpdump to give it time to flush
-		// any packets.  On linux tests killing it immediately can
-		// sometimes result in partial pcaps.
 		time.Sleep(1 * time.Second)
-		testbench.Exec(dockerutil.RunOpts{}, "killall", snifferArgs[0])
+		testbench.Exec(ctx, dockerutil.ExecOpts{}, "killall", snifferArgs[0])
 	}()
 
-	if _, err := testbench.WaitForOutput(snifferRegex, 60*time.Second); err != nil {
+	if _, err := testbench.WaitForOutput(ctx, snifferRegex, 60*time.Second); err != nil {
 		t.Fatalf("sniffer on %s never listened: %s", dut.Name, err)
 	}
 
@@ -258,8 +283,8 @@ func TestOne(t *testing.T) {
 	// will issue a RST. To prevent this IPtables can be used to filter out all
 	// incoming packets. The raw socket that packetimpact tests use will still see
 	// everything.
-	if _, err := testbench.Exec(dockerutil.RunOpts{}, "iptables", "-A", "INPUT", "-i", testNetDev, "-j", "DROP"); err != nil {
-		t.Fatalf("unable to Exec iptables on container %s: %s", testbench.Name, err)
+	if logs, err := testbench.Exec(ctx, dockerutil.ExecOpts{}, "iptables", "-A", "INPUT", "-i", testNetDev, "-j", "DROP"); err != nil {
+		t.Fatalf("unable to Exec iptables on container %s: %s, logs from testbench:\n%s", testbench.Name, err, logs)
 	}
 
 	// FIXME(b/156449515): Some piece of the system has a race. The old
@@ -282,20 +307,20 @@ func TestOne(t *testing.T) {
 		"--device", testNetDev,
 		"--dut_type", *dutPlatform,
 	)
-	_, err = testbench.Exec(dockerutil.RunOpts{}, testArgs...)
+	logs, err := testbench.Exec(ctx, dockerutil.ExecOpts{}, testArgs...)
 	if !*expectFailure && err != nil {
-		t.Fatal("test failed:", err)
+		t.Fatalf("test failed: %v, logs from testbench:\n%s", err, logs)
 	}
 	if *expectFailure && err == nil {
-		t.Fatal("test failure expected but the test succeeded, enable the test and mark the corresponding bug as fixed")
+		t.Fatalf("test failure expected but the test succeeded, enable the test and mark the corresponding bug as fixed, logs from testbench:\n%s", logs)
 	}
 }
 
-func addNetworks(d *dockerutil.Docker, addr net.IP, networks []*dockerutil.DockerNetwork) error {
+func addNetworks(ctx context.Context, d *dockerutil.Container, addr net.IP, networks []*dockerutil.Network) error {
 	for _, dn := range networks {
 		ip := addressInSubnet(addr, *dn.Subnet)
 		// Connect to the network with the specified IP address.
-		if err := dn.Connect(d, "--ip", ip.String()); err != nil {
+		if err := dn.Connect(ctx, d, ip.String(), ""); err != nil {
 			return fmt.Errorf("unable to connect container %s to network %s: %w", d.Name, dn.Name, err)
 		}
 	}
@@ -313,9 +338,9 @@ func addressInSubnet(addr net.IP, subnet net.IPNet) net.IP {
 	return net.IP(octets)
 }
 
-// makeDockerNetwork makes a randomly-named network that will start with the
+// createDockerNetwork makes a randomly-named network that will start with the
 // namePrefix. The network will be a random /24 subnet.
-func createDockerNetwork(n *dockerutil.DockerNetwork) error {
+func createDockerNetwork(ctx context.Context, n *dockerutil.Network) error {
 	randSource := rand.NewSource(time.Now().UnixNano())
 	r1 := rand.New(randSource)
 	// Class C, 192.0.0.0 to 223.255.255.255, transitionally has mask 24.
@@ -324,12 +349,12 @@ func createDockerNetwork(n *dockerutil.DockerNetwork) error {
 		IP:   ip,
 		Mask: ip.DefaultMask(),
 	}
-	return n.Create()
+	return n.Create(ctx)
 }
 
 // deviceByIP finds a deviceInfo and device name from an IP address.
-func deviceByIP(d *dockerutil.Docker, ip net.IP) (string, netdevs.DeviceInfo, error) {
-	out, err := d.Exec(dockerutil.RunOpts{}, "ip", "addr", "show")
+func deviceByIP(ctx context.Context, d *dockerutil.Container, ip net.IP) (string, netdevs.DeviceInfo, error) {
+	out, err := d.Exec(ctx, dockerutil.ExecOpts{}, "ip", "addr", "show")
 	if err != nil {
 		return "", netdevs.DeviceInfo{}, fmt.Errorf("listing devices on %s container: %w", d.Name, err)
 	}
diff --git a/test/packetimpact/testbench/testbench.go b/test/packetimpact/testbench/testbench.go
index d64f32a5b..6530036a8 100644
--- a/test/packetimpact/testbench/testbench.go
+++ b/test/packetimpact/testbench/testbench.go
@@ -31,23 +31,30 @@ var (
 	DUTType = ""
 	// Device is the local device on the test network.
 	Device = ""
+
 	// LocalIPv4 is the local IPv4 address on the test network.
 	LocalIPv4 = ""
+	// RemoteIPv4 is the DUT's IPv4 address on the test network.
+	RemoteIPv4 = ""
+	// IPv4PrefixLength is the network prefix length of the IPv4 test network.
+	IPv4PrefixLength = 0
+
 	// LocalIPv6 is the local IPv6 address on the test network.
 	LocalIPv6 = ""
+	// RemoteIPv6 is the DUT's IPv6 address on the test network.
+	RemoteIPv6 = ""
+
 	// LocalMAC is the local MAC address on the test network.
 	LocalMAC = ""
+	// RemoteMAC is the DUT's MAC address on the test network.
+	RemoteMAC = ""
+
 	// POSIXServerIP is the POSIX server's IP address on the control network.
 	POSIXServerIP = ""
 	// POSIXServerPort is the UDP port the POSIX server is bound to on the
 	// control network.
 	POSIXServerPort = 40000
-	// RemoteIPv4 is the DUT's IPv4 address on the test network.
-	RemoteIPv4 = ""
-	// RemoteIPv6 is the DUT's IPv6 address on the test network.
-	RemoteIPv6 = ""
-	// RemoteMAC is the DUT's MAC address on the test network.
-	RemoteMAC = ""
+
 	// RPCKeepalive is the gRPC keepalive.
 	RPCKeepalive = 10 * time.Second
 	// RPCTimeout is the gRPC timeout.
@@ -91,6 +98,12 @@ func genPseudoFlags() error {
 	LocalMAC = deviceInfo.MAC.String()
 	LocalIPv6 = deviceInfo.IPv6Addr.String()
 
+	if deviceInfo.IPv4Net != nil {
+		IPv4PrefixLength, _ = deviceInfo.IPv4Net.Mask.Size()
+	} else {
+		IPv4PrefixLength, _ = net.ParseIP(LocalIPv4).DefaultMask().Size()
+	}
+
 	return nil
 }
 
diff --git a/test/packetimpact/tests/BUILD b/test/packetimpact/tests/BUILD
index 85749c559..3f538b5c6 100644
--- a/test/packetimpact/tests/BUILD
+++ b/test/packetimpact/tests/BUILD
@@ -18,8 +18,6 @@ packetimpact_go_test(
 packetimpact_go_test(
     name = "ipv4_id_uniqueness",
     srcs = ["ipv4_id_uniqueness_test.go"],
-    # TODO(b/157506701) Fix netstack then remove the line below.
-    expect_netstack_failure = True,
     deps = [
         "//pkg/abi/linux",
         "//pkg/tcpip/header",
@@ -29,8 +27,8 @@ packetimpact_go_test(
 )
 
 packetimpact_go_test(
-    name = "udp_recv_multicast",
-    srcs = ["udp_recv_multicast_test.go"],
+    name = "udp_recv_mcast_bcast",
+    srcs = ["udp_recv_mcast_bcast_test.go"],
     # TODO(b/152813495): Fix netstack then remove the line below.
     expect_netstack_failure = True,
     deps = [
diff --git a/test/packetimpact/tests/tcp_zero_window_probe_retransmit_test.go b/test/packetimpact/tests/tcp_zero_window_probe_retransmit_test.go
index 5ab193181..8c89d57c9 100644
--- a/test/packetimpact/tests/tcp_zero_window_probe_retransmit_test.go
+++ b/test/packetimpact/tests/tcp_zero_window_probe_retransmit_test.go
@@ -88,8 +88,8 @@ func TestZeroWindowProbeRetransmit(t *testing.T) {
 			continue
 		}
 		// Check if the probes came at exponentially increasing intervals.
-		if p := time.Since(start); p < current-startProbeDuration {
-			t.Fatalf("zero probe came sooner interval %d probe %d\n", p, i)
+		if got, want := time.Since(start), current-startProbeDuration; got < want {
+			t.Errorf("got zero probe %d after %s, want >= %s", i, got, want)
 		}
 		// Acknowledge the zero-window probes from the dut.
 		conn.Send(testbench.TCP{AckNum: ackProbe, Flags: testbench.Uint8(header.TCPFlagAck), WindowSize: testbench.Uint16(0)})
diff --git a/test/packetimpact/tests/udp_recv_multicast_test.go b/test/packetimpact/tests/udp_recv_mcast_bcast_test.go
index 77a9bfa1d..263a54291 100644
--- a/test/packetimpact/tests/udp_recv_multicast_test.go
+++ b/test/packetimpact/tests/udp_recv_mcast_bcast_test.go
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-package udp_recv_multicast_test
+package udp_recv_mcast_bcast_test
 
 import (
 	"flag"
@@ -28,13 +28,36 @@ func init() {
 	testbench.RegisterFlags(flag.CommandLine)
 }
 
-func TestUDPRecvMulticast(t *testing.T) {
+func TestUDPRecvMulticastBroadcast(t *testing.T) {
 	dut := testbench.NewDUT(t)
 	defer dut.TearDown()
-	boundFD, remotePort := dut.CreateBoundSocket(unix.SOCK_DGRAM, unix.IPPROTO_UDP, net.ParseIP("0.0.0.0"))
+	boundFD, remotePort := dut.CreateBoundSocket(unix.SOCK_DGRAM, unix.IPPROTO_UDP, net.IPv4(0, 0, 0, 0))
 	defer dut.Close(boundFD)
 	conn := testbench.NewUDPIPv4(t, testbench.UDP{DstPort: &remotePort}, testbench.UDP{SrcPort: &remotePort})
 	defer conn.Close()
-	conn.SendIP(testbench.IPv4{DstAddr: testbench.Address(tcpip.Address(net.ParseIP("224.0.0.1").To4()))}, testbench.UDP{})
-	dut.Recv(boundFD, 100, 0)
+
+	for _, bcastAddr := range []net.IP{
+		broadcastAddr(net.ParseIP(testbench.RemoteIPv4), net.CIDRMask(testbench.IPv4PrefixLength, 32)),
+		net.IPv4(255, 255, 255, 255),
+		net.IPv4(224, 0, 0, 1),
+	} {
+		payload := testbench.GenerateRandomPayload(t, 1<<10)
+		conn.SendIP(
+			testbench.IPv4{DstAddr: testbench.Address(tcpip.Address(bcastAddr.To4()))},
+			testbench.UDP{},
+			&testbench.Payload{Bytes: payload},
+		)
+		t.Logf("Receiving packet sent to address: %s", bcastAddr)
+		if got, want := string(dut.Recv(boundFD, int32(len(payload)), 0)), string(payload); got != want {
+			t.Errorf("received payload does not match sent payload got: %s, want: %s", got, want)
+		}
+	}
+}
+
+func broadcastAddr(ip net.IP, mask net.IPMask) net.IP {
+	ip4 := ip.To4()
+	for i := range ip4 {
+		ip4[i] |= ^mask[i]
+	}
+	return ip4
 }
diff --git a/test/root/cgroup_test.go b/test/root/cgroup_test.go
index d0634b5c3..a26b83081 100644
--- a/test/root/cgroup_test.go
+++ b/test/root/cgroup_test.go
@@ -16,6 +16,7 @@ package root
 
 import (
 	"bufio"
+	"context"
 	"fmt"
 	"io/ioutil"
 	"os"
@@ -56,25 +57,24 @@ func verifyPid(pid int, path string) error {
 	return fmt.Errorf("got: %v, want: %d", gots, pid)
 }
 
-func TestMemCGroup(t *testing.T) {
-	d := dockerutil.MakeDocker(t)
-	defer d.CleanUp()
+func TestMemCgroup(t *testing.T) {
+	ctx := context.Background()
+	d := dockerutil.MakeContainer(ctx, t)
+	defer d.CleanUp(ctx)
 
 	// Start a new container and allocate the specified about of memory.
 	allocMemSize := 128 << 20
 	allocMemLimit := 2 * allocMemSize
-	if err := d.Spawn(dockerutil.RunOpts{
-		Image:  "basic/python",
-		Memory: allocMemLimit / 1024, // Must be in Kb.
-	}, "python", "-c", fmt.Sprintf("import time; s = 'a' * %d; time.sleep(100)", allocMemSize)); err != nil {
+
+	if err := d.Spawn(ctx, dockerutil.RunOpts{
+		Image:  "basic/ubuntu",
+		Memory: allocMemLimit, // Must be in bytes.
+	}, "python3", "-c", fmt.Sprintf("import time; s = 'a' * %d; time.sleep(100)", allocMemSize)); err != nil {
 		t.Fatalf("docker run failed: %v", err)
 	}
 
 	// Extract the ID to lookup the cgroup.
-	gid, err := d.ID()
-	if err != nil {
-		t.Fatalf("Docker.ID() failed: %v", err)
-	}
+	gid := d.ID()
 	t.Logf("cgroup ID: %s", gid)
 
 	// Wait when the container will allocate memory.
@@ -127,8 +127,9 @@ func TestMemCGroup(t *testing.T) {
 
 // TestCgroup sets cgroup options and checks that cgroup was properly configured.
 func TestCgroup(t *testing.T) {
-	d := dockerutil.MakeDocker(t)
-	defer d.CleanUp()
+	ctx := context.Background()
+	d := dockerutil.MakeContainer(ctx, t)
+	defer d.CleanUp(ctx)
 
 	// This is not a comprehensive list of attributes.
 	//
@@ -137,94 +138,133 @@ func TestCgroup(t *testing.T) {
 	// are often run on a single core virtual machine, and there is only a single
 	// CPU available in our current set, and every container's set.
 	attrs := []struct {
-		arg            string
+		field          string
+		value          int64
 		ctrl           string
 		file           string
 		want           string
 		skipIfNotFound bool
 	}{
 		{
-			arg:  "--cpu-shares=1000",
-			ctrl: "cpu",
-			file: "cpu.shares",
-			want: "1000",
+			field: "cpu-shares",
+			value: 1000,
+			ctrl:  "cpu",
+			file:  "cpu.shares",
+			want:  "1000",
 		},
 		{
-			arg:  "--cpu-period=2000",
-			ctrl: "cpu",
-			file: "cpu.cfs_period_us",
-			want: "2000",
+			field: "cpu-period",
+			value: 2000,
+			ctrl:  "cpu",
+			file:  "cpu.cfs_period_us",
+			want:  "2000",
 		},
 		{
-			arg:  "--cpu-quota=3000",
-			ctrl: "cpu",
-			file: "cpu.cfs_quota_us",
-			want: "3000",
+			field: "cpu-quota",
+			value: 3000,
+			ctrl:  "cpu",
+			file:  "cpu.cfs_quota_us",
+			want:  "3000",
 		},
 		{
-			arg:  "--kernel-memory=100MB",
-			ctrl: "memory",
-			file: "memory.kmem.limit_in_bytes",
-			want: "104857600",
+			field: "kernel-memory",
+			value: 100 << 20,
+			ctrl:  "memory",
+			file:  "memory.kmem.limit_in_bytes",
+			want:  "104857600",
 		},
 		{
-			arg:  "--memory=1GB",
-			ctrl: "memory",
-			file: "memory.limit_in_bytes",
-			want: "1073741824",
+			field: "memory",
+			value: 1 << 30,
+			ctrl:  "memory",
+			file:  "memory.limit_in_bytes",
+			want:  "1073741824",
 		},
 		{
-			arg:  "--memory-reservation=500MB",
-			ctrl: "memory",
-			file: "memory.soft_limit_in_bytes",
-			want: "524288000",
+			field: "memory-reservation",
+			value: 500 << 20,
+			ctrl:  "memory",
+			file:  "memory.soft_limit_in_bytes",
+			want:  "524288000",
 		},
 		{
-			arg:            "--memory-swap=2GB",
+			field:          "memory-swap",
+			value:          2 << 30,
 			ctrl:           "memory",
 			file:           "memory.memsw.limit_in_bytes",
 			want:           "2147483648",
 			skipIfNotFound: true, // swap may be disabled on the machine.
 		},
 		{
-			arg:  "--memory-swappiness=5",
-			ctrl: "memory",
-			file: "memory.swappiness",
-			want: "5",
+			field: "memory-swappiness",
+			value: 5,
+			ctrl:  "memory",
+			file:  "memory.swappiness",
+			want:  "5",
 		},
 		{
-			arg:            "--blkio-weight=750",
+			field:          "blkio-weight",
+			value:          750,
 			ctrl:           "blkio",
 			file:           "blkio.weight",
 			want:           "750",
 			skipIfNotFound: true, // blkio groups may not be available.
 		},
 		{
-			arg:  "--pids-limit=1000",
-			ctrl: "pids",
-			file: "pids.max",
-			want: "1000",
+			field: "pids-limit",
+			value: 1000,
+			ctrl:  "pids",
+			file:  "pids.max",
+			want:  "1000",
 		},
 	}
 
-	args := make([]string, 0, len(attrs))
+	// Make configs.
+	conf, hostconf, _ := d.ConfigsFrom(dockerutil.RunOpts{
+		Image: "basic/alpine",
+	}, "sleep", "10000")
+
+	// Add Cgroup arguments to configs.
 	for _, attr := range attrs {
-		args = append(args, attr.arg)
+		switch attr.field {
+		case "cpu-shares":
+			hostconf.Resources.CPUShares = attr.value
+		case "cpu-period":
+			hostconf.Resources.CPUPeriod = attr.value
+		case "cpu-quota":
+			hostconf.Resources.CPUQuota = attr.value
+		case "kernel-memory":
+			hostconf.Resources.KernelMemory = attr.value
+		case "memory":
+			hostconf.Resources.Memory = attr.value
+		case "memory-reservation":
+			hostconf.Resources.MemoryReservation = attr.value
+		case "memory-swap":
+			hostconf.Resources.MemorySwap = attr.value
+		case "memory-swappiness":
+			val := attr.value
+			hostconf.Resources.MemorySwappiness = &val
+		case "blkio-weight":
+			hostconf.Resources.BlkioWeight = uint16(attr.value)
+		case "pids-limit":
+			val := attr.value
+			hostconf.Resources.PidsLimit = &val
+
+		}
 	}
 
-	// Start the container.
-	if err := d.Spawn(dockerutil.RunOpts{
-		Image: "basic/alpine",
-		Extra: args, // Cgroup arguments.
-	}, "sleep", "10000"); err != nil {
-		t.Fatalf("docker run failed: %v", err)
+	// Create container.
+	if err := d.CreateFrom(ctx, conf, hostconf, nil); err != nil {
+		t.Fatalf("create failed with: %v", err)
 	}
 
-	// Lookup the relevant cgroup ID.
-	gid, err := d.ID()
-	if err != nil {
-		t.Fatalf("Docker.ID() failed: %v", err)
+	// Start container.
+	if err := d.Start(ctx); err != nil {
+		t.Fatalf("start failed with: %v", err)
 	}
+
+	// Lookup the relevant cgroup ID.
+	gid := d.ID()
 	t.Logf("cgroup ID: %s", gid)
 
 	// Check list of attributes defined above.
@@ -239,7 +279,7 @@ func TestCgroup(t *testing.T) {
 			t.Fatalf("failed to read %q: %v", path, err)
 		}
 		if got := strings.TrimSpace(string(out)); got != attr.want {
-			t.Errorf("arg: %q, cgroup attribute %s/%s, got: %q, want: %q", attr.arg, attr.ctrl, attr.file, got, attr.want)
+			t.Errorf("field: %q, cgroup attribute %s/%s, got: %q, want: %q", attr.field, attr.ctrl, attr.file, got, attr.want)
 		}
 	}
 
@@ -257,7 +297,7 @@ func TestCgroup(t *testing.T) {
 		"pids",
 		"systemd",
 	}
-	pid, err := d.SandboxPid()
+	pid, err := d.SandboxPid(ctx)
 	if err != nil {
 		t.Fatalf("SandboxPid: %v", err)
 	}
@@ -269,29 +309,34 @@ func TestCgroup(t *testing.T) {
 	}
 }
 
-// TestCgroup sets cgroup options and checks that cgroup was properly configured.
+// TestCgroupParent sets the "CgroupParent" option and checks that the child and parent's
+// cgroups are created correctly relative to each other.
 func TestCgroupParent(t *testing.T) {
-	d := dockerutil.MakeDocker(t)
-	defer d.CleanUp()
+	ctx := context.Background()
+	d := dockerutil.MakeContainer(ctx, t)
+	defer d.CleanUp(ctx)
 
 	// Construct a known cgroup name.
 	parent := testutil.RandomID("runsc-")
-	if err := d.Spawn(dockerutil.RunOpts{
+	conf, hostconf, _ := d.ConfigsFrom(dockerutil.RunOpts{
 		Image: "basic/alpine",
-		Extra: []string{fmt.Sprintf("--cgroup-parent=%s", parent)},
-	}, "sleep", "10000"); err != nil {
-		t.Fatalf("docker run failed: %v", err)
+	}, "sleep", "10000")
+	hostconf.Resources.CgroupParent = parent
+
+	if err := d.CreateFrom(ctx, conf, hostconf, nil); err != nil {
+		t.Fatalf("create failed with: %v", err)
 	}
 
-	// Extract the ID to look up the cgroup.
-	gid, err := d.ID()
-	if err != nil {
-		t.Fatalf("Docker.ID() failed: %v", err)
+	if err := d.Start(ctx); err != nil {
+		t.Fatalf("start failed with: %v", err)
 	}
+
+	// Extract the ID to look up the cgroup.
+	gid := d.ID()
 	t.Logf("cgroup ID: %s", gid)
 
 	// Check that sandbox is inside cgroup.
-	pid, err := d.SandboxPid()
+	pid, err := d.SandboxPid(ctx)
 	if err != nil {
 		t.Fatalf("SandboxPid: %v", err)
 	}
diff --git a/test/root/chroot_test.go b/test/root/chroot_test.go
index a306132a4..58fcd6f08 100644
--- a/test/root/chroot_test.go
+++ b/test/root/chroot_test.go
@@ -16,6 +16,7 @@
 package root
 
 import (
+	"context"
 	"fmt"
 	"io/ioutil"
 	"os/exec"
@@ -30,16 +31,17 @@ import (
 // TestChroot verifies that the sandbox is chroot'd and that mounts are cleaned
 // up after the sandbox is destroyed.
 func TestChroot(t *testing.T) {
-	d := dockerutil.MakeDocker(t)
-	defer d.CleanUp()
+	ctx := context.Background()
+	d := dockerutil.MakeContainer(ctx, t)
+	defer d.CleanUp(ctx)
 
-	if err := d.Spawn(dockerutil.RunOpts{
+	if err := d.Spawn(ctx, dockerutil.RunOpts{
 		Image: "basic/alpine",
 	}, "sleep", "10000"); err != nil {
 		t.Fatalf("docker run failed: %v", err)
 	}
 
-	pid, err := d.SandboxPid()
+	pid, err := d.SandboxPid(ctx)
 	if err != nil {
 		t.Fatalf("Docker.SandboxPid(): %v", err)
 	}
@@ -75,14 +77,15 @@ func TestChroot(t *testing.T) {
 		t.Errorf("chroot got children %v, want %v", fi[0].Name(), "proc")
 	}
 
-	d.CleanUp()
+	d.CleanUp(ctx)
 }
 
 func TestChrootGofer(t *testing.T) {
-	d := dockerutil.MakeDocker(t)
-	defer d.CleanUp()
+	ctx := context.Background()
+	d := dockerutil.MakeContainer(ctx, t)
+	defer d.CleanUp(ctx)
 
-	if err := d.Spawn(dockerutil.RunOpts{
+	if err := d.Spawn(ctx, dockerutil.RunOpts{
 		Image: "basic/alpine",
 	}, "sleep", "10000"); err != nil {
 		t.Fatalf("docker run failed: %v", err)
@@ -91,7 +94,7 @@ func TestChrootGofer(t *testing.T) {
 	// It's tricky to find gofers. Get sandbox PID first, then find parent. From
 	// parent get all immediate children, remove the sandbox, and everything else
 	// are gofers.
-	sandPID, err := d.SandboxPid()
+	sandPID, err := d.SandboxPid(ctx)
 	if err != nil {
 		t.Fatalf("Docker.SandboxPid(): %v", err)
 	}
diff --git a/test/runtimes/BUILD b/test/runtimes/BUILD
index 022de5ff7..f98d02e00 100644
--- a/test/runtimes/BUILD
+++ b/test/runtimes/BUILD
@@ -6,28 +6,33 @@ runtime_test(
     name = "go1.12",
     exclude_file = "exclude_go1.12.csv",
     lang = "go",
+    shard_count = 5,
 )
 
 runtime_test(
     name = "java11",
     exclude_file = "exclude_java11.csv",
     lang = "java",
+    shard_count = 10,
 )
 
 runtime_test(
     name = "nodejs12.4.0",
     exclude_file = "exclude_nodejs12.4.0.csv",
     lang = "nodejs",
+    shard_count = 5,
 )
 
 runtime_test(
     name = "php7.3.6",
     exclude_file = "exclude_php7.3.6.csv",
     lang = "php",
+    shard_count = 5,
 )
 
 runtime_test(
     name = "python3.7.3",
     exclude_file = "exclude_python3.7.3.csv",
     lang = "python",
+    shard_count = 5,
 )
diff --git a/test/runtimes/exclude_nodejs12.4.0.csv b/test/runtimes/exclude_nodejs12.4.0.csv
index 4ab4e2927..e7edfa0a5 100644
--- a/test/runtimes/exclude_nodejs12.4.0.csv
+++ b/test/runtimes/exclude_nodejs12.4.0.csv
@@ -9,6 +9,8 @@ fixtures/test-fs-stat-sync-overflow.js,,
 internet/test-dgram-broadcast-multi-process.js,,
 internet/test-dgram-multicast-multi-process.js,,
 internet/test-dgram-multicast-set-interface-lo.js,,
+internet/test-doctool-versions.js,,
+internet/test-uv-threadpool-schedule.js,,
 parallel/test-cluster-dgram-reuse.js,b/64024294,
 parallel/test-dgram-bind-fd.js,b/132447356,
 parallel/test-dgram-create-socket-handle-fd.js,b/132447238,
@@ -45,3 +47,4 @@ pseudo-tty/test-tty-window-size.js,,
 pseudo-tty/test-tty-wrap.js,,
 pummel/test-net-pingpong.js,,
 pummel/test-vm-memleak.js,,
+tick-processor/test-tick-processor-builtin.js,,
diff --git a/test/runtimes/exclude_php7.3.6.csv b/test/runtimes/exclude_php7.3.6.csv
index 456bf7487..f3606bfe8 100644
--- a/test/runtimes/exclude_php7.3.6.csv
+++ b/test/runtimes/exclude_php7.3.6.csv
@@ -8,6 +8,9 @@ ext/mbstring/tests/bug77165.phpt,,
 ext/mbstring/tests/bug77454.phpt,,
 ext/mbstring/tests/mb_convert_encoding_leak.phpt,,
 ext/mbstring/tests/mb_strrpos_encoding_3rd_param.phpt,,
+ext/session/tests/session_set_save_handler_class_018.phpt,,
+ext/session/tests/session_set_save_handler_iface_003.phpt,,
+ext/session/tests/session_set_save_handler_variation4.phpt,,
 ext/standard/tests/file/filetype_variation.phpt,,
 ext/standard/tests/file/fopen_variation19.phpt,,
 ext/standard/tests/file/php_fd_wrapper_01.phpt,,
@@ -21,9 +24,12 @@ ext/standard/tests/file/symlink_link_linkinfo_is_link_variation8.phpt,,
 ext/standard/tests/general_functions/escapeshellarg_bug71270.phpt,,
 ext/standard/tests/general_functions/escapeshellcmd_bug71270.phpt,,
 ext/standard/tests/network/bug20134.phpt,,
+ext/standard/tests/streams/stream_socket_sendto.phpt,,
+ext/standard/tests/strings/007.phpt,,
 tests/output/stream_isatty_err.phpt,b/68720279,
 tests/output/stream_isatty_in-err.phpt,b/68720282,
 tests/output/stream_isatty_in-out-err.phpt,,
 tests/output/stream_isatty_in-out.phpt,b/68720299,
 tests/output/stream_isatty_out-err.phpt,b/68720311,
 tests/output/stream_isatty_out.phpt,b/68720325,
+Zend/tests/concat_003.phpt,,
diff --git a/test/runtimes/proctor/go.go b/test/runtimes/proctor/go.go
index 3e2d5d8db..073c2959d 100644
--- a/test/runtimes/proctor/go.go
+++ b/test/runtimes/proctor/go.go
@@ -74,17 +74,26 @@ func (goRunner) ListTests() ([]string, error) {
 	return append(toolSlice, diskFiltered...), nil
 }
 
-// TestCmd implements TestRunner.TestCmd.
-func (goRunner) TestCmd(test string) *exec.Cmd {
-	// Check if test exists on disk by searching for file of the same name.
-	// This will determine whether or not it is a Go test on disk.
-	if strings.HasSuffix(test, ".go") {
-		// Test has suffix ".go" which indicates a disk test, run it as such.
-		cmd := exec.Command("go", "run", "run.go", "-v", "--", test)
+// TestCmds implements TestRunner.TestCmds.
+func (goRunner) TestCmds(tests []string) []*exec.Cmd {
+	var toolTests, onDiskTests []string
+	for _, test := range tests {
+		if strings.HasSuffix(test, ".go") {
+			onDiskTests = append(onDiskTests, test)
+		} else {
+			toolTests = append(toolTests, test)
+		}
+	}
+
+	var cmds []*exec.Cmd
+	if len(toolTests) > 0 {
+		cmds = append(cmds, exec.Command("go", "tool", "dist", "test", "-run", strings.Join(toolTests, "\\|")))
+	}
+	if len(onDiskTests) > 0 {
+		cmd := exec.Command("go", append([]string{"run", "run.go", "-v", "--"}, onDiskTests...)...)
 		cmd.Dir = goTestDir
-		return cmd
+		cmds = append(cmds, cmd)
 	}
 
-	// No ".go" suffix, run as a tool test.
-	return exec.Command("go", "tool", "dist", "test", "-run", test)
+	return cmds
 }
diff --git a/test/runtimes/proctor/java.go b/test/runtimes/proctor/java.go
index 8b362029d..737fbe23e 100644
--- a/test/runtimes/proctor/java.go
+++ b/test/runtimes/proctor/java.go
@@ -60,12 +60,14 @@ func (javaRunner) ListTests() ([]string, error) {
 	return testSlice, nil
 }
 
-// TestCmd implements TestRunner.TestCmd.
-func (javaRunner) TestCmd(test string) *exec.Cmd {
-	args := []string{
-		"-noreport",
-		"-dir:" + javaTestDir,
-		test,
-	}
-	return exec.Command("jtreg", args...)
+// TestCmds implements TestRunner.TestCmds.
+func (javaRunner) TestCmds(tests []string) []*exec.Cmd {
+	args := append(
+		[]string{
+			"-noreport",
+			"-dir:" + javaTestDir,
+		},
+		tests...,
+	)
+	return []*exec.Cmd{exec.Command("jtreg", args...)}
 }
diff --git a/test/runtimes/proctor/nodejs.go b/test/runtimes/proctor/nodejs.go
index bd57db444..23d6edc72 100644
--- a/test/runtimes/proctor/nodejs.go
+++ b/test/runtimes/proctor/nodejs.go
@@ -39,8 +39,8 @@ func (nodejsRunner) ListTests() ([]string, error) {
 	return testSlice, nil
 }
 
-// TestCmd implements TestRunner.TestCmd.
-func (nodejsRunner) TestCmd(test string) *exec.Cmd {
-	args := []string{filepath.Join("tools", "test.py"), test}
-	return exec.Command("/usr/bin/python", args...)
+// TestCmds implements TestRunner.TestCmds.
+func (nodejsRunner) TestCmds(tests []string) []*exec.Cmd {
+	args := append([]string{filepath.Join("tools", "test.py")}, tests...)
+	return []*exec.Cmd{exec.Command("/usr/bin/python", args...)}
 }
diff --git a/test/runtimes/proctor/php.go b/test/runtimes/proctor/php.go
index 9115040e1..6a83d64e3 100644
--- a/test/runtimes/proctor/php.go
+++ b/test/runtimes/proctor/php.go
@@ -17,6 +17,7 @@ package main
 import (
 	"os/exec"
 	"regexp"
+	"strings"
 )
 
 var phpTestRegEx = regexp.MustCompile(`^.+\.phpt$`)
@@ -35,8 +36,8 @@ func (phpRunner) ListTests() ([]string, error) {
 	return testSlice, nil
 }
 
-// TestCmd implements TestRunner.TestCmd.
-func (phpRunner) TestCmd(test string) *exec.Cmd {
-	args := []string{"test", "TESTS=" + test}
-	return exec.Command("make", args...)
+// TestCmds implements TestRunner.TestCmds.
+func (phpRunner) TestCmds(tests []string) []*exec.Cmd {
+	args := []string{"test", "TESTS=" + strings.Join(tests, " ")}
+	return []*exec.Cmd{exec.Command("make", args...)}
 }
diff --git a/test/runtimes/proctor/proctor.go b/test/runtimes/proctor/proctor.go
index b54abe434..9e0642424 100644
--- a/test/runtimes/proctor/proctor.go
+++ b/test/runtimes/proctor/proctor.go
@@ -25,6 +25,7 @@ import (
 	"os/signal"
 	"path/filepath"
 	"regexp"
+	"strings"
 	"syscall"
 )
 
@@ -34,15 +35,18 @@ type TestRunner interface {
 	// ListTests returns a string slice of tests available to run.
 	ListTests() ([]string, error)
 
-	// TestCmd returns an *exec.Cmd that will run the given test.
-	TestCmd(test string) *exec.Cmd
+	// TestCmds returns a slice of *exec.Cmd that will run the given tests.
+	// There is no correlation between the number of exec.Cmds returned and the
+	// number of tests. It could return one command to run all tests or a few
+	// commands that collectively run all.
+	TestCmds(tests []string) []*exec.Cmd
 }
 
 var (
-	runtime  = flag.String("runtime", "", "name of runtime")
-	list     = flag.Bool("list", false, "list all available tests")
-	testName = flag.String("test", "", "run a single test from the list of available tests")
-	pause    = flag.Bool("pause", false, "cause container to pause indefinitely, reaping any zombie children")
+	runtime   = flag.String("runtime", "", "name of runtime")
+	list      = flag.Bool("list", false, "list all available tests")
+	testNames = flag.String("tests", "", "run a subset of the available tests")
+	pause     = flag.Bool("pause", false, "cause container to pause indefinitely, reaping any zombie children")
 )
 
 func main() {
@@ -75,18 +79,20 @@ func main() {
 	}
 
 	var tests []string
-	if *testName == "" {
+	if *testNames == "" {
 		// Run every test.
 		tests, err = tr.ListTests()
 		if err != nil {
 			log.Fatalf("failed to get all tests: %v", err)
 		}
 	} else {
-		// Run a single test.
-		tests = []string{*testName}
+		// Run subset of test.
+		tests = strings.Split(*testNames, ",")
 	}
-	for _, test := range tests {
-		cmd := tr.TestCmd(test)
+
+	// Run tests.
+	cmds := tr.TestCmds(tests)
+	for _, cmd := range cmds {
 		cmd.Stdout, cmd.Stderr = os.Stdout, os.Stderr
 		if err := cmd.Run(); err != nil {
 			log.Fatalf("FAIL: %v", err)
diff --git a/test/runtimes/proctor/python.go b/test/runtimes/proctor/python.go
index b9e0fbe6f..7c598801b 100644
--- a/test/runtimes/proctor/python.go
+++ b/test/runtimes/proctor/python.go
@@ -42,8 +42,8 @@ func (pythonRunner) ListTests() ([]string, error) {
 	return toolSlice, nil
 }
 
-// TestCmd implements TestRunner.TestCmd.
-func (pythonRunner) TestCmd(test string) *exec.Cmd {
-	args := []string{"-m", "test", test}
-	return exec.Command("./python", args...)
+// TestCmds implements TestRunner.TestCmds.
+func (pythonRunner) TestCmds(tests []string) []*exec.Cmd {
+	args := append([]string{"-m", "test"}, tests...)
+	return []*exec.Cmd{exec.Command("./python", args...)}
 }
diff --git a/test/runtimes/runner/BUILD b/test/runtimes/runner/BUILD
index 3972244b9..dc0d5d5b4 100644
--- a/test/runtimes/runner/BUILD
+++ b/test/runtimes/runner/BUILD
@@ -8,6 +8,7 @@ go_binary(
     srcs = ["main.go"],
     visibility = ["//test/runtimes:__pkg__"],
     deps = [
+        "//pkg/log",
         "//pkg/test/dockerutil",
         "//pkg/test/testutil",
     ],
diff --git a/test/runtimes/runner/exclude_test.go b/test/runtimes/runner/exclude_test.go
index c08755894..67c2170c8 100644
--- a/test/runtimes/runner/exclude_test.go
+++ b/test/runtimes/runner/exclude_test.go
@@ -26,7 +26,7 @@ func TestMain(m *testing.M) {
 }
 
 // Test that the exclude file parses without error.
-func TestBlacklists(t *testing.T) {
+func TestExcludelist(t *testing.T) {
 	ex, err := getExcludes()
 	if err != nil {
 		t.Fatalf("error parsing exclude file: %v", err)
diff --git a/test/runtimes/runner/main.go b/test/runtimes/runner/main.go
index 54d1169ef..e230912c9 100644
--- a/test/runtimes/runner/main.go
+++ b/test/runtimes/runner/main.go
@@ -16,6 +16,7 @@
 package main
 
 import (
+	"context"
 	"encoding/csv"
 	"flag"
 	"fmt"
@@ -26,6 +27,7 @@ import (
 	"testing"
 	"time"
 
+	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/test/dockerutil"
 	"gvisor.dev/gvisor/pkg/test/testutil"
 )
@@ -34,10 +36,11 @@ var (
 	lang        = flag.String("lang", "", "language runtime to test")
 	image       = flag.String("image", "", "docker image with runtime tests")
 	excludeFile = flag.String("exclude_file", "", "file containing list of tests to exclude, in CSV format with fields: test name, bug id, comment")
+	batchSize   = flag.Int("batch", 50, "number of test cases run in one command")
 )
 
 // Wait time for each test to run.
-const timeout = 5 * time.Minute
+const timeout = 45 * time.Minute
 
 func main() {
 	flag.Parse()
@@ -60,13 +63,19 @@ func runTests() int {
 	}
 
 	// Construct the shared docker instance.
-	d := dockerutil.MakeDocker(testutil.DefaultLogger(*lang))
-	defer d.CleanUp()
+	ctx := context.Background()
+	d := dockerutil.MakeContainer(ctx, testutil.DefaultLogger(*lang))
+	defer d.CleanUp(ctx)
+
+	if err := testutil.TouchShardStatusFile(); err != nil {
+		fmt.Fprintf(os.Stderr, "error touching status shard file: %v\n", err)
+		return 1
+	}
 
 	// Get a slice of tests to run. This will also start a single Docker
 	// container that will be used to run each test. The final test will
 	// stop the Docker container.
-	tests, err := getTests(d, excludes)
+	tests, err := getTests(ctx, d, excludes)
 	if err != nil {
 		fmt.Fprintf(os.Stderr, "%s\n", err.Error())
 		return 1
@@ -77,18 +86,18 @@ func runTests() int {
 }
 
 // getTests executes all tests as table tests.
-func getTests(d *dockerutil.Docker, excludes map[string]struct{}) ([]testing.InternalTest, error) {
+func getTests(ctx context.Context, d *dockerutil.Container, excludes map[string]struct{}) ([]testing.InternalTest, error) {
 	// Start the container.
 	opts := dockerutil.RunOpts{
 		Image: fmt.Sprintf("runtimes/%s", *image),
 	}
 	d.CopyFiles(&opts, "/proctor", "test/runtimes/proctor/proctor")
-	if err := d.Spawn(opts, "/proctor/proctor", "--pause"); err != nil {
+	if err := d.Spawn(ctx, opts, "/proctor/proctor", "--pause"); err != nil {
 		return nil, fmt.Errorf("docker run failed: %v", err)
 	}
 
 	// Get a list of all tests in the image.
-	list, err := d.Exec(dockerutil.RunOpts{}, "/proctor/proctor", "--runtime", *lang, "--list")
+	list, err := d.Exec(ctx, dockerutil.ExecOpts{}, "/proctor/proctor", "--runtime", *lang, "--list")
 	if err != nil {
 		return nil, fmt.Errorf("docker exec failed: %v", err)
 	}
@@ -103,17 +112,23 @@ func getTests(d *dockerutil.Docker, excludes map[string]struct{}) ([]testing.Int
 	}
 
 	var itests []testing.InternalTest
-	for _, tci := range indices {
-		// Capture tc in this scope.
-		tc := tests[tci]
+	for i := 0; i < len(indices); i += *batchSize {
+		var tcs []string
+		end := i + *batchSize
+		if end > len(indices) {
+			end = len(indices)
+		}
+		for _, tc := range indices[i:end] {
+			// Add test if not excluded.
+			if _, ok := excludes[tests[tc]]; ok {
+				log.Infof("Skipping test case %s\n", tests[tc])
+				continue
+			}
+			tcs = append(tcs, tests[tc])
+		}
 		itests = append(itests, testing.InternalTest{
-			Name: tc,
+			Name: strings.Join(tcs, ", "),
 			F: func(t *testing.T) {
-				// Is the test excluded?
-				if _, ok := excludes[tc]; ok {
-					t.Skipf("SKIP: excluded test %q", tc)
-				}
-
 				var (
 					now    = time.Now()
 					done   = make(chan struct{})
@@ -122,20 +137,20 @@ func getTests(d *dockerutil.Docker, excludes map[string]struct{}) ([]testing.Int
 				)
 
 				go func() {
-					fmt.Printf("RUNNING %s...\n", tc)
-					output, err = d.Exec(dockerutil.RunOpts{}, "/proctor/proctor", "--runtime", *lang, "--test", tc)
+					fmt.Printf("RUNNING the following in a batch\n%s\n", strings.Join(tcs, "\n"))
+					output, err = d.Exec(ctx, dockerutil.ExecOpts{}, "/proctor/proctor", "--runtime", *lang, "--tests", strings.Join(tcs, ","))
 					close(done)
 				}()
 
 				select {
 				case <-done:
 					if err == nil {
-						fmt.Printf("PASS: %s (%v)\n\n", tc, time.Since(now))
+						fmt.Printf("PASS: (%v)\n\n", time.Since(now))
 						return
 					}
-					t.Errorf("FAIL: %s (%v):\n%s\n", tc, time.Since(now), output)
+					t.Errorf("FAIL: (%v):\n%s\n", time.Since(now), output)
 				case <-time.After(timeout):
-					t.Errorf("TIMEOUT: %s (%v):\n%s\n", tc, time.Since(now), output)
+					t.Errorf("TIMEOUT: (%v):\n%s\n", time.Since(now), output)
 				}
 			},
 		})
diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD
index 7c4cd8192..28ef55945 100644
--- a/test/syscalls/BUILD
+++ b/test/syscalls/BUILD
@@ -283,6 +283,7 @@ syscall_test(
     size = "medium",
     add_overlay = False,  # TODO(gvisor.dev/issue/317): enable when fixed.
     test = "//test/syscalls/linux:inotify_test",
+    vfs2 = "True",
 )
 
 syscall_test(
@@ -351,6 +352,7 @@ syscall_test(
 syscall_test(
     add_overlay = True,
     test = "//test/syscalls/linux:mknod_test",
+    vfs2 = "True",
 )
 
 syscall_test(
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 9e097c888..662d780d8 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -1330,6 +1330,7 @@ cc_binary(
     name = "packet_socket_raw_test",
     testonly = 1,
     srcs = ["packet_socket_raw.cc"],
+    defines = select_system(),
     linkstatic = 1,
     deps = [
         ":socket_test_util",
@@ -1809,6 +1810,7 @@ cc_binary(
     name = "raw_socket_test",
     testonly = 1,
     srcs = ["raw_socket.cc"],
+    defines = select_system(),
     linkstatic = 1,
     deps = [
         ":socket_test_util",
@@ -3407,6 +3409,7 @@ cc_binary(
     name = "tcp_socket_test",
     testonly = 1,
     srcs = ["tcp_socket.cc"],
+    defines = select_system(),
     linkstatic = 1,
     deps = [
         ":socket_test_util",
diff --git a/test/syscalls/linux/exec.cc b/test/syscalls/linux/exec.cc
index e09afafe9..c5acfc794 100644
--- a/test/syscalls/linux/exec.cc
+++ b/test/syscalls/linux/exec.cc
@@ -553,7 +553,12 @@ TEST(ExecTest, SymlinkLimitRefreshedForInterpreter) {
   // Hold onto TempPath objects so they are not destructed prematurely.
   std::vector<TempPath> interpreter_symlinks;
   std::vector<TempPath> script_symlinks;
-  for (int i = 0; i < kLinuxMaxSymlinks; i++) {
+  // Replace both the interpreter and script paths with symlink chains of just
+  // over half the symlink limit each; this is the minimum required to test that
+  // the symlink limit applies separately to each traversal, while tolerating
+  // some symlinks in the resolution of (the original) interpreter_path and
+  // script_path.
+  for (int i = 0; i < (kLinuxMaxSymlinks / 2) + 1; i++) {
     interpreter_symlinks.push_back(ASSERT_NO_ERRNO_AND_VALUE(
         TempPath::CreateSymlinkTo(tmp_dir, interpreter_path)));
     interpreter_path = interpreter_symlinks[i].path();
@@ -679,18 +684,16 @@ TEST(ExecveatTest, UnshareFiles) {
   const FileDescriptor fd_closed_on_exec =
       ASSERT_NO_ERRNO_AND_VALUE(Open(tempFile.path(), O_RDONLY | O_CLOEXEC));
 
-  pid_t child;
-  EXPECT_THAT(child = syscall(__NR_clone, SIGCHLD | CLONE_VFORK | CLONE_FILES,
-                              0, 0, 0, 0),
-              SyscallSucceeds());
+  ExecveArray argv = {"test"};
+  ExecveArray envp;
+  std::string child_path = RunfilePath(kBasicWorkload);
+  pid_t child =
+      syscall(__NR_clone, SIGCHLD | CLONE_VFORK | CLONE_FILES, 0, 0, 0, 0);
   if (child == 0) {
-    ExecveArray argv = {"test"};
-    ExecveArray envp;
-    ASSERT_THAT(
-        execve(RunfilePath(kBasicWorkload).c_str(), argv.get(), envp.get()),
-        SyscallSucceeds());
+    execve(child_path.c_str(), argv.get(), envp.get());
     _exit(1);
   }
+  ASSERT_THAT(child, SyscallSucceeds());
 
   int status;
   ASSERT_THAT(RetryEINTR(waitpid)(child, &status, 0), SyscallSucceeds());
diff --git a/test/syscalls/linux/mknod.cc b/test/syscalls/linux/mknod.cc
index 4c45766c7..05dfb375a 100644
--- a/test/syscalls/linux/mknod.cc
+++ b/test/syscalls/linux/mknod.cc
@@ -15,6 +15,7 @@
 #include <errno.h>
 #include <fcntl.h>
 #include <sys/stat.h>
+#include <sys/types.h>
 #include <sys/un.h>
 #include <unistd.h>
 
@@ -39,7 +40,28 @@ TEST(MknodTest, RegularFile) {
   EXPECT_THAT(mknod(node1.c_str(), 0, 0), SyscallSucceeds());
 }
 
-TEST(MknodTest, MknodAtRegularFile) {
+TEST(MknodTest, RegularFilePermissions) {
+  const std::string node = NewTempAbsPath();
+  mode_t newUmask = 0077;
+  umask(newUmask);
+
+  // Attempt to open file with mode 0777. Not specifying file type should create
+  // a regualar file.
+  mode_t perms = S_IRWXU | S_IRWXG | S_IRWXO;
+  EXPECT_THAT(mknod(node.c_str(), perms, 0), SyscallSucceeds());
+
+  // In the absence of a default ACL, the permissions of the created node are
+  // (mode & ~umask).  -- mknod(2)
+  mode_t wantPerms = perms & ~newUmask;
+  struct stat st;
+  ASSERT_THAT(stat(node.c_str(), &st), SyscallSucceeds());
+  ASSERT_EQ(st.st_mode & 0777, wantPerms);
+
+  // "Zero file type is equivalent to type S_IFREG." - mknod(2)
+  ASSERT_EQ(st.st_mode & S_IFMT, S_IFREG);
+}
+
+TEST(MknodTest, MknodAtFIFO) {
   const TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
   const std::string fifo_relpath = NewTempRelPath();
   const std::string fifo = JoinPath(dir.path(), fifo_relpath);
@@ -72,7 +94,7 @@ TEST(MknodTest, MknodOnExistingPathFails) {
 TEST(MknodTest, UnimplementedTypesReturnError) {
   const std::string path = NewTempAbsPath();
 
-  if (IsRunningOnGvisor()) {
+  if (IsRunningWithVFS1()) {
     ASSERT_THAT(mknod(path.c_str(), S_IFSOCK, 0),
                 SyscallFailsWithErrno(EOPNOTSUPP));
   }
diff --git a/test/syscalls/linux/packet_socket_raw.cc b/test/syscalls/linux/packet_socket_raw.cc
index d258d353c..6a963b12c 100644
--- a/test/syscalls/linux/packet_socket_raw.cc
+++ b/test/syscalls/linux/packet_socket_raw.cc
@@ -14,6 +14,9 @@
 
 #include <arpa/inet.h>
 #include <linux/capability.h>
+#ifndef __fuchsia__
+#include <linux/filter.h>
+#endif  // __fuchsia__
 #include <linux/if_arp.h>
 #include <linux/if_packet.h>
 #include <net/ethernet.h>
@@ -97,7 +100,7 @@ class RawPacketTest : public ::testing::TestWithParam<int> {
   int GetLoopbackIndex();
 
   // The socket used for both reading and writing.
-  int socket_;
+  int s_;
 };
 
 void RawPacketTest::SetUp() {
@@ -108,34 +111,58 @@ void RawPacketTest::SetUp() {
   }
 
   if (!IsRunningOnGvisor()) {
+    // Ensure that looped back packets aren't rejected by the kernel.
     FileDescriptor acceptLocal = ASSERT_NO_ERRNO_AND_VALUE(
-        Open("/proc/sys/net/ipv4/conf/lo/accept_local", O_RDONLY));
+        Open("/proc/sys/net/ipv4/conf/lo/accept_local", O_RDWR));
     FileDescriptor routeLocalnet = ASSERT_NO_ERRNO_AND_VALUE(
-        Open("/proc/sys/net/ipv4/conf/lo/route_localnet", O_RDONLY));
+        Open("/proc/sys/net/ipv4/conf/lo/route_localnet", O_RDWR));
     char enabled;
     ASSERT_THAT(read(acceptLocal.get(), &enabled, 1),
                 SyscallSucceedsWithValue(1));
-    ASSERT_EQ(enabled, '1');
+    if (enabled != '1') {
+      enabled = '1';
+      ASSERT_THAT(lseek(acceptLocal.get(), 0, SEEK_SET),
+                  SyscallSucceedsWithValue(0));
+      ASSERT_THAT(write(acceptLocal.get(), &enabled, 1),
+                  SyscallSucceedsWithValue(1));
+      ASSERT_THAT(lseek(acceptLocal.get(), 0, SEEK_SET),
+                  SyscallSucceedsWithValue(0));
+      ASSERT_THAT(read(acceptLocal.get(), &enabled, 1),
+                  SyscallSucceedsWithValue(1));
+      ASSERT_EQ(enabled, '1');
+    }
+
     ASSERT_THAT(read(routeLocalnet.get(), &enabled, 1),
                 SyscallSucceedsWithValue(1));
-    ASSERT_EQ(enabled, '1');
+    if (enabled != '1') {
+      enabled = '1';
+      ASSERT_THAT(lseek(routeLocalnet.get(), 0, SEEK_SET),
+                  SyscallSucceedsWithValue(0));
+      ASSERT_THAT(write(routeLocalnet.get(), &enabled, 1),
+                  SyscallSucceedsWithValue(1));
+      ASSERT_THAT(lseek(routeLocalnet.get(), 0, SEEK_SET),
+                  SyscallSucceedsWithValue(0));
+      ASSERT_THAT(read(routeLocalnet.get(), &enabled, 1),
+                  SyscallSucceedsWithValue(1));
+      ASSERT_EQ(enabled, '1');
+    }
   }
 
-  ASSERT_THAT(socket_ = socket(AF_PACKET, SOCK_RAW, htons(GetParam())),
+  ASSERT_THAT(s_ = socket(AF_PACKET, SOCK_RAW, htons(GetParam())),
               SyscallSucceeds());
 }
 
 void RawPacketTest::TearDown() {
   // TearDown will be run even if we skip the test.
   if (ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))) {
-    EXPECT_THAT(close(socket_), SyscallSucceeds());
+    EXPECT_THAT(close(s_), SyscallSucceeds());
   }
 }
 
 int RawPacketTest::GetLoopbackIndex() {
   struct ifreq ifr;
   snprintf(ifr.ifr_name, IFNAMSIZ, "lo");
-  EXPECT_THAT(ioctl(socket_, SIOCGIFINDEX, &ifr), SyscallSucceeds());
+  EXPECT_THAT(ioctl(s_, SIOCGIFINDEX, &ifr), SyscallSucceeds());
   EXPECT_NE(ifr.ifr_ifindex, 0);
   return ifr.ifr_ifindex;
 }
@@ -149,7 +176,7 @@ TEST_P(RawPacketTest, Receive) {
 
   // Wait for the socket to become readable.
   struct pollfd pfd = {};
-  pfd.fd = socket_;
+  pfd.fd = s_;
   pfd.events = POLLIN;
   EXPECT_THAT(RetryEINTR(poll)(&pfd, 1, 2000), SyscallSucceedsWithValue(1));
 
@@ -159,7 +186,7 @@ TEST_P(RawPacketTest, Receive) {
   char buf[64];
   struct sockaddr_ll src = {};
   socklen_t src_len = sizeof(src);
-  ASSERT_THAT(recvfrom(socket_, buf, sizeof(buf), 0,
+  ASSERT_THAT(recvfrom(s_, buf, sizeof(buf), 0,
                        reinterpret_cast<struct sockaddr*>(&src), &src_len),
               SyscallSucceedsWithValue(packet_size));
   // sockaddr_ll ends with an 8 byte physical address field, but ethernet
@@ -277,7 +304,7 @@ TEST_P(RawPacketTest, Send) {
          sizeof(kMessage));
 
   // Send it.
-  ASSERT_THAT(sendto(socket_, send_buf, sizeof(send_buf), 0,
+  ASSERT_THAT(sendto(s_, send_buf, sizeof(send_buf), 0,
                      reinterpret_cast<struct sockaddr*>(&dest), sizeof(dest)),
               SyscallSucceedsWithValue(sizeof(send_buf)));
 
@@ -286,13 +313,13 @@ TEST_P(RawPacketTest, Send) {
   pfd.fd = udp_sock.get();
   pfd.events = POLLIN;
   ASSERT_THAT(RetryEINTR(poll)(&pfd, 1, 5000), SyscallSucceedsWithValue(1));
-  pfd.fd = socket_;
+  pfd.fd = s_;
   pfd.events = POLLIN;
   ASSERT_THAT(RetryEINTR(poll)(&pfd, 1, 5000), SyscallSucceedsWithValue(1));
 
   // Receive on the packet socket.
   char recv_buf[sizeof(send_buf)];
-  ASSERT_THAT(recv(socket_, recv_buf, sizeof(recv_buf), 0),
+  ASSERT_THAT(recv(s_, recv_buf, sizeof(recv_buf), 0),
               SyscallSucceedsWithValue(sizeof(recv_buf)));
   ASSERT_EQ(memcmp(recv_buf, send_buf, sizeof(send_buf)), 0);
 
@@ -309,6 +336,260 @@ TEST_P(RawPacketTest, Send) {
   EXPECT_EQ(src.sin_addr.s_addr, htonl(INADDR_LOOPBACK));
 }
 
+// Check that setting SO_RCVBUF below min is clamped to the minimum
+// receive buffer size.
+TEST_P(RawPacketTest, SetSocketRecvBufBelowMin) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+  // Discover minimum receive buf size by trying to set it to zero.
+  // See:
+  // https://github.com/torvalds/linux/blob/a5dc8300df75e8b8384b4c82225f1e4a0b4d9b55/net/core/sock.c#L820
+  constexpr int kRcvBufSz = 0;
+  ASSERT_THAT(
+      setsockopt(s_, SOL_SOCKET, SO_RCVBUF, &kRcvBufSz, sizeof(kRcvBufSz)),
+      SyscallSucceeds());
+
+  int min = 0;
+  socklen_t min_len = sizeof(min);
+  ASSERT_THAT(getsockopt(s_, SOL_SOCKET, SO_RCVBUF, &min, &min_len),
+              SyscallSucceeds());
+
+  // Linux doubles the value so let's use a value that when doubled will still
+  // be smaller than min.
+  int below_min = min / 2 - 1;
+  ASSERT_THAT(
+      setsockopt(s_, SOL_SOCKET, SO_RCVBUF, &below_min, sizeof(below_min)),
+      SyscallSucceeds());
+
+  int val = 0;
+  socklen_t val_len = sizeof(val);
+  ASSERT_THAT(getsockopt(s_, SOL_SOCKET, SO_RCVBUF, &val, &val_len),
+              SyscallSucceeds());
+
+  ASSERT_EQ(min, val);
+}
+
+// Check that setting SO_RCVBUF above max is clamped to the maximum
+// receive buffer size.
+TEST_P(RawPacketTest, SetSocketRecvBufAboveMax) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+  // Discover max buf size by trying to set the largest possible buffer size.
+  constexpr int kRcvBufSz = 0xffffffff;
+  ASSERT_THAT(
+      setsockopt(s_, SOL_SOCKET, SO_RCVBUF, &kRcvBufSz, sizeof(kRcvBufSz)),
+      SyscallSucceeds());
+
+  int max = 0;
+  socklen_t max_len = sizeof(max);
+  ASSERT_THAT(getsockopt(s_, SOL_SOCKET, SO_RCVBUF, &max, &max_len),
+              SyscallSucceeds());
+
+  int above_max = max + 1;
+  ASSERT_THAT(
+      setsockopt(s_, SOL_SOCKET, SO_RCVBUF, &above_max, sizeof(above_max)),
+      SyscallSucceeds());
+
+  int val = 0;
+  socklen_t val_len = sizeof(val);
+  ASSERT_THAT(getsockopt(s_, SOL_SOCKET, SO_RCVBUF, &val, &val_len),
+              SyscallSucceeds());
+  ASSERT_EQ(max, val);
+}
+
+// Check that setting SO_RCVBUF min <= kRcvBufSz <= max is honored.
+TEST_P(RawPacketTest, SetSocketRecvBuf) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+  int max = 0;
+  int min = 0;
+  {
+    // Discover max buf size by trying to set a really large buffer size.
+    constexpr int kRcvBufSz = 0xffffffff;
+    ASSERT_THAT(
+        setsockopt(s_, SOL_SOCKET, SO_RCVBUF, &kRcvBufSz, sizeof(kRcvBufSz)),
+        SyscallSucceeds());
+
+    max = 0;
+    socklen_t max_len = sizeof(max);
+    ASSERT_THAT(getsockopt(s_, SOL_SOCKET, SO_RCVBUF, &max, &max_len),
+                SyscallSucceeds());
+  }
+
+  {
+    // Discover minimum buffer size by trying to set a zero size receive buffer
+    // size.
+    // See:
+    // https://github.com/torvalds/linux/blob/a5dc8300df75e8b8384b4c82225f1e4a0b4d9b55/net/core/sock.c#L820
+    constexpr int kRcvBufSz = 0;
+    ASSERT_THAT(
+        setsockopt(s_, SOL_SOCKET, SO_RCVBUF, &kRcvBufSz, sizeof(kRcvBufSz)),
+        SyscallSucceeds());
+
+    socklen_t min_len = sizeof(min);
+    ASSERT_THAT(getsockopt(s_, SOL_SOCKET, SO_RCVBUF, &min, &min_len),
+                SyscallSucceeds());
+  }
+
+  int quarter_sz = min + (max - min) / 4;
+  ASSERT_THAT(
+      setsockopt(s_, SOL_SOCKET, SO_RCVBUF, &quarter_sz, sizeof(quarter_sz)),
+      SyscallSucceeds());
+
+  int val = 0;
+  socklen_t val_len = sizeof(val);
+  ASSERT_THAT(getsockopt(s_, SOL_SOCKET, SO_RCVBUF, &val, &val_len),
+              SyscallSucceeds());
+
+  // Linux doubles the value set by SO_SNDBUF/SO_RCVBUF.
+  // TODO(gvisor.dev/issue/2926): Remove when Netstack matches linux behavior.
+  if (!IsRunningOnGvisor()) {
+    quarter_sz *= 2;
+  }
+  ASSERT_EQ(quarter_sz, val);
+}
+
+// Check that setting SO_SNDBUF below min is clamped to the minimum
+// receive buffer size.
+TEST_P(RawPacketTest, SetSocketSendBufBelowMin) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+  // Discover minimum buffer size by trying to set it to zero.
+  constexpr int kSndBufSz = 0;
+  ASSERT_THAT(
+      setsockopt(s_, SOL_SOCKET, SO_SNDBUF, &kSndBufSz, sizeof(kSndBufSz)),
+      SyscallSucceeds());
+
+  int min = 0;
+  socklen_t min_len = sizeof(min);
+  ASSERT_THAT(getsockopt(s_, SOL_SOCKET, SO_SNDBUF, &min, &min_len),
+              SyscallSucceeds());
+
+  // Linux doubles the value so let's use a value that when doubled will still
+  // be smaller than min.
+  int below_min = min / 2 - 1;
+  ASSERT_THAT(
+      setsockopt(s_, SOL_SOCKET, SO_SNDBUF, &below_min, sizeof(below_min)),
+      SyscallSucceeds());
+
+  int val = 0;
+  socklen_t val_len = sizeof(val);
+  ASSERT_THAT(getsockopt(s_, SOL_SOCKET, SO_SNDBUF, &val, &val_len),
+              SyscallSucceeds());
+
+  ASSERT_EQ(min, val);
+}
+
+// Check that setting SO_SNDBUF above max is clamped to the maximum
+// send buffer size.
+TEST_P(RawPacketTest, SetSocketSendBufAboveMax) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+  // Discover maximum buffer size by trying to set it to a large value.
+  constexpr int kSndBufSz = 0xffffffff;
+  ASSERT_THAT(
+      setsockopt(s_, SOL_SOCKET, SO_SNDBUF, &kSndBufSz, sizeof(kSndBufSz)),
+      SyscallSucceeds());
+
+  int max = 0;
+  socklen_t max_len = sizeof(max);
+  ASSERT_THAT(getsockopt(s_, SOL_SOCKET, SO_SNDBUF, &max, &max_len),
+              SyscallSucceeds());
+
+  int above_max = max + 1;
+  ASSERT_THAT(
+      setsockopt(s_, SOL_SOCKET, SO_SNDBUF, &above_max, sizeof(above_max)),
+      SyscallSucceeds());
+
+  int val = 0;
+  socklen_t val_len = sizeof(val);
+  ASSERT_THAT(getsockopt(s_, SOL_SOCKET, SO_SNDBUF, &val, &val_len),
+              SyscallSucceeds());
+  ASSERT_EQ(max, val);
+}
+
+// Check that setting SO_SNDBUF min <= kSndBufSz <= max is honored.
+TEST_P(RawPacketTest, SetSocketSendBuf) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+  int max = 0;
+  int min = 0;
+  {
+    // Discover maximum buffer size by trying to set it to a large value.
+    constexpr int kSndBufSz = 0xffffffff;
+    ASSERT_THAT(
+        setsockopt(s_, SOL_SOCKET, SO_SNDBUF, &kSndBufSz, sizeof(kSndBufSz)),
+        SyscallSucceeds());
+
+    max = 0;
+    socklen_t max_len = sizeof(max);
+    ASSERT_THAT(getsockopt(s_, SOL_SOCKET, SO_SNDBUF, &max, &max_len),
+                SyscallSucceeds());
+  }
+
+  {
+    // Discover minimum buffer size by trying to set it to zero.
+    constexpr int kSndBufSz = 0;
+    ASSERT_THAT(
+        setsockopt(s_, SOL_SOCKET, SO_SNDBUF, &kSndBufSz, sizeof(kSndBufSz)),
+        SyscallSucceeds());
+
+    socklen_t min_len = sizeof(min);
+    ASSERT_THAT(getsockopt(s_, SOL_SOCKET, SO_SNDBUF, &min, &min_len),
+                SyscallSucceeds());
+  }
+
+  int quarter_sz = min + (max - min) / 4;
+  ASSERT_THAT(
+      setsockopt(s_, SOL_SOCKET, SO_SNDBUF, &quarter_sz, sizeof(quarter_sz)),
+      SyscallSucceeds());
+
+  int val = 0;
+  socklen_t val_len = sizeof(val);
+  ASSERT_THAT(getsockopt(s_, SOL_SOCKET, SO_SNDBUF, &val, &val_len),
+              SyscallSucceeds());
+
+  // Linux doubles the value set by SO_SNDBUF/SO_RCVBUF.
+  // TODO(gvisor.dev/issue/2926): Remove the gvisor special casing when Netstack
+  // matches linux behavior.
+  if (!IsRunningOnGvisor()) {
+    quarter_sz *= 2;
+  }
+
+  ASSERT_EQ(quarter_sz, val);
+}
+
+#ifndef __fuchsia__
+
+TEST_P(RawPacketTest, SetSocketDetachFilterNoInstalledFilter) {
+  // TODO(gvisor.dev/2746): Support SO_ATTACH_FILTER/SO_DETACH_FILTER.
+  //
+  // gVisor returns no error on SO_DETACH_FILTER even if there is no filter
+  // attached unlike linux which does return ENOENT in such cases. This is
+  // because gVisor doesn't support SO_ATTACH_FILTER and just silently returns
+  // success.
+  if (IsRunningOnGvisor()) {
+    constexpr int val = 0;
+    ASSERT_THAT(setsockopt(s_, SOL_SOCKET, SO_DETACH_FILTER, &val, sizeof(val)),
+                SyscallSucceeds());
+    return;
+  }
+  constexpr int val = 0;
+  ASSERT_THAT(setsockopt(s_, SOL_SOCKET, SO_DETACH_FILTER, &val, sizeof(val)),
+              SyscallFailsWithErrno(ENOENT));
+}
+
+TEST_P(RawPacketTest, GetSocketDetachFilter) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+  int val = 0;
+  socklen_t val_len = sizeof(val);
+  ASSERT_THAT(getsockopt(s_, SOL_SOCKET, SO_DETACH_FILTER, &val, &val_len),
+              SyscallFailsWithErrno(ENOPROTOOPT));
+}
+
+#endif  // __fuchsia__
+
 INSTANTIATE_TEST_SUITE_P(AllInetTests, RawPacketTest,
                          ::testing::Values(ETH_P_IP, ETH_P_ALL));
 
diff --git a/test/syscalls/linux/pty.cc b/test/syscalls/linux/pty.cc
index aabfa6955..f9392b9e0 100644
--- a/test/syscalls/linux/pty.cc
+++ b/test/syscalls/linux/pty.cc
@@ -634,6 +634,11 @@ TEST_F(PtyTest, TermiosAffectsSlave) {
 // Verify this by setting ICRNL (which rewrites input \r to \n) and verify that
 // it has no effect on the master.
 TEST_F(PtyTest, MasterTermiosUnchangable) {
+  struct kernel_termios master_termios = {};
+  EXPECT_THAT(ioctl(master_.get(), TCGETS, &master_termios), SyscallSucceeds());
+  master_termios.c_lflag |= ICRNL;
+  EXPECT_THAT(ioctl(master_.get(), TCSETS, &master_termios), SyscallSucceeds());
+
   char c = '\r';
   ASSERT_THAT(WriteFd(slave_.get(), &c, 1), SyscallSucceedsWithValue(1));
 
diff --git a/test/syscalls/linux/raw_socket.cc b/test/syscalls/linux/raw_socket.cc
index 05c4ed03f..ce54dc064 100644
--- a/test/syscalls/linux/raw_socket.cc
+++ b/test/syscalls/linux/raw_socket.cc
@@ -13,6 +13,9 @@
 // limitations under the License.
 
 #include <linux/capability.h>
+#ifndef __fuchsia__
+#include <linux/filter.h>
+#endif  // __fuchsia__
 #include <netinet/in.h>
 #include <netinet/ip.h>
 #include <netinet/ip6.h>
@@ -21,6 +24,7 @@
 #include <sys/socket.h>
 #include <sys/types.h>
 #include <unistd.h>
+
 #include <algorithm>
 
 #include "gtest/gtest.h"
@@ -790,10 +794,30 @@ void RawSocketTest::ReceiveBufFrom(int sock, char* recv_buf,
   ASSERT_NO_FATAL_FAILURE(RecvNoCmsg(sock, recv_buf, recv_buf_len));
 }
 
-INSTANTIATE_TEST_SUITE_P(AllInetTests, RawSocketTest,
-                         ::testing::Combine(
-                             ::testing::Values(IPPROTO_TCP, IPPROTO_UDP),
-                             ::testing::Values(AF_INET, AF_INET6)));
+#ifndef __fuchsia__
+
+TEST_P(RawSocketTest, SetSocketDetachFilterNoInstalledFilter) {
+  // TODO(gvisor.dev/2746): Support SO_ATTACH_FILTER/SO_DETACH_FILTER.
+  if (IsRunningOnGvisor()) {
+    constexpr int val = 0;
+    ASSERT_THAT(setsockopt(s_, SOL_SOCKET, SO_DETACH_FILTER, &val, sizeof(val)),
+                SyscallSucceeds());
+    return;
+  }
+
+  constexpr int val = 0;
+  ASSERT_THAT(setsockopt(s_, SOL_SOCKET, SO_DETACH_FILTER, &val, sizeof(val)),
+              SyscallFailsWithErrno(ENOENT));
+}
+
+TEST_P(RawSocketTest, GetSocketDetachFilter) {
+  int val = 0;
+  socklen_t val_len = sizeof(val);
+  ASSERT_THAT(getsockopt(s_, SOL_SOCKET, SO_DETACH_FILTER, &val, &val_len),
+              SyscallFailsWithErrno(ENOPROTOOPT));
+}
+
+#endif  //  __fuchsia__
 
 // AF_INET6+SOCK_RAW+IPPROTO_RAW sockets can be created, but not written to.
 TEST(RawSocketTest, IPv6ProtoRaw) {
@@ -813,6 +837,11 @@ TEST(RawSocketTest, IPv6ProtoRaw) {
               SyscallFailsWithErrno(EINVAL));
 }
 
+INSTANTIATE_TEST_SUITE_P(
+    AllInetTests, RawSocketTest,
+    ::testing::Combine(::testing::Values(IPPROTO_TCP, IPPROTO_UDP),
+                       ::testing::Values(AF_INET, AF_INET6)));
+
 }  // namespace
 
 }  // namespace testing
diff --git a/test/syscalls/linux/raw_socket_hdrincl.cc b/test/syscalls/linux/raw_socket_hdrincl.cc
index 0a27506aa..5bb14d57c 100644
--- a/test/syscalls/linux/raw_socket_hdrincl.cc
+++ b/test/syscalls/linux/raw_socket_hdrincl.cc
@@ -167,7 +167,7 @@ TEST_F(RawHDRINCL, NotReadable) {
   // nothing to be read.
   char buf[117];
   ASSERT_THAT(RetryEINTR(recv)(socket_, buf, sizeof(buf), MSG_DONTWAIT),
-              SyscallFailsWithErrno(EINVAL));
+              SyscallFailsWithErrno(EAGAIN));
 }
 
 // Test that we can connect() to a valid IP (loopback).
@@ -273,14 +273,14 @@ TEST_F(RawHDRINCL, SendAndReceive) {
   // The network stack should have set the source address.
   EXPECT_EQ(src.sin_family, AF_INET);
   EXPECT_EQ(absl::gbswap_32(src.sin_addr.s_addr), INADDR_LOOPBACK);
-  // The packet ID should be 0, as the packet is less than 68 bytes.
-  struct iphdr iphdr = {};
-  memcpy(&iphdr, recv_buf, sizeof(iphdr));
-  EXPECT_EQ(iphdr.id, 0);
+  // The packet ID should not be 0, as the packet has DF=0.
+  struct iphdr* iphdr = reinterpret_cast<struct iphdr*>(recv_buf);
+  EXPECT_NE(iphdr->id, 0);
 }
 
-// Send and receive a packet with nonzero IP ID.
-TEST_F(RawHDRINCL, SendAndReceiveNonzeroID) {
+// Send and receive a packet where the sendto address is not the same as the
+// provided destination.
+TEST_F(RawHDRINCL, SendAndReceiveDifferentAddress) {
   int port = 40000;
   if (!IsRunningOnGvisor()) {
     port = static_cast<short>(ASSERT_NO_ERRNO_AND_VALUE(
@@ -292,19 +292,24 @@ TEST_F(RawHDRINCL, SendAndReceiveNonzeroID) {
   FileDescriptor udp_sock =
       ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_RAW, IPPROTO_UDP));
 
-  // Construct a packet with an IP header, UDP header, and payload. Make the
-  // payload large enough to force an IP ID to be assigned.
-  constexpr char kPayload[128] = {};
+  // Construct a packet with an IP header, UDP header, and payload.
+  constexpr char kPayload[] = "toto";
   char packet[sizeof(struct iphdr) + sizeof(struct udphdr) + sizeof(kPayload)];
   ASSERT_TRUE(
       FillPacket(packet, sizeof(packet), port, kPayload, sizeof(kPayload)));
+  // Overwrite the IP destination address with an IP we can't get to.
+  struct iphdr iphdr = {};
+  memcpy(&iphdr, packet, sizeof(iphdr));
+  iphdr.daddr = 42;
+  memcpy(packet, &iphdr, sizeof(iphdr));
 
   socklen_t addrlen = sizeof(addr_);
   ASSERT_NO_FATAL_FAILURE(sendto(socket_, &packet, sizeof(packet), 0,
                                  reinterpret_cast<struct sockaddr*>(&addr_),
                                  addrlen));
 
-  // Receive the payload.
+  // Receive the payload, since sendto should replace the bad destination with
+  // localhost.
   char recv_buf[sizeof(packet)];
   struct sockaddr_in src;
   socklen_t src_size = sizeof(src);
@@ -318,47 +323,58 @@ TEST_F(RawHDRINCL, SendAndReceiveNonzeroID) {
   // The network stack should have set the source address.
   EXPECT_EQ(src.sin_family, AF_INET);
   EXPECT_EQ(absl::gbswap_32(src.sin_addr.s_addr), INADDR_LOOPBACK);
-  // The packet ID should not be 0, as the packet was more than 68 bytes.
-  struct iphdr* iphdr = reinterpret_cast<struct iphdr*>(recv_buf);
-  EXPECT_NE(iphdr->id, 0);
+  // The packet ID should not be 0, as the packet has DF=0.
+  struct iphdr recv_iphdr = {};
+  memcpy(&recv_iphdr, recv_buf, sizeof(recv_iphdr));
+  EXPECT_NE(recv_iphdr.id, 0);
+  // The destination address should be localhost, not the bad IP we set
+  // initially.
+  EXPECT_EQ(absl::gbswap_32(recv_iphdr.daddr), INADDR_LOOPBACK);
 }
 
-// Send and receive a packet where the sendto address is not the same as the
-// provided destination.
-TEST_F(RawHDRINCL, SendAndReceiveDifferentAddress) {
+// Send and receive a packet w/ the IP_HDRINCL option set.
+TEST_F(RawHDRINCL, SendAndReceiveIPHdrIncl) {
   int port = 40000;
   if (!IsRunningOnGvisor()) {
     port = static_cast<short>(ASSERT_NO_ERRNO_AND_VALUE(
         PortAvailable(0, AddressFamily::kIpv4, SocketType::kUdp, false)));
   }
 
-  // IPPROTO_RAW sockets are write-only. We'll have to open another socket to
-  // read what we write.
-  FileDescriptor udp_sock =
+  FileDescriptor recv_sock =
       ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_RAW, IPPROTO_UDP));
 
+  FileDescriptor send_sock =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_RAW, IPPROTO_UDP));
+
+  // Enable IP_HDRINCL option so that we can build and send w/ an IP
+  // header.
+  constexpr int kSockOptOn = 1;
+  ASSERT_THAT(setsockopt(send_sock.get(), SOL_IP, IP_HDRINCL, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceeds());
+  // This is not strictly required but we do it to make sure that setting
+  // IP_HDRINCL on a non IPPROTO_RAW socket does not prevent it from receiving
+  // packets.
+  ASSERT_THAT(setsockopt(recv_sock.get(), SOL_IP, IP_HDRINCL, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceeds());
+
   // Construct a packet with an IP header, UDP header, and payload.
   constexpr char kPayload[] = "toto";
   char packet[sizeof(struct iphdr) + sizeof(struct udphdr) + sizeof(kPayload)];
   ASSERT_TRUE(
       FillPacket(packet, sizeof(packet), port, kPayload, sizeof(kPayload)));
-  // Overwrite the IP destination address with an IP we can't get to.
-  struct iphdr iphdr = {};
-  memcpy(&iphdr, packet, sizeof(iphdr));
-  iphdr.daddr = 42;
-  memcpy(packet, &iphdr, sizeof(iphdr));
 
   socklen_t addrlen = sizeof(addr_);
-  ASSERT_NO_FATAL_FAILURE(sendto(socket_, &packet, sizeof(packet), 0,
+  ASSERT_NO_FATAL_FAILURE(sendto(send_sock.get(), &packet, sizeof(packet), 0,
                                  reinterpret_cast<struct sockaddr*>(&addr_),
                                  addrlen));
 
-  // Receive the payload, since sendto should replace the bad destination with
-  // localhost.
+  // Receive the payload.
   char recv_buf[sizeof(packet)];
   struct sockaddr_in src;
   socklen_t src_size = sizeof(src);
-  ASSERT_THAT(recvfrom(udp_sock.get(), recv_buf, sizeof(recv_buf), 0,
+  ASSERT_THAT(recvfrom(recv_sock.get(), recv_buf, sizeof(recv_buf), 0,
                        reinterpret_cast<struct sockaddr*>(&src), &src_size),
               SyscallSucceedsWithValue(sizeof(packet)));
   EXPECT_EQ(
@@ -368,13 +384,20 @@ TEST_F(RawHDRINCL, SendAndReceiveDifferentAddress) {
   // The network stack should have set the source address.
   EXPECT_EQ(src.sin_family, AF_INET);
   EXPECT_EQ(absl::gbswap_32(src.sin_addr.s_addr), INADDR_LOOPBACK);
-  // The packet ID should be 0, as the packet is less than 68 bytes.
-  struct iphdr recv_iphdr = {};
-  memcpy(&recv_iphdr, recv_buf, sizeof(recv_iphdr));
-  EXPECT_EQ(recv_iphdr.id, 0);
-  // The destination address should be localhost, not the bad IP we set
-  // initially.
-  EXPECT_EQ(absl::gbswap_32(recv_iphdr.daddr), INADDR_LOOPBACK);
+  struct iphdr iphdr = {};
+  memcpy(&iphdr, recv_buf, sizeof(iphdr));
+  EXPECT_NE(iphdr.id, 0);
+
+  // Also verify that the packet we just sent was not delivered to the
+  // IPPROTO_RAW socket.
+  {
+    char recv_buf[sizeof(packet)];
+    struct sockaddr_in src;
+    socklen_t src_size = sizeof(src);
+    ASSERT_THAT(recvfrom(socket_, recv_buf, sizeof(recv_buf), MSG_DONTWAIT,
+                         reinterpret_cast<struct sockaddr*>(&src), &src_size),
+                SyscallFailsWithErrno(EAGAIN));
+  }
 }
 
 }  // namespace
diff --git a/test/syscalls/linux/tcp_socket.cc b/test/syscalls/linux/tcp_socket.cc
index a4d2953e1..0cea7d11f 100644
--- a/test/syscalls/linux/tcp_socket.cc
+++ b/test/syscalls/linux/tcp_socket.cc
@@ -13,6 +13,9 @@
 // limitations under the License.
 
 #include <fcntl.h>
+#ifndef __fuchsia__
+#include <linux/filter.h>
+#endif  // __fuchsia__
 #include <netinet/in.h>
 #include <netinet/tcp.h>
 #include <poll.h>
@@ -1559,6 +1562,63 @@ TEST_P(SimpleTcpSocketTest, SetTCPWindowClampAboveHalfMinRcvBuf) {
   }
 }
 
+#ifndef __fuchsia__
+
+// TODO(gvisor.dev/2746): Support SO_ATTACH_FILTER/SO_DETACH_FILTER.
+// gVisor currently silently ignores attaching a filter.
+TEST_P(SimpleTcpSocketTest, SetSocketAttachDetachFilter) {
+  FileDescriptor s =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));
+  // Program generated using sudo tcpdump -i lo tcp and port 1234 -dd
+  struct sock_filter code[] = {
+      {0x28, 0, 0, 0x0000000c},  {0x15, 0, 6, 0x000086dd},
+      {0x30, 0, 0, 0x00000014},  {0x15, 0, 15, 0x00000006},
+      {0x28, 0, 0, 0x00000036},  {0x15, 12, 0, 0x000004d2},
+      {0x28, 0, 0, 0x00000038},  {0x15, 10, 11, 0x000004d2},
+      {0x15, 0, 10, 0x00000800}, {0x30, 0, 0, 0x00000017},
+      {0x15, 0, 8, 0x00000006},  {0x28, 0, 0, 0x00000014},
+      {0x45, 6, 0, 0x00001fff},  {0xb1, 0, 0, 0x0000000e},
+      {0x48, 0, 0, 0x0000000e},  {0x15, 2, 0, 0x000004d2},
+      {0x48, 0, 0, 0x00000010},  {0x15, 0, 1, 0x000004d2},
+      {0x6, 0, 0, 0x00040000},   {0x6, 0, 0, 0x00000000},
+  };
+  struct sock_fprog bpf = {
+      .len = ABSL_ARRAYSIZE(code),
+      .filter = code,
+  };
+  ASSERT_THAT(
+      setsockopt(s.get(), SOL_SOCKET, SO_ATTACH_FILTER, &bpf, sizeof(bpf)),
+      SyscallSucceeds());
+
+  constexpr int val = 0;
+  ASSERT_THAT(
+      setsockopt(s.get(), SOL_SOCKET, SO_DETACH_FILTER, &val, sizeof(val)),
+      SyscallSucceeds());
+}
+
+TEST_P(SimpleTcpSocketTest, SetSocketDetachFilterNoInstalledFilter) {
+  // TODO(gvisor.dev/2746): Support SO_ATTACH_FILTER/SO_DETACH_FILTER.
+  SKIP_IF(IsRunningOnGvisor());
+  FileDescriptor s =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));
+  constexpr int val = 0;
+  ASSERT_THAT(
+      setsockopt(s.get(), SOL_SOCKET, SO_DETACH_FILTER, &val, sizeof(val)),
+      SyscallFailsWithErrno(ENOENT));
+}
+
+TEST_P(SimpleTcpSocketTest, GetSocketDetachFilter) {
+  FileDescriptor s =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));
+
+  int val = 0;
+  socklen_t val_len = sizeof(val);
+  ASSERT_THAT(getsockopt(s.get(), SOL_SOCKET, SO_DETACH_FILTER, &val, &val_len),
+              SyscallFailsWithErrno(ENOPROTOOPT));
+}
+
+#endif  // __fuchsia__
+
 INSTANTIATE_TEST_SUITE_P(AllInetTests, SimpleTcpSocketTest,
                          ::testing::Values(AF_INET, AF_INET6));
 
diff --git a/test/syscalls/linux/udp_socket_test_cases.cc b/test/syscalls/linux/udp_socket_test_cases.cc
index 9cc6be4fb..60c48ed6e 100644
--- a/test/syscalls/linux/udp_socket_test_cases.cc
+++ b/test/syscalls/linux/udp_socket_test_cases.cc
@@ -16,6 +16,9 @@
 
 #include <arpa/inet.h>
 #include <fcntl.h>
+#ifndef __fuchsia__
+#include <linux/filter.h>
+#endif  // __fuchsia__
 #include <netinet/in.h>
 #include <poll.h>
 #include <sys/ioctl.h>
@@ -1723,5 +1726,56 @@ TEST_P(UdpSocketTest, RecvBufLimits) {
   }
 }
 
+#ifndef __fuchsia__
+
+// TODO(gvisor.dev/2746): Support SO_ATTACH_FILTER/SO_DETACH_FILTER.
+// gVisor currently silently ignores attaching a filter.
+TEST_P(UdpSocketTest, SetSocketDetachFilter) {
+  // Program generated using sudo tcpdump -i lo udp and port 1234 -dd
+  struct sock_filter code[] = {
+      {0x28, 0, 0, 0x0000000c},  {0x15, 0, 6, 0x000086dd},
+      {0x30, 0, 0, 0x00000014},  {0x15, 0, 15, 0x00000011},
+      {0x28, 0, 0, 0x00000036},  {0x15, 12, 0, 0x000004d2},
+      {0x28, 0, 0, 0x00000038},  {0x15, 10, 11, 0x000004d2},
+      {0x15, 0, 10, 0x00000800}, {0x30, 0, 0, 0x00000017},
+      {0x15, 0, 8, 0x00000011},  {0x28, 0, 0, 0x00000014},
+      {0x45, 6, 0, 0x00001fff},  {0xb1, 0, 0, 0x0000000e},
+      {0x48, 0, 0, 0x0000000e},  {0x15, 2, 0, 0x000004d2},
+      {0x48, 0, 0, 0x00000010},  {0x15, 0, 1, 0x000004d2},
+      {0x6, 0, 0, 0x00040000},   {0x6, 0, 0, 0x00000000},
+  };
+  struct sock_fprog bpf = {
+      .len = ABSL_ARRAYSIZE(code),
+      .filter = code,
+  };
+  ASSERT_THAT(
+      setsockopt(sock_.get(), SOL_SOCKET, SO_ATTACH_FILTER, &bpf, sizeof(bpf)),
+      SyscallSucceeds());
+
+  constexpr int val = 0;
+  ASSERT_THAT(
+      setsockopt(sock_.get(), SOL_SOCKET, SO_DETACH_FILTER, &val, sizeof(val)),
+      SyscallSucceeds());
+}
+
+TEST_P(UdpSocketTest, SetSocketDetachFilterNoInstalledFilter) {
+  // TODO(gvisor.dev/2746): Support SO_ATTACH_FILTER/SO_DETACH_FILTER.
+  SKIP_IF(IsRunningOnGvisor());
+  constexpr int val = 0;
+  ASSERT_THAT(
+      setsockopt(sock_.get(), SOL_SOCKET, SO_DETACH_FILTER, &val, sizeof(val)),
+      SyscallFailsWithErrno(ENOENT));
+}
+
+TEST_P(UdpSocketTest, GetSocketDetachFilter) {
+  int val = 0;
+  socklen_t val_len = sizeof(val);
+  ASSERT_THAT(
+      getsockopt(sock_.get(), SOL_SOCKET, SO_DETACH_FILTER, &val, &val_len),
+      SyscallFailsWithErrno(ENOPROTOOPT));
+}
+
+#endif  // __fuchsia__
+
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/util/fs_util.cc b/test/util/fs_util.cc
index 052781445..5418948fe 100644
--- a/test/util/fs_util.cc
+++ b/test/util/fs_util.cc
@@ -125,12 +125,12 @@ PosixErrorOr<struct stat> Fstat(int fd) {
 
 PosixErrorOr<bool> Exists(absl::string_view path) {
   struct stat stat_buf;
-  int res = stat(std::string(path).c_str(), &stat_buf);
+  int res = lstat(std::string(path).c_str(), &stat_buf);
   if (res < 0) {
     if (errno == ENOENT) {
       return false;
     }
-    return PosixError(errno, absl::StrCat("stat ", path));
+    return PosixError(errno, absl::StrCat("lstat ", path));
   }
   return true;
 }
diff --git a/test/util/fs_util.h b/test/util/fs_util.h
index caf19b24d..8cdac23a1 100644
--- a/test/util/fs_util.h
+++ b/test/util/fs_util.h
@@ -44,9 +44,14 @@ PosixErrorOr<std::string> GetCWD();
 // can't be determined.
 PosixErrorOr<bool> Exists(absl::string_view path);
 
-// Returns a stat structure for the given path or an error.
+// Returns a stat structure for the given path or an error. If the path
+// represents a symlink, it will be traversed.
 PosixErrorOr<struct stat> Stat(absl::string_view path);
 
+// Returns a stat structure for the given path or an error. If the path
+// represents a symlink, it will not be traversed.
+PosixErrorOr<struct stat> Lstat(absl::string_view path);
+
 // Returns a stat struct for the given fd.
 PosixErrorOr<struct stat> Fstat(int fd);
 
diff --git a/test/util/temp_path.cc b/test/util/temp_path.cc
index 9c10b6674..e1bdee7fd 100644
--- a/test/util/temp_path.cc
+++ b/test/util/temp_path.cc
@@ -56,7 +56,7 @@ void TryDeleteRecursively(std::string const& path) {
     if (undeleted_dirs || undeleted_files || !status.ok()) {
       std::cerr << path << ": failed to delete " << undeleted_dirs
                 << " directories and " << undeleted_files
-                << " files: " << status;
+                << " files: " << status << std::endl;
     }
   }
 }
diff --git a/test/util/test_util.h b/test/util/test_util.h
index 109078fc7..0f9781038 100644
--- a/test/util/test_util.h
+++ b/test/util/test_util.h
@@ -567,6 +567,25 @@ ssize_t ApplyFileIoSyscall(F const& f, size_t const count) {
 
 }  // namespace internal
 
+inline PosixErrorOr<std::string> ReadAllFd(int fd) {
+  std::string all;
+  all.reserve(128 * 1024);  // arbitrary.
+
+  std::vector<char> buffer(16 * 1024);
+  for (;;) {
+    auto const bytes = RetryEINTR(read)(fd, buffer.data(), buffer.size());
+    if (bytes < 0) {
+      return PosixError(errno, "file read");
+    }
+    if (bytes == 0) {
+      return std::move(all);
+    }
+    if (bytes > 0) {
+      all.append(buffer.data(), bytes);
+    }
+  }
+}
+
 inline ssize_t ReadFd(int fd, void* buf, size_t count) {
   return internal::ApplyFileIoSyscall(
       [&](size_t completed) {