220 files changed, 6850 insertions, 1148 deletions
diff --git a/WORKSPACE b/WORKSPACE
index 8f50a3e57..57e6f3558 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -3,10 +3,10 @@ load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
 
 http_archive(
     name = "io_bazel_rules_go",
-    sha256 = "078f2a9569fa9ed846e60805fb5fb167d6f6c4ece48e6d409bf5fb2154eaf0d8",
+    sha256 = "842ec0e6b4fbfdd3de6150b61af92901eeb73681fd4d185746644c338f51d4c0",
     urls = [
-        "https://storage.googleapis.com/bazel-mirror/github.com/bazelbuild/rules_go/releases/download/v0.20.0/rules_go-v0.20.0.tar.gz",
-        "https://github.com/bazelbuild/rules_go/releases/download/v0.20.0/rules_go-v0.20.0.tar.gz",
+        "https://storage.googleapis.com/bazel-mirror/github.com/bazelbuild/rules_go/releases/download/v0.20.1/rules_go-v0.20.1.tar.gz",
+        "https://github.com/bazelbuild/rules_go/releases/download/v0.20.1/rules_go-v0.20.1.tar.gz",
     ],
 )
 
@@ -24,7 +24,7 @@ load("@io_bazel_rules_go//go:deps.bzl", "go_rules_dependencies", "go_register_to
 go_rules_dependencies()
 
 go_register_toolchains(
-    go_version = "1.13.1",
+    go_version = "1.13.3",
     nogo = "@//:nogo",
 )
 
diff --git a/kokoro/swgso_tests.cfg b/kokoro/swgso_tests.cfg
new file mode 100644
index 000000000..101a9c607
--- /dev/null
+++ b/kokoro/swgso_tests.cfg
@@ -0,0 +1,9 @@
+build_file: "repo/scripts/swgso_tests.sh"
+
+action {
+  define_artifacts {
+    regex: "**/sponge_log.xml"
+    regex: "**/sponge_log.log"
+    regex: "**/outputs.zip"
+  }
+}
diff --git a/pkg/abi/BUILD b/pkg/abi/BUILD
index 32c601a03..f5c08ea06 100644
--- a/pkg/abi/BUILD
+++ b/pkg/abi/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])
-
 load("//tools/go_stateify:defs.bzl", "go_library")
 
+package(licenses = ["notice"])
+
 go_library(
     name = "abi",
     srcs = [
diff --git a/pkg/abi/linux/BUILD b/pkg/abi/linux/BUILD
index f45934466..7c17109a6 100644
--- a/pkg/abi/linux/BUILD
+++ b/pkg/abi/linux/BUILD
@@ -1,13 +1,12 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
+
 # Package linux contains the constants and types needed to interface with a
 # Linux kernel. It should be used instead of syscall or golang.org/x/sys/unix
 # when the host OS may not be Linux.
 
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
-
 package(licenses = ["notice"])
 
-load("//tools/go_stateify:defs.bzl", "go_library")
-
 go_library(
     name = "linux",
     srcs = [
diff --git a/pkg/abi/linux/socket.go b/pkg/abi/linux/socket.go
index d5b731390..2e2cc6be7 100644
--- a/pkg/abi/linux/socket.go
+++ b/pkg/abi/linux/socket.go
@@ -256,6 +256,17 @@ type SockAddrInet6 struct {
 	Scope_id uint32
 }
 
+// SockAddrLink is a struct sockaddr_ll, from uapi/linux/if_packet.h.
+type SockAddrLink struct {
+	Family          uint16
+	Protocol        uint16
+	InterfaceIndex  int32
+	ARPHardwareType uint16
+	PacketType      byte
+	HardwareAddrLen byte
+	HardwareAddr    [8]byte
+}
+
 // UnixPathMax is the maximum length of the path in an AF_UNIX socket.
 //
 // From uapi/linux/un.h.
@@ -278,6 +289,7 @@ type SockAddr interface {
 
 func (s *SockAddrInet) implementsSockAddr()    {}
 func (s *SockAddrInet6) implementsSockAddr()   {}
+func (s *SockAddrLink) implementsSockAddr()    {}
 func (s *SockAddrUnix) implementsSockAddr()    {}
 func (s *SockAddrNetlink) implementsSockAddr() {}
 
diff --git a/pkg/bits/BUILD b/pkg/bits/BUILD
index 1b5dac99a..93b88a29a 100644
--- a/pkg/bits/BUILD
+++ b/pkg/bits/BUILD
@@ -1,10 +1,9 @@
 load("//tools/go_stateify:defs.bzl", "go_library")
 load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template", "go_template_instance")
 
 package(licenses = ["notice"])
 
-load("//tools/go_generics:defs.bzl", "go_template", "go_template_instance")
-
 go_library(
     name = "bits",
     srcs = [
diff --git a/pkg/bpf/BUILD b/pkg/bpf/BUILD
index 8d31e068c..fba5643e8 100644
--- a/pkg/bpf/BUILD
+++ b/pkg/bpf/BUILD
@@ -1,9 +1,8 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
-load("//tools/go_stateify:defs.bzl", "go_library")
-
 go_library(
     name = "bpf",
     srcs = [
diff --git a/pkg/cpuid/BUILD b/pkg/cpuid/BUILD
index 32422f9e2..ed111fd2a 100644
--- a/pkg/cpuid/BUILD
+++ b/pkg/cpuid/BUILD
@@ -1,9 +1,8 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
-load("//tools/go_stateify:defs.bzl", "go_library")
-
 go_library(
     name = "cpuid",
     srcs = [
diff --git a/pkg/fd/BUILD b/pkg/fd/BUILD
index c7f549428..afa8f7659 100644
--- a/pkg/fd/BUILD
+++ b/pkg/fd/BUILD
@@ -8,9 +8,6 @@ go_library(
     srcs = ["fd.go"],
     importpath = "gvisor.dev/gvisor/pkg/fd",
     visibility = ["//visibility:public"],
-    deps = [
-        "//pkg/unet",
-    ],
 )
 
 go_test(
diff --git a/pkg/fd/fd.go b/pkg/fd/fd.go
index 7691b477b..83bcfe220 100644
--- a/pkg/fd/fd.go
+++ b/pkg/fd/fd.go
@@ -22,8 +22,6 @@ import (
 	"runtime"
 	"sync/atomic"
 	"syscall"
-
-	"gvisor.dev/gvisor/pkg/unet"
 )
 
 // ReadWriter implements io.ReadWriter, io.ReaderAt, and io.WriterAt for fd. It
@@ -187,12 +185,6 @@ func OpenAt(dir *FD, path string, flags int, mode uint32) (*FD, error) {
 	return New(f), nil
 }
 
-// DialUnix connects to a Unix Domain Socket and return the file descriptor.
-func DialUnix(path string) (*FD, error) {
-	socket, err := unet.Connect(path, false)
-	return New(socket.FD()), err
-}
-
 // Close closes the file descriptor contained in the FD.
 //
 // Close is safe to call multiple times, but will return an error after the
diff --git a/pkg/refs/BUILD b/pkg/refs/BUILD
index 827385139..7ad59dfd7 100644
--- a/pkg/refs/BUILD
+++ b/pkg/refs/BUILD
@@ -1,10 +1,9 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_test")
-
-package(licenses = ["notice"])
-
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 load("//tools/go_stateify:defs.bzl", "go_library")
 
+package(licenses = ["notice"])
+
 go_template_instance(
     name = "weak_ref_list",
     out = "weak_ref_list.go",
diff --git a/pkg/segment/BUILD b/pkg/segment/BUILD
index 700385907..1b487b887 100644
--- a/pkg/segment/BUILD
+++ b/pkg/segment/BUILD
@@ -1,10 +1,10 @@
+load("//tools/go_generics:defs.bzl", "go_template")
+
 package(
     default_visibility = ["//:sandbox"],
     licenses = ["notice"],
 )
 
-load("//tools/go_generics:defs.bzl", "go_template")
-
 go_template(
     name = "generic_range",
     srcs = ["range.go"],
diff --git a/pkg/segment/test/BUILD b/pkg/segment/test/BUILD
index 12d7c77d2..a27c35e21 100644
--- a/pkg/segment/test/BUILD
+++ b/pkg/segment/test/BUILD
@@ -1,13 +1,12 @@
 load("//tools/go_stateify:defs.bzl", "go_library")
 load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
 
 package(
     default_visibility = ["//visibility:private"],
     licenses = ["notice"],
 )
 
-load("//tools/go_generics:defs.bzl", "go_template_instance")
-
 go_template_instance(
     name = "int_range",
     out = "int_range.go",
diff --git a/pkg/sentry/arch/BUILD b/pkg/sentry/arch/BUILD
index c71cff9f3..18c73cc24 100644
--- a/pkg/sentry/arch/BUILD
+++ b/pkg/sentry/arch/BUILD
@@ -1,10 +1,9 @@
 load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
 load("@rules_cc//cc:defs.bzl", "cc_proto_library")
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
-load("//tools/go_stateify:defs.bzl", "go_library")
-
 go_library(
     name = "arch",
     srcs = [
diff --git a/pkg/sentry/context/contexttest/BUILD b/pkg/sentry/context/contexttest/BUILD
index 3b6841b7e..581e7aa96 100644
--- a/pkg/sentry/context/contexttest/BUILD
+++ b/pkg/sentry/context/contexttest/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])
-
 load("//tools/go_stateify:defs.bzl", "go_library")
 
+package(licenses = ["notice"])
+
 go_library(
     name = "contexttest",
     testonly = 1,
diff --git a/pkg/sentry/device/BUILD b/pkg/sentry/device/BUILD
index 0c86197f7..1098ed777 100644
--- a/pkg/sentry/device/BUILD
+++ b/pkg/sentry/device/BUILD
@@ -1,9 +1,8 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
-load("//tools/go_stateify:defs.bzl", "go_library")
-
 go_library(
     name = "device",
     srcs = ["device.go"],
diff --git a/pkg/sentry/fs/BUILD b/pkg/sentry/fs/BUILD
index 3119a61b6..378602cc9 100644
--- a/pkg/sentry/fs/BUILD
+++ b/pkg/sentry/fs/BUILD
@@ -1,10 +1,9 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_test")
-
-package(licenses = ["notice"])
-
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 load("//tools/go_stateify:defs.bzl", "go_library")
 
+package(licenses = ["notice"])
+
 go_library(
     name = "fs",
     srcs = [
diff --git a/pkg/sentry/fs/dev/BUILD b/pkg/sentry/fs/dev/BUILD
index 80e106e6f..a0d9e8496 100644
--- a/pkg/sentry/fs/dev/BUILD
+++ b/pkg/sentry/fs/dev/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])
-
 load("//tools/go_stateify:defs.bzl", "go_library")
 
+package(licenses = ["notice"])
+
 go_library(
     name = "dev",
     srcs = [
diff --git a/pkg/sentry/fs/fdpipe/BUILD b/pkg/sentry/fs/fdpipe/BUILD
index b9bd9ed17..277ee4c31 100644
--- a/pkg/sentry/fs/fdpipe/BUILD
+++ b/pkg/sentry/fs/fdpipe/BUILD
@@ -1,9 +1,8 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
-load("//tools/go_stateify:defs.bzl", "go_library")
-
 go_library(
     name = "fdpipe",
     srcs = [
diff --git a/pkg/sentry/fs/filetest/BUILD b/pkg/sentry/fs/filetest/BUILD
index a9d6d9301..358dc2be3 100644
--- a/pkg/sentry/fs/filetest/BUILD
+++ b/pkg/sentry/fs/filetest/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])
-
 load("//tools/go_stateify:defs.bzl", "go_library")
 
+package(licenses = ["notice"])
+
 go_library(
     name = "filetest",
     testonly = 1,
diff --git a/pkg/sentry/fs/fsutil/BUILD b/pkg/sentry/fs/fsutil/BUILD
index b4ac83dc4..b2e8d9c77 100644
--- a/pkg/sentry/fs/fsutil/BUILD
+++ b/pkg/sentry/fs/fsutil/BUILD
@@ -1,10 +1,9 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_test")
-
-package(licenses = ["notice"])
-
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 load("//tools/go_stateify:defs.bzl", "go_library")
 
+package(licenses = ["notice"])
+
 go_template_instance(
     name = "dirty_set_impl",
     out = "dirty_set_impl.go",
diff --git a/pkg/sentry/fs/fsutil/host_file_mapper.go b/pkg/sentry/fs/fsutil/host_file_mapper.go
index e239f12a5..b06a71cc2 100644
--- a/pkg/sentry/fs/fsutil/host_file_mapper.go
+++ b/pkg/sentry/fs/fsutil/host_file_mapper.go
@@ -209,3 +209,29 @@ func (f *HostFileMapper) unmapAndRemoveLocked(chunkStart uint64, m mapping) {
 	}
 	delete(f.mappings, chunkStart)
 }
+
+// RegenerateMappings must be called when the file description mapped by f
+// changes, to replace existing mappings of the previous file description.
+func (f *HostFileMapper) RegenerateMappings(fd int) error {
+	f.mapsMu.Lock()
+	defer f.mapsMu.Unlock()
+
+	for chunkStart, m := range f.mappings {
+		prot := syscall.PROT_READ
+		if m.writable {
+			prot |= syscall.PROT_WRITE
+		}
+		_, _, errno := syscall.Syscall6(
+			syscall.SYS_MMAP,
+			m.addr,
+			chunkSize,
+			uintptr(prot),
+			syscall.MAP_SHARED|syscall.MAP_FIXED,
+			uintptr(fd),
+			uintptr(chunkStart))
+		if errno != 0 {
+			return errno
+		}
+	}
+	return nil
+}
diff --git a/pkg/sentry/fs/fsutil/host_mappable.go b/pkg/sentry/fs/fsutil/host_mappable.go
index 693625ddc..30475f340 100644
--- a/pkg/sentry/fs/fsutil/host_mappable.go
+++ b/pkg/sentry/fs/fsutil/host_mappable.go
@@ -100,13 +100,30 @@ func (h *HostMappable) Translate(ctx context.Context, required, optional memmap.
 }
 
 // InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable.
-func (h *HostMappable) InvalidateUnsavable(ctx context.Context) error {
+func (h *HostMappable) InvalidateUnsavable(_ context.Context) error {
 	h.mu.Lock()
 	h.mappings.InvalidateAll(memmap.InvalidateOpts{})
 	h.mu.Unlock()
 	return nil
 }
 
+// NotifyChangeFD must be called after the file description represented by
+// CachedFileObject.FD() changes.
+func (h *HostMappable) NotifyChangeFD() error {
+	// Update existing sentry mappings to refer to the new file description.
+	if err := h.hostFileMapper.RegenerateMappings(h.backingFile.FD()); err != nil {
+		return err
+	}
+
+	// Shoot down existing application mappings of the old file description;
+	// they will be remapped with the new file description on demand.
+	h.mu.Lock()
+	defer h.mu.Unlock()
+
+	h.mappings.InvalidateAll(memmap.InvalidateOpts{})
+	return nil
+}
+
 // MapInternal implements platform.File.MapInternal.
 func (h *HostMappable) MapInternal(fr platform.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) {
 	return h.hostFileMapper.MapInternal(fr, h.backingFile.FD(), at.Write)
diff --git a/pkg/sentry/fs/fsutil/inode_cached.go b/pkg/sentry/fs/fsutil/inode_cached.go
index dd80757dc..798920d18 100644
--- a/pkg/sentry/fs/fsutil/inode_cached.go
+++ b/pkg/sentry/fs/fsutil/inode_cached.go
@@ -959,6 +959,23 @@ func (c *CachingInodeOperations) InvalidateUnsavable(ctx context.Context) error
 	return nil
 }
 
+// NotifyChangeFD must be called after the file description represented by
+// CachedFileObject.FD() changes.
+func (c *CachingInodeOperations) NotifyChangeFD() error {
+	// Update existing sentry mappings to refer to the new file description.
+	if err := c.hostFileMapper.RegenerateMappings(c.backingFile.FD()); err != nil {
+		return err
+	}
+
+	// Shoot down existing application mappings of the old file description;
+	// they will be remapped with the new file description on demand.
+	c.mapsMu.Lock()
+	defer c.mapsMu.Unlock()
+
+	c.mappings.InvalidateAll(memmap.InvalidateOpts{})
+	return nil
+}
+
 // Evict implements pgalloc.EvictableMemoryUser.Evict.
 func (c *CachingInodeOperations) Evict(ctx context.Context, er pgalloc.EvictableRange) {
 	c.mapsMu.Lock()
@@ -1027,7 +1044,6 @@ func (c *CachingInodeOperations) DecRef(fr platform.FileRange) {
 	}
 	c.refs.MergeAdjacent(fr)
 	c.dataMu.Unlock()
-
 }
 
 // MapInternal implements platform.File.MapInternal. This is used when we
diff --git a/pkg/sentry/fs/gofer/BUILD b/pkg/sentry/fs/gofer/BUILD
index 2b71ca0e1..4a005c605 100644
--- a/pkg/sentry/fs/gofer/BUILD
+++ b/pkg/sentry/fs/gofer/BUILD
@@ -1,9 +1,8 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
-load("//tools/go_stateify:defs.bzl", "go_library")
-
 go_library(
     name = "gofer",
     srcs = [
diff --git a/pkg/sentry/fs/gofer/file.go b/pkg/sentry/fs/gofer/file.go
index 9e2e412cd..7960b9c7b 100644
--- a/pkg/sentry/fs/gofer/file.go
+++ b/pkg/sentry/fs/gofer/file.go
@@ -214,28 +214,64 @@ func (f *fileOperations) readdirAll(ctx context.Context) (map[string]fs.DentAttr
 	return entries, nil
 }
 
+// maybeSync will call FSync on the file if either the cache policy or file
+// flags require it.
+func (f *fileOperations) maybeSync(ctx context.Context, file *fs.File, offset, n int64) error {
+	if n == 0 {
+		// Nothing to sync.
+		return nil
+	}
+
+	if f.inodeOperations.session().cachePolicy.writeThrough(file.Dirent.Inode) {
+		// Call WriteOut directly, as some "writethrough" filesystems
+		// do not support sync.
+		return f.inodeOperations.cachingInodeOps.WriteOut(ctx, file.Dirent.Inode)
+	}
+
+	flags := file.Flags()
+	var syncType fs.SyncType
+	switch {
+	case flags.Direct || flags.Sync:
+		syncType = fs.SyncAll
+	case flags.DSync:
+		syncType = fs.SyncData
+	default:
+		// No need to sync.
+		return nil
+	}
+
+	return f.Fsync(ctx, file, offset, offset+n, syncType)
+}
+
 // Write implements fs.FileOperations.Write.
 func (f *fileOperations) Write(ctx context.Context, file *fs.File, src usermem.IOSequence, offset int64) (int64, error) {
 	if fs.IsDir(file.Dirent.Inode.StableAttr) {
 		// Not all remote file systems enforce this so this client does.
 		return 0, syserror.EISDIR
 	}
-	cp := f.inodeOperations.session().cachePolicy
-	if cp.useCachingInodeOps(file.Dirent.Inode) {
-		n, err := f.inodeOperations.cachingInodeOps.Write(ctx, src, offset)
-		if err != nil {
-			return n, err
-		}
-		if cp.writeThrough(file.Dirent.Inode) {
-			// Write out the file.
-			err = f.inodeOperations.cachingInodeOps.WriteOut(ctx, file.Dirent.Inode)
-		}
-		return n, err
+
+	var (
+		n   int64
+		err error
+	)
+	// The write is handled in different ways depending on the cache policy
+	// and availability of a host-mappable FD.
+	if f.inodeOperations.session().cachePolicy.useCachingInodeOps(file.Dirent.Inode) {
+		n, err = f.inodeOperations.cachingInodeOps.Write(ctx, src, offset)
+	} else if f.inodeOperations.fileState.hostMappable != nil {
+		n, err = f.inodeOperations.fileState.hostMappable.Write(ctx, src, offset)
+	} else {
+		n, err = src.CopyInTo(ctx, f.handles.readWriterAt(ctx, offset))
 	}
-	if f.inodeOperations.fileState.hostMappable != nil {
-		return f.inodeOperations.fileState.hostMappable.Write(ctx, src, offset)
+
+	// We may need to sync the written bytes.
+	if syncErr := f.maybeSync(ctx, file, offset, n); syncErr != nil {
+		// Sync failed. Report 0 bytes written, since none of them are
+		// guaranteed to have been synced.
+		return 0, syncErr
 	}
-	return src.CopyInTo(ctx, f.handles.readWriterAt(ctx, offset))
+
+	return n, err
 }
 
 // incrementReadCounters increments the read counters for the read starting at the given time. We
@@ -273,7 +309,7 @@ func (f *fileOperations) Read(ctx context.Context, file *fs.File, dst usermem.IO
 }
 
 // Fsync implements fs.FileOperations.Fsync.
-func (f *fileOperations) Fsync(ctx context.Context, file *fs.File, start int64, end int64, syncType fs.SyncType) error {
+func (f *fileOperations) Fsync(ctx context.Context, file *fs.File, start, end int64, syncType fs.SyncType) error {
 	switch syncType {
 	case fs.SyncAll, fs.SyncData:
 		if err := file.Dirent.Inode.WriteOut(ctx); err != nil {
diff --git a/pkg/sentry/fs/gofer/file_state.go b/pkg/sentry/fs/gofer/file_state.go
index 9aa68a70e..c2fbb4be9 100644
--- a/pkg/sentry/fs/gofer/file_state.go
+++ b/pkg/sentry/fs/gofer/file_state.go
@@ -29,7 +29,7 @@ func (f *fileOperations) afterLoad() {
 		// Manually load the open handles.
 		var err error
 		// TODO(b/38173783): Context is not plumbed to save/restore.
-		f.handles, err = f.inodeOperations.fileState.getHandles(context.Background(), f.flags)
+		f.handles, err = f.inodeOperations.fileState.getHandles(context.Background(), f.flags, f.inodeOperations.cachingInodeOps)
 		if err != nil {
 			return fmt.Errorf("failed to re-open handle: %v", err)
 		}
diff --git a/pkg/sentry/fs/gofer/fs.go b/pkg/sentry/fs/gofer/fs.go
index 8f8ab5d29..cf96dd9fa 100644
--- a/pkg/sentry/fs/gofer/fs.go
+++ b/pkg/sentry/fs/gofer/fs.go
@@ -58,6 +58,11 @@ const (
 	// If present, sets CachingInodeOperationsOptions.LimitHostFDTranslation to
 	// true.
 	limitHostFDTranslationKey = "limit_host_fd_translation"
+
+	// overlayfsStaleRead if present closes cached readonly file after the first
+	// write. This is done to workaround a limitation of overlayfs in kernels
+	// before 4.19 where open FDs are not updated after the file is copied up.
+	overlayfsStaleRead = "overlayfs_stale_read"
 )
 
 // defaultAname is the default attach name.
@@ -145,6 +150,7 @@ type opts struct {
 	version                string
 	privateunixsocket      bool
 	limitHostFDTranslation bool
+	overlayfsStaleRead     bool
 }
 
 // options parses mount(2) data into structured options.
@@ -247,6 +253,11 @@ func options(data string) (opts, error) {
 		delete(options, limitHostFDTranslationKey)
 	}
 
+	if _, ok := options[overlayfsStaleRead]; ok {
+		o.overlayfsStaleRead = true
+		delete(options, overlayfsStaleRead)
+	}
+
 	// Fail to attach if the caller wanted us to do something that we
 	// don't support.
 	if len(options) > 0 {
diff --git a/pkg/sentry/fs/gofer/handles.go b/pkg/sentry/fs/gofer/handles.go
index 27eeae3d9..39c8ec33d 100644
--- a/pkg/sentry/fs/gofer/handles.go
+++ b/pkg/sentry/fs/gofer/handles.go
@@ -39,14 +39,22 @@ type handles struct {
 
 	// Host is an *fd.FD handle. May be nil.
 	Host *fd.FD
+
+	// isHostBorrowed tells whether 'Host' is owned or borrowed. If owned, it's
+	// closed on destruction, otherwise it's released.
+	isHostBorrowed bool
 }
 
 // DecRef drops a reference on handles.
 func (h *handles) DecRef() {
 	h.DecRefWithDestructor(func() {
 		if h.Host != nil {
-			if err := h.Host.Close(); err != nil {
-				log.Warningf("error closing host file: %v", err)
+			if h.isHostBorrowed {
+				h.Host.Release()
+			} else {
+				if err := h.Host.Close(); err != nil {
+					log.Warningf("error closing host file: %v", err)
+				}
 			}
 		}
 		// FIXME(b/38173783): Context is not plumbed here.
diff --git a/pkg/sentry/fs/gofer/inode.go b/pkg/sentry/fs/gofer/inode.go
index d918d6620..99910388f 100644
--- a/pkg/sentry/fs/gofer/inode.go
+++ b/pkg/sentry/fs/gofer/inode.go
@@ -100,7 +100,7 @@ type inodeFileState struct {
 	// true.
 	//
 	// Once readHandles becomes non-nil, it can't be changed until
-	// inodeFileState.Release(), because of a defect in the
+	// inodeFileState.Release()*, because of a defect in the
 	// fsutil.CachedFileObject interface: there's no way for the caller of
 	// fsutil.CachedFileObject.FD() to keep the returned FD open, so if we
 	// racily replace readHandles after inodeFileState.FD() has returned
@@ -108,6 +108,9 @@ type inodeFileState struct {
 	// FD. writeHandles can be changed if writeHandlesRW is false, since
 	// inodeFileState.FD() can't return a write-only FD, but can't be changed
 	// if writeHandlesRW is true for the same reason.
+	//
+	// * There is one notable exception in recreateReadHandles(), where it dup's
+	// the FD and invalidates the page cache.
 	readHandles    *handles `state:"nosave"`
 	writeHandles   *handles `state:"nosave"`
 	writeHandlesRW bool     `state:"nosave"`
@@ -175,43 +178,124 @@ func (i *inodeFileState) setSharedHandlesLocked(flags fs.FileFlags, h *handles)
 
 // getHandles returns a set of handles for a new file using i opened with the
 // given flags.
-func (i *inodeFileState) getHandles(ctx context.Context, flags fs.FileFlags) (*handles, error) {
+func (i *inodeFileState) getHandles(ctx context.Context, flags fs.FileFlags, cache *fsutil.CachingInodeOperations) (*handles, error) {
 	if !i.canShareHandles() {
 		return newHandles(ctx, i.file, flags)
 	}
+
 	i.handlesMu.Lock()
-	defer i.handlesMu.Unlock()
+	h, invalidate, err := i.getHandlesLocked(ctx, flags)
+	i.handlesMu.Unlock()
+
+	if invalidate {
+		cache.NotifyChangeFD()
+		if i.hostMappable != nil {
+			i.hostMappable.NotifyChangeFD()
+		}
+	}
+
+	return h, err
+}
+
+// getHandlesLocked returns a pointer to cached handles and a boolean indicating
+// whether previously open read handle was recreated. Host mappings must be
+// invalidated if so.
+func (i *inodeFileState) getHandlesLocked(ctx context.Context, flags fs.FileFlags) (*handles, bool, error) {
 	// Do we already have usable shared handles?
 	if flags.Write {
 		if i.writeHandles != nil && (i.writeHandlesRW || !flags.Read) {
 			i.writeHandles.IncRef()
-			return i.writeHandles, nil
+			return i.writeHandles, false, nil
 		}
 	} else if i.readHandles != nil {
 		i.readHandles.IncRef()
-		return i.readHandles, nil
+		return i.readHandles, false, nil
 	}
+
 	// No; get new handles and cache them for future sharing.
 	h, err := newHandles(ctx, i.file, flags)
 	if err != nil {
-		return nil, err
+		return nil, false, err
+	}
+
+	// Read handles invalidation is needed if:
+	//   - Mount option 'overlayfs_stale_read' is set
+	//   - Read handle is open: nothing to invalidate otherwise
+	//   - Write handle is not open: file was not open for write and is being open
+	//     for write now (will trigger copy up in overlayfs).
+	invalidate := false
+	if i.s.overlayfsStaleRead && i.readHandles != nil && i.writeHandles == nil && flags.Write {
+		if err := i.recreateReadHandles(ctx, h, flags); err != nil {
+			return nil, false, err
+		}
+		invalidate = true
 	}
 	i.setSharedHandlesLocked(flags, h)
-	return h, nil
+	return h, invalidate, nil
+}
+
+func (i *inodeFileState) recreateReadHandles(ctx context.Context, writer *handles, flags fs.FileFlags) error {
+	h := writer
+	if !flags.Read {
+		// Writer can't be used for read, must create a new handle.
+		var err error
+		h, err = newHandles(ctx, i.file, fs.FileFlags{Read: true})
+		if err != nil {
+			return err
+		}
+		defer h.DecRef()
+	}
+
+	if i.readHandles.Host == nil {
+		// If current readHandles doesn't have a host FD, it can simply be replaced.
+		i.readHandles.DecRef()
+
+		h.IncRef()
+		i.readHandles = h
+		return nil
+	}
+
+	if h.Host == nil {
+		// Current read handle has a host FD and can't be replaced with one that
+		// doesn't, because it breaks fsutil.CachedFileObject.FD() contract.
+		log.Warningf("Read handle can't be invalidated, reads may return stale data")
+		return nil
+	}
+
+	// Due to a defect in the fsutil.CachedFileObject interface,
+	// readHandles.Host.FD() may be used outside locks, making it impossible to
+	// reliably close it. To workaround it, we dup the new FD into the old one, so
+	// operations on the old will see the new data. Then, make the new handle take
+	// ownereship of the old FD and mark the old readHandle to not close the FD
+	// when done.
+	if err := syscall.Dup2(h.Host.FD(), i.readHandles.Host.FD()); err != nil {
+		return err
+	}
+
+	h.Host.Close()
+	h.Host = fd.New(i.readHandles.Host.FD())
+	i.readHandles.isHostBorrowed = true
+	i.readHandles.DecRef()
+
+	h.IncRef()
+	i.readHandles = h
+	return nil
 }
 
 // ReadToBlocksAt implements fsutil.CachedFileObject.ReadToBlocksAt.
 func (i *inodeFileState) ReadToBlocksAt(ctx context.Context, dsts safemem.BlockSeq, offset uint64) (uint64, error) {
 	i.handlesMu.RLock()
-	defer i.handlesMu.RUnlock()
-	return i.readHandles.readWriterAt(ctx, int64(offset)).ReadToBlocks(dsts)
+	n, err := i.readHandles.readWriterAt(ctx, int64(offset)).ReadToBlocks(dsts)
+	i.handlesMu.RUnlock()
+	return n, err
 }
 
 // WriteFromBlocksAt implements fsutil.CachedFileObject.WriteFromBlocksAt.
 func (i *inodeFileState) WriteFromBlocksAt(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error) {
 	i.handlesMu.RLock()
-	defer i.handlesMu.RUnlock()
-	return i.writeHandles.readWriterAt(ctx, int64(offset)).WriteFromBlocks(srcs)
+	n, err := i.writeHandles.readWriterAt(ctx, int64(offset)).WriteFromBlocks(srcs)
+	i.handlesMu.RUnlock()
+	return n, err
 }
 
 // SetMaskedAttributes implements fsutil.CachedFileObject.SetMaskedAttributes.
@@ -449,7 +533,7 @@ func (i *inodeOperations) NonBlockingOpen(ctx context.Context, p fs.PermMask) (*
 }
 
 func (i *inodeOperations) getFileDefault(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
-	h, err := i.fileState.getHandles(ctx, flags)
+	h, err := i.fileState.getHandles(ctx, flags, i.cachingInodeOps)
 	if err != nil {
 		return nil, err
 	}
diff --git a/pkg/sentry/fs/gofer/session.go b/pkg/sentry/fs/gofer/session.go
index 50da865c1..0da608548 100644
--- a/pkg/sentry/fs/gofer/session.go
+++ b/pkg/sentry/fs/gofer/session.go
@@ -122,6 +122,10 @@ type session struct {
 	// CachingInodeOperations created by the session.
 	limitHostFDTranslation bool
 
+	// overlayfsStaleRead when set causes the readonly handle to be invalidated
+	// after file is open for write.
+	overlayfsStaleRead bool
+
 	// connID is a unique identifier for the session connection.
 	connID string `state:"wait"`
 
@@ -257,6 +261,7 @@ func Root(ctx context.Context, dev string, filesystem fs.Filesystem, superBlockF
 		aname:                  o.aname,
 		superBlockFlags:        superBlockFlags,
 		limitHostFDTranslation: o.limitHostFDTranslation,
+		overlayfsStaleRead:     o.overlayfsStaleRead,
 		mounter:                mounter,
 	}
 	s.EnableLeakCheck("gofer.session")
diff --git a/pkg/sentry/fs/host/BUILD b/pkg/sentry/fs/host/BUILD
index 3e532332e..1cbed07ae 100644
--- a/pkg/sentry/fs/host/BUILD
+++ b/pkg/sentry/fs/host/BUILD
@@ -1,9 +1,8 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
-load("//tools/go_stateify:defs.bzl", "go_library")
-
 go_library(
     name = "host",
     srcs = [
diff --git a/pkg/sentry/fs/host/socket.go b/pkg/sentry/fs/host/socket.go
index 2392787cb..107336a3e 100644
--- a/pkg/sentry/fs/host/socket.go
+++ b/pkg/sentry/fs/host/socket.go
@@ -385,3 +385,6 @@ func (c *ConnectedEndpoint) RecvMaxQueueSize() int64 {
 func (c *ConnectedEndpoint) Release() {
 	c.ref.DecRefWithDestructor(c.close)
 }
+
+// CloseUnread implements transport.ConnectedEndpoint.CloseUnread.
+func (c *ConnectedEndpoint) CloseUnread() {}
diff --git a/pkg/sentry/fs/lock/BUILD b/pkg/sentry/fs/lock/BUILD
index 5a7a5b8cd..8d62642e7 100644
--- a/pkg/sentry/fs/lock/BUILD
+++ b/pkg/sentry/fs/lock/BUILD
@@ -1,10 +1,9 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_test")
-
-package(licenses = ["notice"])
-
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 load("//tools/go_stateify:defs.bzl", "go_library")
 
+package(licenses = ["notice"])
+
 go_template_instance(
     name = "lock_range",
     out = "lock_range.go",
diff --git a/pkg/sentry/fs/proc/BUILD b/pkg/sentry/fs/proc/BUILD
index c307603a6..75cbb0622 100644
--- a/pkg/sentry/fs/proc/BUILD
+++ b/pkg/sentry/fs/proc/BUILD
@@ -1,9 +1,8 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
-load("//tools/go_stateify:defs.bzl", "go_library")
-
 go_library(
     name = "proc",
     srcs = [
diff --git a/pkg/sentry/fs/proc/seqfile/BUILD b/pkg/sentry/fs/proc/seqfile/BUILD
index 76433c7d0..fe7067be1 100644
--- a/pkg/sentry/fs/proc/seqfile/BUILD
+++ b/pkg/sentry/fs/proc/seqfile/BUILD
@@ -1,9 +1,8 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
-load("//tools/go_stateify:defs.bzl", "go_library")
-
 go_library(
     name = "seqfile",
     srcs = ["seqfile.go"],
diff --git a/pkg/sentry/fs/ramfs/BUILD b/pkg/sentry/fs/ramfs/BUILD
index d0f351e5a..012cb3e44 100644
--- a/pkg/sentry/fs/ramfs/BUILD
+++ b/pkg/sentry/fs/ramfs/BUILD
@@ -1,9 +1,8 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
-load("//tools/go_stateify:defs.bzl", "go_library")
-
 go_library(
     name = "ramfs",
     srcs = [
diff --git a/pkg/sentry/fs/sys/BUILD b/pkg/sentry/fs/sys/BUILD
index 70fa3af89..25f0f124e 100644
--- a/pkg/sentry/fs/sys/BUILD
+++ b/pkg/sentry/fs/sys/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])
-
 load("//tools/go_stateify:defs.bzl", "go_library")
 
+package(licenses = ["notice"])
+
 go_library(
     name = "sys",
     srcs = [
diff --git a/pkg/sentry/fs/timerfd/BUILD b/pkg/sentry/fs/timerfd/BUILD
index 1d80daeaf..a215c1b95 100644
--- a/pkg/sentry/fs/timerfd/BUILD
+++ b/pkg/sentry/fs/timerfd/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])
-
 load("//tools/go_stateify:defs.bzl", "go_library")
 
+package(licenses = ["notice"])
+
 go_library(
     name = "timerfd",
     srcs = ["timerfd.go"],
diff --git a/pkg/sentry/fs/tmpfs/BUILD b/pkg/sentry/fs/tmpfs/BUILD
index 11b680929..59ce400c2 100644
--- a/pkg/sentry/fs/tmpfs/BUILD
+++ b/pkg/sentry/fs/tmpfs/BUILD
@@ -1,9 +1,8 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
-load("//tools/go_stateify:defs.bzl", "go_library")
-
 go_library(
     name = "tmpfs",
     srcs = [
diff --git a/pkg/sentry/fs/tty/BUILD b/pkg/sentry/fs/tty/BUILD
index 25811f668..95ad98cb0 100644
--- a/pkg/sentry/fs/tty/BUILD
+++ b/pkg/sentry/fs/tty/BUILD
@@ -1,9 +1,8 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
-load("//tools/go_stateify:defs.bzl", "go_library")
-
 go_library(
     name = "tty",
     srcs = [
diff --git a/pkg/sentry/fsimpl/ext/BUILD b/pkg/sentry/fsimpl/ext/BUILD
index b0c286b7a..7ccff8b0d 100644
--- a/pkg/sentry/fsimpl/ext/BUILD
+++ b/pkg/sentry/fsimpl/ext/BUILD
@@ -1,10 +1,9 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_test")
-
-package(licenses = ["notice"])
-
 load("//tools/go_stateify:defs.bzl", "go_library")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 
+package(licenses = ["notice"])
+
 go_template_instance(
     name = "dirent_list",
     out = "dirent_list.go",
diff --git a/pkg/sentry/fsimpl/ext/directory.go b/pkg/sentry/fsimpl/ext/directory.go
index 0b471d121..91802dc1e 100644
--- a/pkg/sentry/fsimpl/ext/directory.go
+++ b/pkg/sentry/fsimpl/ext/directory.go
@@ -301,8 +301,8 @@ func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (in
 	return offset, nil
 }
 
-// IterDirents implements vfs.FileDescriptionImpl.IterDirents.
-func (fd *directoryFD) ConfigureMMap(ctx context.Context, opts memmap.MMapOpts) error {
+// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap.
+func (fd *directoryFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
 	// mmap(2) specifies that EACCESS should be returned for non-regular file fds.
 	return syserror.EACCES
 }
diff --git a/pkg/sentry/fsimpl/ext/disklayout/BUILD b/pkg/sentry/fsimpl/ext/disklayout/BUILD
index 2d50e30aa..fcfaf5c3e 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/BUILD
+++ b/pkg/sentry/fsimpl/ext/disklayout/BUILD
@@ -1,9 +1,8 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
-load("//tools/go_stateify:defs.bzl", "go_library")
-
 go_library(
     name = "disklayout",
     srcs = [
diff --git a/pkg/sentry/fsimpl/ext/file_description.go b/pkg/sentry/fsimpl/ext/file_description.go
index a0065343b..4d18b28cb 100644
--- a/pkg/sentry/fsimpl/ext/file_description.go
+++ b/pkg/sentry/fsimpl/ext/file_description.go
@@ -43,9 +43,6 @@ func (fd *fileDescription) inode() *inode {
 	return fd.vfsfd.VirtualDentry().Dentry().Impl().(*dentry).inode
 }
 
-// OnClose implements vfs.FileDescriptionImpl.OnClose.
-func (fd *fileDescription) OnClose() error { return nil }
-
 // StatusFlags implements vfs.FileDescriptionImpl.StatusFlags.
 func (fd *fileDescription) StatusFlags(ctx context.Context) (uint32, error) {
 	return fd.flags, nil
diff --git a/pkg/sentry/fsimpl/ext/regular_file.go b/pkg/sentry/fsimpl/ext/regular_file.go
index ffc76ba5b..aec33e00a 100644
--- a/pkg/sentry/fsimpl/ext/regular_file.go
+++ b/pkg/sentry/fsimpl/ext/regular_file.go
@@ -152,8 +152,8 @@ func (fd *regularFileFD) Seek(ctx context.Context, offset int64, whence int32) (
 	return offset, nil
 }
 
-// IterDirents implements vfs.FileDescriptionImpl.IterDirents.
-func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts memmap.MMapOpts) error {
+// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap.
+func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
 	// TODO(b/134676337): Implement mmap(2).
 	return syserror.ENODEV
 }
diff --git a/pkg/sentry/fsimpl/ext/symlink.go b/pkg/sentry/fsimpl/ext/symlink.go
index e06548a98..bdf8705c1 100644
--- a/pkg/sentry/fsimpl/ext/symlink.go
+++ b/pkg/sentry/fsimpl/ext/symlink.go
@@ -105,7 +105,7 @@ func (fd *symlinkFD) Seek(ctx context.Context, offset int64, whence int32) (int6
 	return 0, syserror.EBADF
 }
 
-// IterDirents implements vfs.FileDescriptionImpl.IterDirents.
-func (fd *symlinkFD) ConfigureMMap(ctx context.Context, opts memmap.MMapOpts) error {
+// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap.
+func (fd *symlinkFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
 	return syserror.EBADF
 }
diff --git a/pkg/sentry/fsimpl/memfs/BUILD b/pkg/sentry/fsimpl/memfs/BUILD
index 7e364c5fd..04d667273 100644
--- a/pkg/sentry/fsimpl/memfs/BUILD
+++ b/pkg/sentry/fsimpl/memfs/BUILD
@@ -24,14 +24,18 @@ go_library(
         "directory.go",
         "filesystem.go",
         "memfs.go",
+        "named_pipe.go",
         "regular_file.go",
         "symlink.go",
     ],
     importpath = "gvisor.dev/gvisor/pkg/sentry/fsimpl/memfs",
     deps = [
         "//pkg/abi/linux",
+        "//pkg/amutex",
+        "//pkg/sentry/arch",
         "//pkg/sentry/context",
         "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/kernel/pipe",
         "//pkg/sentry/usermem",
         "//pkg/sentry/vfs",
         "//pkg/syserror",
@@ -54,3 +58,19 @@ go_test(
         "//pkg/syserror",
     ],
 )
+
+go_test(
+    name = "memfs_test",
+    size = "small",
+    srcs = ["pipe_test.go"],
+    embed = [":memfs"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/sentry/context",
+        "//pkg/sentry/context/contexttest",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/usermem",
+        "//pkg/sentry/vfs",
+        "//pkg/syserror",
+    ],
+)
diff --git a/pkg/sentry/fsimpl/memfs/filesystem.go b/pkg/sentry/fsimpl/memfs/filesystem.go
index f79e2d9c8..f006c40cd 100644
--- a/pkg/sentry/fsimpl/memfs/filesystem.go
+++ b/pkg/sentry/fsimpl/memfs/filesystem.go
@@ -233,7 +233,7 @@ func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
 	if err != nil {
 		return err
 	}
-	_, err = checkCreateLocked(rp, parentVFSD, parentInode)
+	pc, err := checkCreateLocked(rp, parentVFSD, parentInode)
 	if err != nil {
 		return err
 	}
@@ -241,8 +241,40 @@ func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
 		return err
 	}
 	defer rp.Mount().EndWrite()
-	// TODO: actually implement mknod
-	return syserror.EPERM
+
+	switch opts.Mode.FileType() {
+	case 0:
+		// "Zero file type is equivalent to type S_IFREG." - mknod(2)
+		fallthrough
+	case linux.ModeRegular:
+		// TODO(b/138862511): Implement.
+		return syserror.EINVAL
+
+	case linux.ModeNamedPipe:
+		child := fs.newDentry(fs.newNamedPipe(rp.Credentials(), opts.Mode))
+		parentVFSD.InsertChild(&child.vfsd, pc)
+		parentInode.impl.(*directory).childList.PushBack(child)
+		return nil
+
+	case linux.ModeSocket:
+		// TODO(b/138862511): Implement.
+		return syserror.EINVAL
+
+	case linux.ModeCharacterDevice:
+		fallthrough
+	case linux.ModeBlockDevice:
+		// TODO(b/72101894): We don't support creating block or character
+		// devices at the moment.
+		//
+		// When we start supporting block and character devices, we'll
+		// need to check for CAP_MKNOD here.
+		return syserror.EPERM
+
+	default:
+		// "EINVAL - mode requested creation of something other than a
+		// regular file, device special file, FIFO or socket." - mknod(2)
+		return syserror.EINVAL
+	}
 }
 
 // OpenAt implements vfs.FilesystemImpl.OpenAt.
@@ -250,8 +282,9 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
 	// Filter out flags that are not supported by memfs. O_DIRECTORY and
 	// O_NOFOLLOW have no effect here (they're handled by VFS by setting
 	// appropriate bits in rp), but are returned by
-	// FileDescriptionImpl.StatusFlags().
-	opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_TRUNC | linux.O_DIRECTORY | linux.O_NOFOLLOW
+	// FileDescriptionImpl.StatusFlags(). O_NONBLOCK is supported only by
+	// pipes.
+	opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_TRUNC | linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_NONBLOCK
 
 	if opts.Flags&linux.O_CREAT == 0 {
 		fs.mu.RLock()
@@ -260,7 +293,7 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
 		if err != nil {
 			return nil, err
 		}
-		return inode.open(rp, vfsd, opts.Flags, false)
+		return inode.open(ctx, rp, vfsd, opts.Flags, false)
 	}
 
 	mustCreate := opts.Flags&linux.O_EXCL != 0
@@ -275,7 +308,7 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
 		if mustCreate {
 			return nil, syserror.EEXIST
 		}
-		return inode.open(rp, vfsd, opts.Flags, false)
+		return inode.open(ctx, rp, vfsd, opts.Flags, false)
 	}
 afterTrailingSymlink:
 	// Walk to the parent directory of the last path component.
@@ -320,7 +353,7 @@ afterTrailingSymlink:
 		child := fs.newDentry(childInode)
 		vfsd.InsertChild(&child.vfsd, pc)
 		inode.impl.(*directory).childList.PushBack(child)
-		return childInode.open(rp, &child.vfsd, opts.Flags, true)
+		return childInode.open(ctx, rp, &child.vfsd, opts.Flags, true)
 	}
 	// Open existing file or follow symlink.
 	if mustCreate {
@@ -336,10 +369,10 @@ afterTrailingSymlink:
 		// symlink target.
 		goto afterTrailingSymlink
 	}
-	return childInode.open(rp, childVFSD, opts.Flags, false)
+	return childInode.open(ctx, rp, childVFSD, opts.Flags, false)
 }
 
-func (i *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32, afterCreate bool) (*vfs.FileDescription, error) {
+func (i *inode) open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32, afterCreate bool) (*vfs.FileDescription, error) {
 	ats := vfs.AccessTypesForOpenFlags(flags)
 	if !afterCreate {
 		if err := i.checkPermissions(rp.Credentials(), ats, i.isDir()); err != nil {
@@ -378,6 +411,8 @@ func (i *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32, afte
 	case *symlink:
 		// Can't open symlinks without O_PATH (which is unimplemented).
 		return nil, syserror.ELOOP
+	case *namedPipe:
+		return newNamedPipeFD(ctx, impl, rp, vfsd, flags)
 	default:
 		panic(fmt.Sprintf("unknown inode type: %T", i.impl))
 	}
diff --git a/pkg/sentry/fsimpl/memfs/memfs.go b/pkg/sentry/fsimpl/memfs/memfs.go
index b78471c0f..64c851c1a 100644
--- a/pkg/sentry/fsimpl/memfs/memfs.go
+++ b/pkg/sentry/fsimpl/memfs/memfs.go
@@ -227,6 +227,8 @@ func (i *inode) statTo(stat *linux.Statx) {
 		stat.Mask |= linux.STATX_SIZE | linux.STATX_BLOCKS
 		stat.Size = uint64(len(impl.target))
 		stat.Blocks = allocatedBlocksForSize(stat.Size)
+	case *namedPipe:
+		stat.Mode |= linux.S_IFIFO
 	default:
 		panic(fmt.Sprintf("unknown inode type: %T", i.impl))
 	}
diff --git a/pkg/sentry/fsimpl/memfs/named_pipe.go b/pkg/sentry/fsimpl/memfs/named_pipe.go
new file mode 100644
index 000000000..732ed7c58
--- /dev/null
+++ b/pkg/sentry/fsimpl/memfs/named_pipe.go
@@ -0,0 +1,59 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package memfs
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/pipe"
+	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+)
+
+type namedPipe struct {
+	inode inode
+
+	pipe *pipe.VFSPipe
+}
+
+// Preconditions:
+//   * fs.mu must be locked.
+//   * rp.Mount().CheckBeginWrite() has been called successfully.
+func (fs *filesystem) newNamedPipe(creds *auth.Credentials, mode linux.FileMode) *inode {
+	file := &namedPipe{pipe: pipe.NewVFSPipe(pipe.DefaultPipeSize, usermem.PageSize)}
+	file.inode.init(file, fs, creds, mode)
+	file.inode.nlink = 1 // Only the parent has a link.
+	return &file.inode
+}
+
+// namedPipeFD implements vfs.FileDescriptionImpl. Methods are implemented
+// entirely via struct embedding.
+type namedPipeFD struct {
+	fileDescription
+
+	*pipe.VFSPipeFD
+}
+
+func newNamedPipeFD(ctx context.Context, np *namedPipe, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) {
+	var err error
+	var fd namedPipeFD
+	fd.VFSPipeFD, err = np.pipe.NewVFSPipeFD(ctx, rp, vfsd, &fd.vfsfd, flags)
+	if err != nil {
+		return nil, err
+	}
+	fd.vfsfd.Init(&fd, rp.Mount(), vfsd)
+	return &fd.vfsfd, nil
+}
diff --git a/pkg/sentry/fsimpl/memfs/pipe_test.go b/pkg/sentry/fsimpl/memfs/pipe_test.go
new file mode 100644
index 000000000..0674b81a3
--- /dev/null
+++ b/pkg/sentry/fsimpl/memfs/pipe_test.go
@@ -0,0 +1,233 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package memfs
+
+import (
+	"bytes"
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/context/contexttest"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+const fileName = "mypipe"
+
+func TestSeparateFDs(t *testing.T) {
+	ctx, creds, vfsObj, root := setup(t)
+	defer root.DecRef()
+
+	// Open the read side. This is done in a concurrently because opening
+	// One end the pipe blocks until the other end is opened.
+	pop := vfs.PathOperation{
+		Root:               root,
+		Start:              root,
+		Pathname:           fileName,
+		FollowFinalSymlink: true,
+	}
+	rfdchan := make(chan *vfs.FileDescription)
+	go func() {
+		openOpts := vfs.OpenOptions{Flags: linux.O_RDONLY}
+		rfd, _ := vfsObj.OpenAt(ctx, creds, &pop, &openOpts)
+		rfdchan <- rfd
+	}()
+
+	// Open the write side.
+	openOpts := vfs.OpenOptions{Flags: linux.O_WRONLY}
+	wfd, err := vfsObj.OpenAt(ctx, creds, &pop, &openOpts)
+	if err != nil {
+		t.Fatalf("failed to open pipe for writing %q: %v", fileName, err)
+	}
+	defer wfd.DecRef()
+
+	rfd, ok := <-rfdchan
+	if !ok {
+		t.Fatalf("failed to open pipe for reading %q", fileName)
+	}
+	defer rfd.DecRef()
+
+	const msg = "vamos azul"
+	checkEmpty(ctx, t, rfd)
+	checkWrite(ctx, t, wfd, msg)
+	checkRead(ctx, t, rfd, msg)
+}
+
+func TestNonblockingRead(t *testing.T) {
+	ctx, creds, vfsObj, root := setup(t)
+	defer root.DecRef()
+
+	// Open the read side as nonblocking.
+	pop := vfs.PathOperation{
+		Root:               root,
+		Start:              root,
+		Pathname:           fileName,
+		FollowFinalSymlink: true,
+	}
+	openOpts := vfs.OpenOptions{Flags: linux.O_RDONLY | linux.O_NONBLOCK}
+	rfd, err := vfsObj.OpenAt(ctx, creds, &pop, &openOpts)
+	if err != nil {
+		t.Fatalf("failed to open pipe for reading %q: %v", fileName, err)
+	}
+	defer rfd.DecRef()
+
+	// Open the write side.
+	openOpts = vfs.OpenOptions{Flags: linux.O_WRONLY}
+	wfd, err := vfsObj.OpenAt(ctx, creds, &pop, &openOpts)
+	if err != nil {
+		t.Fatalf("failed to open pipe for writing %q: %v", fileName, err)
+	}
+	defer wfd.DecRef()
+
+	const msg = "geh blau"
+	checkEmpty(ctx, t, rfd)
+	checkWrite(ctx, t, wfd, msg)
+	checkRead(ctx, t, rfd, msg)
+}
+
+func TestNonblockingWriteError(t *testing.T) {
+	ctx, creds, vfsObj, root := setup(t)
+	defer root.DecRef()
+
+	// Open the write side as nonblocking, which should return ENXIO.
+	pop := vfs.PathOperation{
+		Root:               root,
+		Start:              root,
+		Pathname:           fileName,
+		FollowFinalSymlink: true,
+	}
+	openOpts := vfs.OpenOptions{Flags: linux.O_WRONLY | linux.O_NONBLOCK}
+	_, err := vfsObj.OpenAt(ctx, creds, &pop, &openOpts)
+	if err != syserror.ENXIO {
+		t.Fatalf("expected ENXIO, but got error: %v", err)
+	}
+}
+
+func TestSingleFD(t *testing.T) {
+	ctx, creds, vfsObj, root := setup(t)
+	defer root.DecRef()
+
+	// Open the pipe as readable and writable.
+	pop := vfs.PathOperation{
+		Root:               root,
+		Start:              root,
+		Pathname:           fileName,
+		FollowFinalSymlink: true,
+	}
+	openOpts := vfs.OpenOptions{Flags: linux.O_RDWR}
+	fd, err := vfsObj.OpenAt(ctx, creds, &pop, &openOpts)
+	if err != nil {
+		t.Fatalf("failed to open pipe for writing %q: %v", fileName, err)
+	}
+	defer fd.DecRef()
+
+	const msg = "forza blu"
+	checkEmpty(ctx, t, fd)
+	checkWrite(ctx, t, fd, msg)
+	checkRead(ctx, t, fd, msg)
+}
+
+// setup creates a VFS with a pipe in the root directory at path fileName. The
+// returned VirtualDentry must be DecRef()'d be the caller. It calls t.Fatal
+// upon failure.
+func setup(t *testing.T) (context.Context, *auth.Credentials, *vfs.VirtualFilesystem, vfs.VirtualDentry) {
+	ctx := contexttest.Context(t)
+	creds := auth.CredentialsFromContext(ctx)
+
+	// Create VFS.
+	vfsObj := vfs.New()
+	vfsObj.MustRegisterFilesystemType("memfs", FilesystemType{})
+	mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "memfs", &vfs.NewFilesystemOptions{})
+	if err != nil {
+		t.Fatalf("failed to create tmpfs root mount: %v", err)
+	}
+
+	// Create the pipe.
+	root := mntns.Root()
+	pop := vfs.PathOperation{
+		Root:               root,
+		Start:              root,
+		Pathname:           fileName,
+		FollowFinalSymlink: true,
+	}
+	mknodOpts := vfs.MknodOptions{Mode: linux.ModeNamedPipe | 0644}
+	if err := vfsObj.MknodAt(ctx, creds, &pop, &mknodOpts); err != nil {
+		t.Fatalf("failed to create file %q: %v", fileName, err)
+	}
+
+	// Sanity check: the file pipe exists and has the correct mode.
+	stat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{
+		Root:               root,
+		Start:              root,
+		Pathname:           fileName,
+		FollowFinalSymlink: true,
+	}, &vfs.StatOptions{})
+	if err != nil {
+		t.Fatalf("stat(%q) failed: %v", fileName, err)
+	}
+	if stat.Mode&^linux.S_IFMT != 0644 {
+		t.Errorf("got wrong permissions (%0o)", stat.Mode)
+	}
+	if stat.Mode&linux.S_IFMT != linux.ModeNamedPipe {
+		t.Errorf("got wrong file type (%0o)", stat.Mode)
+	}
+
+	return ctx, creds, vfsObj, root
+}
+
+// checkEmpty calls t.Fatal if the pipe in fd is not empty.
+func checkEmpty(ctx context.Context, t *testing.T, fd *vfs.FileDescription) {
+	readData := make([]byte, 1)
+	dst := usermem.BytesIOSequence(readData)
+	bytesRead, err := fd.Impl().Read(ctx, dst, vfs.ReadOptions{})
+	if err != syserror.ErrWouldBlock {
+		t.Fatalf("expected ErrWouldBlock reading from empty pipe %q, but got: %v", fileName, err)
+	}
+	if bytesRead != 0 {
+		t.Fatalf("expected to read 0 bytes, but got %d", bytesRead)
+	}
+}
+
+// checkWrite calls t.Fatal if it fails to write all of msg to fd.
+func checkWrite(ctx context.Context, t *testing.T, fd *vfs.FileDescription, msg string) {
+	writeData := []byte(msg)
+	src := usermem.BytesIOSequence(writeData)
+	bytesWritten, err := fd.Impl().Write(ctx, src, vfs.WriteOptions{})
+	if err != nil {
+		t.Fatalf("error writing to pipe %q: %v", fileName, err)
+	}
+	if bytesWritten != int64(len(writeData)) {
+		t.Fatalf("expected to write %d bytes, but wrote %d", len(writeData), bytesWritten)
+	}
+}
+
+// checkRead calls t.Fatal if it fails to read msg from fd.
+func checkRead(ctx context.Context, t *testing.T, fd *vfs.FileDescription, msg string) {
+	readData := make([]byte, len(msg))
+	dst := usermem.BytesIOSequence(readData)
+	bytesRead, err := fd.Impl().Read(ctx, dst, vfs.ReadOptions{})
+	if err != nil {
+		t.Fatalf("error reading from pipe %q: %v", fileName, err)
+	}
+	if bytesRead != int64(len(msg)) {
+		t.Fatalf("expected to read %d bytes, but got %d", len(msg), bytesRead)
+	}
+	if !bytes.Equal(readData, []byte(msg)) {
+		t.Fatalf("expected to read %q from pipe, but got %q", msg, string(readData))
+	}
+}
diff --git a/pkg/sentry/inet/BUILD b/pkg/sentry/inet/BUILD
index 184b566d9..d5284f0d9 100644
--- a/pkg/sentry/inet/BUILD
+++ b/pkg/sentry/inet/BUILD
@@ -1,10 +1,10 @@
+load("//tools/go_stateify:defs.bzl", "go_library")
+
 package(
     default_visibility = ["//:sandbox"],
     licenses = ["notice"],
 )
 
-load("//tools/go_stateify:defs.bzl", "go_library")
-
 go_library(
     name = "inet",
     srcs = [
diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
index aba2414d4..e041c51b3 100644
--- a/pkg/sentry/kernel/BUILD
+++ b/pkg/sentry/kernel/BUILD
@@ -1,12 +1,11 @@
 load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
 load("@io_bazel_rules_go//go:def.bzl", "go_test")
 load("@rules_cc//cc:defs.bzl", "cc_proto_library")
-
-package(licenses = ["notice"])
-
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 load("//tools/go_stateify:defs.bzl", "go_library")
 
+package(licenses = ["notice"])
+
 go_template_instance(
     name = "pending_signals_list",
     out = "pending_signals_list.go",
diff --git a/pkg/sentry/kernel/auth/BUILD b/pkg/sentry/kernel/auth/BUILD
index 1d00a6310..51de4568a 100644
--- a/pkg/sentry/kernel/auth/BUILD
+++ b/pkg/sentry/kernel/auth/BUILD
@@ -1,8 +1,8 @@
-package(licenses = ["notice"])
-
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 load("//tools/go_stateify:defs.bzl", "go_library")
 
+package(licenses = ["notice"])
+
 go_template_instance(
     name = "atomicptr_credentials",
     out = "atomicptr_credentials_unsafe.go",
diff --git a/pkg/sentry/kernel/contexttest/BUILD b/pkg/sentry/kernel/contexttest/BUILD
index bec13a3d9..3a88a585c 100644
--- a/pkg/sentry/kernel/contexttest/BUILD
+++ b/pkg/sentry/kernel/contexttest/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])
-
 load("//tools/go_stateify:defs.bzl", "go_library")
 
+package(licenses = ["notice"])
+
 go_library(
     name = "contexttest",
     testonly = 1,
diff --git a/pkg/sentry/kernel/epoll/BUILD b/pkg/sentry/kernel/epoll/BUILD
index 65427b112..3361e8b7d 100644
--- a/pkg/sentry/kernel/epoll/BUILD
+++ b/pkg/sentry/kernel/epoll/BUILD
@@ -1,10 +1,9 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_test")
-
-package(licenses = ["notice"])
-
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 load("//tools/go_stateify:defs.bzl", "go_library")
 
+package(licenses = ["notice"])
+
 go_template_instance(
     name = "epoll_list",
     out = "epoll_list.go",
diff --git a/pkg/sentry/kernel/eventfd/BUILD b/pkg/sentry/kernel/eventfd/BUILD
index 983ca67ed..e65b961e8 100644
--- a/pkg/sentry/kernel/eventfd/BUILD
+++ b/pkg/sentry/kernel/eventfd/BUILD
@@ -1,9 +1,8 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
-load("//tools/go_stateify:defs.bzl", "go_library")
-
 go_library(
     name = "eventfd",
     srcs = ["eventfd.go"],
diff --git a/pkg/sentry/kernel/fasync/BUILD b/pkg/sentry/kernel/fasync/BUILD
index 5eddca115..49d81b712 100644
--- a/pkg/sentry/kernel/fasync/BUILD
+++ b/pkg/sentry/kernel/fasync/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])
-
 load("//tools/go_stateify:defs.bzl", "go_library")
 
+package(licenses = ["notice"])
+
 go_library(
     name = "fasync",
     srcs = ["fasync.go"],
diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go
index cc3f43a45..11f613a11 100644
--- a/pkg/sentry/kernel/fd_table.go
+++ b/pkg/sentry/kernel/fd_table.go
@@ -81,6 +81,9 @@ type FDTable struct {
 	// mu protects below.
 	mu sync.Mutex `state:"nosave"`
 
+	// next is start position to find fd.
+	next int32
+
 	// used contains the number of non-nil entries. It must be accessed
 	// atomically. It may be read atomically without holding mu (but not
 	// written).
@@ -226,6 +229,11 @@ func (f *FDTable) NewFDs(ctx context.Context, fd int32, files []*fs.File, flags
 	f.mu.Lock()
 	defer f.mu.Unlock()
 
+	// From f.next to find available fd.
+	if fd < f.next {
+		fd = f.next
+	}
+
 	// Install all entries.
 	for i := fd; i < end && len(fds) < len(files); i++ {
 		if d, _, _ := f.get(i); d == nil {
@@ -242,6 +250,11 @@ func (f *FDTable) NewFDs(ctx context.Context, fd int32, files []*fs.File, flags
 		return nil, syscall.EMFILE
 	}
 
+	if fd == f.next {
+		// Update next search start position.
+		f.next = fds[len(fds)-1] + 1
+	}
+
 	return fds, nil
 }
 
@@ -361,6 +374,11 @@ func (f *FDTable) Remove(fd int32) *fs.File {
 	f.mu.Lock()
 	defer f.mu.Unlock()
 
+	// Update current available position.
+	if fd < f.next {
+		f.next = fd
+	}
+
 	orig, _, _ := f.get(fd)
 	if orig != nil {
 		orig.IncRef()             // Reference for caller.
@@ -377,6 +395,10 @@ func (f *FDTable) RemoveIf(cond func(*fs.File, FDFlags) bool) {
 	f.forEach(func(fd int32, file *fs.File, flags FDFlags) {
 		if cond(file, flags) {
 			f.set(fd, nil, FDFlags{}) // Clear from table.
+			// Update current available position.
+			if fd < f.next {
+				f.next = fd
+			}
 		}
 	})
 }
diff --git a/pkg/sentry/kernel/fd_table_test.go b/pkg/sentry/kernel/fd_table_test.go
index 2413788e7..2bcb6216a 100644
--- a/pkg/sentry/kernel/fd_table_test.go
+++ b/pkg/sentry/kernel/fd_table_test.go
@@ -70,6 +70,42 @@ func TestFDTableMany(t *testing.T) {
 		if err := fdTable.NewFDAt(ctx, 1, file, FDFlags{}); err != nil {
 			t.Fatalf("fdTable.NewFDAt(1, r, FDFlags{}): got %v, wanted nil", err)
 		}
+
+		i := int32(2)
+		fdTable.Remove(i)
+		if fds, err := fdTable.NewFDs(ctx, 0, []*fs.File{file}, FDFlags{}); err != nil || fds[0] != i {
+			t.Fatalf("Allocated %v FDs but wanted to allocate %v: %v", i, maxFD, err)
+		}
+	})
+}
+
+func TestFDTableOverLimit(t *testing.T) {
+	runTest(t, func(ctx context.Context, fdTable *FDTable, file *fs.File, _ *limits.LimitSet) {
+		if _, err := fdTable.NewFDs(ctx, maxFD, []*fs.File{file}, FDFlags{}); err == nil {
+			t.Fatalf("fdTable.NewFDs(maxFD, f): got nil, wanted error")
+		}
+
+		if _, err := fdTable.NewFDs(ctx, maxFD-2, []*fs.File{file, file, file}, FDFlags{}); err == nil {
+			t.Fatalf("fdTable.NewFDs(maxFD-2, {f,f,f}): got nil, wanted error")
+		}
+
+		if fds, err := fdTable.NewFDs(ctx, maxFD-3, []*fs.File{file, file, file}, FDFlags{}); err != nil {
+			t.Fatalf("fdTable.NewFDs(maxFD-3, {f,f,f}): got %v, wanted nil", err)
+		} else {
+			for _, fd := range fds {
+				fdTable.Remove(fd)
+			}
+		}
+
+		if fds, err := fdTable.NewFDs(ctx, maxFD-1, []*fs.File{file}, FDFlags{}); err != nil || fds[0] != maxFD-1 {
+			t.Fatalf("fdTable.NewFDAt(1, r, FDFlags{}): got %v, wanted nil", err)
+		}
+
+		if fds, err := fdTable.NewFDs(ctx, 0, []*fs.File{file}, FDFlags{}); err != nil {
+			t.Fatalf("Adding an FD to a resized map: got %v, want nil", err)
+		} else if len(fds) != 1 || fds[0] != 0 {
+			t.Fatalf("Added an FD to a resized map: got %v, want {1}", fds)
+		}
 	})
 }
 
diff --git a/pkg/sentry/kernel/futex/BUILD b/pkg/sentry/kernel/futex/BUILD
index 41f44999c..34286c7a8 100644
--- a/pkg/sentry/kernel/futex/BUILD
+++ b/pkg/sentry/kernel/futex/BUILD
@@ -1,10 +1,9 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_test")
-
-package(licenses = ["notice"])
-
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 load("//tools/go_stateify:defs.bzl", "go_library")
 
+package(licenses = ["notice"])
+
 go_template_instance(
     name = "atomicptr_bucket",
     out = "atomicptr_bucket_unsafe.go",
diff --git a/pkg/sentry/kernel/pipe/BUILD b/pkg/sentry/kernel/pipe/BUILD
index 2ce8952e2..9d34f6d4d 100644
--- a/pkg/sentry/kernel/pipe/BUILD
+++ b/pkg/sentry/kernel/pipe/BUILD
@@ -1,10 +1,9 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_test")
-
-package(licenses = ["notice"])
-
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 load("//tools/go_stateify:defs.bzl", "go_library")
 
+package(licenses = ["notice"])
+
 go_template_instance(
     name = "buffer_list",
     out = "buffer_list.go",
@@ -25,8 +24,10 @@ go_library(
         "device.go",
         "node.go",
         "pipe.go",
+        "pipe_util.go",
         "reader.go",
         "reader_writer.go",
+        "vfs.go",
         "writer.go",
     ],
     importpath = "gvisor.dev/gvisor/pkg/sentry/kernel/pipe",
@@ -41,6 +42,7 @@ go_library(
         "//pkg/sentry/fs/fsutil",
         "//pkg/sentry/safemem",
         "//pkg/sentry/usermem",
+        "//pkg/sentry/vfs",
         "//pkg/syserror",
         "//pkg/waiter",
     ],
diff --git a/pkg/sentry/kernel/pipe/node.go b/pkg/sentry/kernel/pipe/node.go
index a2dc72204..4a19ab7ce 100644
--- a/pkg/sentry/kernel/pipe/node.go
+++ b/pkg/sentry/kernel/pipe/node.go
@@ -18,7 +18,6 @@ import (
 	"sync"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/amutex"
 	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
@@ -91,10 +90,10 @@ func (i *inodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.Fi
 	switch {
 	case flags.Read && !flags.Write: // O_RDONLY.
 		r := i.p.Open(ctx, d, flags)
-		i.newHandleLocked(&i.rWakeup)
+		newHandleLocked(&i.rWakeup)
 
 		if i.p.isNamed && !flags.NonBlocking && !i.p.HasWriters() {
-			if !i.waitFor(&i.wWakeup, ctx) {
+			if !waitFor(&i.mu, &i.wWakeup, ctx) {
 				r.DecRef()
 				return nil, syserror.ErrInterrupted
 			}
@@ -107,7 +106,7 @@ func (i *inodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.Fi
 
 	case flags.Write && !flags.Read: // O_WRONLY.
 		w := i.p.Open(ctx, d, flags)
-		i.newHandleLocked(&i.wWakeup)
+		newHandleLocked(&i.wWakeup)
 
 		if i.p.isNamed && !i.p.HasReaders() {
 			// On a nonblocking, write-only open, the open fails with ENXIO if the
@@ -117,7 +116,7 @@ func (i *inodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.Fi
 				return nil, syserror.ENXIO
 			}
 
-			if !i.waitFor(&i.rWakeup, ctx) {
+			if !waitFor(&i.mu, &i.rWakeup, ctx) {
 				w.DecRef()
 				return nil, syserror.ErrInterrupted
 			}
@@ -127,8 +126,8 @@ func (i *inodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.Fi
 	case flags.Read && flags.Write: // O_RDWR.
 		// Pipes opened for read-write always succeeds without blocking.
 		rw := i.p.Open(ctx, d, flags)
-		i.newHandleLocked(&i.rWakeup)
-		i.newHandleLocked(&i.wWakeup)
+		newHandleLocked(&i.rWakeup)
+		newHandleLocked(&i.wWakeup)
 		return rw, nil
 
 	default:
@@ -136,65 +135,6 @@ func (i *inodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.Fi
 	}
 }
 
-// waitFor blocks until the underlying pipe has at least one reader/writer is
-// announced via 'wakeupChan', or until 'sleeper' is cancelled. Any call to this
-// function will block for either readers or writers, depending on where
-// 'wakeupChan' points.
-//
-// f.mu must be held by the caller. waitFor returns with f.mu held, but it will
-// drop f.mu before blocking for any reader/writers.
-func (i *inodeOperations) waitFor(wakeupChan *chan struct{}, sleeper amutex.Sleeper) bool {
-	// Ideally this function would simply use a condition variable. However, the
-	// wait needs to be interruptible via 'sleeper', so we must sychronize via a
-	// channel. The synchronization below relies on the fact that closing a
-	// channel unblocks all receives on the channel.
-
-	// Does an appropriate wakeup channel already exist? If not, create a new
-	// one. This is all done under f.mu to avoid races.
-	if *wakeupChan == nil {
-		*wakeupChan = make(chan struct{})
-	}
-
-	// Grab a local reference to the wakeup channel since it may disappear as
-	// soon as we drop f.mu.
-	wakeup := *wakeupChan
-
-	// Drop the lock and prepare to sleep.
-	i.mu.Unlock()
-	cancel := sleeper.SleepStart()
-
-	// Wait for either a new reader/write to be signalled via 'wakeup', or
-	// for the sleep to be cancelled.
-	select {
-	case <-wakeup:
-		sleeper.SleepFinish(true)
-	case <-cancel:
-		sleeper.SleepFinish(false)
-	}
-
-	// Take the lock and check if we were woken. If we were woken and
-	// interrupted, the former takes priority.
-	i.mu.Lock()
-	select {
-	case <-wakeup:
-		return true
-	default:
-		return false
-	}
-}
-
-// newHandleLocked signals a new pipe reader or writer depending on where
-// 'wakeupChan' points. This unblocks any corresponding reader or writer
-// waiting for the other end of the channel to be opened, see Fifo.waitFor.
-//
-// i.mu must be held.
-func (*inodeOperations) newHandleLocked(wakeupChan *chan struct{}) {
-	if *wakeupChan != nil {
-		close(*wakeupChan)
-		*wakeupChan = nil
-	}
-}
-
 func (*inodeOperations) Allocate(_ context.Context, _ *fs.Inode, _, _ int64) error {
 	return syserror.EPIPE
 }
diff --git a/pkg/sentry/kernel/pipe/pipe.go b/pkg/sentry/kernel/pipe/pipe.go
index 8e4e8e82e..1a1b38f83 100644
--- a/pkg/sentry/kernel/pipe/pipe.go
+++ b/pkg/sentry/kernel/pipe/pipe.go
@@ -111,11 +111,27 @@ func NewPipe(isNamed bool, sizeBytes, atomicIOBytes int64) *Pipe {
 	if atomicIOBytes > sizeBytes {
 		atomicIOBytes = sizeBytes
 	}
-	return &Pipe{
-		isNamed:       isNamed,
-		max:           sizeBytes,
-		atomicIOBytes: atomicIOBytes,
+	var p Pipe
+	initPipe(&p, isNamed, sizeBytes, atomicIOBytes)
+	return &p
+}
+
+func initPipe(pipe *Pipe, isNamed bool, sizeBytes, atomicIOBytes int64) {
+	if sizeBytes < MinimumPipeSize {
+		sizeBytes = MinimumPipeSize
+	}
+	if sizeBytes > MaximumPipeSize {
+		sizeBytes = MaximumPipeSize
+	}
+	if atomicIOBytes <= 0 {
+		atomicIOBytes = 1
+	}
+	if atomicIOBytes > sizeBytes {
+		atomicIOBytes = sizeBytes
 	}
+	pipe.isNamed = isNamed
+	pipe.max = sizeBytes
+	pipe.atomicIOBytes = atomicIOBytes
 }
 
 // NewConnectedPipe initializes a pipe and returns a pair of objects
diff --git a/pkg/sentry/kernel/pipe/pipe_util.go b/pkg/sentry/kernel/pipe/pipe_util.go
new file mode 100644
index 000000000..ef9641e6a
--- /dev/null
+++ b/pkg/sentry/kernel/pipe/pipe_util.go
@@ -0,0 +1,213 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pipe
+
+import (
+	"io"
+	"math"
+	"sync"
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/amutex"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// This file contains Pipe file functionality that is tied to neither VFS nor
+// the old fs architecture.
+
+// Release cleans up the pipe's state.
+func (p *Pipe) Release() {
+	p.rClose()
+	p.wClose()
+
+	// Wake up readers and writers.
+	p.Notify(waiter.EventIn | waiter.EventOut)
+}
+
+// Read reads from the Pipe into dst.
+func (p *Pipe) Read(ctx context.Context, dst usermem.IOSequence) (int64, error) {
+	n, err := p.read(ctx, readOps{
+		left: func() int64 {
+			return dst.NumBytes()
+		},
+		limit: func(l int64) {
+			dst = dst.TakeFirst64(l)
+		},
+		read: func(buf *buffer) (int64, error) {
+			n, err := dst.CopyOutFrom(ctx, buf)
+			dst = dst.DropFirst64(n)
+			return n, err
+		},
+	})
+	if n > 0 {
+		p.Notify(waiter.EventOut)
+	}
+	return n, err
+}
+
+// WriteTo writes to w from the Pipe.
+func (p *Pipe) WriteTo(ctx context.Context, w io.Writer, count int64, dup bool) (int64, error) {
+	ops := readOps{
+		left: func() int64 {
+			return count
+		},
+		limit: func(l int64) {
+			count = l
+		},
+		read: func(buf *buffer) (int64, error) {
+			n, err := buf.ReadToWriter(w, count, dup)
+			count -= n
+			return n, err
+		},
+	}
+	if dup {
+		// There is no notification for dup operations.
+		return p.dup(ctx, ops)
+	}
+	n, err := p.read(ctx, ops)
+	if n > 0 {
+		p.Notify(waiter.EventOut)
+	}
+	return n, err
+}
+
+// Write writes to the Pipe from src.
+func (p *Pipe) Write(ctx context.Context, src usermem.IOSequence) (int64, error) {
+	n, err := p.write(ctx, writeOps{
+		left: func() int64 {
+			return src.NumBytes()
+		},
+		limit: func(l int64) {
+			src = src.TakeFirst64(l)
+		},
+		write: func(buf *buffer) (int64, error) {
+			n, err := src.CopyInTo(ctx, buf)
+			src = src.DropFirst64(n)
+			return n, err
+		},
+	})
+	if n > 0 {
+		p.Notify(waiter.EventIn)
+	}
+	return n, err
+}
+
+// ReadFrom reads from r to the Pipe.
+func (p *Pipe) ReadFrom(ctx context.Context, r io.Reader, count int64) (int64, error) {
+	n, err := p.write(ctx, writeOps{
+		left: func() int64 {
+			return count
+		},
+		limit: func(l int64) {
+			count = l
+		},
+		write: func(buf *buffer) (int64, error) {
+			n, err := buf.WriteFromReader(r, count)
+			count -= n
+			return n, err
+		},
+	})
+	if n > 0 {
+		p.Notify(waiter.EventIn)
+	}
+	return n, err
+}
+
+// Readiness returns the ready events in the underlying pipe.
+func (p *Pipe) Readiness(mask waiter.EventMask) waiter.EventMask {
+	return p.rwReadiness() & mask
+}
+
+// Ioctl implements ioctls on the Pipe.
+func (p *Pipe) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	// Switch on ioctl request.
+	switch int(args[1].Int()) {
+	case linux.FIONREAD:
+		v := p.queued()
+		if v > math.MaxInt32 {
+			v = math.MaxInt32 // Silently truncate.
+		}
+		// Copy result to user-space.
+		_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(v), usermem.IOOpts{
+			AddressSpaceActive: true,
+		})
+		return 0, err
+	default:
+		return 0, syscall.ENOTTY
+	}
+}
+
+// waitFor blocks until the underlying pipe has at least one reader/writer is
+// announced via 'wakeupChan', or until 'sleeper' is cancelled. Any call to this
+// function will block for either readers or writers, depending on where
+// 'wakeupChan' points.
+//
+// mu must be held by the caller. waitFor returns with mu held, but it will
+// drop mu before blocking for any reader/writers.
+func waitFor(mu *sync.Mutex, wakeupChan *chan struct{}, sleeper amutex.Sleeper) bool {
+	// Ideally this function would simply use a condition variable. However, the
+	// wait needs to be interruptible via 'sleeper', so we must sychronize via a
+	// channel. The synchronization below relies on the fact that closing a
+	// channel unblocks all receives on the channel.
+
+	// Does an appropriate wakeup channel already exist? If not, create a new
+	// one. This is all done under f.mu to avoid races.
+	if *wakeupChan == nil {
+		*wakeupChan = make(chan struct{})
+	}
+
+	// Grab a local reference to the wakeup channel since it may disappear as
+	// soon as we drop f.mu.
+	wakeup := *wakeupChan
+
+	// Drop the lock and prepare to sleep.
+	mu.Unlock()
+	cancel := sleeper.SleepStart()
+
+	// Wait for either a new reader/write to be signalled via 'wakeup', or
+	// for the sleep to be cancelled.
+	select {
+	case <-wakeup:
+		sleeper.SleepFinish(true)
+	case <-cancel:
+		sleeper.SleepFinish(false)
+	}
+
+	// Take the lock and check if we were woken. If we were woken and
+	// interrupted, the former takes priority.
+	mu.Lock()
+	select {
+	case <-wakeup:
+		return true
+	default:
+		return false
+	}
+}
+
+// newHandleLocked signals a new pipe reader or writer depending on where
+// 'wakeupChan' points. This unblocks any corresponding reader or writer
+// waiting for the other end of the channel to be opened, see Fifo.waitFor.
+//
+// Precondition: the mutex protecting wakeupChan must be held.
+func newHandleLocked(wakeupChan *chan struct{}) {
+	if *wakeupChan != nil {
+		close(*wakeupChan)
+		*wakeupChan = nil
+	}
+}
diff --git a/pkg/sentry/kernel/pipe/reader_writer.go b/pkg/sentry/kernel/pipe/reader_writer.go
index 7c307f013..b4d29fc77 100644
--- a/pkg/sentry/kernel/pipe/reader_writer.go
+++ b/pkg/sentry/kernel/pipe/reader_writer.go
@@ -16,16 +16,12 @@ package pipe
 
 import (
 	"io"
-	"math"
-	"syscall"
 
-	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
-	"gvisor.dev/gvisor/pkg/waiter"
 )
 
 // ReaderWriter satisfies the FileOperations interface and services both
@@ -45,124 +41,27 @@ type ReaderWriter struct {
 	*Pipe
 }
 
-// Release implements fs.FileOperations.Release.
-func (rw *ReaderWriter) Release() {
-	rw.Pipe.rClose()
-	rw.Pipe.wClose()
-
-	// Wake up readers and writers.
-	rw.Pipe.Notify(waiter.EventIn | waiter.EventOut)
-}
-
 // Read implements fs.FileOperations.Read.
 func (rw *ReaderWriter) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, _ int64) (int64, error) {
-	n, err := rw.Pipe.read(ctx, readOps{
-		left: func() int64 {
-			return dst.NumBytes()
-		},
-		limit: func(l int64) {
-			dst = dst.TakeFirst64(l)
-		},
-		read: func(buf *buffer) (int64, error) {
-			n, err := dst.CopyOutFrom(ctx, buf)
-			dst = dst.DropFirst64(n)
-			return n, err
-		},
-	})
-	if n > 0 {
-		rw.Pipe.Notify(waiter.EventOut)
-	}
-	return n, err
+	return rw.Pipe.Read(ctx, dst)
 }
 
 // WriteTo implements fs.FileOperations.WriteTo.
 func (rw *ReaderWriter) WriteTo(ctx context.Context, _ *fs.File, w io.Writer, count int64, dup bool) (int64, error) {
-	ops := readOps{
-		left: func() int64 {
-			return count
-		},
-		limit: func(l int64) {
-			count = l
-		},
-		read: func(buf *buffer) (int64, error) {
-			n, err := buf.ReadToWriter(w, count, dup)
-			count -= n
-			return n, err
-		},
-	}
-	if dup {
-		// There is no notification for dup operations.
-		return rw.Pipe.dup(ctx, ops)
-	}
-	n, err := rw.Pipe.read(ctx, ops)
-	if n > 0 {
-		rw.Pipe.Notify(waiter.EventOut)
-	}
-	return n, err
+	return rw.Pipe.WriteTo(ctx, w, count, dup)
 }
 
 // Write implements fs.FileOperations.Write.
 func (rw *ReaderWriter) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, _ int64) (int64, error) {
-	n, err := rw.Pipe.write(ctx, writeOps{
-		left: func() int64 {
-			return src.NumBytes()
-		},
-		limit: func(l int64) {
-			src = src.TakeFirst64(l)
-		},
-		write: func(buf *buffer) (int64, error) {
-			n, err := src.CopyInTo(ctx, buf)
-			src = src.DropFirst64(n)
-			return n, err
-		},
-	})
-	if n > 0 {
-		rw.Pipe.Notify(waiter.EventIn)
-	}
-	return n, err
+	return rw.Pipe.Write(ctx, src)
 }
 
 // ReadFrom implements fs.FileOperations.WriteTo.
 func (rw *ReaderWriter) ReadFrom(ctx context.Context, _ *fs.File, r io.Reader, count int64) (int64, error) {
-	n, err := rw.Pipe.write(ctx, writeOps{
-		left: func() int64 {
-			return count
-		},
-		limit: func(l int64) {
-			count = l
-		},
-		write: func(buf *buffer) (int64, error) {
-			n, err := buf.WriteFromReader(r, count)
-			count -= n
-			return n, err
-		},
-	})
-	if n > 0 {
-		rw.Pipe.Notify(waiter.EventIn)
-	}
-	return n, err
-}
-
-// Readiness returns the ready events in the underlying pipe.
-func (rw *ReaderWriter) Readiness(mask waiter.EventMask) waiter.EventMask {
-	return rw.Pipe.rwReadiness() & mask
+	return rw.Pipe.ReadFrom(ctx, r, count)
 }
 
 // Ioctl implements fs.FileOperations.Ioctl.
 func (rw *ReaderWriter) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
-	// Switch on ioctl request.
-	switch int(args[1].Int()) {
-	case linux.FIONREAD:
-		v := rw.queued()
-		if v > math.MaxInt32 {
-			v = math.MaxInt32 // Silently truncate.
-		}
-		// Copy result to user-space.
-		_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(v), usermem.IOOpts{
-			AddressSpaceActive: true,
-		})
-		return 0, err
-	default:
-		return 0, syscall.ENOTTY
-	}
+	return rw.Pipe.Ioctl(ctx, io, args)
 }
diff --git a/pkg/sentry/kernel/pipe/vfs.go b/pkg/sentry/kernel/pipe/vfs.go
new file mode 100644
index 000000000..6416e0dd8
--- /dev/null
+++ b/pkg/sentry/kernel/pipe/vfs.go
@@ -0,0 +1,220 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pipe
+
+import (
+	"sync"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// This file contains types enabling the pipe package to be used with the vfs
+// package.
+
+// VFSPipe represents the actual pipe, analagous to an inode. VFSPipes should
+// not be copied.
+type VFSPipe struct {
+	// mu protects the fields below.
+	mu sync.Mutex `state:"nosave"`
+
+	// pipe is the underlying pipe.
+	pipe Pipe
+
+	// Channels for synchronizing the creation of new readers and writers
+	// of this fifo. See waitFor and newHandleLocked.
+	//
+	// These are not saved/restored because all waiters are unblocked on
+	// save, and either automatically restart (via ERESTARTSYS) or return
+	// EINTR on resume. On restarts via ERESTARTSYS, the appropriate
+	// channel will be recreated.
+	rWakeup chan struct{} `state:"nosave"`
+	wWakeup chan struct{} `state:"nosave"`
+}
+
+// NewVFSPipe returns an initialized VFSPipe.
+func NewVFSPipe(sizeBytes, atomicIOBytes int64) *VFSPipe {
+	var vp VFSPipe
+	initPipe(&vp.pipe, true /* isNamed */, sizeBytes, atomicIOBytes)
+	return &vp
+}
+
+// NewVFSPipeFD opens a named pipe. Named pipes have special blocking semantics
+// during open:
+//
+// "Normally, opening the FIFO blocks until the other end is opened also. A
+// process can open a FIFO in nonblocking mode. In this case, opening for
+// read-only will succeed even if no-one has opened on the write side yet,
+// opening for write-only will fail with ENXIO (no such device or address)
+// unless the other end has already been opened. Under Linux, opening a FIFO
+// for read and write will succeed both in blocking and nonblocking mode. POSIX
+// leaves this behavior undefined. This can be used to open a FIFO for writing
+// while there are no readers available." - fifo(7)
+func (vp *VFSPipe) NewVFSPipeFD(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, vfsfd *vfs.FileDescription, flags uint32) (*VFSPipeFD, error) {
+	vp.mu.Lock()
+	defer vp.mu.Unlock()
+
+	readable := vfs.MayReadFileWithOpenFlags(flags)
+	writable := vfs.MayWriteFileWithOpenFlags(flags)
+	if !readable && !writable {
+		return nil, syserror.EINVAL
+	}
+
+	vfd, err := vp.open(rp, vfsd, vfsfd, flags)
+	if err != nil {
+		return nil, err
+	}
+
+	switch {
+	case readable && writable:
+		// Pipes opened for read-write always succeed without blocking.
+		newHandleLocked(&vp.rWakeup)
+		newHandleLocked(&vp.wWakeup)
+
+	case readable:
+		newHandleLocked(&vp.rWakeup)
+		// If this pipe is being opened as nonblocking and there's no
+		// writer, we have to wait for a writer to open the other end.
+		if flags&linux.O_NONBLOCK == 0 && !vp.pipe.HasWriters() && !waitFor(&vp.mu, &vp.wWakeup, ctx) {
+			return nil, syserror.EINTR
+		}
+
+	case writable:
+		newHandleLocked(&vp.wWakeup)
+
+		if !vp.pipe.HasReaders() {
+			// Nonblocking, write-only opens fail with ENXIO when
+			// the read side isn't open yet.
+			if flags&linux.O_NONBLOCK != 0 {
+				return nil, syserror.ENXIO
+			}
+			// Wait for a reader to open the other end.
+			if !waitFor(&vp.mu, &vp.rWakeup, ctx) {
+				return nil, syserror.EINTR
+			}
+		}
+
+	default:
+		panic("invalid pipe flags: must be readable, writable, or both")
+	}
+
+	return vfd, nil
+}
+
+// Preconditions: vp.mu must be held.
+func (vp *VFSPipe) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, vfsfd *vfs.FileDescription, flags uint32) (*VFSPipeFD, error) {
+	var fd VFSPipeFD
+	fd.flags = flags
+	fd.readable = vfs.MayReadFileWithOpenFlags(flags)
+	fd.writable = vfs.MayWriteFileWithOpenFlags(flags)
+	fd.vfsfd = vfsfd
+	fd.pipe = &vp.pipe
+	if fd.writable {
+		// The corresponding Mount.EndWrite() is in VFSPipe.Release().
+		if err := rp.Mount().CheckBeginWrite(); err != nil {
+			return nil, err
+		}
+	}
+
+	switch {
+	case fd.readable && fd.writable:
+		vp.pipe.rOpen()
+		vp.pipe.wOpen()
+	case fd.readable:
+		vp.pipe.rOpen()
+	case fd.writable:
+		vp.pipe.wOpen()
+	default:
+		panic("invalid pipe flags: must be readable, writable, or both")
+	}
+
+	return &fd, nil
+}
+
+// VFSPipeFD implements a subset of vfs.FileDescriptionImpl for pipes. It is
+// expected that filesystesm will use this in a struct implementing
+// vfs.FileDescriptionImpl.
+type VFSPipeFD struct {
+	pipe     *Pipe
+	flags    uint32
+	readable bool
+	writable bool
+	vfsfd    *vfs.FileDescription
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *VFSPipeFD) Release() {
+	var event waiter.EventMask
+	if fd.readable {
+		fd.pipe.rClose()
+		event |= waiter.EventIn
+	}
+	if fd.writable {
+		fd.pipe.wClose()
+		event |= waiter.EventOut
+	}
+	if event == 0 {
+		panic("invalid pipe flags: must be readable, writable, or both")
+	}
+
+	if fd.writable {
+		fd.vfsfd.VirtualDentry().Mount().EndWrite()
+	}
+
+	fd.pipe.Notify(event)
+}
+
+// OnClose implements vfs.FileDescriptionImpl.OnClose.
+func (fd *VFSPipeFD) OnClose(_ context.Context) error {
+	return nil
+}
+
+// PRead implements vfs.FileDescriptionImpl.PRead.
+func (fd *VFSPipeFD) PRead(_ context.Context, _ usermem.IOSequence, _ int64, _ vfs.ReadOptions) (int64, error) {
+	return 0, syserror.ESPIPE
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (fd *VFSPipeFD) Read(ctx context.Context, dst usermem.IOSequence, _ vfs.ReadOptions) (int64, error) {
+	if !fd.readable {
+		return 0, syserror.EINVAL
+	}
+
+	return fd.pipe.Read(ctx, dst)
+}
+
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
+func (fd *VFSPipeFD) PWrite(_ context.Context, _ usermem.IOSequence, _ int64, _ vfs.WriteOptions) (int64, error) {
+	return 0, syserror.ESPIPE
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (fd *VFSPipeFD) Write(ctx context.Context, src usermem.IOSequence, _ vfs.WriteOptions) (int64, error) {
+	if !fd.writable {
+		return 0, syserror.EINVAL
+	}
+
+	return fd.pipe.Write(ctx, src)
+}
+
+// Ioctl implements vfs.FileDescriptionImpl.Ioctl.
+func (fd *VFSPipeFD) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	return fd.pipe.Ioctl(ctx, uio, args)
+}
diff --git a/pkg/sentry/kernel/semaphore/BUILD b/pkg/sentry/kernel/semaphore/BUILD
index 80e5e5da3..f4c00cd86 100644
--- a/pkg/sentry/kernel/semaphore/BUILD
+++ b/pkg/sentry/kernel/semaphore/BUILD
@@ -1,10 +1,9 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_test")
-
-package(licenses = ["notice"])
-
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 load("//tools/go_stateify:defs.bzl", "go_library")
 
+package(licenses = ["notice"])
+
 go_template_instance(
     name = "waiter_list",
     out = "waiter_list.go",
diff --git a/pkg/sentry/kernel/shm/BUILD b/pkg/sentry/kernel/shm/BUILD
index aa7471eb6..cd48945e6 100644
--- a/pkg/sentry/kernel/shm/BUILD
+++ b/pkg/sentry/kernel/shm/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])
-
 load("//tools/go_stateify:defs.bzl", "go_library")
 
+package(licenses = ["notice"])
+
 go_library(
     name = "shm",
     srcs = [
diff --git a/pkg/sentry/kernel/time/BUILD b/pkg/sentry/kernel/time/BUILD
index 9beae4b31..31847e1df 100644
--- a/pkg/sentry/kernel/time/BUILD
+++ b/pkg/sentry/kernel/time/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])
-
 load("//tools/go_stateify:defs.bzl", "go_library")
 
+package(licenses = ["notice"])
+
 go_library(
     name = "time",
     srcs = [
diff --git a/pkg/sentry/limits/BUILD b/pkg/sentry/limits/BUILD
index 59649c770..156e67bf8 100644
--- a/pkg/sentry/limits/BUILD
+++ b/pkg/sentry/limits/BUILD
@@ -1,9 +1,8 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
-load("//tools/go_stateify:defs.bzl", "go_library")
-
 go_library(
     name = "limits",
     srcs = [
diff --git a/pkg/sentry/loader/BUILD b/pkg/sentry/loader/BUILD
index 3b322f5f3..2890393bd 100644
--- a/pkg/sentry/loader/BUILD
+++ b/pkg/sentry/loader/BUILD
@@ -1,9 +1,8 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_embed_data")
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
-load("//tools/go_stateify:defs.bzl", "go_library")
-
 go_embed_data(
     name = "vdso_bin",
     src = "//vdso:vdso.so",
diff --git a/pkg/sentry/memmap/BUILD b/pkg/sentry/memmap/BUILD
index 9687e7e76..3ef84245b 100644
--- a/pkg/sentry/memmap/BUILD
+++ b/pkg/sentry/memmap/BUILD
@@ -1,10 +1,9 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_test")
-
-package(licenses = ["notice"])
-
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 load("//tools/go_stateify:defs.bzl", "go_library")
 
+package(licenses = ["notice"])
+
 go_template_instance(
     name = "mappable_range",
     out = "mappable_range.go",
diff --git a/pkg/sentry/mm/BUILD b/pkg/sentry/mm/BUILD
index b35c8c673..a804b8b5c 100644
--- a/pkg/sentry/mm/BUILD
+++ b/pkg/sentry/mm/BUILD
@@ -1,10 +1,9 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_test")
-
-package(licenses = ["notice"])
-
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 load("//tools/go_stateify:defs.bzl", "go_library")
 
+package(licenses = ["notice"])
+
 go_template_instance(
     name = "file_refcount_set",
     out = "file_refcount_set.go",
diff --git a/pkg/sentry/pgalloc/BUILD b/pkg/sentry/pgalloc/BUILD
index 3fd904c67..f404107af 100644
--- a/pkg/sentry/pgalloc/BUILD
+++ b/pkg/sentry/pgalloc/BUILD
@@ -1,10 +1,9 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_test")
-
-package(licenses = ["notice"])
-
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 load("//tools/go_stateify:defs.bzl", "go_library")
 
+package(licenses = ["notice"])
+
 go_template_instance(
     name = "evictable_range",
     out = "evictable_range.go",
diff --git a/pkg/sentry/platform/BUILD b/pkg/sentry/platform/BUILD
index 9aa6ec507..157bffa81 100644
--- a/pkg/sentry/platform/BUILD
+++ b/pkg/sentry/platform/BUILD
@@ -1,8 +1,8 @@
-package(licenses = ["notice"])
-
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 load("//tools/go_stateify:defs.bzl", "go_library")
 
+package(licenses = ["notice"])
+
 go_template_instance(
     name = "file_range",
     out = "file_range.go",
diff --git a/pkg/sentry/platform/ptrace/subprocess.go b/pkg/sentry/platform/ptrace/subprocess.go
index 9f0ecfbe4..b699b057d 100644
--- a/pkg/sentry/platform/ptrace/subprocess.go
+++ b/pkg/sentry/platform/ptrace/subprocess.go
@@ -327,6 +327,19 @@ func (t *thread) dumpAndPanic(message string) {
 	panic(message)
 }
 
+func (t *thread) unexpectedStubExit() {
+	msg, err := t.getEventMessage()
+	status := syscall.WaitStatus(msg)
+	if status.Signaled() && status.Signal() == syscall.SIGKILL {
+		// SIGKILL can be only sent by an user or OOM-killer. In both
+		// these cases, we don't need to panic. There is no reasons to
+		// think that something wrong in gVisor.
+		log.Warningf("The ptrace stub process %v has been killed by SIGKILL.", t.tgid)
+		syscall.Kill(os.Getpid(), syscall.SIGKILL)
+	}
+	t.dumpAndPanic(fmt.Sprintf("wait failed: the process %d:%d exited: %x (err %v)", t.tgid, t.tid, msg, err))
+}
+
 // wait waits for a stop event.
 //
 // Precondition: outcome is a valid waitOutcome.
@@ -355,8 +368,7 @@ func (t *thread) wait(outcome waitOutcome) syscall.Signal {
 			}
 			if stopSig == syscall.SIGTRAP {
 				if status.TrapCause() == syscall.PTRACE_EVENT_EXIT {
-					msg, err := t.getEventMessage()
-					t.dumpAndPanic(fmt.Sprintf("wait failed: the process %d:%d exited: %x (err %v)", t.tgid, t.tid, msg, err))
+					t.unexpectedStubExit()
 				}
 				// Re-encode the trap cause the way it's expected.
 				return stopSig | syscall.Signal(status.TrapCause()<<8)
diff --git a/pkg/sentry/platform/ptrace/subprocess_linux.go b/pkg/sentry/platform/ptrace/subprocess_linux.go
index c075b5f91..3782d4332 100644
--- a/pkg/sentry/platform/ptrace/subprocess_linux.go
+++ b/pkg/sentry/platform/ptrace/subprocess_linux.go
@@ -129,6 +129,9 @@ func createStub() (*thread, error) {
 	// transitively) will be killed as well. It's simply not possible to
 	// safely handle a single stub getting killed: the exact state of
 	// execution is unknown and not recoverable.
+	//
+	// In addition, we set the PTRACE_O_TRACEEXIT option to log more
+	// information about a stub process when it receives a fatal signal.
 	return attachedThread(uintptr(syscall.SIGKILL)|syscall.CLONE_FILES, defaultAction)
 }
 
diff --git a/pkg/sentry/platform/ring0/BUILD b/pkg/sentry/platform/ring0/BUILD
index 8ed6c7652..48b0ceaec 100644
--- a/pkg/sentry/platform/ring0/BUILD
+++ b/pkg/sentry/platform/ring0/BUILD
@@ -1,9 +1,8 @@
 load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools/go_generics:defs.bzl", "go_template", "go_template_instance")
 
 package(licenses = ["notice"])
 
-load("//tools/go_generics:defs.bzl", "go_template", "go_template_instance")
-
 go_template(
     name = "defs",
     srcs = [
diff --git a/pkg/sentry/platform/ring0/gen_offsets/BUILD b/pkg/sentry/platform/ring0/gen_offsets/BUILD
index d7029d5a9..780bf9a66 100644
--- a/pkg/sentry/platform/ring0/gen_offsets/BUILD
+++ b/pkg/sentry/platform/ring0/gen_offsets/BUILD
@@ -1,9 +1,8 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_binary")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
 
 package(licenses = ["notice"])
 
-load("//tools/go_generics:defs.bzl", "go_template_instance")
-
 go_template_instance(
     name = "defs_impl",
     out = "defs_impl.go",
diff --git a/pkg/sentry/platform/ring0/pagetables/BUILD b/pkg/sentry/platform/ring0/pagetables/BUILD
index ea090b686..934a90378 100644
--- a/pkg/sentry/platform/ring0/pagetables/BUILD
+++ b/pkg/sentry/platform/ring0/pagetables/BUILD
@@ -1,10 +1,9 @@
 load("//tools/go_stateify:defs.bzl", "go_library")
 load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template", "go_template_instance")
 
 package(licenses = ["notice"])
 
-load("//tools/go_generics:defs.bzl", "go_template", "go_template_instance")
-
 go_template(
     name = "generic_walker",
     srcs = [
diff --git a/pkg/sentry/socket/BUILD b/pkg/sentry/socket/BUILD
index 3300f9a6b..26176b10d 100644
--- a/pkg/sentry/socket/BUILD
+++ b/pkg/sentry/socket/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])
-
 load("//tools/go_stateify:defs.bzl", "go_library")
 
+package(licenses = ["notice"])
+
 go_library(
     name = "socket",
     srcs = ["socket.go"],
diff --git a/pkg/sentry/socket/control/BUILD b/pkg/sentry/socket/control/BUILD
index 81dbd7309..4a6e83a8b 100644
--- a/pkg/sentry/socket/control/BUILD
+++ b/pkg/sentry/socket/control/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])
-
 load("//tools/go_stateify:defs.bzl", "go_library")
 
+package(licenses = ["notice"])
+
 go_library(
     name = "control",
     srcs = ["control.go"],
diff --git a/pkg/sentry/socket/hostinet/BUILD b/pkg/sentry/socket/hostinet/BUILD
index a951f1bb0..4d174dda4 100644
--- a/pkg/sentry/socket/hostinet/BUILD
+++ b/pkg/sentry/socket/hostinet/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])
-
 load("//tools/go_stateify:defs.bzl", "go_library")
 
+package(licenses = ["notice"])
+
 go_library(
     name = "hostinet",
     srcs = [
diff --git a/pkg/sentry/socket/netfilter/BUILD b/pkg/sentry/socket/netfilter/BUILD
index 354a0d6ee..5eb06bbf4 100644
--- a/pkg/sentry/socket/netfilter/BUILD
+++ b/pkg/sentry/socket/netfilter/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])
-
 load("//tools/go_stateify:defs.bzl", "go_library")
 
+package(licenses = ["notice"])
+
 go_library(
     name = "netfilter",
     srcs = [
diff --git a/pkg/sentry/socket/netlink/BUILD b/pkg/sentry/socket/netlink/BUILD
index 7da68384e..f95803f91 100644
--- a/pkg/sentry/socket/netlink/BUILD
+++ b/pkg/sentry/socket/netlink/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])
-
 load("//tools/go_stateify:defs.bzl", "go_library")
 
+package(licenses = ["notice"])
+
 go_library(
     name = "netlink",
     srcs = [
diff --git a/pkg/sentry/socket/netlink/port/BUILD b/pkg/sentry/socket/netlink/port/BUILD
index 445080aa4..463544c1a 100644
--- a/pkg/sentry/socket/netlink/port/BUILD
+++ b/pkg/sentry/socket/netlink/port/BUILD
@@ -1,9 +1,8 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
-load("//tools/go_stateify:defs.bzl", "go_library")
-
 go_library(
     name = "port",
     srcs = ["port.go"],
diff --git a/pkg/sentry/socket/netlink/route/BUILD b/pkg/sentry/socket/netlink/route/BUILD
index 5dc8533ec..1d4912753 100644
--- a/pkg/sentry/socket/netlink/route/BUILD
+++ b/pkg/sentry/socket/netlink/route/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])
-
 load("//tools/go_stateify:defs.bzl", "go_library")
 
+package(licenses = ["notice"])
+
 go_library(
     name = "route",
     srcs = ["protocol.go"],
diff --git a/pkg/sentry/socket/netstack/BUILD b/pkg/sentry/socket/netstack/BUILD
index 60523f79a..e414d8055 100644
--- a/pkg/sentry/socket/netstack/BUILD
+++ b/pkg/sentry/socket/netstack/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])
-
 load("//tools/go_stateify:defs.bzl", "go_library")
 
+package(licenses = ["notice"])
+
 go_library(
     name = "netstack",
     srcs = [
diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index 6fd43fcbd..69dbfd197 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -53,6 +53,7 @@ import (
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/tcpip/transport/tcp"
 	"gvisor.dev/gvisor/pkg/tcpip/transport/udp"
@@ -298,6 +299,7 @@ func New(t *kernel.Task, family int, skType linux.SockType, protocol int, queue
 
 var sockAddrInetSize = int(binary.Size(linux.SockAddrInet{}))
 var sockAddrInet6Size = int(binary.Size(linux.SockAddrInet6{}))
+var sockAddrLinkSize = int(binary.Size(linux.SockAddrLink{}))
 
 // bytesToIPAddress converts an IPv4 or IPv6 address from the user to the
 // netstack representation taking any addresses into account.
@@ -309,12 +311,12 @@ func bytesToIPAddress(addr []byte) tcpip.Address {
 }
 
 // AddressAndFamily reads an sockaddr struct from the given address and
-// converts it to the FullAddress format. It supports AF_UNIX, AF_INET and
-// AF_INET6 addresses.
+// converts it to the FullAddress format. It supports AF_UNIX, AF_INET,
+// AF_INET6, and AF_PACKET addresses.
 //
 // strict indicates whether addresses with the AF_UNSPEC family are accepted of not.
 //
-// AddressAndFamily returns an address, its family.
+// AddressAndFamily returns an address and its family.
 func AddressAndFamily(sfamily int, addr []byte, strict bool) (tcpip.FullAddress, uint16, *syserr.Error) {
 	// Make sure we have at least 2 bytes for the address family.
 	if len(addr) < 2 {
@@ -373,6 +375,22 @@ func AddressAndFamily(sfamily int, addr []byte, strict bool) (tcpip.FullAddress,
 		}
 		return out, family, nil
 
+	case linux.AF_PACKET:
+		var a linux.SockAddrLink
+		if len(addr) < sockAddrLinkSize {
+			return tcpip.FullAddress{}, family, syserr.ErrInvalidArgument
+		}
+		binary.Unmarshal(addr[:sockAddrLinkSize], usermem.ByteOrder, &a)
+		if a.Family != linux.AF_PACKET || a.HardwareAddrLen != header.EthernetAddressSize {
+			return tcpip.FullAddress{}, family, syserr.ErrInvalidArgument
+		}
+
+		// TODO(b/129292371): Return protocol too.
+		return tcpip.FullAddress{
+			NIC:  tcpip.NICID(a.InterfaceIndex),
+			Addr: tcpip.Address(a.HardwareAddr[:header.EthernetAddressSize]),
+		}, family, nil
+
 	case linux.AF_UNSPEC:
 		return tcpip.FullAddress{}, family, nil
 
@@ -1953,12 +1971,14 @@ func ConvertAddress(family int, addr tcpip.FullAddress) (linux.SockAddr, uint32)
 			return &out, uint32(2 + l)
 		}
 		return &out, uint32(3 + l)
+
 	case linux.AF_INET:
 		var out linux.SockAddrInet
 		copy(out.Addr[:], addr.Addr)
 		out.Family = linux.AF_INET
 		out.Port = htons(addr.Port)
-		return &out, uint32(binary.Size(out))
+		return &out, uint32(sockAddrInetSize)
+
 	case linux.AF_INET6:
 		var out linux.SockAddrInet6
 		if len(addr.Addr) == 4 {
@@ -1974,7 +1994,17 @@ func ConvertAddress(family int, addr tcpip.FullAddress) (linux.SockAddr, uint32)
 		if isLinkLocal(addr.Addr) {
 			out.Scope_id = uint32(addr.NIC)
 		}
-		return &out, uint32(binary.Size(out))
+		return &out, uint32(sockAddrInet6Size)
+
+	case linux.AF_PACKET:
+		// TODO(b/129292371): Return protocol too.
+		var out linux.SockAddrLink
+		out.Family = linux.AF_PACKET
+		out.InterfaceIndex = int32(addr.NIC)
+		out.HardwareAddrLen = header.EthernetAddressSize
+		copy(out.HardwareAddr[:], addr.Addr)
+		return &out, uint32(sockAddrLinkSize)
+
 	default:
 		return nil, 0
 	}
diff --git a/pkg/sentry/socket/netstack/provider.go b/pkg/sentry/socket/netstack/provider.go
index 357a664cc..2d2c1ba2a 100644
--- a/pkg/sentry/socket/netstack/provider.go
+++ b/pkg/sentry/socket/netstack/provider.go
@@ -62,6 +62,10 @@ func getTransportProtocol(ctx context.Context, stype linux.SockType, protocol in
 		}
 
 	case linux.SOCK_RAW:
+		// TODO(b/142504697): "In order to create a raw socket, a
+		// process must have the CAP_NET_RAW capability in the user
+		// namespace that governs its network namespace." - raw(7)
+
 		// Raw sockets require CAP_NET_RAW.
 		creds := auth.CredentialsFromContext(ctx)
 		if !creds.HasCapability(linux.CAP_NET_RAW) {
@@ -85,7 +89,8 @@ func getTransportProtocol(ctx context.Context, stype linux.SockType, protocol in
 	return 0, true, syserr.ErrProtocolNotSupported
 }
 
-// Socket creates a new socket object for the AF_INET or AF_INET6 family.
+// Socket creates a new socket object for the AF_INET, AF_INET6, or AF_PACKET
+// family.
 func (p *provider) Socket(t *kernel.Task, stype linux.SockType, protocol int) (*fs.File, *syserr.Error) {
 	// Fail right away if we don't have a stack.
 	stack := t.NetworkContext()
@@ -99,6 +104,12 @@ func (p *provider) Socket(t *kernel.Task, stype linux.SockType, protocol int) (*
 		return nil, nil
 	}
 
+	// Packet sockets are handled separately, since they are neither INET
+	// nor INET6 specific.
+	if p.family == linux.AF_PACKET {
+		return packetSocket(t, eps, stype, protocol)
+	}
+
 	// Figure out the transport protocol.
 	transProto, associated, err := getTransportProtocol(t, stype, protocol)
 	if err != nil {
@@ -121,12 +132,47 @@ func (p *provider) Socket(t *kernel.Task, stype linux.SockType, protocol int) (*
 	return New(t, p.family, stype, int(transProto), wq, ep)
 }
 
+func packetSocket(t *kernel.Task, epStack *Stack, stype linux.SockType, protocol int) (*fs.File, *syserr.Error) {
+	// TODO(b/142504697): "In order to create a packet socket, a process
+	// must have the CAP_NET_RAW capability in the user namespace that
+	// governs its network namespace." - packet(7)
+
+	// Packet sockets require CAP_NET_RAW.
+	creds := auth.CredentialsFromContext(t)
+	if !creds.HasCapability(linux.CAP_NET_RAW) {
+		return nil, syserr.ErrNotPermitted
+	}
+
+	// "cooked" packets don't contain link layer information.
+	var cooked bool
+	switch stype {
+	case linux.SOCK_DGRAM:
+		cooked = true
+	case linux.SOCK_RAW:
+		cooked = false
+	default:
+		return nil, syserr.ErrProtocolNotSupported
+	}
+
+	// protocol is passed in network byte order, but netstack wants it in
+	// host order.
+	netProto := tcpip.NetworkProtocolNumber(ntohs(uint16(protocol)))
+
+	wq := &waiter.Queue{}
+	ep, err := epStack.Stack.NewPacketEndpoint(cooked, netProto, wq)
+	if err != nil {
+		return nil, syserr.TranslateNetstackError(err)
+	}
+
+	return New(t, linux.AF_PACKET, stype, protocol, wq, ep)
+}
+
 // Pair just returns nil sockets (not supported).
 func (*provider) Pair(*kernel.Task, linux.SockType, int) (*fs.File, *fs.File, *syserr.Error) {
 	return nil, nil, nil
 }
 
-// init registers socket providers for AF_INET and AF_INET6.
+// init registers socket providers for AF_INET, AF_INET6, and AF_PACKET.
 func init() {
 	// Providers backed by netstack.
 	p := []provider{
@@ -138,6 +184,9 @@ func init() {
 			family:   linux.AF_INET6,
 			netProto: ipv6.ProtocolNumber,
 		},
+		{
+			family: linux.AF_PACKET,
+		},
 	}
 
 	for i := range p {
diff --git a/pkg/sentry/socket/unix/BUILD b/pkg/sentry/socket/unix/BUILD
index 830f4da10..5b6a154f6 100644
--- a/pkg/sentry/socket/unix/BUILD
+++ b/pkg/sentry/socket/unix/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])
-
 load("//tools/go_stateify:defs.bzl", "go_library")
 
+package(licenses = ["notice"])
+
 go_library(
     name = "unix",
     srcs = [
diff --git a/pkg/sentry/socket/unix/transport/BUILD b/pkg/sentry/socket/unix/transport/BUILD
index 0b0240336..788ad70d2 100644
--- a/pkg/sentry/socket/unix/transport/BUILD
+++ b/pkg/sentry/socket/unix/transport/BUILD
@@ -1,8 +1,8 @@
-package(licenses = ["notice"])
-
 load("//tools/go_stateify:defs.bzl", "go_library")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 
+package(licenses = ["notice"])
+
 go_template_instance(
     name = "transport_message_list",
     out = "transport_message_list.go",
diff --git a/pkg/sentry/socket/unix/transport/connectioned.go b/pkg/sentry/socket/unix/transport/connectioned.go
index 4bd15808a..dea11e253 100644
--- a/pkg/sentry/socket/unix/transport/connectioned.go
+++ b/pkg/sentry/socket/unix/transport/connectioned.go
@@ -220,6 +220,11 @@ func (e *connectionedEndpoint) Close() {
 	case e.Connected():
 		e.connected.CloseSend()
 		e.receiver.CloseRecv()
+		// Still have unread data? If yes, we set this into the write
+		// end so that the peer can get ECONNRESET) when it does read.
+		if e.receiver.RecvQueuedSize() > 0 {
+			e.connected.CloseUnread()
+		}
 		c = e.connected
 		r = e.receiver
 		e.connected = nil
diff --git a/pkg/sentry/socket/unix/transport/queue.go b/pkg/sentry/socket/unix/transport/queue.go
index 0415fae9a..e27b1c714 100644
--- a/pkg/sentry/socket/unix/transport/queue.go
+++ b/pkg/sentry/socket/unix/transport/queue.go
@@ -33,6 +33,7 @@ type queue struct {
 
 	mu       sync.Mutex `state:"nosave"`
 	closed   bool
+	unread   bool
 	used     int64
 	limit    int64
 	dataList messageList
@@ -161,6 +162,9 @@ func (q *queue) Dequeue() (e *message, notify bool, err *syserr.Error) {
 		err := syserr.ErrWouldBlock
 		if q.closed {
 			err = syserr.ErrClosedForReceive
+			if q.unread {
+				err = syserr.ErrConnectionReset
+			}
 		}
 		q.mu.Unlock()
 
@@ -188,7 +192,9 @@ func (q *queue) Peek() (*message, *syserr.Error) {
 	if q.dataList.Front() == nil {
 		err := syserr.ErrWouldBlock
 		if q.closed {
-			err = syserr.ErrClosedForReceive
+			if err = syserr.ErrClosedForReceive; q.unread {
+				err = syserr.ErrConnectionReset
+			}
 		}
 		return nil, err
 	}
@@ -208,3 +214,11 @@ func (q *queue) QueuedSize() int64 {
 func (q *queue) MaxQueueSize() int64 {
 	return q.limit
 }
+
+// CloseUnread sets flag to indicate that the peer is closed (not shutdown)
+// with unread data. So if read on this queue shall return ECONNRESET error.
+func (q *queue) CloseUnread() {
+	q.mu.Lock()
+	defer q.mu.Unlock()
+	q.unread = true
+}
diff --git a/pkg/sentry/socket/unix/transport/unix.go b/pkg/sentry/socket/unix/transport/unix.go
index 1867b3a5c..529a7a7a9 100644
--- a/pkg/sentry/socket/unix/transport/unix.go
+++ b/pkg/sentry/socket/unix/transport/unix.go
@@ -608,6 +608,10 @@ type ConnectedEndpoint interface {
 	// Release releases any resources owned by the ConnectedEndpoint. It should
 	// be called before droping all references to a ConnectedEndpoint.
 	Release()
+
+	// CloseUnread sets the fact that this end is closed with unread data to
+	// the peer socket.
+	CloseUnread()
 }
 
 // +stateify savable
@@ -711,6 +715,11 @@ func (e *connectedEndpoint) Release() {
 	e.writeQueue.DecRef()
 }
 
+// CloseUnread implements ConnectedEndpoint.CloseUnread.
+func (e *connectedEndpoint) CloseUnread() {
+	e.writeQueue.CloseUnread()
+}
+
 // baseEndpoint is an embeddable unix endpoint base used in both the connected and connectionless
 // unix domain socket Endpoint implementations.
 //
diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go
index 50c308134..1aaae8487 100644
--- a/pkg/sentry/socket/unix/unix.go
+++ b/pkg/sentry/socket/unix/unix.go
@@ -595,7 +595,8 @@ func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 				total += n
 			}
 
-			if err != nil || !waitAll || isPacket || n >= dst.NumBytes() {
+			streamPeerClosed := s.stype == linux.SOCK_STREAM && n == 0 && err == nil
+			if err != nil || !waitAll || isPacket || n >= dst.NumBytes() || streamPeerClosed {
 				if total > 0 {
 					err = nil
 				}
diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD
index cf2a56bed..fb2c1777f 100644
--- a/pkg/sentry/syscalls/linux/BUILD
+++ b/pkg/sentry/syscalls/linux/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])
-
 load("//tools/go_stateify:defs.bzl", "go_library")
 
+package(licenses = ["notice"])
+
 go_library(
     name = "linux",
     srcs = [
diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go
index b64c49ff5..68589a377 100644
--- a/pkg/sentry/syscalls/linux/linux64.go
+++ b/pkg/sentry/syscalls/linux/linux64.go
@@ -16,7 +16,12 @@
 package linux
 
 const (
-	_LINUX_SYSNAME = "Linux"
-	_LINUX_RELEASE = "4.4"
-	_LINUX_VERSION = "#1 SMP Sun Jan 10 15:06:54 PST 2016"
+	// LinuxSysname is the OS name advertised by gVisor.
+	LinuxSysname = "Linux"
+
+	// LinuxRelease is the Linux release version number advertised by gVisor.
+	LinuxRelease = "4.4.0"
+
+	// LinuxVersion is the version info advertised by gVisor.
+	LinuxVersion = "#1 SMP Sun Jan 10 15:06:54 PST 2016"
 )
diff --git a/pkg/sentry/syscalls/linux/linux64_amd64.go b/pkg/sentry/syscalls/linux/linux64_amd64.go
index e215ac049..aedb6d774 100644
--- a/pkg/sentry/syscalls/linux/linux64_amd64.go
+++ b/pkg/sentry/syscalls/linux/linux64_amd64.go
@@ -34,9 +34,9 @@ var AMD64 = &kernel.SyscallTable{
 		// guides the interface provided by this syscall table. The build
 		// version is that for a clean build with default kernel config, at 5
 		// minutes after v4.4 was tagged.
-		Sysname: _LINUX_SYSNAME,
-		Release: _LINUX_RELEASE,
-		Version: _LINUX_VERSION,
+		Sysname: LinuxSysname,
+		Release: LinuxRelease,
+		Version: LinuxVersion,
 	},
 	AuditNumber: linux.AUDIT_ARCH_X86_64,
 	Table: map[uintptr]kernel.Syscall{
@@ -362,7 +362,7 @@ var AMD64 = &kernel.SyscallTable{
 		319: syscalls.Supported("memfd_create", MemfdCreate),
 		320: syscalls.CapError("kexec_file_load", linux.CAP_SYS_BOOT, "", nil),
 		321: syscalls.CapError("bpf", linux.CAP_SYS_ADMIN, "", nil),
-		322: syscalls.ErrorWithEvent("execveat", syserror.ENOSYS, "", []string{"gvisor.dev/issue/265"}),    // TODO(b/118901836)
+		322: syscalls.PartiallySupported("execveat", Execveat, "No support for AT_EMPTY_PATH, AT_SYMLINK_FOLLOW.", nil),
 		323: syscalls.ErrorWithEvent("userfaultfd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/266"}), // TODO(b/118906345)
 		324: syscalls.ErrorWithEvent("membarrier", syserror.ENOSYS, "", []string{"gvisor.dev/issue/267"}),  // TODO(b/118904897)
 		325: syscalls.PartiallySupported("mlock2", Mlock2, "Stub implementation. The sandbox lacks appropriate permissions.", nil),
diff --git a/pkg/sentry/syscalls/linux/linux64_arm64.go b/pkg/sentry/syscalls/linux/linux64_arm64.go
index 1d3b63020..4cf7f836a 100644
--- a/pkg/sentry/syscalls/linux/linux64_arm64.go
+++ b/pkg/sentry/syscalls/linux/linux64_arm64.go
@@ -30,9 +30,9 @@ var ARM64 = &kernel.SyscallTable{
 	OS:   abi.Linux,
 	Arch: arch.ARM64,
 	Version: kernel.Version{
-		Sysname: _LINUX_SYSNAME,
-		Release: _LINUX_RELEASE,
-		Version: _LINUX_VERSION,
+		Sysname: LinuxSysname,
+		Release: LinuxRelease,
+		Version: LinuxVersion,
 	},
 	AuditNumber: linux.AUDIT_ARCH_AARCH64,
 	Table: map[uintptr]kernel.Syscall{
@@ -107,10 +107,10 @@ var ARM64 = &kernel.SyscallTable{
 		71:  syscalls.Supported("sendfile", Sendfile),
 		72:  syscalls.Supported("pselect", Pselect),
 		73:  syscalls.Supported("ppoll", Ppoll),
-		74:  syscalls.ErrorWithEvent("signalfd4", syserror.ENOSYS, "", []string{"gvisor.dev/issue/139"}),             // TODO(b/19846426)
+		74:  syscalls.PartiallySupported("signalfd4", Signalfd4, "Semantics are slightly different.", []string{"gvisor.dev/issue/139"}),
 		75:  syscalls.ErrorWithEvent("vmsplice", syserror.ENOSYS, "", []string{"gvisor.dev/issue/138"}),              // TODO(b/29354098)
 		76:  syscalls.PartiallySupported("splice", Splice, "Stub implementation.", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098)
-		77:  syscalls.ErrorWithEvent("tee", syserror.ENOSYS, "", []string{"gvisor.dev/issue/138"}),                   // TODO(b/29354098)
+		77:  syscalls.Supported("tee", Tee),
 		78:  syscalls.Supported("readlinkat", Readlinkat),
 		80:  syscalls.Supported("fstat", Fstat),
 		81:  syscalls.PartiallySupported("sync", Sync, "Full data flush is not guaranteed at this time.", nil),
@@ -245,7 +245,7 @@ var ARM64 = &kernel.SyscallTable{
 		210: syscalls.PartiallySupported("shutdown", Shutdown, "Not all flags and control messages are supported.", nil),
 		211: syscalls.Supported("sendmsg", SendMsg),
 		212: syscalls.PartiallySupported("recvmsg", RecvMsg, "Not all flags and control messages are supported.", nil),
-		213: syscalls.ErrorWithEvent("readahead", syserror.ENOSYS, "", []string{"gvisor.dev/issue/261"}), // TODO(b/29351341)
+		213: syscalls.Supported("readahead", Readahead),
 		214: syscalls.Supported("brk", Brk),
 		215: syscalls.Supported("munmap", Munmap),
 		216: syscalls.Supported("mremap", Mremap),
diff --git a/pkg/sentry/syscalls/linux/sys_thread.go b/pkg/sentry/syscalls/linux/sys_thread.go
index 8ab7ffa25..6e425f1ec 100644
--- a/pkg/sentry/syscalls/linux/sys_thread.go
+++ b/pkg/sentry/syscalls/linux/sys_thread.go
@@ -15,10 +15,12 @@
 package linux
 
 import (
+	"path"
 	"syscall"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/sched"
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
@@ -67,8 +69,22 @@ func Execve(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 	argvAddr := args[1].Pointer()
 	envvAddr := args[2].Pointer()
 
-	// Extract our arguments.
-	filename, err := t.CopyInString(filenameAddr, linux.PATH_MAX)
+	return execveat(t, linux.AT_FDCWD, filenameAddr, argvAddr, envvAddr, 0)
+}
+
+// Execveat implements linux syscall execveat(2).
+func Execveat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirFD := args[0].Int()
+	pathnameAddr := args[1].Pointer()
+	argvAddr := args[2].Pointer()
+	envvAddr := args[3].Pointer()
+	flags := args[4].Int()
+
+	return execveat(t, dirFD, pathnameAddr, argvAddr, envvAddr, flags)
+}
+
+func execveat(t *kernel.Task, dirFD int32, pathnameAddr, argvAddr, envvAddr usermem.Addr, flags int32) (uintptr, *kernel.SyscallControl, error) {
+	pathname, err := t.CopyInString(pathnameAddr, linux.PATH_MAX)
 	if err != nil {
 		return 0, nil, err
 	}
@@ -89,14 +105,38 @@ func Execve(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 		}
 	}
 
+	if flags != 0 {
+		// TODO(b/128449944): Handle AT_EMPTY_PATH and AT_SYMLINK_NOFOLLOW.
+		t.Kernel().EmitUnimplementedEvent(t)
+		return 0, nil, syserror.ENOSYS
+	}
+
 	root := t.FSContext().RootDirectory()
 	defer root.DecRef()
-	wd := t.FSContext().WorkingDirectory()
+
+	var wd *fs.Dirent
+	if dirFD == linux.AT_FDCWD || path.IsAbs(pathname) {
+		// If pathname is absolute, LoadTaskImage() will ignore the wd.
+		wd = t.FSContext().WorkingDirectory()
+	} else {
+		// Need to extract the given FD.
+		f := t.GetFile(dirFD)
+		if f == nil {
+			return 0, nil, syserror.EBADF
+		}
+		defer f.DecRef()
+
+		wd = f.Dirent
+		wd.IncRef()
+		if !fs.IsDir(wd.Inode.StableAttr) {
+			return 0, nil, syserror.ENOTDIR
+		}
+	}
 	defer wd.DecRef()
 
 	// Load the new TaskContext.
 	maxTraversals := uint(linux.MaxSymlinkTraversals)
-	tc, se := t.Kernel().LoadTaskImage(t, t.MountNamespace(), root, wd, &maxTraversals, filename, nil, argv, envv, t.Arch().FeatureSet())
+	tc, se := t.Kernel().LoadTaskImage(t, t.MountNamespace(), root, wd, &maxTraversals, pathname, nil, argv, envv, t.Arch().FeatureSet())
 	if se != nil {
 		return 0, nil, se.ToError()
 	}
diff --git a/pkg/sentry/syscalls/linux/sys_write.go b/pkg/sentry/syscalls/linux/sys_write.go
index 27cd2c336..ad4b67806 100644
--- a/pkg/sentry/syscalls/linux/sys_write.go
+++ b/pkg/sentry/syscalls/linux/sys_write.go
@@ -191,7 +191,6 @@ func Pwritev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
 }
 
 // Pwritev2 implements linux syscall pwritev2(2).
-// TODO(b/120161091): Implement O_SYNC and D_SYNC functionality.
 func Pwritev2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	// While the syscall is
 	// pwritev2(int fd, struct iovec* iov, int iov_cnt, off_t offset, int flags)
diff --git a/pkg/sentry/time/BUILD b/pkg/sentry/time/BUILD
index beb43ba13..d3a4cd943 100644
--- a/pkg/sentry/time/BUILD
+++ b/pkg/sentry/time/BUILD
@@ -1,10 +1,9 @@
 load("//tools/go_stateify:defs.bzl", "go_library")
 load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
 
 package(licenses = ["notice"])
 
-load("//tools/go_generics:defs.bzl", "go_template_instance")
-
 go_template_instance(
     name = "seqatomic_parameters",
     out = "seqatomic_parameters_unsafe.go",
diff --git a/pkg/sentry/usage/BUILD b/pkg/sentry/usage/BUILD
index a34c39540..c32fe3241 100644
--- a/pkg/sentry/usage/BUILD
+++ b/pkg/sentry/usage/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])
-
 load("//tools/go_stateify:defs.bzl", "go_library")
 
+package(licenses = ["notice"])
+
 go_library(
     name = "usage",
     srcs = [
diff --git a/pkg/sentry/usermem/BUILD b/pkg/sentry/usermem/BUILD
index cc5d25762..684f59a6b 100644
--- a/pkg/sentry/usermem/BUILD
+++ b/pkg/sentry/usermem/BUILD
@@ -1,10 +1,9 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_test")
-
-package(licenses = ["notice"])
-
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 load("//tools/go_stateify:defs.bzl", "go_library")
 
+package(licenses = ["notice"])
+
 go_template_instance(
     name = "addr_range",
     out = "addr_range.go",
diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go
index 7eb2b2821..3a9665800 100644
--- a/pkg/sentry/vfs/file_description.go
+++ b/pkg/sentry/vfs/file_description.go
@@ -102,7 +102,7 @@ type FileDescriptionImpl interface {
 	// OnClose is called when a file descriptor representing the
 	// FileDescription is closed. Note that returning a non-nil error does not
 	// prevent the file descriptor from being closed.
-	OnClose() error
+	OnClose(ctx context.Context) error
 
 	// StatusFlags returns file description status flags, as for
 	// fcntl(F_GETFL).
@@ -180,7 +180,7 @@ type FileDescriptionImpl interface {
 	// ConfigureMMap mutates opts to implement mmap(2) for the file. Most
 	// implementations that support memory mapping can call
 	// GenericConfigureMMap with the appropriate memmap.Mappable.
-	ConfigureMMap(ctx context.Context, opts memmap.MMapOpts) error
+	ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error
 
 	// Ioctl implements the ioctl(2) syscall.
 	Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error)
diff --git a/pkg/sentry/vfs/file_description_impl_util.go b/pkg/sentry/vfs/file_description_impl_util.go
index ba230da72..4fbad7840 100644
--- a/pkg/sentry/vfs/file_description_impl_util.go
+++ b/pkg/sentry/vfs/file_description_impl_util.go
@@ -45,7 +45,7 @@ type FileDescriptionDefaultImpl struct{}
 
 // OnClose implements FileDescriptionImpl.OnClose analogously to
 // file_operations::flush == NULL in Linux.
-func (FileDescriptionDefaultImpl) OnClose() error {
+func (FileDescriptionDefaultImpl) OnClose(ctx context.Context) error {
 	return nil
 }
 
@@ -117,7 +117,7 @@ func (FileDescriptionDefaultImpl) Sync(ctx context.Context) error {
 
 // ConfigureMMap implements FileDescriptionImpl.ConfigureMMap analogously to
 // file_operations::mmap == NULL in Linux.
-func (FileDescriptionDefaultImpl) ConfigureMMap(ctx context.Context, opts memmap.MMapOpts) error {
+func (FileDescriptionDefaultImpl) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
 	return syserror.ENODEV
 }
 
diff --git a/pkg/sentry/vfs/syscalls.go b/pkg/sentry/vfs/syscalls.go
index 23f2b9e08..abde0feaa 100644
--- a/pkg/sentry/vfs/syscalls.go
+++ b/pkg/sentry/vfs/syscalls.go
@@ -96,6 +96,26 @@ func (vfs *VirtualFilesystem) MkdirAt(ctx context.Context, creds *auth.Credentia
 	}
 }
 
+// MknodAt creates a file of the given mode at the given path. It returns an
+// error from the syserror package.
+func (vfs *VirtualFilesystem) MknodAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *MknodOptions) error {
+	rp, err := vfs.getResolvingPath(creds, pop)
+	if err != nil {
+		return nil
+	}
+	for {
+		if err = rp.mount.fs.impl.MknodAt(ctx, rp, *opts); err == nil {
+			vfs.putResolvingPath(rp)
+			return nil
+		}
+		// Handle mount traversals.
+		if !rp.handleError(err) {
+			vfs.putResolvingPath(rp)
+			return err
+		}
+	}
+}
+
 // OpenAt returns a FileDescription providing access to the file at the given
 // path. A reference is taken on the returned FileDescription.
 func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *OpenOptions) (*FileDescription, error) {
@@ -198,8 +218,6 @@ func (fd *FileDescription) SetStatusFlags(ctx context.Context, flags uint32) err
 //
 // - VFS.LinkAt()
 //
-// - VFS.MknodAt()
-//
 // - VFS.ReadlinkAt()
 //
 // - VFS.RenameAt()
diff --git a/pkg/state/BUILD b/pkg/state/BUILD
index 329904457..be93750bf 100644
--- a/pkg/state/BUILD
+++ b/pkg/state/BUILD
@@ -1,11 +1,10 @@
 load("//tools/go_stateify:defs.bzl", "go_library")
 load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
 load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
 
 package(licenses = ["notice"])
 
-load("//tools/go_generics:defs.bzl", "go_template_instance")
-
 go_template_instance(
     name = "addr_range",
     out = "addr_range.go",
diff --git a/pkg/tcpip/buffer/prependable.go b/pkg/tcpip/buffer/prependable.go
index 4287464f3..48a2a2713 100644
--- a/pkg/tcpip/buffer/prependable.go
+++ b/pkg/tcpip/buffer/prependable.go
@@ -41,6 +41,11 @@ func NewPrependableFromView(v View) Prependable {
 	return Prependable{buf: v, usedIdx: 0}
 }
 
+// NewEmptyPrependableFromView creates a new prependable buffer from a View.
+func NewEmptyPrependableFromView(v View) Prependable {
+	return Prependable{buf: v, usedIdx: len(v)}
+}
+
 // View returns a View of the backing buffer that contains all prepended
 // data so far.
 func (p Prependable) View() View {
diff --git a/pkg/tcpip/checker/checker.go b/pkg/tcpip/checker/checker.go
index 096ad71ab..02137e1c9 100644
--- a/pkg/tcpip/checker/checker.go
+++ b/pkg/tcpip/checker/checker.go
@@ -686,3 +686,64 @@ func ICMPv6Code(want byte) TransportChecker {
 		}
 	}
 }
+
+// NDP creates a checker that checks that the packet contains a valid NDP
+// message for type of ty, with potentially additional checks specified by
+// checkers.
+//
+// checkers may assume that a valid ICMPv6 is passed to it containing a valid
+// NDP message as far as the size of the message (minSize) is concerned. The
+// values within the message are up to checkers to validate.
+func NDP(msgType header.ICMPv6Type, minSize int, checkers ...TransportChecker) NetworkChecker {
+	return func(t *testing.T, h []header.Network) {
+		t.Helper()
+
+		// Check normal ICMPv6 first.
+		ICMPv6(
+			ICMPv6Type(msgType),
+			ICMPv6Code(0))(t, h)
+
+		last := h[len(h)-1]
+
+		icmp := header.ICMPv6(last.Payload())
+		if got := len(icmp.NDPPayload()); got < minSize {
+			t.Fatalf("ICMPv6 NDP (type = %d) payload size of %d is less than the minimum size of %d", msgType, got, minSize)
+		}
+
+		for _, f := range checkers {
+			f(t, icmp)
+		}
+		if t.Failed() {
+			t.FailNow()
+		}
+	}
+}
+
+// NDPNS creates a checker that checks that the packet contains a valid NDP
+// Neighbor Solicitation message (as per the raw wire format), with potentially
+// additional checks specified by checkers.
+//
+// checkers may assume that a valid ICMPv6 is passed to it containing a valid
+// NDPNS message as far as the size of the messages concerned. The values within
+// the message are up to checkers to validate.
+func NDPNS(checkers ...TransportChecker) NetworkChecker {
+	return NDP(header.ICMPv6NeighborSolicit, header.NDPNSMinimumSize, checkers...)
+}
+
+// NDPNSTargetAddress creates a checker that checks the Target Address field of
+// a header.NDPNeighborSolicit.
+//
+// The returned TransportChecker assumes that a valid ICMPv6 is passed to it
+// containing a valid NDPNS message as far as the size is concerned.
+func NDPNSTargetAddress(want tcpip.Address) TransportChecker {
+	return func(t *testing.T, h header.Transport) {
+		t.Helper()
+
+		icmp := h.(header.ICMPv6)
+		ns := header.NDPNeighborSolicit(icmp.NDPPayload())
+
+		if got := ns.TargetAddress(); got != want {
+			t.Fatalf("got %T.TargetAddress = %s, want = %s", ns, got, want)
+		}
+	}
+}
diff --git a/pkg/tcpip/header/BUILD b/pkg/tcpip/header/BUILD
index a255231a3..a3485b35c 100644
--- a/pkg/tcpip/header/BUILD
+++ b/pkg/tcpip/header/BUILD
@@ -16,6 +16,10 @@ go_library(
         "ipv4.go",
         "ipv6.go",
         "ipv6_fragment.go",
+        "ndp_neighbor_advert.go",
+        "ndp_neighbor_solicit.go",
+        "ndp_options.go",
+        "ndp_router_advert.go",
         "tcp.go",
         "udp.go",
     ],
@@ -30,13 +34,26 @@ go_library(
 )
 
 go_test(
-    name = "header_test",
+    name = "header_x_test",
     size = "small",
     srcs = [
+        "checksum_test.go",
         "ipversion_test.go",
         "tcp_test.go",
     ],
     deps = [
         ":header",
+        "//pkg/tcpip/buffer",
+    ],
+)
+
+go_test(
+    name = "header_test",
+    size = "small",
+    srcs = [
+        "eth_test.go",
+        "ndp_test.go",
     ],
+    embed = [":header"],
+    deps = ["//pkg/tcpip"],
 )
diff --git a/pkg/tcpip/header/checksum.go b/pkg/tcpip/header/checksum.go
index 39a4d69be..9749c7f4d 100644
--- a/pkg/tcpip/header/checksum.go
+++ b/pkg/tcpip/header/checksum.go
@@ -23,11 +23,17 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 )
 
-func calculateChecksum(buf []byte, initial uint32) uint16 {
+func calculateChecksum(buf []byte, odd bool, initial uint32) (uint16, bool) {
 	v := initial
 
+	if odd {
+		v += uint32(buf[0])
+		buf = buf[1:]
+	}
+
 	l := len(buf)
-	if l&1 != 0 {
+	odd = l&1 != 0
+	if odd {
 		l--
 		v += uint32(buf[l]) << 8
 	}
@@ -36,7 +42,7 @@ func calculateChecksum(buf []byte, initial uint32) uint16 {
 		v += (uint32(buf[i]) << 8) + uint32(buf[i+1])
 	}
 
-	return ChecksumCombine(uint16(v), uint16(v>>16))
+	return ChecksumCombine(uint16(v), uint16(v>>16)), odd
 }
 
 // Checksum calculates the checksum (as defined in RFC 1071) of the bytes in the
@@ -44,7 +50,8 @@ func calculateChecksum(buf []byte, initial uint32) uint16 {
 //
 // The initial checksum must have been computed on an even number of bytes.
 func Checksum(buf []byte, initial uint16) uint16 {
-	return calculateChecksum(buf, uint32(initial))
+	s, _ := calculateChecksum(buf, false, uint32(initial))
+	return s
 }
 
 // ChecksumVV calculates the checksum (as defined in RFC 1071) of the bytes in
@@ -52,19 +59,40 @@ func Checksum(buf []byte, initial uint16) uint16 {
 //
 // The initial checksum must have been computed on an even number of bytes.
 func ChecksumVV(vv buffer.VectorisedView, initial uint16) uint16 {
-	var odd bool
+	return ChecksumVVWithOffset(vv, initial, 0, vv.Size())
+}
+
+// ChecksumVVWithOffset calculates the checksum (as defined in RFC 1071) of the
+// bytes in the given VectorizedView.
+//
+// The initial checksum must have been computed on an even number of bytes.
+func ChecksumVVWithOffset(vv buffer.VectorisedView, initial uint16, off int, size int) uint16 {
+	odd := false
 	sum := initial
 	for _, v := range vv.Views() {
 		if len(v) == 0 {
 			continue
 		}
-		s := uint32(sum)
-		if odd {
-			s += uint32(v[0])
-			v = v[1:]
+
+		if off >= len(v) {
+			off -= len(v)
+			continue
+		}
+		v = v[off:]
+
+		l := len(v)
+		if l > size {
+			l = size
+		}
+		v = v[:l]
+
+		sum, odd = calculateChecksum(v, odd, uint32(sum))
+
+		size -= len(v)
+		if size == 0 {
+			break
 		}
-		odd = len(v)&1 != 0
-		sum = calculateChecksum(v, s)
+		off = 0
 	}
 	return sum
 }
diff --git a/pkg/tcpip/header/checksum_test.go b/pkg/tcpip/header/checksum_test.go
new file mode 100644
index 000000000..86b466c1c
--- /dev/null
+++ b/pkg/tcpip/header/checksum_test.go
@@ -0,0 +1,109 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package header provides the implementation of the encoding and decoding of
+// network protocol headers.
+package header_test
+
+import (
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+)
+
+func TestChecksumVVWithOffset(t *testing.T) {
+	testCases := []struct {
+		name      string
+		vv        buffer.VectorisedView
+		off, size int
+		initial   uint16
+		want      uint16
+	}{
+		{
+			name: "empty",
+			vv: buffer.NewVectorisedView(0, []buffer.View{
+				buffer.NewViewFromBytes([]byte{1, 9, 0, 5, 4}),
+			}),
+			off:  0,
+			size: 0,
+			want: 0,
+		},
+		{
+			name: "OneView",
+			vv: buffer.NewVectorisedView(0, []buffer.View{
+				buffer.NewViewFromBytes([]byte{1, 9, 0, 5, 4}),
+			}),
+			off:  0,
+			size: 5,
+			want: 1294,
+		},
+		{
+			name: "TwoViews",
+			vv: buffer.NewVectorisedView(0, []buffer.View{
+				buffer.NewViewFromBytes([]byte{1, 9, 0, 5, 4}),
+				buffer.NewViewFromBytes([]byte{4, 3, 7, 1, 2, 123}),
+			}),
+			off:  0,
+			size: 11,
+			want: 33819,
+		},
+		{
+			name: "TwoViewsWithOffset",
+			vv: buffer.NewVectorisedView(0, []buffer.View{
+				buffer.NewViewFromBytes([]byte{98, 1, 9, 0, 5, 4}),
+				buffer.NewViewFromBytes([]byte{4, 3, 7, 1, 2, 123}),
+			}),
+			off:  1,
+			size: 11,
+			want: 33819,
+		},
+		{
+			name: "ThreeViewsWithOffset",
+			vv: buffer.NewVectorisedView(0, []buffer.View{
+				buffer.NewViewFromBytes([]byte{98, 1, 9, 0, 5, 4}),
+				buffer.NewViewFromBytes([]byte{98, 1, 9, 0, 5, 4}),
+				buffer.NewViewFromBytes([]byte{4, 3, 7, 1, 2, 123}),
+			}),
+			off:  7,
+			size: 11,
+			want: 33819,
+		},
+		{
+			name: "ThreeViewsWithInitial",
+			vv: buffer.NewVectorisedView(0, []buffer.View{
+				buffer.NewViewFromBytes([]byte{77, 11, 33, 0, 55, 44}),
+				buffer.NewViewFromBytes([]byte{98, 1, 9, 0, 5, 4}),
+				buffer.NewViewFromBytes([]byte{4, 3, 7, 1, 2, 123, 99}),
+			}),
+			initial: 77,
+			off:     7,
+			size:    11,
+			want:    33896,
+		},
+	}
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			if got, want := header.ChecksumVVWithOffset(tc.vv, tc.initial, tc.off, tc.size), tc.want; got != want {
+				t.Errorf("header.ChecksumVVWithOffset(%v) = %v, want: %v", tc, got, tc.want)
+			}
+			v := tc.vv.ToView()
+			v.TrimFront(tc.off)
+			v.CapLength(tc.size)
+			if got, want := header.Checksum(v, tc.initial), tc.want; got != want {
+				t.Errorf("header.Checksum(%v) = %v, want: %v", tc, got, tc.want)
+			}
+		})
+	}
+}
diff --git a/pkg/tcpip/header/eth.go b/pkg/tcpip/header/eth.go
index 4c3d3311f..f5d2c127f 100644
--- a/pkg/tcpip/header/eth.go
+++ b/pkg/tcpip/header/eth.go
@@ -48,8 +48,48 @@ const (
 
 	// EthernetAddressSize is the size, in bytes, of an ethernet address.
 	EthernetAddressSize = 6
+
+	// unspecifiedEthernetAddress is the unspecified ethernet address
+	// (all bits set to 0).
+	unspecifiedEthernetAddress = tcpip.LinkAddress("\x00\x00\x00\x00\x00\x00")
+
+	// unicastMulticastFlagMask is the mask of the least significant bit in
+	// the first octet (in network byte order) of an ethernet address that
+	// determines whether the ethernet address is a unicast or multicast. If
+	// the masked bit is a 1, then the address is a multicast, unicast
+	// otherwise.
+	//
+	// See the IEEE Std 802-2001 document for more details. Specifically,
+	// section 9.2.1 of http://ieee802.org/secmail/pdfocSP2xXA6d.pdf:
+	// "A 48-bit universal address consists of two parts. The first 24 bits
+	// correspond to the OUI as assigned by the IEEE, expect that the
+	// assignee may set the LSB of the first octet to 1 for group addresses
+	// or set it to 0 for individual addresses."
+	unicastMulticastFlagMask = 1
+
+	// unicastMulticastFlagByteIdx is the byte that holds the
+	// unicast/multicast flag. See unicastMulticastFlagMask.
+	unicastMulticastFlagByteIdx = 0
+)
+
+const (
+	// EthernetProtocolAll is a catch-all for all protocols carried inside
+	// an ethernet frame. It is mainly used to create packet sockets that
+	// capture all traffic.
+	EthernetProtocolAll tcpip.NetworkProtocolNumber = 0x0003
+
+	// EthernetProtocolPUP is the PARC Universial Packet protocol ethertype.
+	EthernetProtocolPUP tcpip.NetworkProtocolNumber = 0x0200
 )
 
+// Ethertypes holds the protocol numbers describing the payload of an ethernet
+// frame. These types aren't necessarily supported by netstack, but can be used
+// to catch all traffic of a type via packet endpoints.
+var Ethertypes = []tcpip.NetworkProtocolNumber{
+	EthernetProtocolAll,
+	EthernetProtocolPUP,
+}
+
 // SourceAddress returns the "MAC source" field of the ethernet frame header.
 func (b Ethernet) SourceAddress() tcpip.LinkAddress {
 	return tcpip.LinkAddress(b[srcMAC:][:EthernetAddressSize])
@@ -72,3 +112,25 @@ func (b Ethernet) Encode(e *EthernetFields) {
 	copy(b[srcMAC:][:EthernetAddressSize], e.SrcAddr)
 	copy(b[dstMAC:][:EthernetAddressSize], e.DstAddr)
 }
+
+// IsValidUnicastEthernetAddress returns true if addr is a valid unicast
+// ethernet address.
+func IsValidUnicastEthernetAddress(addr tcpip.LinkAddress) bool {
+	// Must be of the right length.
+	if len(addr) != EthernetAddressSize {
+		return false
+	}
+
+	// Must not be unspecified.
+	if addr == unspecifiedEthernetAddress {
+		return false
+	}
+
+	// Must not be a multicast.
+	if addr[unicastMulticastFlagByteIdx]&unicastMulticastFlagMask != 0 {
+		return false
+	}
+
+	// addr is a valid unicast ethernet address.
+	return true
+}
diff --git a/pkg/tcpip/header/eth_test.go b/pkg/tcpip/header/eth_test.go
new file mode 100644
index 000000000..6634c90f5
--- /dev/null
+++ b/pkg/tcpip/header/eth_test.go
@@ -0,0 +1,68 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package header
+
+import (
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+)
+
+func TestIsValidUnicastEthernetAddress(t *testing.T) {
+	tests := []struct {
+		name     string
+		addr     tcpip.LinkAddress
+		expected bool
+	}{
+		{
+			"Nil",
+			tcpip.LinkAddress([]byte(nil)),
+			false,
+		},
+		{
+			"Empty",
+			tcpip.LinkAddress(""),
+			false,
+		},
+		{
+			"InvalidLength",
+			tcpip.LinkAddress("\x01\x02\x03"),
+			false,
+		},
+		{
+			"Unspecified",
+			unspecifiedEthernetAddress,
+			false,
+		},
+		{
+			"Multicast",
+			tcpip.LinkAddress("\x01\x02\x03\x04\x05\x06"),
+			false,
+		},
+		{
+			"Valid",
+			tcpip.LinkAddress("\x02\x02\x03\x04\x05\x06"),
+			true,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			if got := IsValidUnicastEthernetAddress(test.addr); got != test.expected {
+				t.Fatalf("got IsValidUnicastEthernetAddress = %t, want = %t", got, test.expected)
+			}
+		})
+	}
+}
diff --git a/pkg/tcpip/header/icmpv6.go b/pkg/tcpip/header/icmpv6.go
index 1125a7d14..c2bfd8c79 100644
--- a/pkg/tcpip/header/icmpv6.go
+++ b/pkg/tcpip/header/icmpv6.go
@@ -25,6 +25,12 @@ import (
 type ICMPv6 []byte
 
 const (
+	// ICMPv6HeaderSize is the size of the ICMPv6 header. That is, the
+	// sum of the size of the ICMPv6 Type, Code and Checksum fields, as
+	// per RFC 4443 section 2.1. After the ICMPv6 header, the ICMPv6
+	// message body begins.
+	ICMPv6HeaderSize = 4
+
 	// ICMPv6MinimumSize is the minimum size of a valid ICMP packet.
 	ICMPv6MinimumSize = 8
 
@@ -37,10 +43,16 @@ const (
 
 	// ICMPv6NeighborSolicitMinimumSize is the minimum size of a
 	// neighbor solicitation packet.
-	ICMPv6NeighborSolicitMinimumSize = ICMPv6MinimumSize + 16
+	ICMPv6NeighborSolicitMinimumSize = ICMPv6HeaderSize + NDPNSMinimumSize
+
+	// ICMPv6NeighborAdvertMinimumSize is the minimum size of a
+	// neighbor advertisement packet.
+	ICMPv6NeighborAdvertMinimumSize = ICMPv6HeaderSize + NDPNAMinimumSize
 
-	// ICMPv6NeighborAdvertSize is size of a neighbor advertisement.
-	ICMPv6NeighborAdvertSize = 32
+	// ICMPv6NeighborAdvertSize is size of a neighbor advertisement
+	// including the NDP Target Link Layer option for an Ethernet
+	// address.
+	ICMPv6NeighborAdvertSize = ICMPv6HeaderSize + NDPNAMinimumSize + ndpTargetEthernetLinkLayerAddressSize
 
 	// ICMPv6EchoMinimumSize is the minimum size of a valid ICMP echo packet.
 	ICMPv6EchoMinimumSize = 8
@@ -68,6 +80,13 @@ const (
 	// icmpv6SequenceOffset is the offset of the sequence field
 	// in a ICMPv6 Echo Request/Reply message.
 	icmpv6SequenceOffset = 6
+
+	// NDPHopLimit is the expected IP hop limit value of 255 for received
+	// NDP packets, as per RFC 4861 sections 4.1 - 4.5, 6.1.1, 6.1.2, 7.1.1,
+	// 7.1.2 and 8.1. If the hop limit value is not 255, nodes MUST silently
+	// drop the NDP packet. All outgoing NDP packets must use this value for
+	// its IP hop limit field.
+	NDPHopLimit = 255
 )
 
 // ICMPv6Type is the ICMP type field described in RFC 4443 and friends.
@@ -166,6 +185,13 @@ func (b ICMPv6) SetSequence(sequence uint16) {
 	binary.BigEndian.PutUint16(b[icmpv6SequenceOffset:], sequence)
 }
 
+// NDPPayload returns the NDP payload buffer. That is, it returns the ICMPv6
+// packet's message body as defined by RFC 4443 section 2.1; the portion of the
+// ICMPv6 buffer after the first ICMPv6HeaderSize bytes.
+func (b ICMPv6) NDPPayload() []byte {
+	return b[ICMPv6HeaderSize:]
+}
+
 // Payload implements Transport.Payload.
 func (b ICMPv6) Payload() []byte {
 	return b[ICMPv6PayloadOffset:]
diff --git a/pkg/tcpip/header/ipv6.go b/pkg/tcpip/header/ipv6.go
index 9d3abc0e4..f1e60911b 100644
--- a/pkg/tcpip/header/ipv6.go
+++ b/pkg/tcpip/header/ipv6.go
@@ -87,7 +87,8 @@ const (
 	// section 5.
 	IPv6MinimumMTU = 1280
 
-	// IPv6Any is the non-routable IPv6 "any" meta address.
+	// IPv6Any is the non-routable IPv6 "any" meta address. It is also
+	// known as the unspecified address.
 	IPv6Any tcpip.Address = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
 )
 
@@ -100,6 +101,15 @@ var IPv6EmptySubnet = func() tcpip.Subnet {
 	return subnet
 }()
 
+// IPv6LinkLocalPrefix is the prefix for IPv6 link-local addresses, as defined
+// by RFC 4291 section 2.5.6.
+//
+// The prefix is fe80::/64
+var IPv6LinkLocalPrefix = tcpip.AddressWithPrefix{
+	Address:   "\xfe\x80\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
+	PrefixLen: 64,
+}
+
 // PayloadLength returns the value of the "payload length" field of the ipv6
 // header.
 func (b IPv6) PayloadLength() uint16 {
diff --git a/pkg/tcpip/header/ndp_neighbor_advert.go b/pkg/tcpip/header/ndp_neighbor_advert.go
new file mode 100644
index 000000000..505c92668
--- /dev/null
+++ b/pkg/tcpip/header/ndp_neighbor_advert.go
@@ -0,0 +1,110 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package header
+
+import "gvisor.dev/gvisor/pkg/tcpip"
+
+// NDPNeighborAdvert is an NDP Neighbor Advertisement message. It will
+// only contain the body of an ICMPv6 packet.
+//
+// See RFC 4861 section 4.4 for more details.
+type NDPNeighborAdvert []byte
+
+const (
+	// NDPNAMinimumSize is the minimum size of a valid NDP Neighbor
+	// Advertisement message (body of an ICMPv6 packet).
+	NDPNAMinimumSize = 20
+
+	// ndpNATargetAddressOffset is the start of the Target Address
+	// field within an NDPNeighborAdvert.
+	ndpNATargetAddressOffset = 4
+
+	// ndpNAOptionsOffset is the start of the NDP options in an
+	// NDPNeighborAdvert.
+	ndpNAOptionsOffset = ndpNATargetAddressOffset + IPv6AddressSize
+
+	// ndpNAFlagsOffset is the offset of the flags within an
+	// NDPNeighborAdvert
+	ndpNAFlagsOffset = 0
+
+	// ndpNARouterFlagMask is the mask of the Router Flag field in
+	// the flags byte within in an NDPNeighborAdvert.
+	ndpNARouterFlagMask = (1 << 7)
+
+	// ndpNASolicitedFlagMask is the mask of the Solicited Flag field in
+	// the flags byte within in an NDPNeighborAdvert.
+	ndpNASolicitedFlagMask = (1 << 6)
+
+	// ndpNAOverrideFlagMask is the mask of the Override Flag field in
+	// the flags byte within in an NDPNeighborAdvert.
+	ndpNAOverrideFlagMask = (1 << 5)
+)
+
+// TargetAddress returns the value within the Target Address field.
+func (b NDPNeighborAdvert) TargetAddress() tcpip.Address {
+	return tcpip.Address(b[ndpNATargetAddressOffset:][:IPv6AddressSize])
+}
+
+// SetTargetAddress sets the value within the Target Address field.
+func (b NDPNeighborAdvert) SetTargetAddress(addr tcpip.Address) {
+	copy(b[ndpNATargetAddressOffset:][:IPv6AddressSize], addr)
+}
+
+// RouterFlag returns the value of the Router Flag field.
+func (b NDPNeighborAdvert) RouterFlag() bool {
+	return b[ndpNAFlagsOffset]&ndpNARouterFlagMask != 0
+}
+
+// SetRouterFlag sets the value in the Router Flag field.
+func (b NDPNeighborAdvert) SetRouterFlag(f bool) {
+	if f {
+		b[ndpNAFlagsOffset] |= ndpNARouterFlagMask
+	} else {
+		b[ndpNAFlagsOffset] &^= ndpNARouterFlagMask
+	}
+}
+
+// SolicitedFlag returns the value of the Solicited Flag field.
+func (b NDPNeighborAdvert) SolicitedFlag() bool {
+	return b[ndpNAFlagsOffset]&ndpNASolicitedFlagMask != 0
+}
+
+// SetSolicitedFlag sets the value in the Solicited Flag field.
+func (b NDPNeighborAdvert) SetSolicitedFlag(f bool) {
+	if f {
+		b[ndpNAFlagsOffset] |= ndpNASolicitedFlagMask
+	} else {
+		b[ndpNAFlagsOffset] &^= ndpNASolicitedFlagMask
+	}
+}
+
+// OverrideFlag returns the value of the Override Flag field.
+func (b NDPNeighborAdvert) OverrideFlag() bool {
+	return b[ndpNAFlagsOffset]&ndpNAOverrideFlagMask != 0
+}
+
+// SetOverrideFlag sets the value in the Override Flag field.
+func (b NDPNeighborAdvert) SetOverrideFlag(f bool) {
+	if f {
+		b[ndpNAFlagsOffset] |= ndpNAOverrideFlagMask
+	} else {
+		b[ndpNAFlagsOffset] &^= ndpNAOverrideFlagMask
+	}
+}
+
+// Options returns an NDPOptions of the the options body.
+func (b NDPNeighborAdvert) Options() NDPOptions {
+	return NDPOptions(b[ndpNAOptionsOffset:])
+}
diff --git a/pkg/tcpip/header/ndp_neighbor_solicit.go b/pkg/tcpip/header/ndp_neighbor_solicit.go
new file mode 100644
index 000000000..3a1b8e139
--- /dev/null
+++ b/pkg/tcpip/header/ndp_neighbor_solicit.go
@@ -0,0 +1,52 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package header
+
+import "gvisor.dev/gvisor/pkg/tcpip"
+
+// NDPNeighborSolicit is an NDP Neighbor Solicitation message. It will only
+// contain the body of an ICMPv6 packet.
+//
+// See RFC 4861 section 4.3 for more details.
+type NDPNeighborSolicit []byte
+
+const (
+	// NDPNSMinimumSize is the minimum size of a valid NDP Neighbor
+	// Solicitation message (body of an ICMPv6 packet).
+	NDPNSMinimumSize = 20
+
+	// ndpNSTargetAddessOffset is the start of the Target Address
+	// field within an NDPNeighborSolicit.
+	ndpNSTargetAddessOffset = 4
+
+	// ndpNSOptionsOffset is the start of the NDP options in an
+	// NDPNeighborSolicit.
+	ndpNSOptionsOffset = ndpNSTargetAddessOffset + IPv6AddressSize
+)
+
+// TargetAddress returns the value within the Target Address field.
+func (b NDPNeighborSolicit) TargetAddress() tcpip.Address {
+	return tcpip.Address(b[ndpNSTargetAddessOffset:][:IPv6AddressSize])
+}
+
+// SetTargetAddress sets the value within the Target Address field.
+func (b NDPNeighborSolicit) SetTargetAddress(addr tcpip.Address) {
+	copy(b[ndpNSTargetAddessOffset:][:IPv6AddressSize], addr)
+}
+
+// Options returns an NDPOptions of the the options body.
+func (b NDPNeighborSolicit) Options() NDPOptions {
+	return NDPOptions(b[ndpNSOptionsOffset:])
+}
diff --git a/pkg/tcpip/header/ndp_options.go b/pkg/tcpip/header/ndp_options.go
new file mode 100644
index 000000000..b28bde15b
--- /dev/null
+++ b/pkg/tcpip/header/ndp_options.go
@@ -0,0 +1,172 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package header
+
+import (
+	"gvisor.dev/gvisor/pkg/tcpip"
+)
+
+const (
+	// NDPTargetLinkLayerAddressOptionType is the type of the Target
+	// Link-Layer Address option, as per RFC 4861 section 4.6.1.
+	NDPTargetLinkLayerAddressOptionType = 2
+
+	// ndpTargetEthernetLinkLayerAddressSize is the size of a Target
+	// Link Layer Option for an Ethernet address.
+	ndpTargetEthernetLinkLayerAddressSize = 8
+
+	// lengthByteUnits is the multiplier factor for the Length field of an
+	// NDP option. That is, the length field for NDP options is in units of
+	// 8 octets, as per RFC 4861 section 4.6.
+	lengthByteUnits = 8
+)
+
+// NDPOptions is a buffer of NDP options as defined by RFC 4861 section 4.6.
+type NDPOptions []byte
+
+// Serialize serializes the provided list of NDP options into o.
+//
+// Note, b must be of sufficient size to hold all the options in s. See
+// NDPOptionsSerializer.Length for details on the getting the total size
+// of a serialized NDPOptionsSerializer.
+//
+// Serialize may panic if b is not of sufficient size to hold all the options
+// in s.
+func (b NDPOptions) Serialize(s NDPOptionsSerializer) int {
+	done := 0
+
+	for _, o := range s {
+		l := paddedLength(o)
+
+		if l == 0 {
+			continue
+		}
+
+		b[0] = o.Type()
+
+		// We know this safe because paddedLength would have returned
+		// 0 if o had an invalid length (> 255 * lengthByteUnits).
+		b[1] = uint8(l / lengthByteUnits)
+
+		// Serialize NDP option body.
+		used := o.serializeInto(b[2:])
+
+		// Zero out remaining (padding) bytes, if any exists.
+		for i := used + 2; i < l; i++ {
+			b[i] = 0
+		}
+
+		b = b[l:]
+		done += l
+	}
+
+	return done
+}
+
+// ndpOption is the set of functions to be implemented by all NDP option types.
+type ndpOption interface {
+	// Type returns the type of this ndpOption.
+	Type() uint8
+
+	// Length returns the length of the body of this ndpOption, in bytes.
+	Length() int
+
+	// serializeInto serializes this ndpOption into the provided byte
+	// buffer.
+	//
+	// Note, the caller MUST provide a byte buffer with size of at least
+	// Length. Implementers of this function may assume that the byte buffer
+	// is of sufficient size. serializeInto MAY panic if the provided byte
+	// buffer is not of sufficient size.
+	//
+	// serializeInto will return the number of bytes that was used to
+	// serialize this ndpOption. Implementers must only use the number of
+	// bytes required to serialize this ndpOption. Callers MAY provide a
+	// larger buffer than required to serialize into.
+	serializeInto([]byte) int
+}
+
+// paddedLength returns the length of o, in bytes, with any padding bytes, if
+// required.
+func paddedLength(o ndpOption) int {
+	l := o.Length()
+
+	if l == 0 {
+		return 0
+	}
+
+	// Length excludes the 2 Type and Length bytes.
+	l += 2
+
+	// Add extra bytes if needed to make sure the option is
+	// lengthByteUnits-byte aligned. We do this by adding lengthByteUnits-1
+	// to l and then stripping off the last few LSBits from l. This will
+	// make sure that l is rounded up to the nearest unit of
+	// lengthByteUnits. This works since lengthByteUnits is a power of 2
+	// (= 8).
+	mask := lengthByteUnits - 1
+	l += mask
+	l &^= mask
+
+	if l/lengthByteUnits > 255 {
+		// Should never happen because an option can only have a max
+		// value of 255 for its Length field, so just return 0 so this
+		// option does not get serialized.
+		//
+		// Returning 0 here will make sure that this option does not get
+		// serialized when NDPOptions.Serialize is called with the
+		// NDPOptionsSerializer that holds this option, effectively
+		// skipping this option during serialization. Also note that
+		// a value of zero for the Length field in an NDP option is
+		// invalid so this is another sign to the caller that this NDP
+		// option is malformed, as per RFC 4861 section 4.6.
+		return 0
+	}
+
+	return l
+}
+
+// NDPOptionsSerializer is a serializer for NDP options.
+type NDPOptionsSerializer []ndpOption
+
+// Length returns the total number of bytes required to serialize.
+func (b NDPOptionsSerializer) Length() int {
+	l := 0
+
+	for _, o := range b {
+		l += paddedLength(o)
+	}
+
+	return l
+}
+
+// NDPTargetLinkLayerAddressOption is the NDP Target Link Layer Option
+// as defined by RFC 4861 section 4.6.1.
+type NDPTargetLinkLayerAddressOption tcpip.LinkAddress
+
+// Type implements ndpOption.Type.
+func (o NDPTargetLinkLayerAddressOption) Type() uint8 {
+	return NDPTargetLinkLayerAddressOptionType
+}
+
+// Length implements ndpOption.Length.
+func (o NDPTargetLinkLayerAddressOption) Length() int {
+	return len(o)
+}
+
+// serializeInto implements ndpOption.serializeInto.
+func (o NDPTargetLinkLayerAddressOption) serializeInto(b []byte) int {
+	return copy(b, o)
+}
diff --git a/pkg/tcpip/header/ndp_router_advert.go b/pkg/tcpip/header/ndp_router_advert.go
new file mode 100644
index 000000000..bf7610863
--- /dev/null
+++ b/pkg/tcpip/header/ndp_router_advert.go
@@ -0,0 +1,112 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package header
+
+import (
+	"encoding/binary"
+	"time"
+)
+
+// NDPRouterAdvert is an NDP Router Advertisement message. It will only contain
+// the body of an ICMPv6 packet.
+//
+// See RFC 4861 section 4.2 for more details.
+type NDPRouterAdvert []byte
+
+const (
+	// NDPRAMinimumSize is the minimum size of a valid NDP Router
+	// Advertisement message (body of an ICMPv6 packet).
+	NDPRAMinimumSize = 12
+
+	// ndpRACurrHopLimitOffset is the byte of the Curr Hop Limit field
+	// within an NDPRouterAdvert.
+	ndpRACurrHopLimitOffset = 0
+
+	// ndpRAFlagsOffset is the byte with the NDP RA bit-fields/flags
+	// within an NDPRouterAdvert.
+	ndpRAFlagsOffset = 1
+
+	// ndpRAManagedAddrConfFlagMask is the mask of the Managed Address
+	// Configuration flag within the bit-field/flags byte of an
+	// NDPRouterAdvert.
+	ndpRAManagedAddrConfFlagMask = (1 << 7)
+
+	// ndpRAOtherConfFlagMask is the mask of the Other Configuration flag
+	// within the bit-field/flags byte of an NDPRouterAdvert.
+	ndpRAOtherConfFlagMask = (1 << 6)
+
+	// ndpRARouterLifetimeOffset is the start of the 2-byte Router Lifetime
+	// field within an NDPRouterAdvert.
+	ndpRARouterLifetimeOffset = 2
+
+	// ndpRAReachableTimeOffset is the start of the 4-byte Reachable Time
+	// field within an NDPRouterAdvert.
+	ndpRAReachableTimeOffset = 4
+
+	// ndpRARetransTimerOffset is the start of the 4-byte Retrans Timer
+	// field within an NDPRouterAdvert.
+	ndpRARetransTimerOffset = 8
+
+	// ndpRAOptionsOffset is the start of the NDP options in an
+	// NDPRouterAdvert.
+	ndpRAOptionsOffset = 12
+)
+
+// CurrHopLimit returns the value of the Curr Hop Limit field.
+func (b NDPRouterAdvert) CurrHopLimit() uint8 {
+	return b[ndpRACurrHopLimitOffset]
+}
+
+// ManagedAddrConfFlag returns the value of the Managed Address Configuration
+// flag.
+func (b NDPRouterAdvert) ManagedAddrConfFlag() bool {
+	return b[ndpRAFlagsOffset]&ndpRAManagedAddrConfFlagMask != 0
+}
+
+// OtherConfFlag returns the value of the Other Configuration flag.
+func (b NDPRouterAdvert) OtherConfFlag() bool {
+	return b[ndpRAFlagsOffset]&ndpRAOtherConfFlagMask != 0
+}
+
+// RouterLifetime returns the lifetime associated with the default router. A
+// value of 0 means the source of the Router Advertisement is not a default
+// router and SHOULD NOT appear on the default router list. Note, a value of 0
+// only means that the router should not be used as a default router, it does
+// not apply to other information contained in the Router Advertisement.
+func (b NDPRouterAdvert) RouterLifetime() time.Duration {
+	// The field is the time in seconds, as per RFC 4861 section 4.2.
+	return time.Second * time.Duration(binary.BigEndian.Uint16(b[ndpRARouterLifetimeOffset:]))
+}
+
+// ReachableTime returns the time that a node assumes a neighbor is reachable
+// after having received a reachability confirmation. A value of 0 means
+// that it is unspecified by the source of the Router Advertisement message.
+func (b NDPRouterAdvert) ReachableTime() time.Duration {
+	// The field is the time in milliseconds, as per RFC 4861 section 4.2.
+	return time.Millisecond * time.Duration(binary.BigEndian.Uint32(b[ndpRAReachableTimeOffset:]))
+}
+
+// RetransTimer returns the time between retransmitted Neighbor Solicitation
+// messages. A value of 0 means that it is unspecified by the source of the
+// Router Advertisement message.
+func (b NDPRouterAdvert) RetransTimer() time.Duration {
+	// The field is the time in milliseconds, as per RFC 4861 section 4.2.
+	return time.Millisecond * time.Duration(binary.BigEndian.Uint32(b[ndpRARetransTimerOffset:]))
+}
+
+// Options returns an NDPOptions of the the options body.
+func (b NDPRouterAdvert) Options() NDPOptions {
+	return NDPOptions(b[ndpRAOptionsOffset:])
+}
diff --git a/pkg/tcpip/header/ndp_test.go b/pkg/tcpip/header/ndp_test.go
new file mode 100644
index 000000000..0aac14f43
--- /dev/null
+++ b/pkg/tcpip/header/ndp_test.go
@@ -0,0 +1,199 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package header
+
+import (
+	"bytes"
+	"testing"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+)
+
+// TestNDPNeighborSolicit tests the functions of NDPNeighborSolicit.
+func TestNDPNeighborSolicit(t *testing.T) {
+	b := []byte{
+		0, 0, 0, 0,
+		1, 2, 3, 4,
+		5, 6, 7, 8,
+		9, 10, 11, 12,
+		13, 14, 15, 16,
+	}
+
+	// Test getting the Target Address.
+	ns := NDPNeighborSolicit(b)
+	addr := tcpip.Address("\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f\x10")
+	if got := ns.TargetAddress(); got != addr {
+		t.Fatalf("got ns.TargetAddress = %s, want %s", got, addr)
+	}
+
+	// Test updating the Target Address.
+	addr2 := tcpip.Address("\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f\x11")
+	ns.SetTargetAddress(addr2)
+	if got := ns.TargetAddress(); got != addr2 {
+		t.Fatalf("got ns.TargetAddress = %s, want %s", got, addr2)
+	}
+	// Make sure the address got updated in the backing buffer.
+	if got := tcpip.Address(b[ndpNSTargetAddessOffset:][:IPv6AddressSize]); got != addr2 {
+		t.Fatalf("got targetaddress buffer = %s, want %s", got, addr2)
+	}
+}
+
+// TestNDPNeighborAdvert tests the functions of NDPNeighborAdvert.
+func TestNDPNeighborAdvert(t *testing.T) {
+	b := []byte{
+		160, 0, 0, 0,
+		1, 2, 3, 4,
+		5, 6, 7, 8,
+		9, 10, 11, 12,
+		13, 14, 15, 16,
+	}
+
+	// Test getting the Target Address.
+	na := NDPNeighborAdvert(b)
+	addr := tcpip.Address("\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f\x10")
+	if got := na.TargetAddress(); got != addr {
+		t.Fatalf("got TargetAddress = %s, want %s", got, addr)
+	}
+
+	// Test getting the Router Flag.
+	if got := na.RouterFlag(); !got {
+		t.Fatalf("got RouterFlag = false, want = true")
+	}
+
+	// Test getting the Solicited Flag.
+	if got := na.SolicitedFlag(); got {
+		t.Fatalf("got SolicitedFlag = true, want = false")
+	}
+
+	// Test getting the Override Flag.
+	if got := na.OverrideFlag(); !got {
+		t.Fatalf("got OverrideFlag = false, want = true")
+	}
+
+	// Test updating the Target Address.
+	addr2 := tcpip.Address("\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f\x11")
+	na.SetTargetAddress(addr2)
+	if got := na.TargetAddress(); got != addr2 {
+		t.Fatalf("got TargetAddress = %s, want %s", got, addr2)
+	}
+	// Make sure the address got updated in the backing buffer.
+	if got := tcpip.Address(b[ndpNATargetAddressOffset:][:IPv6AddressSize]); got != addr2 {
+		t.Fatalf("got targetaddress buffer = %s, want %s", got, addr2)
+	}
+
+	// Test updating the Router Flag.
+	na.SetRouterFlag(false)
+	if got := na.RouterFlag(); got {
+		t.Fatalf("got RouterFlag = true, want = false")
+	}
+
+	// Test updating the Solicited Flag.
+	na.SetSolicitedFlag(true)
+	if got := na.SolicitedFlag(); !got {
+		t.Fatalf("got SolicitedFlag = false, want = true")
+	}
+
+	// Test updating the Override Flag.
+	na.SetOverrideFlag(false)
+	if got := na.OverrideFlag(); got {
+		t.Fatalf("got OverrideFlag = true, want = false")
+	}
+
+	// Make sure flags got updated in the backing buffer.
+	if got := b[ndpNAFlagsOffset]; got != 64 {
+		t.Fatalf("got flags byte = %d, want = 64")
+	}
+}
+
+func TestNDPRouterAdvert(t *testing.T) {
+	b := []byte{
+		64, 128, 1, 2,
+		3, 4, 5, 6,
+		7, 8, 9, 10,
+	}
+
+	ra := NDPRouterAdvert(b)
+
+	if got := ra.CurrHopLimit(); got != 64 {
+		t.Fatalf("got ra.CurrHopLimit = %d, want = 64", got)
+	}
+
+	if got := ra.ManagedAddrConfFlag(); !got {
+		t.Fatalf("got ManagedAddrConfFlag = false, want = true")
+	}
+
+	if got := ra.OtherConfFlag(); got {
+		t.Fatalf("got OtherConfFlag = true, want = false")
+	}
+
+	if got, want := ra.RouterLifetime(), time.Second*258; got != want {
+		t.Fatalf("got ra.RouterLifetime = %d, want = %d", got, want)
+	}
+
+	if got, want := ra.ReachableTime(), time.Millisecond*50595078; got != want {
+		t.Fatalf("got ra.ReachableTime = %d, want = %d", got, want)
+	}
+
+	if got, want := ra.RetransTimer(), time.Millisecond*117967114; got != want {
+		t.Fatalf("got ra.RetransTimer = %d, want = %d", got, want)
+	}
+}
+
+// TestNDPTargetLinkLayerAddressOptionSerialize tests serializing a
+// NDPTargetLinkLayerAddressOption.
+func TestNDPTargetLinkLayerAddressOptionSerialize(t *testing.T) {
+	tests := []struct {
+		name        string
+		buf         []byte
+		expectedBuf []byte
+		addr        tcpip.LinkAddress
+	}{
+		{
+			"Ethernet",
+			make([]byte, 8),
+			[]byte{2, 1, 1, 2, 3, 4, 5, 6},
+			"\x01\x02\x03\x04\x05\x06",
+		},
+		{
+			"Padding",
+			[]byte{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
+			[]byte{2, 2, 1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0},
+			"\x01\x02\x03\x04\x05\x06\x07\x08",
+		},
+		{
+			"Empty",
+			[]byte{},
+			[]byte{},
+			"",
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			opts := NDPOptions(test.buf)
+			serializer := NDPOptionsSerializer{
+				NDPTargetLinkLayerAddressOption(test.addr),
+			}
+			if got, want := int(serializer.Length()), len(test.expectedBuf); got != want {
+				t.Fatalf("got Length = %d, want = %d", got, want)
+			}
+			opts.Serialize(serializer)
+			if !bytes.Equal(test.buf, test.expectedBuf) {
+				t.Fatalf("got b = %d, want = %d", test.buf, test.expectedBuf)
+			}
+		})
+	}
+}
diff --git a/pkg/tcpip/link/channel/channel.go b/pkg/tcpip/link/channel/channel.go
index 18adb2085..14f197a77 100644
--- a/pkg/tcpip/link/channel/channel.go
+++ b/pkg/tcpip/link/channel/channel.go
@@ -72,7 +72,7 @@ func (e *Endpoint) Inject(protocol tcpip.NetworkProtocolNumber, vv buffer.Vector
 
 // InjectLinkAddr injects an inbound packet with a remote link address.
 func (e *Endpoint) InjectLinkAddr(protocol tcpip.NetworkProtocolNumber, remote tcpip.LinkAddress, vv buffer.VectorisedView) {
-	e.dispatcher.DeliverNetworkPacket(e, remote, "" /* local */, protocol, vv.Clone(nil))
+	e.dispatcher.DeliverNetworkPacket(e, remote, "" /* local */, protocol, vv.Clone(nil), nil /* linkHeader */)
 }
 
 // Attach saves the stack network-layer dispatcher for use later when packets
@@ -96,7 +96,7 @@ func (e *Endpoint) MTU() uint32 {
 func (e *Endpoint) Capabilities() stack.LinkEndpointCapabilities {
 	caps := stack.LinkEndpointCapabilities(0)
 	if e.GSO {
-		caps |= stack.CapabilityGSO
+		caps |= stack.CapabilityHardwareGSO
 	}
 	return caps
 }
@@ -134,5 +134,49 @@ func (e *Endpoint) WritePacket(_ *stack.Route, gso *stack.GSO, hdr buffer.Prepen
 	return nil
 }
 
+// WritePackets stores outbound packets into the channel.
+func (e *Endpoint) WritePackets(_ *stack.Route, gso *stack.GSO, hdrs []stack.PacketDescriptor, payload buffer.VectorisedView, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+	payloadView := payload.ToView()
+	n := 0
+packetLoop:
+	for i := range hdrs {
+		hdr := &hdrs[i].Hdr
+		off := hdrs[i].Off
+		size := hdrs[i].Size
+		p := PacketInfo{
+			Header:  hdr.View(),
+			Proto:   protocol,
+			Payload: buffer.NewViewFromBytes(payloadView[off : off+size]),
+			GSO:     gso,
+		}
+
+		select {
+		case e.C <- p:
+			n++
+		default:
+			break packetLoop
+		}
+	}
+
+	return n, nil
+}
+
+// WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket.
+func (e *Endpoint) WriteRawPacket(packet buffer.VectorisedView) *tcpip.Error {
+	p := PacketInfo{
+		Header:  packet.ToView(),
+		Proto:   0,
+		Payload: buffer.View{},
+		GSO:     nil,
+	}
+
+	select {
+	case e.C <- p:
+	default:
+	}
+
+	return nil
+}
+
 // Wait implements stack.LinkEndpoint.Wait.
 func (*Endpoint) Wait() {}
diff --git a/pkg/tcpip/link/fdbased/endpoint.go b/pkg/tcpip/link/fdbased/endpoint.go
index f80ac3435..ae4858529 100644
--- a/pkg/tcpip/link/fdbased/endpoint.go
+++ b/pkg/tcpip/link/fdbased/endpoint.go
@@ -165,6 +165,9 @@ type Options struct {
 	// disabled.
 	GSOMaxSize uint32
 
+	// SoftwareGSOEnabled indicates whether software GSO is enabled or not.
+	SoftwareGSOEnabled bool
+
 	// PacketDispatchMode specifies the type of inbound dispatcher to be
 	// used for this endpoint.
 	PacketDispatchMode PacketDispatchMode
@@ -242,7 +245,11 @@ func New(opts *Options) (stack.LinkEndpoint, error) {
 		}
 		if isSocket {
 			if opts.GSOMaxSize != 0 {
-				e.caps |= stack.CapabilityGSO
+				if opts.SoftwareGSOEnabled {
+					e.caps |= stack.CapabilitySoftwareGSO
+				} else {
+					e.caps |= stack.CapabilityHardwareGSO
+				}
 				e.gsoMaxSize = opts.GSOMaxSize
 			}
 		}
@@ -397,7 +404,7 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr buffer.Prepen
 		eth.Encode(ethHdr)
 	}
 
-	if e.Capabilities()&stack.CapabilityGSO != 0 {
+	if e.Capabilities()&stack.CapabilityHardwareGSO != 0 {
 		vnetHdr := virtioNetHdr{}
 		vnetHdrBuf := vnetHdrToByteSlice(&vnetHdr)
 		if gso != nil {
@@ -430,8 +437,130 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr buffer.Prepen
 	return rawfile.NonBlockingWrite3(e.fds[0], hdr.View(), payload.ToView(), nil)
 }
 
-// WriteRawPacket writes a raw packet directly to the file descriptor.
-func (e *endpoint) WriteRawPacket(dest tcpip.Address, packet []byte) *tcpip.Error {
+// WritePackets writes outbound packets to the file descriptor. If it is not
+// currently writable, the packet is dropped.
+func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, hdrs []stack.PacketDescriptor, payload buffer.VectorisedView, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+	var ethHdrBuf []byte
+	// hdr + data
+	iovLen := 2
+	if e.hdrSize > 0 {
+		// Add ethernet header if needed.
+		ethHdrBuf = make([]byte, header.EthernetMinimumSize)
+		eth := header.Ethernet(ethHdrBuf)
+		ethHdr := &header.EthernetFields{
+			DstAddr: r.RemoteLinkAddress,
+			Type:    protocol,
+		}
+
+		// Preserve the src address if it's set in the route.
+		if r.LocalLinkAddress != "" {
+			ethHdr.SrcAddr = r.LocalLinkAddress
+		} else {
+			ethHdr.SrcAddr = e.addr
+		}
+		eth.Encode(ethHdr)
+		iovLen++
+	}
+
+	n := len(hdrs)
+
+	views := payload.Views()
+	/*
+	 * Each bondary in views can add one more iovec.
+	 *
+	 * payload |      |          |         |
+	 *         -----------------------------
+	 * packets |    |    |    |    |    |  |
+	 *         -----------------------------
+	 * iovecs  |    | |  |    |  | |    |  |
+	 */
+	iovec := make([]syscall.Iovec, n*iovLen+len(views)-1)
+	mmsgHdrs := make([]rawfile.MMsgHdr, n)
+
+	iovecIdx := 0
+	viewIdx := 0
+	viewOff := 0
+	off := 0
+	nextOff := 0
+	for i := range hdrs {
+		prevIovecIdx := iovecIdx
+		mmsgHdr := &mmsgHdrs[i]
+		mmsgHdr.Msg.Iov = &iovec[iovecIdx]
+		packetSize := hdrs[i].Size
+		hdr := &hdrs[i].Hdr
+
+		off = hdrs[i].Off
+		if off != nextOff {
+			// We stop in a different point last time.
+			size := packetSize
+			viewIdx = 0
+			viewOff = 0
+			for size > 0 {
+				if size >= len(views[viewIdx]) {
+					viewIdx++
+					viewOff = 0
+					size -= len(views[viewIdx])
+				} else {
+					viewOff = size
+					size = 0
+				}
+			}
+		}
+		nextOff = off + packetSize
+
+		if ethHdrBuf != nil {
+			v := &iovec[iovecIdx]
+			v.Base = &ethHdrBuf[0]
+			v.Len = uint64(len(ethHdrBuf))
+			iovecIdx++
+		}
+
+		v := &iovec[iovecIdx]
+		hdrView := hdr.View()
+		v.Base = &hdrView[0]
+		v.Len = uint64(len(hdrView))
+		iovecIdx++
+
+		for packetSize > 0 {
+			vec := &iovec[iovecIdx]
+			iovecIdx++
+
+			v := views[viewIdx]
+			vec.Base = &v[viewOff]
+			s := len(v) - viewOff
+			if s <= packetSize {
+				viewIdx++
+				viewOff = 0
+			} else {
+				s = packetSize
+				viewOff += s
+			}
+			vec.Len = uint64(s)
+			packetSize -= s
+		}
+
+		mmsgHdr.Msg.Iovlen = uint64(iovecIdx - prevIovecIdx)
+	}
+
+	packets := 0
+	for packets < n {
+		sent, err := rawfile.NonBlockingSendMMsg(e.fds[0], mmsgHdrs)
+		if err != nil {
+			return packets, err
+		}
+		packets += sent
+		mmsgHdrs = mmsgHdrs[sent:]
+	}
+	return packets, nil
+}
+
+// WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket.
+func (e *endpoint) WriteRawPacket(packet buffer.VectorisedView) *tcpip.Error {
+	return rawfile.NonBlockingWrite(e.fds[0], packet.ToView())
+}
+
+// InjectOutobund implements stack.InjectableEndpoint.InjectOutbound.
+func (e *endpoint) InjectOutbound(dest tcpip.Address, packet []byte) *tcpip.Error {
 	return rawfile.NonBlockingWrite(e.fds[0], packet)
 }
 
@@ -468,9 +597,9 @@ func (e *InjectableEndpoint) Attach(dispatcher stack.NetworkDispatcher) {
 	e.dispatcher = dispatcher
 }
 
-// Inject injects an inbound packet.
-func (e *InjectableEndpoint) Inject(protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView) {
-	e.dispatcher.DeliverNetworkPacket(e, "" /* remote */, "" /* local */, protocol, vv)
+// InjectInbound injects an inbound packet.
+func (e *InjectableEndpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView) {
+	e.dispatcher.DeliverNetworkPacket(e, "" /* remote */, "" /* local */, protocol, vv, nil /* linkHeader */)
 }
 
 // NewInjectable creates a new fd-based InjectableEndpoint.
diff --git a/pkg/tcpip/link/fdbased/endpoint_test.go b/pkg/tcpip/link/fdbased/endpoint_test.go
index 04406bc9a..59378b96c 100644
--- a/pkg/tcpip/link/fdbased/endpoint_test.go
+++ b/pkg/tcpip/link/fdbased/endpoint_test.go
@@ -43,9 +43,10 @@ const (
 )
 
 type packetInfo struct {
-	raddr    tcpip.LinkAddress
-	proto    tcpip.NetworkProtocolNumber
-	contents buffer.View
+	raddr      tcpip.LinkAddress
+	proto      tcpip.NetworkProtocolNumber
+	contents   buffer.View
+	linkHeader buffer.View
 }
 
 type context struct {
@@ -92,8 +93,8 @@ func (c *context) cleanup() {
 	syscall.Close(c.fds[1])
 }
 
-func (c *context) DeliverNetworkPacket(linkEP stack.LinkEndpoint, remote tcpip.LinkAddress, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView) {
-	c.ch <- packetInfo{remote, protocol, vv.ToView()}
+func (c *context) DeliverNetworkPacket(linkEP stack.LinkEndpoint, remote tcpip.LinkAddress, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView, linkHeader buffer.View) {
+	c.ch <- packetInfo{remote, protocol, vv.ToView(), linkHeader}
 }
 
 func TestNoEthernetProperties(t *testing.T) {
@@ -293,11 +294,12 @@ func TestDeliverPacket(t *testing.T) {
 					b[i] = uint8(rand.Intn(256))
 				}
 
+				var hdr header.Ethernet
 				if !eth {
 					// So that it looks like an IPv4 packet.
 					b[0] = 0x40
 				} else {
-					hdr := make(header.Ethernet, header.EthernetMinimumSize)
+					hdr = make(header.Ethernet, header.EthernetMinimumSize)
 					hdr.Encode(&header.EthernetFields{
 						SrcAddr: raddr,
 						DstAddr: laddr,
@@ -315,9 +317,10 @@ func TestDeliverPacket(t *testing.T) {
 				select {
 				case pi := <-c.ch:
 					want := packetInfo{
-						raddr:    raddr,
-						proto:    proto,
-						contents: b,
+						raddr:      raddr,
+						proto:      proto,
+						contents:   b,
+						linkHeader: buffer.View(hdr),
 					}
 					if !eth {
 						want.proto = header.IPv4ProtocolNumber
diff --git a/pkg/tcpip/link/fdbased/mmap.go b/pkg/tcpip/link/fdbased/mmap.go
index 8bfeb97e4..554d45715 100644
--- a/pkg/tcpip/link/fdbased/mmap.go
+++ b/pkg/tcpip/link/fdbased/mmap.go
@@ -169,9 +169,10 @@ func (d *packetMMapDispatcher) dispatch() (bool, *tcpip.Error) {
 	var (
 		p             tcpip.NetworkProtocolNumber
 		remote, local tcpip.LinkAddress
+		eth           header.Ethernet
 	)
 	if d.e.hdrSize > 0 {
-		eth := header.Ethernet(pkt)
+		eth = header.Ethernet(pkt)
 		p = eth.Type()
 		remote = eth.SourceAddress()
 		local = eth.DestinationAddress()
@@ -189,6 +190,6 @@ func (d *packetMMapDispatcher) dispatch() (bool, *tcpip.Error) {
 	}
 
 	pkt = pkt[d.e.hdrSize:]
-	d.e.dispatcher.DeliverNetworkPacket(d.e, remote, local, p, buffer.NewVectorisedView(len(pkt), []buffer.View{buffer.View(pkt)}))
+	d.e.dispatcher.DeliverNetworkPacket(d.e, remote, local, p, buffer.NewVectorisedView(len(pkt), []buffer.View{buffer.View(pkt)}), buffer.View(eth))
 	return true, nil
 }
diff --git a/pkg/tcpip/link/fdbased/packet_dispatchers.go b/pkg/tcpip/link/fdbased/packet_dispatchers.go
index 7ca217e5b..12168a1dc 100644
--- a/pkg/tcpip/link/fdbased/packet_dispatchers.go
+++ b/pkg/tcpip/link/fdbased/packet_dispatchers.go
@@ -53,7 +53,7 @@ func newReadVDispatcher(fd int, e *endpoint) (linkDispatcher, error) {
 	d := &readVDispatcher{fd: fd, e: e}
 	d.views = make([]buffer.View, len(BufConfig))
 	iovLen := len(BufConfig)
-	if d.e.Capabilities()&stack.CapabilityGSO != 0 {
+	if d.e.Capabilities()&stack.CapabilityHardwareGSO != 0 {
 		iovLen++
 	}
 	d.iovecs = make([]syscall.Iovec, iovLen)
@@ -63,7 +63,7 @@ func newReadVDispatcher(fd int, e *endpoint) (linkDispatcher, error) {
 func (d *readVDispatcher) allocateViews(bufConfig []int) {
 	var vnetHdr [virtioNetHdrSize]byte
 	vnetHdrOff := 0
-	if d.e.Capabilities()&stack.CapabilityGSO != 0 {
+	if d.e.Capabilities()&stack.CapabilityHardwareGSO != 0 {
 		// The kernel adds virtioNetHdr before each packet, but
 		// we don't use it, so so we allocate a buffer for it,
 		// add it in iovecs but don't add it in a view.
@@ -106,7 +106,7 @@ func (d *readVDispatcher) dispatch() (bool, *tcpip.Error) {
 	if err != nil {
 		return false, err
 	}
-	if d.e.Capabilities()&stack.CapabilityGSO != 0 {
+	if d.e.Capabilities()&stack.CapabilityHardwareGSO != 0 {
 		// Skip virtioNetHdr which is added before each packet, it
 		// isn't used and it isn't in a view.
 		n -= virtioNetHdrSize
@@ -118,9 +118,10 @@ func (d *readVDispatcher) dispatch() (bool, *tcpip.Error) {
 	var (
 		p             tcpip.NetworkProtocolNumber
 		remote, local tcpip.LinkAddress
+		eth           header.Ethernet
 	)
 	if d.e.hdrSize > 0 {
-		eth := header.Ethernet(d.views[0])
+		eth = header.Ethernet(d.views[0][:header.EthernetMinimumSize])
 		p = eth.Type()
 		remote = eth.SourceAddress()
 		local = eth.DestinationAddress()
@@ -141,7 +142,7 @@ func (d *readVDispatcher) dispatch() (bool, *tcpip.Error) {
 	vv := buffer.NewVectorisedView(n, d.views[:used])
 	vv.TrimFront(d.e.hdrSize)
 
-	d.e.dispatcher.DeliverNetworkPacket(d.e, remote, local, p, vv)
+	d.e.dispatcher.DeliverNetworkPacket(d.e, remote, local, p, vv, buffer.View(eth))
 
 	// Prepare e.views for another packet: release used views.
 	for i := 0; i < used; i++ {
@@ -194,7 +195,7 @@ func newRecvMMsgDispatcher(fd int, e *endpoint) (linkDispatcher, error) {
 	}
 	d.iovecs = make([][]syscall.Iovec, MaxMsgsPerRecv)
 	iovLen := len(BufConfig)
-	if d.e.Capabilities()&stack.CapabilityGSO != 0 {
+	if d.e.Capabilities()&stack.CapabilityHardwareGSO != 0 {
 		// virtioNetHdr is prepended before each packet.
 		iovLen++
 	}
@@ -225,7 +226,7 @@ func (d *recvMMsgDispatcher) allocateViews(bufConfig []int) {
 	for k := 0; k < len(d.views); k++ {
 		var vnetHdr [virtioNetHdrSize]byte
 		vnetHdrOff := 0
-		if d.e.Capabilities()&stack.CapabilityGSO != 0 {
+		if d.e.Capabilities()&stack.CapabilityHardwareGSO != 0 {
 			// The kernel adds virtioNetHdr before each packet, but
 			// we don't use it, so so we allocate a buffer for it,
 			// add it in iovecs but don't add it in a view.
@@ -261,7 +262,7 @@ func (d *recvMMsgDispatcher) dispatch() (bool, *tcpip.Error) {
 	// Process each of received packets.
 	for k := 0; k < nMsgs; k++ {
 		n := int(d.msgHdrs[k].Len)
-		if d.e.Capabilities()&stack.CapabilityGSO != 0 {
+		if d.e.Capabilities()&stack.CapabilityHardwareGSO != 0 {
 			n -= virtioNetHdrSize
 		}
 		if n <= d.e.hdrSize {
@@ -271,9 +272,10 @@ func (d *recvMMsgDispatcher) dispatch() (bool, *tcpip.Error) {
 		var (
 			p             tcpip.NetworkProtocolNumber
 			remote, local tcpip.LinkAddress
+			eth           header.Ethernet
 		)
 		if d.e.hdrSize > 0 {
-			eth := header.Ethernet(d.views[k][0])
+			eth = header.Ethernet(d.views[k][0])
 			p = eth.Type()
 			remote = eth.SourceAddress()
 			local = eth.DestinationAddress()
@@ -293,7 +295,7 @@ func (d *recvMMsgDispatcher) dispatch() (bool, *tcpip.Error) {
 		used := d.capViews(k, int(n), BufConfig)
 		vv := buffer.NewVectorisedView(int(n), d.views[k][:used])
 		vv.TrimFront(d.e.hdrSize)
-		d.e.dispatcher.DeliverNetworkPacket(d.e, remote, local, p, vv)
+		d.e.dispatcher.DeliverNetworkPacket(d.e, remote, local, p, vv, buffer.View(eth))
 
 		// Prepare e.views for another packet: release used views.
 		for i := 0; i < used; i++ {
diff --git a/pkg/tcpip/link/loopback/BUILD b/pkg/tcpip/link/loopback/BUILD
index 47a54845c..23e4d1418 100644
--- a/pkg/tcpip/link/loopback/BUILD
+++ b/pkg/tcpip/link/loopback/BUILD
@@ -10,6 +10,7 @@ go_library(
     deps = [
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
+        "//pkg/tcpip/header",
         "//pkg/tcpip/stack",
     ],
 )
diff --git a/pkg/tcpip/link/loopback/loopback.go b/pkg/tcpip/link/loopback/loopback.go
index b36629d2c..a3b48fa73 100644
--- a/pkg/tcpip/link/loopback/loopback.go
+++ b/pkg/tcpip/link/loopback/loopback.go
@@ -23,6 +23,7 @@ package loopback
 import (
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 )
 
@@ -70,6 +71,9 @@ func (*endpoint) LinkAddress() tcpip.LinkAddress {
 	return ""
 }
 
+// Wait implements stack.LinkEndpoint.Wait.
+func (*endpoint) Wait() {}
+
 // WritePacket implements stack.LinkEndpoint.WritePacket. It delivers outbound
 // packets to the network-layer dispatcher.
 func (e *endpoint) WritePacket(_ *stack.Route, _ *stack.GSO, hdr buffer.Prependable, payload buffer.VectorisedView, protocol tcpip.NetworkProtocolNumber) *tcpip.Error {
@@ -81,10 +85,27 @@ func (e *endpoint) WritePacket(_ *stack.Route, _ *stack.GSO, hdr buffer.Prependa
 	// Because we're immediately turning around and writing the packet back to the
 	// rx path, we intentionally don't preserve the remote and local link
 	// addresses from the stack.Route we're passed.
-	e.dispatcher.DeliverNetworkPacket(e, "" /* remote */, "" /* local */, protocol, vv)
+	e.dispatcher.DeliverNetworkPacket(e, "" /* remote */, "" /* local */, protocol, vv, nil /* linkHeader */)
 
 	return nil
 }
 
-// Wait implements stack.LinkEndpoint.Wait.
-func (*endpoint) Wait() {}
+// WritePackets implements stack.LinkEndpoint.WritePackets.
+func (e *endpoint) WritePackets(_ *stack.Route, _ *stack.GSO, hdrs []stack.PacketDescriptor, payload buffer.VectorisedView, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+	panic("not implemented")
+}
+
+// WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket.
+func (e *endpoint) WriteRawPacket(packet buffer.VectorisedView) *tcpip.Error {
+	// Reject the packet if it's shorter than an ethernet header.
+	if packet.Size() < header.EthernetMinimumSize {
+		return tcpip.ErrBadAddress
+	}
+
+	// There should be an ethernet header at the beginning of packet.
+	linkHeader := header.Ethernet(packet.First()[:header.EthernetMinimumSize])
+	packet.TrimFront(len(linkHeader))
+	e.dispatcher.DeliverNetworkPacket(e, "" /* remote */, "" /* local */, linkHeader.Type(), packet, buffer.View(linkHeader))
+
+	return nil
+}
diff --git a/pkg/tcpip/link/muxed/injectable.go b/pkg/tcpip/link/muxed/injectable.go
index 7c946101d..682b60291 100644
--- a/pkg/tcpip/link/muxed/injectable.go
+++ b/pkg/tcpip/link/muxed/injectable.go
@@ -79,29 +79,47 @@ func (m *InjectableEndpoint) IsAttached() bool {
 	return m.dispatcher != nil
 }
 
-// Inject implements stack.InjectableLinkEndpoint.
-func (m *InjectableEndpoint) Inject(protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView) {
-	m.dispatcher.DeliverNetworkPacket(m, "" /* remote */, "" /* local */, protocol, vv)
+// InjectInbound implements stack.InjectableLinkEndpoint.
+func (m *InjectableEndpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView) {
+	m.dispatcher.DeliverNetworkPacket(m, "" /* remote */, "" /* local */, protocol, vv, nil /* linkHeader */)
+}
+
+// WritePackets writes outbound packets to the appropriate
+// LinkInjectableEndpoint based on the RemoteAddress. HandleLocal only works if
+// r.RemoteAddress has a route registered in this endpoint.
+func (m *InjectableEndpoint) WritePackets(r *stack.Route, gso *stack.GSO, hdrs []stack.PacketDescriptor, payload buffer.VectorisedView, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+	endpoint, ok := m.routes[r.RemoteAddress]
+	if !ok {
+		return 0, tcpip.ErrNoRoute
+	}
+	return endpoint.WritePackets(r, gso, hdrs, payload, protocol)
 }
 
 // WritePacket writes outbound packets to the appropriate LinkInjectableEndpoint
 // based on the RemoteAddress. HandleLocal only works if r.RemoteAddress has a
 // route registered in this endpoint.
-func (m *InjectableEndpoint) WritePacket(r *stack.Route, _ *stack.GSO, hdr buffer.Prependable, payload buffer.VectorisedView, protocol tcpip.NetworkProtocolNumber) *tcpip.Error {
+func (m *InjectableEndpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr buffer.Prependable, payload buffer.VectorisedView, protocol tcpip.NetworkProtocolNumber) *tcpip.Error {
 	if endpoint, ok := m.routes[r.RemoteAddress]; ok {
-		return endpoint.WritePacket(r, nil /* gso */, hdr, payload, protocol)
+		return endpoint.WritePacket(r, gso, hdr, payload, protocol)
 	}
 	return tcpip.ErrNoRoute
 }
 
-// WriteRawPacket writes outbound packets to the appropriate
+// WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket.
+func (m *InjectableEndpoint) WriteRawPacket(packet buffer.VectorisedView) *tcpip.Error {
+	// WriteRawPacket doesn't get a route or network address, so there's
+	// nowhere to write this.
+	return tcpip.ErrNoRoute
+}
+
+// InjectOutbound writes outbound packets to the appropriate
 // LinkInjectableEndpoint based on the dest address.
-func (m *InjectableEndpoint) WriteRawPacket(dest tcpip.Address, packet []byte) *tcpip.Error {
+func (m *InjectableEndpoint) InjectOutbound(dest tcpip.Address, packet []byte) *tcpip.Error {
 	endpoint, ok := m.routes[dest]
 	if !ok {
 		return tcpip.ErrNoRoute
 	}
-	return endpoint.WriteRawPacket(dest, packet)
+	return endpoint.InjectOutbound(dest, packet)
 }
 
 // Wait implements stack.LinkEndpoint.Wait.
diff --git a/pkg/tcpip/link/muxed/injectable_test.go b/pkg/tcpip/link/muxed/injectable_test.go
index 3086fec00..9cd300af8 100644
--- a/pkg/tcpip/link/muxed/injectable_test.go
+++ b/pkg/tcpip/link/muxed/injectable_test.go
@@ -31,7 +31,7 @@ import (
 func TestInjectableEndpointRawDispatch(t *testing.T) {
 	endpoint, sock, dstIP := makeTestInjectableEndpoint(t)
 
-	endpoint.WriteRawPacket(dstIP, []byte{0xFA})
+	endpoint.InjectOutbound(dstIP, []byte{0xFA})
 
 	buf := make([]byte, ipv4.MaxTotalSize)
 	bytesRead, err := sock.Read(buf)
diff --git a/pkg/tcpip/link/rawfile/BUILD b/pkg/tcpip/link/rawfile/BUILD
index 2e8bc772a..05c7b8024 100644
--- a/pkg/tcpip/link/rawfile/BUILD
+++ b/pkg/tcpip/link/rawfile/BUILD
@@ -16,5 +16,8 @@ go_library(
     visibility = [
         "//visibility:public",
     ],
-    deps = ["//pkg/tcpip"],
+    deps = [
+        "//pkg/tcpip",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
 )
diff --git a/pkg/tcpip/link/rawfile/rawfile_unsafe.go b/pkg/tcpip/link/rawfile/rawfile_unsafe.go
index 7e286a3a6..44e25d475 100644
--- a/pkg/tcpip/link/rawfile/rawfile_unsafe.go
+++ b/pkg/tcpip/link/rawfile/rawfile_unsafe.go
@@ -22,6 +22,7 @@ import (
 	"syscall"
 	"unsafe"
 
+	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/tcpip"
 )
 
@@ -101,6 +102,16 @@ func NonBlockingWrite3(fd int, b1, b2, b3 []byte) *tcpip.Error {
 	return nil
 }
 
+// NonBlockingSendMMsg sends multiple messages on a socket.
+func NonBlockingSendMMsg(fd int, msgHdrs []MMsgHdr) (int, *tcpip.Error) {
+	n, _, e := syscall.RawSyscall6(unix.SYS_SENDMMSG, uintptr(fd), uintptr(unsafe.Pointer(&msgHdrs[0])), uintptr(len(msgHdrs)), syscall.MSG_DONTWAIT, 0, 0)
+	if e != 0 {
+		return 0, TranslateErrno(e)
+	}
+
+	return int(n), nil
+}
+
 // PollEvent represents the pollfd structure passed to a poll() system call.
 type PollEvent struct {
 	FD      int32
diff --git a/pkg/tcpip/link/sharedmem/sharedmem.go b/pkg/tcpip/link/sharedmem/sharedmem.go
index 9e71d4edf..279e2b457 100644
--- a/pkg/tcpip/link/sharedmem/sharedmem.go
+++ b/pkg/tcpip/link/sharedmem/sharedmem.go
@@ -212,6 +212,26 @@ func (e *endpoint) WritePacket(r *stack.Route, _ *stack.GSO, hdr buffer.Prependa
 	return nil
 }
 
+// WritePackets implements stack.LinkEndpoint.WritePackets.
+func (e *endpoint) WritePackets(r *stack.Route, _ *stack.GSO, hdrs []stack.PacketDescriptor, payload buffer.VectorisedView, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+	panic("not implemented")
+}
+
+// WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket.
+func (e *endpoint) WriteRawPacket(packet buffer.VectorisedView) *tcpip.Error {
+	v := packet.ToView()
+	// Transmit the packet.
+	e.mu.Lock()
+	ok := e.tx.transmit(v, buffer.View{})
+	e.mu.Unlock()
+
+	if !ok {
+		return tcpip.ErrWouldBlock
+	}
+
+	return nil
+}
+
 // dispatchLoop reads packets from the rx queue in a loop and dispatches them
 // to the network stack.
 func (e *endpoint) dispatchLoop(d stack.NetworkDispatcher) {
@@ -254,7 +274,7 @@ func (e *endpoint) dispatchLoop(d stack.NetworkDispatcher) {
 
 		// Send packet up the stack.
 		eth := header.Ethernet(b)
-		d.DeliverNetworkPacket(e, eth.SourceAddress(), eth.DestinationAddress(), eth.Type(), buffer.View(b[header.EthernetMinimumSize:]).ToVectorisedView())
+		d.DeliverNetworkPacket(e, eth.SourceAddress(), eth.DestinationAddress(), eth.Type(), buffer.View(b[header.EthernetMinimumSize:]).ToVectorisedView(), buffer.View(eth))
 	}
 
 	// Clean state.
diff --git a/pkg/tcpip/link/sharedmem/sharedmem_test.go b/pkg/tcpip/link/sharedmem/sharedmem_test.go
index 0e9ba0846..f3e9705c9 100644
--- a/pkg/tcpip/link/sharedmem/sharedmem_test.go
+++ b/pkg/tcpip/link/sharedmem/sharedmem_test.go
@@ -78,9 +78,10 @@ func (q *queueBuffers) cleanup() {
 }
 
 type packetInfo struct {
-	addr  tcpip.LinkAddress
-	proto tcpip.NetworkProtocolNumber
-	vv    buffer.VectorisedView
+	addr       tcpip.LinkAddress
+	proto      tcpip.NetworkProtocolNumber
+	vv         buffer.VectorisedView
+	linkHeader buffer.View
 }
 
 type testContext struct {
@@ -130,12 +131,13 @@ func newTestContext(t *testing.T, mtu, bufferSize uint32, addr tcpip.LinkAddress
 	return c
 }
 
-func (c *testContext) DeliverNetworkPacket(_ stack.LinkEndpoint, remoteLinkAddr, localLinkAddr tcpip.LinkAddress, proto tcpip.NetworkProtocolNumber, vv buffer.VectorisedView) {
+func (c *testContext) DeliverNetworkPacket(_ stack.LinkEndpoint, remoteLinkAddr, localLinkAddr tcpip.LinkAddress, proto tcpip.NetworkProtocolNumber, vv buffer.VectorisedView, linkHeader buffer.View) {
 	c.mu.Lock()
 	c.packets = append(c.packets, packetInfo{
-		addr:  remoteLinkAddr,
-		proto: proto,
-		vv:    vv.Clone(nil),
+		addr:       remoteLinkAddr,
+		proto:      proto,
+		vv:         vv.Clone(nil),
+		linkHeader: linkHeader,
 	})
 	c.mu.Unlock()
 
diff --git a/pkg/tcpip/link/sniffer/sniffer.go b/pkg/tcpip/link/sniffer/sniffer.go
index e401dce44..39757ea2a 100644
--- a/pkg/tcpip/link/sniffer/sniffer.go
+++ b/pkg/tcpip/link/sniffer/sniffer.go
@@ -116,7 +116,7 @@ func NewWithFile(lower stack.LinkEndpoint, file *os.File, snapLen uint32) (stack
 // DeliverNetworkPacket implements the stack.NetworkDispatcher interface. It is
 // called by the link-layer endpoint being wrapped when a packet arrives, and
 // logs the packet before forwarding to the actual dispatcher.
-func (e *endpoint) DeliverNetworkPacket(linkEP stack.LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView) {
+func (e *endpoint) DeliverNetworkPacket(linkEP stack.LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView, linkHeader buffer.View) {
 	if atomic.LoadUint32(&LogPackets) == 1 && e.file == nil {
 		logPacket("recv", protocol, vv.First(), nil)
 	}
@@ -147,7 +147,7 @@ func (e *endpoint) DeliverNetworkPacket(linkEP stack.LinkEndpoint, remote, local
 			panic(err)
 		}
 	}
-	e.dispatcher.DeliverNetworkPacket(e, remote, local, protocol, vv)
+	e.dispatcher.DeliverNetworkPacket(e, remote, local, protocol, vv, linkHeader)
 }
 
 // Attach implements the stack.LinkEndpoint interface. It saves the dispatcher
@@ -193,10 +193,7 @@ func (e *endpoint) GSOMaxSize() uint32 {
 	return 0
 }
 
-// WritePacket implements the stack.LinkEndpoint interface. It is called by
-// higher-level protocols to write packets; it just logs the packet and forwards
-// the request to the lower endpoint.
-func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr buffer.Prependable, payload buffer.VectorisedView, protocol tcpip.NetworkProtocolNumber) *tcpip.Error {
+func (e *endpoint) dumpPacket(gso *stack.GSO, hdr buffer.Prependable, payload buffer.VectorisedView, protocol tcpip.NetworkProtocolNumber) {
 	if atomic.LoadUint32(&LogPackets) == 1 && e.file == nil {
 		logPacket("send", protocol, hdr.View(), gso)
 	}
@@ -218,28 +215,74 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr buffer.Prepen
 			panic(err)
 		}
 		length -= len(hdrBuf)
-		if length > 0 {
-			for _, v := range payload.Views() {
-				if len(v) > length {
-					v = v[:length]
-				}
-				n, err := buf.Write(v)
-				if err != nil {
-					panic(err)
-				}
-				length -= n
-				if length == 0 {
-					break
-				}
-			}
-		}
+		logVectorisedView(payload, length, buf)
 		if _, err := e.file.Write(buf.Bytes()); err != nil {
 			panic(err)
 		}
 	}
+}
+
+// WritePacket implements the stack.LinkEndpoint interface. It is called by
+// higher-level protocols to write packets; it just logs the packet and
+// forwards the request to the lower endpoint.
+func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr buffer.Prependable, payload buffer.VectorisedView, protocol tcpip.NetworkProtocolNumber) *tcpip.Error {
+	e.dumpPacket(gso, hdr, payload, protocol)
 	return e.lower.WritePacket(r, gso, hdr, payload, protocol)
 }
 
+// WritePackets implements the stack.LinkEndpoint interface. It is called by
+// higher-level protocols to write packets; it just logs the packet and
+// forwards the request to the lower endpoint.
+func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, hdrs []stack.PacketDescriptor, payload buffer.VectorisedView, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+	view := payload.ToView()
+	for _, d := range hdrs {
+		e.dumpPacket(gso, d.Hdr, buffer.NewVectorisedView(d.Size, []buffer.View{view[d.Off:][:d.Size]}), protocol)
+	}
+	return e.lower.WritePackets(r, gso, hdrs, payload, protocol)
+}
+
+// WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket.
+func (e *endpoint) WriteRawPacket(packet buffer.VectorisedView) *tcpip.Error {
+	if atomic.LoadUint32(&LogPackets) == 1 && e.file == nil {
+		logPacket("send", 0, buffer.View("[raw packet, no header available]"), nil /* gso */)
+	}
+	if e.file != nil && atomic.LoadUint32(&LogPacketsToFile) == 1 {
+		length := packet.Size()
+		if length > int(e.maxPCAPLen) {
+			length = int(e.maxPCAPLen)
+		}
+
+		buf := bytes.NewBuffer(make([]byte, 0, pcapPacketHeaderLen+length))
+		if err := binary.Write(buf, binary.BigEndian, newPCAPPacketHeader(uint32(length), uint32(packet.Size()))); err != nil {
+			panic(err)
+		}
+		logVectorisedView(packet, length, buf)
+		if _, err := e.file.Write(buf.Bytes()); err != nil {
+			panic(err)
+		}
+	}
+	return e.lower.WriteRawPacket(packet)
+}
+
+func logVectorisedView(vv buffer.VectorisedView, length int, buf *bytes.Buffer) {
+	if length <= 0 {
+		return
+	}
+	for _, v := range vv.Views() {
+		if len(v) > length {
+			v = v[:length]
+		}
+		n, err := buf.Write(v)
+		if err != nil {
+			panic(err)
+		}
+		length -= n
+		if length == 0 {
+			return
+		}
+	}
+}
+
 // Wait implements stack.LinkEndpoint.Wait.
 func (*endpoint) Wait() {}
 
diff --git a/pkg/tcpip/link/waitable/waitable.go b/pkg/tcpip/link/waitable/waitable.go
index 5a1791cb5..a04fc1062 100644
--- a/pkg/tcpip/link/waitable/waitable.go
+++ b/pkg/tcpip/link/waitable/waitable.go
@@ -50,12 +50,12 @@ func New(lower stack.LinkEndpoint) *Endpoint {
 // It is called by the link-layer endpoint being wrapped when a packet arrives,
 // and only forwards to the actual dispatcher if Wait or WaitDispatch haven't
 // been called.
-func (e *Endpoint) DeliverNetworkPacket(linkEP stack.LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView) {
+func (e *Endpoint) DeliverNetworkPacket(linkEP stack.LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView, linkHeader buffer.View) {
 	if !e.dispatchGate.Enter() {
 		return
 	}
 
-	e.dispatcher.DeliverNetworkPacket(e, remote, local, protocol, vv)
+	e.dispatcher.DeliverNetworkPacket(e, remote, local, protocol, vv, linkHeader)
 	e.dispatchGate.Leave()
 }
 
@@ -109,6 +109,30 @@ func (e *Endpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr buffer.Prepen
 	return err
 }
 
+// WritePackets implements stack.LinkEndpoint.WritePackets. It is called by
+// higher-level protocols to write packets. It only forwards packets to the
+// lower endpoint if Wait or WaitWrite haven't been called.
+func (e *Endpoint) WritePackets(r *stack.Route, gso *stack.GSO, hdrs []stack.PacketDescriptor, payload buffer.VectorisedView, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+	if !e.writeGate.Enter() {
+		return len(hdrs), nil
+	}
+
+	n, err := e.lower.WritePackets(r, gso, hdrs, payload, protocol)
+	e.writeGate.Leave()
+	return n, err
+}
+
+// WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket.
+func (e *Endpoint) WriteRawPacket(packet buffer.VectorisedView) *tcpip.Error {
+	if !e.writeGate.Enter() {
+		return nil
+	}
+
+	err := e.lower.WriteRawPacket(packet)
+	e.writeGate.Leave()
+	return err
+}
+
 // WaitWrite prevents new calls to WritePacket from reaching the lower endpoint,
 // and waits for inflight ones to finish before returning.
 func (e *Endpoint) WaitWrite() {
diff --git a/pkg/tcpip/link/waitable/waitable_test.go b/pkg/tcpip/link/waitable/waitable_test.go
index ae23c96b7..5f0f8fa2d 100644
--- a/pkg/tcpip/link/waitable/waitable_test.go
+++ b/pkg/tcpip/link/waitable/waitable_test.go
@@ -35,7 +35,7 @@ type countedEndpoint struct {
 	dispatcher stack.NetworkDispatcher
 }
 
-func (e *countedEndpoint) DeliverNetworkPacket(linkEP stack.LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView) {
+func (e *countedEndpoint) DeliverNetworkPacket(linkEP stack.LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView, linkHeader buffer.View) {
 	e.dispatchCount++
 }
 
@@ -70,6 +70,17 @@ func (e *countedEndpoint) WritePacket(r *stack.Route, _ *stack.GSO, hdr buffer.P
 	return nil
 }
 
+// WritePackets implements stack.LinkEndpoint.WritePackets.
+func (e *countedEndpoint) WritePackets(r *stack.Route, _ *stack.GSO, hdrs []stack.PacketDescriptor, payload buffer.VectorisedView, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+	e.writeCount += len(hdrs)
+	return len(hdrs), nil
+}
+
+func (e *countedEndpoint) WriteRawPacket(packet buffer.VectorisedView) *tcpip.Error {
+	e.writeCount++
+	return nil
+}
+
 // Wait implements stack.LinkEndpoint.Wait.
 func (*countedEndpoint) Wait() {}
 
@@ -109,21 +120,21 @@ func TestWaitDispatch(t *testing.T) {
 	}
 
 	// Dispatch and check that it goes through.
-	ep.dispatcher.DeliverNetworkPacket(ep, "", "", 0, buffer.VectorisedView{})
+	ep.dispatcher.DeliverNetworkPacket(ep, "", "", 0, buffer.VectorisedView{}, buffer.View{})
 	if want := 1; ep.dispatchCount != want {
 		t.Fatalf("Unexpected dispatchCount: got=%v, want=%v", ep.dispatchCount, want)
 	}
 
 	// Wait on writes, then try to dispatch. It must go through.
 	wep.WaitWrite()
-	ep.dispatcher.DeliverNetworkPacket(ep, "", "", 0, buffer.VectorisedView{})
+	ep.dispatcher.DeliverNetworkPacket(ep, "", "", 0, buffer.VectorisedView{}, buffer.View{})
 	if want := 2; ep.dispatchCount != want {
 		t.Fatalf("Unexpected dispatchCount: got=%v, want=%v", ep.dispatchCount, want)
 	}
 
 	// Wait on dispatches, then try to dispatch. It must not go through.
 	wep.WaitDispatch()
-	ep.dispatcher.DeliverNetworkPacket(ep, "", "", 0, buffer.VectorisedView{})
+	ep.dispatcher.DeliverNetworkPacket(ep, "", "", 0, buffer.VectorisedView{}, buffer.View{})
 	if want := 2; ep.dispatchCount != want {
 		t.Fatalf("Unexpected dispatchCount: got=%v, want=%v", ep.dispatchCount, want)
 	}
diff --git a/pkg/tcpip/network/arp/arp.go b/pkg/tcpip/network/arp/arp.go
index 922181ac0..46178459e 100644
--- a/pkg/tcpip/network/arp/arp.go
+++ b/pkg/tcpip/network/arp/arp.go
@@ -83,6 +83,11 @@ func (e *endpoint) WritePacket(*stack.Route, *stack.GSO, buffer.Prependable, buf
 	return tcpip.ErrNotSupported
 }
 
+// WritePackets implements stack.NetworkEndpoint.WritePackets.
+func (e *endpoint) WritePackets(*stack.Route, *stack.GSO, []stack.PacketDescriptor, buffer.VectorisedView, stack.NetworkHeaderParams, stack.PacketLooping) (int, *tcpip.Error) {
+	return 0, tcpip.ErrNotSupported
+}
+
 func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, payload buffer.VectorisedView, loop stack.PacketLooping) *tcpip.Error {
 	return tcpip.ErrNotSupported
 }
@@ -109,7 +114,11 @@ func (e *endpoint) HandlePacket(r *stack.Route, vv buffer.VectorisedView) {
 		copy(pkt.HardwareAddressTarget(), h.HardwareAddressSender())
 		copy(pkt.ProtocolAddressTarget(), h.ProtocolAddressSender())
 		e.linkEP.WritePacket(r, nil /* gso */, hdr, buffer.VectorisedView{}, ProtocolNumber)
+		fallthrough // also fill the cache from requests
 	case header.ARPReply:
+		addr := tcpip.Address(h.ProtocolAddressSender())
+		linkAddr := tcpip.LinkAddress(h.HardwareAddressSender())
+		e.linkAddrCache.AddLinkAddress(e.nicid, addr, linkAddr)
 	}
 }
 
diff --git a/pkg/tcpip/network/fragmentation/BUILD b/pkg/tcpip/network/fragmentation/BUILD
index 825ff3392..2cad0a0b6 100644
--- a/pkg/tcpip/network/fragmentation/BUILD
+++ b/pkg/tcpip/network/fragmentation/BUILD
@@ -28,6 +28,7 @@ go_library(
     visibility = ["//:sandbox"],
     deps = [
         "//pkg/log",
+        "//pkg/tcpip",
         "//pkg/tcpip/buffer",
     ],
 )
diff --git a/pkg/tcpip/network/fragmentation/fragmentation.go b/pkg/tcpip/network/fragmentation/fragmentation.go
index 1628a82be..6da5238ec 100644
--- a/pkg/tcpip/network/fragmentation/fragmentation.go
+++ b/pkg/tcpip/network/fragmentation/fragmentation.go
@@ -17,6 +17,7 @@
 package fragmentation
 
 import (
+	"fmt"
 	"log"
 	"sync"
 	"time"
@@ -82,7 +83,7 @@ func NewFragmentation(highMemoryLimit, lowMemoryLimit int, reassemblingTimeout t
 
 // Process processes an incoming fragment belonging to an ID
 // and returns a complete packet when all the packets belonging to that ID have been received.
-func (f *Fragmentation) Process(id uint32, first, last uint16, more bool, vv buffer.VectorisedView) (buffer.VectorisedView, bool) {
+func (f *Fragmentation) Process(id uint32, first, last uint16, more bool, vv buffer.VectorisedView) (buffer.VectorisedView, bool, error) {
 	f.mu.Lock()
 	r, ok := f.reassemblers[id]
 	if ok && r.tooOld(f.timeout) {
@@ -97,8 +98,15 @@ func (f *Fragmentation) Process(id uint32, first, last uint16, more bool, vv buf
 	}
 	f.mu.Unlock()
 
-	res, done, consumed := r.process(first, last, more, vv)
-
+	res, done, consumed, err := r.process(first, last, more, vv)
+	if err != nil {
+		// We probably got an invalid sequence of fragments. Just
+		// discard the reassembler and move on.
+		f.mu.Lock()
+		f.release(r)
+		f.mu.Unlock()
+		return buffer.VectorisedView{}, false, fmt.Errorf("fragmentation processing error: %v", err)
+	}
 	f.mu.Lock()
 	f.size += consumed
 	if done {
@@ -114,7 +122,7 @@ func (f *Fragmentation) Process(id uint32, first, last uint16, more bool, vv buf
 		}
 	}
 	f.mu.Unlock()
-	return res, done
+	return res, done, nil
 }
 
 func (f *Fragmentation) release(r *reassembler) {
diff --git a/pkg/tcpip/network/fragmentation/fragmentation_test.go b/pkg/tcpip/network/fragmentation/fragmentation_test.go
index 799798544..72c0f53be 100644
--- a/pkg/tcpip/network/fragmentation/fragmentation_test.go
+++ b/pkg/tcpip/network/fragmentation/fragmentation_test.go
@@ -83,7 +83,10 @@ func TestFragmentationProcess(t *testing.T) {
 		t.Run(c.comment, func(t *testing.T) {
 			f := NewFragmentation(1024, 512, DefaultReassembleTimeout)
 			for i, in := range c.in {
-				vv, done := f.Process(in.id, in.first, in.last, in.more, in.vv)
+				vv, done, err := f.Process(in.id, in.first, in.last, in.more, in.vv)
+				if err != nil {
+					t.Fatalf("f.Process(%+v, %+d, %+d, %t, %+v) failed: %v", in.id, in.first, in.last, in.more, in.vv, err)
+				}
 				if !reflect.DeepEqual(vv, c.out[i].vv) {
 					t.Errorf("got Process(%d) = %+v, want = %+v", i, vv, c.out[i].vv)
 				}
@@ -114,7 +117,10 @@ func TestReassemblingTimeout(t *testing.T) {
 	time.Sleep(2 * timeout)
 	// Send another fragment that completes a packet.
 	// However, no packet should be reassembled because the fragment arrived after the timeout.
-	_, done := f.Process(0, 1, 1, false, vv(1, "1"))
+	_, done, err := f.Process(0, 1, 1, false, vv(1, "1"))
+	if err != nil {
+		t.Fatalf("f.Process(0, 1, 1, false, vv(1, \"1\")) failed: %v", err)
+	}
 	if done {
 		t.Errorf("Fragmentation does not respect the reassembling timeout.")
 	}
diff --git a/pkg/tcpip/network/fragmentation/reassembler.go b/pkg/tcpip/network/fragmentation/reassembler.go
index 8037f734b..9e002e396 100644
--- a/pkg/tcpip/network/fragmentation/reassembler.go
+++ b/pkg/tcpip/network/fragmentation/reassembler.go
@@ -78,7 +78,7 @@ func (r *reassembler) updateHoles(first, last uint16, more bool) bool {
 	return used
 }
 
-func (r *reassembler) process(first, last uint16, more bool, vv buffer.VectorisedView) (buffer.VectorisedView, bool, int) {
+func (r *reassembler) process(first, last uint16, more bool, vv buffer.VectorisedView) (buffer.VectorisedView, bool, int, error) {
 	r.mu.Lock()
 	defer r.mu.Unlock()
 	consumed := 0
@@ -86,7 +86,7 @@ func (r *reassembler) process(first, last uint16, more bool, vv buffer.Vectorise
 		// A concurrent goroutine might have already reassembled
 		// the packet and emptied the heap while this goroutine
 		// was waiting on the mutex. We don't have to do anything in this case.
-		return buffer.VectorisedView{}, false, consumed
+		return buffer.VectorisedView{}, false, consumed, nil
 	}
 	if r.updateHoles(first, last, more) {
 		// We store the incoming packet only if it filled some holes.
@@ -96,13 +96,13 @@ func (r *reassembler) process(first, last uint16, more bool, vv buffer.Vectorise
 	}
 	// Check if all the holes have been deleted and we are ready to reassamble.
 	if r.deleted < len(r.holes) {
-		return buffer.VectorisedView{}, false, consumed
+		return buffer.VectorisedView{}, false, consumed, nil
 	}
 	res, err := r.heap.reassemble()
 	if err != nil {
-		panic(fmt.Sprintf("reassemble failed with: %v. There is probably a bug in the code handling the holes.", err))
+		return buffer.VectorisedView{}, false, consumed, fmt.Errorf("fragment reassembly failed: %v", err)
 	}
-	return res, true, consumed
+	return res, true, consumed, nil
 }
 
 func (r *reassembler) tooOld(timeout time.Duration) bool {
diff --git a/pkg/tcpip/network/ip_test.go b/pkg/tcpip/network/ip_test.go
index f644a8b08..8d74497ba 100644
--- a/pkg/tcpip/network/ip_test.go
+++ b/pkg/tcpip/network/ip_test.go
@@ -171,6 +171,15 @@ func (t *testObject) WritePacket(_ *stack.Route, _ *stack.GSO, hdr buffer.Prepen
 	return nil
 }
 
+// WritePackets implements stack.LinkEndpoint.WritePackets.
+func (t *testObject) WritePackets(_ *stack.Route, _ *stack.GSO, hdr []stack.PacketDescriptor, payload buffer.VectorisedView, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+	panic("not implemented")
+}
+
+func (t *testObject) WriteRawPacket(_ buffer.VectorisedView) *tcpip.Error {
+	return tcpip.ErrNotSupported
+}
+
 func buildIPv4Route(local, remote tcpip.Address) (stack.Route, *tcpip.Error) {
 	s := stack.New(stack.Options{
 		NetworkProtocols:   []stack.NetworkProtocol{ipv4.NewProtocol()},
diff --git a/pkg/tcpip/network/ipv4/ipv4.go b/pkg/tcpip/network/ipv4/ipv4.go
index df1a08113..1339f8474 100644
--- a/pkg/tcpip/network/ipv4/ipv4.go
+++ b/pkg/tcpip/network/ipv4/ipv4.go
@@ -198,10 +198,9 @@ func (e *endpoint) writePacketFragments(r *stack.Route, gso *stack.GSO, hdr buff
 	return nil
 }
 
-// WritePacket writes a packet to the given destination address and protocol.
-func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr buffer.Prependable, payload buffer.VectorisedView, params stack.NetworkHeaderParams, loop stack.PacketLooping) *tcpip.Error {
+func (e *endpoint) addIPHeader(r *stack.Route, hdr *buffer.Prependable, payloadSize int, params stack.NetworkHeaderParams) {
 	ip := header.IPv4(hdr.Prepend(header.IPv4MinimumSize))
-	length := uint16(hdr.UsedLength() + payload.Size())
+	length := uint16(hdr.UsedLength() + payloadSize)
 	id := uint32(0)
 	if length > header.IPv4MaximumHeaderSize+8 {
 		// Packets of 68 bytes or less are required by RFC 791 to not be
@@ -219,6 +218,11 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr buffer.Prepen
 		DstAddr:     r.RemoteAddress,
 	})
 	ip.SetChecksum(^ip.CalculateChecksum())
+}
+
+// WritePacket writes a packet to the given destination address and protocol.
+func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr buffer.Prependable, payload buffer.VectorisedView, params stack.NetworkHeaderParams, loop stack.PacketLooping) *tcpip.Error {
+	e.addIPHeader(r, &hdr, payload.Size(), params)
 
 	if loop&stack.PacketLoop != 0 {
 		views := make([]buffer.View, 1, 1+len(payload.Views()))
@@ -242,6 +246,23 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr buffer.Prepen
 	return nil
 }
 
+// WritePackets implements stack.NetworkEndpoint.WritePackets.
+func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, hdrs []stack.PacketDescriptor, payload buffer.VectorisedView, params stack.NetworkHeaderParams, loop stack.PacketLooping) (int, *tcpip.Error) {
+	if loop&stack.PacketLoop != 0 {
+		panic("multiple packets in local loop")
+	}
+	if loop&stack.PacketOut == 0 {
+		return len(hdrs), nil
+	}
+
+	for i := range hdrs {
+		e.addIPHeader(r, &hdrs[i].Hdr, hdrs[i].Size, params)
+	}
+	n, err := e.linkEP.WritePackets(r, gso, hdrs, payload, ProtocolNumber)
+	r.Stats().IP.PacketsSent.IncrementBy(uint64(n))
+	return n, err
+}
+
 // WriteHeaderIncludedPacket writes a packet already containing a network
 // header through the given route.
 func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, payload buffer.VectorisedView, loop stack.PacketLooping) *tcpip.Error {
@@ -326,7 +347,13 @@ func (e *endpoint) HandlePacket(r *stack.Route, vv buffer.VectorisedView) {
 			return
 		}
 		var ready bool
-		vv, ready = e.fragmentation.Process(hash.IPv4FragmentHash(h), h.FragmentOffset(), last, more, vv)
+		var err error
+		vv, ready, err = e.fragmentation.Process(hash.IPv4FragmentHash(h), h.FragmentOffset(), last, more, vv)
+		if err != nil {
+			r.Stats().IP.MalformedPacketsReceived.Increment()
+			r.Stats().IP.MalformedFragmentsReceived.Increment()
+			return
+		}
 		if !ready {
 			return
 		}
diff --git a/pkg/tcpip/network/ipv4/ipv4_test.go b/pkg/tcpip/network/ipv4/ipv4_test.go
index 560638ce8..99f84acd7 100644
--- a/pkg/tcpip/network/ipv4/ipv4_test.go
+++ b/pkg/tcpip/network/ipv4/ipv4_test.go
@@ -47,10 +47,6 @@ func TestExcludeBroadcast(t *testing.T) {
 		t.Fatalf("CreateNIC failed: %v", err)
 	}
 
-	if err := s.AddAddress(1, ipv4.ProtocolNumber, header.IPv4Any); err != nil {
-		t.Fatalf("AddAddress failed: %v", err)
-	}
-
 	s.SetRouteTable([]tcpip.Route{{
 		Destination: header.IPv4EmptySubnet,
 		NIC:         1,
@@ -441,6 +437,16 @@ func TestInvalidFragments(t *testing.T) {
 			1,
 			1,
 		},
+		{
+			"multiple_fragments_with_more_fragments_set_to_false",
+			[][]byte{
+				{0x45, 0x00, 0x00, 0x1c, 0x30, 0x40, 0x00, 0x10, 0x00, 0x06, 0x34, 0x69, 0x73, 0x73, 0x69, 0x6e, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+				{0x45, 0x00, 0x00, 0x1c, 0x30, 0x40, 0x00, 0x01, 0x61, 0x06, 0x34, 0x69, 0x73, 0x73, 0x69, 0x6e, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+				{0x45, 0x00, 0x00, 0x1c, 0x30, 0x40, 0x20, 0x00, 0x00, 0x06, 0x34, 0x1e, 0x73, 0x73, 0x69, 0x6e, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+			},
+			1,
+			1,
+		},
 	}
 
 	for _, tc := range testCases {
diff --git a/pkg/tcpip/network/ipv6/icmp.go b/pkg/tcpip/network/ipv6/icmp.go
index b5df85455..b289e902f 100644
--- a/pkg/tcpip/network/ipv6/icmp.go
+++ b/pkg/tcpip/network/ipv6/icmp.go
@@ -21,15 +21,6 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 )
 
-const (
-	// ndpHopLimit is the expected IP hop limit value of 255 for received
-	// NDP packets, as per RFC 4861 sections 4.1 - 4.5, 6.1.1, 6.1.2, 7.1.1,
-	// 7.1.2 and 8.1. If the hop limit value is not 255, nodes MUST silently
-	// drop the NDP packet. All outgoing NDP packets must use this value for
-	// its IP hop limit field.
-	ndpHopLimit = 255
-)
-
 // handleControl handles the case when an ICMP packet contains the headers of
 // the original packet that caused the ICMP one to be sent. This information is
 // used to find out which transport endpoint must be notified about the ICMP
@@ -79,6 +70,7 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, vv buffer.V
 		return
 	}
 	h := header.ICMPv6(v)
+	iph := header.IPv6(netHeader)
 
 	// As per RFC 4861 sections 4.1 - 4.5, 6.1.1, 6.1.2, 7.1.1, 7.1.2 and
 	// 8.1, nodes MUST silently drop NDP packets where the Hop Limit field
@@ -89,7 +81,7 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, vv buffer.V
 		header.ICMPv6RouterSolicit,
 		header.ICMPv6RouterAdvert,
 		header.ICMPv6RedirectMsg:
-		if header.IPv6(netHeader).HopLimit() != ndpHopLimit {
+		if iph.HopLimit() != header.NDPHopLimit {
 			received.Invalid.Increment()
 			return
 		}
@@ -121,25 +113,72 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, vv buffer.V
 
 	case header.ICMPv6NeighborSolicit:
 		received.NeighborSolicit.Increment()
-
 		if len(v) < header.ICMPv6NeighborSolicitMinimumSize {
 			received.Invalid.Increment()
 			return
 		}
-		targetAddr := tcpip.Address(v[8:][:header.IPv6AddressSize])
+
+		ns := header.NDPNeighborSolicit(h.NDPPayload())
+		targetAddr := ns.TargetAddress()
+		s := r.Stack()
+		rxNICID := r.NICID()
+
+		isTentative, err := s.IsAddrTentative(rxNICID, targetAddr)
+		if err != nil {
+			// We will only get an error if rxNICID is unrecognized,
+			// which should not happen. For now short-circuit this
+			// packet.
+			//
+			// TODO(b/141002840): Handle this better?
+			return
+		}
+
+		if isTentative {
+			// If the target address is tentative and the source
+			// of the packet is a unicast (specified) address, then
+			// the source of the packet is attempting to perform
+			// address resolution on the target. In this case, the
+			// solicitation is silently ignored, as per RFC 4862
+			// section 5.4.3.
+			//
+			// If the target address is tentative and the source of
+			// the packet is the unspecified address (::), then we
+			// know another node is also performing DAD for the
+			// same address (since targetAddr is tentative for us,
+			// we know we are also performing DAD on it). In this
+			// case we let the stack know so it can handle such a
+			// scenario and do nothing further with the NDP NS.
+			if iph.SourceAddress() == header.IPv6Any {
+				s.DupTentativeAddrDetected(rxNICID, targetAddr)
+			}
+
+			// Do not handle neighbor solicitations targeted
+			// to an address that is tentative on the received
+			// NIC any further.
+			return
+		}
+
+		// At this point we know that targetAddr is not tentative on
+		// rxNICID so the packet is processed as defined in RFC 4861,
+		// as per RFC 4862 section 5.4.3.
+
 		if e.linkAddrCache.CheckLocalAddress(e.nicid, ProtocolNumber, targetAddr) == 0 {
 			// We don't have a useful answer; the best we can do is ignore the request.
 			return
 		}
 
-		hdr := buffer.NewPrependable(int(r.MaxHeaderLength()) + header.ICMPv6NeighborAdvertSize)
+		optsSerializer := header.NDPOptionsSerializer{
+			header.NDPTargetLinkLayerAddressOption(r.LocalLinkAddress[:]),
+		}
+		hdr := buffer.NewPrependable(int(r.MaxHeaderLength()) + header.ICMPv6NeighborAdvertMinimumSize + int(optsSerializer.Length()))
 		pkt := header.ICMPv6(hdr.Prepend(header.ICMPv6NeighborAdvertSize))
 		pkt.SetType(header.ICMPv6NeighborAdvert)
-		pkt[icmpV6FlagOffset] = ndpSolicitedFlag | ndpOverrideFlag
-		copy(pkt[icmpV6OptOffset-len(targetAddr):], targetAddr)
-		pkt[icmpV6OptOffset] = ndpOptDstLinkAddr
-		pkt[icmpV6LengthOffset] = 1
-		copy(pkt[icmpV6LengthOffset+1:], r.LocalLinkAddress[:])
+		na := header.NDPNeighborAdvert(pkt.NDPPayload())
+		na.SetSolicitedFlag(true)
+		na.SetOverrideFlag(true)
+		na.SetTargetAddress(targetAddr)
+		opts := na.Options()
+		opts.Serialize(optsSerializer)
 
 		// ICMPv6 Neighbor Solicit messages are always sent to
 		// specially crafted IPv6 multicast addresses. As a result, the
@@ -154,7 +193,22 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, vv buffer.V
 		r.LocalAddress = targetAddr
 		pkt.SetChecksum(header.ICMPv6Checksum(pkt, r.LocalAddress, r.RemoteAddress, buffer.VectorisedView{}))
 
-		if err := r.WritePacket(nil /* gso */, hdr, buffer.VectorisedView{}, stack.NetworkHeaderParams{Protocol: header.ICMPv6ProtocolNumber, TTL: r.DefaultTTL(), TOS: stack.DefaultTOS}); err != nil {
+		// TODO(tamird/ghanan): there exists an explicit NDP option that is
+		// used to update the neighbor table with link addresses for a
+		// neighbor from an NS (see the Source Link Layer option RFC
+		// 4861 section 4.6.1 and section 7.2.3).
+		//
+		// Furthermore, the entirety of NDP handling here seems to be
+		// contradicted by RFC 4861.
+		e.linkAddrCache.AddLinkAddress(e.nicid, r.RemoteAddress, r.RemoteLinkAddress)
+
+		// RFC 4861 Neighbor Discovery for IP version 6 (IPv6)
+		//
+		// 7.1.2. Validation of Neighbor Advertisements
+		//
+		// The IP Hop Limit field has a value of 255, i.e., the packet
+		// could not possibly have been forwarded by a router.
+		if err := r.WritePacket(nil /* gso */, hdr, buffer.VectorisedView{}, stack.NetworkHeaderParams{Protocol: header.ICMPv6ProtocolNumber, TTL: header.NDPHopLimit, TOS: stack.DefaultTOS}); err != nil {
 			sent.Dropped.Increment()
 			return
 		}
@@ -166,7 +220,42 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, vv buffer.V
 			received.Invalid.Increment()
 			return
 		}
-		targetAddr := tcpip.Address(v[8:][:header.IPv6AddressSize])
+
+		na := header.NDPNeighborAdvert(h.NDPPayload())
+		targetAddr := na.TargetAddress()
+		stack := r.Stack()
+		rxNICID := r.NICID()
+
+		isTentative, err := stack.IsAddrTentative(rxNICID, targetAddr)
+		if err != nil {
+			// We will only get an error if rxNICID is unrecognized,
+			// which should not happen. For now short-circuit this
+			// packet.
+			//
+			// TODO(b/141002840): Handle this better?
+			return
+		}
+
+		if isTentative {
+			// We just got an NA from a node that owns an address we
+			// are performing DAD on, implying the address is not
+			// unique. In this case we let the stack know so it can
+			// handle such a scenario and do nothing furthur with
+			// the NDP NA.
+			stack.DupTentativeAddrDetected(rxNICID, targetAddr)
+			return
+		}
+
+		// At this point we know that the targetAddress is not tentative
+		// on rxNICID. However, targetAddr may still be assigned to
+		// rxNICID but not tentative (it could be permanent). Such a
+		// scenario is beyond the scope of RFC 4862. As such, we simply
+		// ignore such a scenario for now and proceed as normal.
+		//
+		// TODO(b/143147598): Handle the scenario described above. Also
+		// inform the netstack integration that a duplicate address was
+		// detected outside of DAD.
+
 		e.linkAddrCache.AddLinkAddress(e.nicid, targetAddr, r.RemoteLinkAddress)
 		if targetAddr != r.RemoteAddress {
 			e.linkAddrCache.AddLinkAddress(e.nicid, r.RemoteAddress, r.RemoteLinkAddress)
@@ -178,7 +267,6 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, vv buffer.V
 			received.Invalid.Increment()
 			return
 		}
-
 		vv.TrimFront(header.ICMPv6EchoMinimumSize)
 		hdr := buffer.NewPrependable(int(r.MaxHeaderLength()) + header.ICMPv6EchoMinimumSize)
 		pkt := header.ICMPv6(hdr.Prepend(header.ICMPv6EchoMinimumSize))
@@ -262,7 +350,7 @@ func (*protocol) LinkAddressRequest(addr, localAddr tcpip.Address, linkEP stack.
 	ip.Encode(&header.IPv6Fields{
 		PayloadLength: length,
 		NextHeader:    uint8(header.ICMPv6ProtocolNumber),
-		HopLimit:      ndpHopLimit,
+		HopLimit:      header.NDPHopLimit,
 		SrcAddr:       r.LocalAddress,
 		DstAddr:       r.RemoteAddress,
 	})
diff --git a/pkg/tcpip/network/ipv6/icmp_test.go b/pkg/tcpip/network/ipv6/icmp_test.go
index 501be208e..7c11dde55 100644
--- a/pkg/tcpip/network/ipv6/icmp_test.go
+++ b/pkg/tcpip/network/ipv6/icmp_test.go
@@ -15,7 +15,6 @@
 package ipv6
 
 import (
-	"fmt"
 	"reflect"
 	"strings"
 	"testing"
@@ -144,7 +143,7 @@ func TestICMPCounts(t *testing.T) {
 		ip.Encode(&header.IPv6Fields{
 			PayloadLength: uint16(payloadLength),
 			NextHeader:    uint8(header.ICMPv6ProtocolNumber),
-			HopLimit:      ndpHopLimit,
+			HopLimit:      header.NDPHopLimit,
 			SrcAddr:       r.LocalAddress,
 			DstAddr:       r.RemoteAddress,
 		})
@@ -179,13 +178,10 @@ func visitStats(v reflect.Value, f func(string, *tcpip.StatCounter)) {
 	t := v.Type()
 	for i := 0; i < v.NumField(); i++ {
 		v := v.Field(i)
-		switch v.Kind() {
-		case reflect.Ptr:
-			f(t.Field(i).Name, v.Interface().(*tcpip.StatCounter))
-		case reflect.Struct:
+		if s, ok := v.Interface().(*tcpip.StatCounter); ok {
+			f(t.Field(i).Name, s)
+		} else {
 			visitStats(v, f)
-		default:
-			panic(fmt.Sprintf("unexpected type %s", v.Type()))
 		}
 	}
 }
diff --git a/pkg/tcpip/network/ipv6/ipv6.go b/pkg/tcpip/network/ipv6/ipv6.go
index cd1e34085..5898f8f9e 100644
--- a/pkg/tcpip/network/ipv6/ipv6.go
+++ b/pkg/tcpip/network/ipv6/ipv6.go
@@ -97,9 +97,8 @@ func (e *endpoint) GSOMaxSize() uint32 {
 	return 0
 }
 
-// WritePacket writes a packet to the given destination address and protocol.
-func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr buffer.Prependable, payload buffer.VectorisedView, params stack.NetworkHeaderParams, loop stack.PacketLooping) *tcpip.Error {
-	length := uint16(hdr.UsedLength() + payload.Size())
+func (e *endpoint) addIPHeader(r *stack.Route, hdr *buffer.Prependable, payloadSize int, params stack.NetworkHeaderParams) {
+	length := uint16(hdr.UsedLength() + payloadSize)
 	ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
 	ip.Encode(&header.IPv6Fields{
 		PayloadLength: length,
@@ -109,6 +108,11 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr buffer.Prepen
 		SrcAddr:       r.LocalAddress,
 		DstAddr:       r.RemoteAddress,
 	})
+}
+
+// WritePacket writes a packet to the given destination address and protocol.
+func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr buffer.Prependable, payload buffer.VectorisedView, params stack.NetworkHeaderParams, loop stack.PacketLooping) *tcpip.Error {
+	e.addIPHeader(r, &hdr, payload.Size(), params)
 
 	if loop&stack.PacketLoop != 0 {
 		views := make([]buffer.View, 1, 1+len(payload.Views()))
@@ -127,6 +131,26 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr buffer.Prepen
 	return e.linkEP.WritePacket(r, gso, hdr, payload, ProtocolNumber)
 }
 
+// WritePackets implements stack.LinkEndpoint.WritePackets.
+func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, hdrs []stack.PacketDescriptor, payload buffer.VectorisedView, params stack.NetworkHeaderParams, loop stack.PacketLooping) (int, *tcpip.Error) {
+	if loop&stack.PacketLoop != 0 {
+		panic("not implemented")
+	}
+	if loop&stack.PacketOut == 0 {
+		return len(hdrs), nil
+	}
+
+	for i := range hdrs {
+		hdr := &hdrs[i].Hdr
+		size := hdrs[i].Size
+		e.addIPHeader(r, hdr, size, params)
+	}
+
+	n, err := e.linkEP.WritePackets(r, gso, hdrs, payload, ProtocolNumber)
+	r.Stats().IP.PacketsSent.IncrementBy(uint64(n))
+	return n, err
+}
+
 // WriteHeaderIncludedPacker implements stack.NetworkEndpoint. It is not yet
 // supported by IPv6.
 func (*endpoint) WriteHeaderIncludedPacket(r *stack.Route, payload buffer.VectorisedView, loop stack.PacketLooping) *tcpip.Error {
diff --git a/pkg/tcpip/network/ipv6/ndp_test.go b/pkg/tcpip/network/ipv6/ndp_test.go
index e30791fe3..c32716f2e 100644
--- a/pkg/tcpip/network/ipv6/ndp_test.go
+++ b/pkg/tcpip/network/ipv6/ndp_test.go
@@ -150,7 +150,7 @@ func TestHopLimitValidation(t *testing.T) {
 
 			// Receive the NDP packet with an invalid hop limit
 			// value.
-			handleIPv6Payload(hdr, ndpHopLimit-1, ep, &r)
+			handleIPv6Payload(hdr, header.NDPHopLimit-1, ep, &r)
 
 			// Invalid count should have increased.
 			if got := invalid.Value(); got != 1 {
@@ -164,7 +164,7 @@ func TestHopLimitValidation(t *testing.T) {
 			}
 
 			// Receive the NDP packet with a valid hop limit value.
-			handleIPv6Payload(hdr, ndpHopLimit, ep, &r)
+			handleIPv6Payload(hdr, header.NDPHopLimit, ep, &r)
 
 			// Rx count of NDP packet of type typ.typ should have
 			// increased.
diff --git a/pkg/tcpip/stack/BUILD b/pkg/tcpip/stack/BUILD
index 6a78432c9..460db3cf8 100644
--- a/pkg/tcpip/stack/BUILD
+++ b/pkg/tcpip/stack/BUILD
@@ -22,6 +22,7 @@ go_library(
         "icmp_rate_limit.go",
         "linkaddrcache.go",
         "linkaddrentry_list.go",
+        "ndp.go",
         "nic.go",
         "registration.go",
         "route.go",
@@ -53,6 +54,7 @@ go_test(
     name = "stack_x_test",
     size = "small",
     srcs = [
+        "ndp_test.go",
         "stack_test.go",
         "transport_demuxer_test.go",
         "transport_test.go",
@@ -61,14 +63,17 @@ go_test(
         ":stack",
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
+        "//pkg/tcpip/checker",
         "//pkg/tcpip/header",
         "//pkg/tcpip/iptables",
         "//pkg/tcpip/link/channel",
         "//pkg/tcpip/link/loopback",
         "//pkg/tcpip/network/ipv4",
         "//pkg/tcpip/network/ipv6",
+        "//pkg/tcpip/transport/icmp",
         "//pkg/tcpip/transport/udp",
         "//pkg/waiter",
+        "@com_github_google_go-cmp//cmp:go_default_library",
     ],
 )
 
diff --git a/pkg/tcpip/stack/ndp.go b/pkg/tcpip/stack/ndp.go
new file mode 100644
index 000000000..ea2dbed2e
--- /dev/null
+++ b/pkg/tcpip/stack/ndp.go
@@ -0,0 +1,319 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package stack
+
+import (
+	"fmt"
+	"log"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+)
+
+const (
+	// defaultDupAddrDetectTransmits is the default number of NDP Neighbor
+	// Solicitation messages to send when doing Duplicate Address Detection
+	// for a tentative address.
+	//
+	// Default = 1 (from RFC 4862 section 5.1)
+	defaultDupAddrDetectTransmits = 1
+
+	// defaultRetransmitTimer is the default amount of time to wait between
+	// sending NDP Neighbor solicitation messages.
+	//
+	// Default = 1s (from RFC 4861 section 10).
+	defaultRetransmitTimer = time.Second
+
+	// minimumRetransmitTimer is the minimum amount of time to wait between
+	// sending NDP Neighbor solicitation messages. Note, RFC 4861 does
+	// not impose a minimum Retransmit Timer, but we do here to make sure
+	// the messages are not sent all at once. We also come to this value
+	// because in the RetransmitTimer field of a Router Advertisement, a
+	// value of 0 means unspecified, so the smallest valid value is 1.
+	// Note, the unit of the RetransmitTimer field in the Router
+	// Advertisement is milliseconds.
+	//
+	// Min = 1ms.
+	minimumRetransmitTimer = time.Millisecond
+)
+
+// NDPDispatcher is the interface integrators of netstack must implement to
+// receive and handle NDP related events.
+type NDPDispatcher interface {
+	// OnDuplicateAddressDetectionStatus will be called when the DAD process
+	// for an address (addr) on a NIC (with ID nicid) completes. resolved
+	// will be set to true if DAD completed successfully (no duplicate addr
+	// detected); false otherwise (addr was detected to be a duplicate on
+	// the link the NIC is a part of, or it was stopped for some other
+	// reason, such as the address being removed). If an error occured
+	// during DAD, err will be set and resolved must be ignored.
+	//
+	// This function is permitted to block indefinitely without interfering
+	// with the stack's operation.
+	OnDuplicateAddressDetectionStatus(nicid tcpip.NICID, addr tcpip.Address, resolved bool, err *tcpip.Error)
+}
+
+// NDPConfigurations is the NDP configurations for the netstack.
+type NDPConfigurations struct {
+	// The number of Neighbor Solicitation messages to send when doing
+	// Duplicate Address Detection for a tentative address.
+	//
+	// Note, a value of zero effectively disables DAD.
+	DupAddrDetectTransmits uint8
+
+	// The amount of time to wait between sending Neighbor solicitation
+	// messages.
+	//
+	// Must be greater than 0.5s.
+	RetransmitTimer time.Duration
+}
+
+// DefaultNDPConfigurations returns an NDPConfigurations populated with
+// default values.
+func DefaultNDPConfigurations() NDPConfigurations {
+	return NDPConfigurations{
+		DupAddrDetectTransmits: defaultDupAddrDetectTransmits,
+		RetransmitTimer:        defaultRetransmitTimer,
+	}
+}
+
+// validate modifies an NDPConfigurations with valid values. If invalid values
+// are present in c, the corresponding default values will be used instead.
+//
+// If RetransmitTimer is less than minimumRetransmitTimer, then a value of
+// defaultRetransmitTimer will be used.
+func (c *NDPConfigurations) validate() {
+	if c.RetransmitTimer < minimumRetransmitTimer {
+		c.RetransmitTimer = defaultRetransmitTimer
+	}
+}
+
+// ndpState is the per-interface NDP state.
+type ndpState struct {
+	// The NIC this ndpState is for.
+	nic *NIC
+
+	// The DAD state to send the next NS message, or resolve the address.
+	dad map[tcpip.Address]dadState
+}
+
+// dadState holds the Duplicate Address Detection timer and channel to signal
+// to the DAD goroutine that DAD should stop.
+type dadState struct {
+	// The DAD timer to send the next NS message, or resolve the address.
+	timer *time.Timer
+
+	// Used to let the DAD timer know that it has been stopped.
+	//
+	// Must only be read from or written to while protected by the lock of
+	// the NIC this dadState is associated with.
+	done *bool
+}
+
+// startDuplicateAddressDetection performs Duplicate Address Detection.
+//
+// This function must only be called by IPv6 addresses that are currently
+// tentative.
+//
+// The NIC that ndp belongs to MUST be locked.
+func (ndp *ndpState) startDuplicateAddressDetection(addr tcpip.Address, ref *referencedNetworkEndpoint) *tcpip.Error {
+	// addr must be a valid unicast IPv6 address.
+	if !header.IsV6UnicastAddress(addr) {
+		return tcpip.ErrAddressFamilyNotSupported
+	}
+
+	// Should not attempt to perform DAD on an address that is currently in
+	// the DAD process.
+	if _, ok := ndp.dad[addr]; ok {
+		// Should never happen because we should only ever call this
+		// function for newly created addresses. If we attemped to
+		// "add" an address that already existed, we would returned an
+		// error since we attempted to add a duplicate address, or its
+		// reference count would have been increased without doing the
+		// work that would have been done for an address that was brand
+		// new. See NIC.addPermanentAddressLocked.
+		panic(fmt.Sprintf("ndpdad: already performing DAD for addr %s on NIC(%d)", addr, ndp.nic.ID()))
+	}
+
+	remaining := ndp.nic.stack.ndpConfigs.DupAddrDetectTransmits
+
+	{
+		done, err := ndp.doDuplicateAddressDetection(addr, remaining, ref)
+		if err != nil {
+			return err
+		}
+		if done {
+			return nil
+		}
+	}
+
+	remaining--
+
+	var done bool
+	var timer *time.Timer
+	timer = time.AfterFunc(ndp.nic.stack.ndpConfigs.RetransmitTimer, func() {
+		var d bool
+		var err *tcpip.Error
+
+		// doDadIteration does a single iteration of the DAD loop.
+		//
+		// Returns true if the integrator needs to be informed of DAD
+		// completing.
+		doDadIteration := func() bool {
+			ndp.nic.mu.Lock()
+			defer ndp.nic.mu.Unlock()
+
+			if done {
+				// If we reach this point, it means that the DAD
+				// timer fired after another goroutine already
+				// obtained the NIC lock and stopped DAD before
+				// this function obtained the NIC lock. Simply
+				// return here and do nothing further.
+				return false
+			}
+
+			ref, ok := ndp.nic.endpoints[NetworkEndpointID{addr}]
+			if !ok {
+				// This should never happen.
+				// We should have an endpoint for addr since we
+				// are still performing DAD on it. If the
+				// endpoint does not exist, but we are doing DAD
+				// on it, then we started DAD at some point, but
+				// forgot to stop it when the endpoint was
+				// deleted.
+				panic(fmt.Sprintf("ndpdad: unrecognized addr %s for NIC(%d)", addr, ndp.nic.ID()))
+			}
+
+			d, err = ndp.doDuplicateAddressDetection(addr, remaining, ref)
+			if err != nil || d {
+				delete(ndp.dad, addr)
+
+				if err != nil {
+					log.Printf("ndpdad: Error occured during DAD iteration for addr (%s) on NIC(%d); err = %s", addr, ndp.nic.ID(), err)
+				}
+
+				// Let the integrator know DAD has completed.
+				return true
+			}
+
+			remaining--
+			timer.Reset(ndp.nic.stack.ndpConfigs.RetransmitTimer)
+			return false
+		}
+
+		if doDadIteration() && ndp.nic.stack.ndpDisp != nil {
+			ndp.nic.stack.ndpDisp.OnDuplicateAddressDetectionStatus(ndp.nic.ID(), addr, d, err)
+		}
+
+	})
+
+	ndp.dad[addr] = dadState{
+		timer: timer,
+		done:  &done,
+	}
+
+	return nil
+}
+
+// doDuplicateAddressDetection is called on every iteration of the timer, and
+// when DAD starts.
+//
+// It handles resolving the address (if there are no more NS to send), or
+// sending the next NS if there are more NS to send.
+//
+// This function must only be called by IPv6 addresses that are currently
+// tentative.
+//
+// The NIC that ndp belongs to (n) MUST be locked.
+//
+// Returns true if DAD has resolved; false if DAD is still ongoing.
+func (ndp *ndpState) doDuplicateAddressDetection(addr tcpip.Address, remaining uint8, ref *referencedNetworkEndpoint) (bool, *tcpip.Error) {
+	if ref.getKind() != permanentTentative {
+		// The endpoint should still be marked as tentative
+		// since we are still performing DAD on it.
+		panic(fmt.Sprintf("ndpdad: addr %s is not tentative on NIC(%d)", addr, ndp.nic.ID()))
+	}
+
+	if remaining == 0 {
+		// DAD has resolved.
+		ref.setKind(permanent)
+		return true, nil
+	}
+
+	// Send a new NS.
+	snmc := header.SolicitedNodeAddr(addr)
+	snmcRef, ok := ndp.nic.endpoints[NetworkEndpointID{snmc}]
+	if !ok {
+		// This should never happen as if we have the
+		// address, we should have the solicited-node
+		// address.
+		panic(fmt.Sprintf("ndpdad: NIC(%d) is not in the solicited-node multicast group (%s) but it has addr %s", ndp.nic.ID(), snmc, addr))
+	}
+
+	// Use the unspecified address as the source address when performing
+	// DAD.
+	r := makeRoute(header.IPv6ProtocolNumber, header.IPv6Any, snmc, ndp.nic.linkEP.LinkAddress(), snmcRef, false, false)
+
+	hdr := buffer.NewPrependable(int(r.MaxHeaderLength()) + header.ICMPv6NeighborSolicitMinimumSize)
+	pkt := header.ICMPv6(hdr.Prepend(header.ICMPv6NeighborSolicitMinimumSize))
+	pkt.SetType(header.ICMPv6NeighborSolicit)
+	ns := header.NDPNeighborSolicit(pkt.NDPPayload())
+	ns.SetTargetAddress(addr)
+	pkt.SetChecksum(header.ICMPv6Checksum(pkt, r.LocalAddress, r.RemoteAddress, buffer.VectorisedView{}))
+
+	sent := r.Stats().ICMP.V6PacketsSent
+	if err := r.WritePacket(nil, hdr, buffer.VectorisedView{}, NetworkHeaderParams{Protocol: header.ICMPv6ProtocolNumber, TTL: header.NDPHopLimit, TOS: DefaultTOS}); err != nil {
+		sent.Dropped.Increment()
+		return false, err
+	}
+	sent.NeighborSolicit.Increment()
+
+	return false, nil
+}
+
+// stopDuplicateAddressDetection ends a running Duplicate Address Detection
+// process. Note, this may leave the DAD process for a tentative address in
+// such a state forever, unless some other external event resolves the DAD
+// process (receiving an NA from the true owner of addr, or an NS for addr
+// (implying another node is attempting to use addr)). It is up to the caller
+// of this function to handle such a scenario. Normally, addr will be removed
+// from n right after this function returns or the address successfully
+// resolved.
+//
+// The NIC that ndp belongs to MUST be locked.
+func (ndp *ndpState) stopDuplicateAddressDetection(addr tcpip.Address) {
+	dad, ok := ndp.dad[addr]
+	if !ok {
+		// Not currently performing DAD on addr, just return.
+		return
+	}
+
+	if dad.timer != nil {
+		dad.timer.Stop()
+		dad.timer = nil
+
+		*dad.done = true
+		dad.done = nil
+	}
+
+	delete(ndp.dad, addr)
+
+	// Let the integrator know DAD did not resolve.
+	if ndp.nic.stack.ndpDisp != nil {
+		go ndp.nic.stack.ndpDisp.OnDuplicateAddressDetectionStatus(ndp.nic.ID(), addr, false, nil)
+	}
+}
diff --git a/pkg/tcpip/stack/ndp_test.go b/pkg/tcpip/stack/ndp_test.go
new file mode 100644
index 000000000..b089ce2ae
--- /dev/null
+++ b/pkg/tcpip/stack/ndp_test.go
@@ -0,0 +1,443 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package stack_test
+
+import (
+	"testing"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/checker"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/link/channel"
+	"gvisor.dev/gvisor/pkg/tcpip/network/ipv6"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+	"gvisor.dev/gvisor/pkg/tcpip/transport/icmp"
+)
+
+const (
+	addr1     = "\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01"
+	addr2     = "\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02"
+	linkAddr1 = "\x02\x02\x03\x04\x05\x06"
+)
+
+// TestDADDisabled tests that an address successfully resolves immediately
+// when DAD is not enabled (the default for an empty stack.Options).
+func TestDADDisabled(t *testing.T) {
+	opts := stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+	}
+
+	e := channel.New(10, 1280, linkAddr1)
+	s := stack.New(opts)
+	if err := s.CreateNIC(1, e); err != nil {
+		t.Fatalf("CreateNIC(_) = %s", err)
+	}
+
+	if err := s.AddAddress(1, header.IPv6ProtocolNumber, addr1); err != nil {
+		t.Fatalf("AddAddress(_, %d, %s) = %s", header.IPv6ProtocolNumber, addr1, err)
+	}
+
+	// Should get the address immediately since we should not have performed
+	// DAD on it.
+	addr, err := s.GetMainNICAddress(1, header.IPv6ProtocolNumber)
+	if err != nil {
+		t.Fatalf("stack.GetMainNICAddress(_, _) err = %s", err)
+	}
+	if addr.Address != addr1 {
+		t.Fatalf("got stack.GetMainNICAddress(_, _) = %s, want = %s", addr, addr1)
+	}
+
+	// We should not have sent any NDP NS messages.
+	if got := s.Stats().ICMP.V6PacketsSent.NeighborSolicit.Value(); got != 0 {
+		t.Fatalf("got NeighborSolicit = %d, want = 0", got)
+	}
+}
+
+// ndpDADEvent is a set of parameters that was passed to
+// ndpDispatcher.OnDuplicateAddressDetectionStatus.
+type ndpDADEvent struct {
+	nicid    tcpip.NICID
+	addr     tcpip.Address
+	resolved bool
+	err      *tcpip.Error
+}
+
+var _ stack.NDPDispatcher = (*ndpDispatcher)(nil)
+
+// ndpDispatcher implements NDPDispatcher so tests can know when various NDP
+// related events happen for test purposes.
+type ndpDispatcher struct {
+	dadC chan ndpDADEvent
+}
+
+// Implements stack.NDPDispatcher.OnDuplicateAddressDetectionStatus.
+//
+// If the DAD event matches what we are expecting, send signal on n.dadC.
+func (n *ndpDispatcher) OnDuplicateAddressDetectionStatus(nicid tcpip.NICID, addr tcpip.Address, resolved bool, err *tcpip.Error) {
+	n.dadC <- ndpDADEvent{
+		nicid,
+		addr,
+		resolved,
+		err,
+	}
+}
+
+// TestDADResolve tests that an address successfully resolves after performing
+// DAD for various values of DupAddrDetectTransmits and RetransmitTimer.
+// Included in the subtests is a test to make sure that an invalid
+// RetransmitTimer (<1ms) values get fixed to the default RetransmitTimer of 1s.
+func TestDADResolve(t *testing.T) {
+	tests := []struct {
+		name                    string
+		dupAddrDetectTransmits  uint8
+		retransTimer            time.Duration
+		expectedRetransmitTimer time.Duration
+	}{
+		{"1:1s:1s", 1, time.Second, time.Second},
+		{"2:1s:1s", 2, time.Second, time.Second},
+		{"1:2s:2s", 1, 2 * time.Second, 2 * time.Second},
+		// 0s is an invalid RetransmitTimer timer and will be fixed to
+		// the default RetransmitTimer value of 1s.
+		{"1:0s:1s", 1, 0, time.Second},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			ndpDisp := ndpDispatcher{
+				dadC: make(chan ndpDADEvent),
+			}
+			opts := stack.Options{
+				NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+				NDPDisp:          &ndpDisp,
+			}
+			opts.NDPConfigs.RetransmitTimer = test.retransTimer
+			opts.NDPConfigs.DupAddrDetectTransmits = test.dupAddrDetectTransmits
+
+			e := channel.New(10, 1280, linkAddr1)
+			s := stack.New(opts)
+			if err := s.CreateNIC(1, e); err != nil {
+				t.Fatalf("CreateNIC(_) = %s", err)
+			}
+
+			if err := s.AddAddress(1, header.IPv6ProtocolNumber, addr1); err != nil {
+				t.Fatalf("AddAddress(_, %d, %s) = %s", header.IPv6ProtocolNumber, addr1, err)
+			}
+
+			stat := s.Stats().ICMP.V6PacketsSent.NeighborSolicit
+
+			// Should have sent an NDP NS immediately.
+			if got := stat.Value(); got != 1 {
+				t.Fatalf("got NeighborSolicit = %d, want = 1", got)
+
+			}
+
+			// Address should not be considered bound to the NIC yet
+			// (DAD ongoing).
+			addr, err := s.GetMainNICAddress(1, header.IPv6ProtocolNumber)
+			if err != nil {
+				t.Fatalf("got stack.GetMainNICAddress(_, _) = (_, %v), want = (_, nil)", err)
+			}
+			if want := (tcpip.AddressWithPrefix{}); addr != want {
+				t.Fatalf("got stack.GetMainNICAddress(_, _) = (%s, nil), want = (%s, nil)", addr, want)
+			}
+
+			// Wait for the remaining time - some delta (500ms), to
+			// make sure the address is still not resolved.
+			const delta = 500 * time.Millisecond
+			time.Sleep(test.expectedRetransmitTimer*time.Duration(test.dupAddrDetectTransmits) - delta)
+			addr, err = s.GetMainNICAddress(1, header.IPv6ProtocolNumber)
+			if err != nil {
+				t.Fatalf("got stack.GetMainNICAddress(_, _) = (_, %v), want = (_, nil)", err)
+			}
+			if want := (tcpip.AddressWithPrefix{}); addr != want {
+				t.Fatalf("got stack.GetMainNICAddress(_, _) = (%s, nil), want = (%s, nil)", addr, want)
+			}
+
+			// Wait for DAD to resolve.
+			select {
+			case <-time.After(2 * delta):
+				// We should get a resolution event after 500ms
+				// (delta) since we wait for 500ms less than the
+				// expected resolution time above to make sure
+				// that the address did not yet resolve. Waiting
+				// for 1s (2x delta) without a resolution event
+				// means something is wrong.
+				t.Fatal("timed out waiting for DAD resolution")
+			case e := <-ndpDisp.dadC:
+				if e.err != nil {
+					t.Fatal("got DAD error: ", e.err)
+				}
+				if e.nicid != 1 {
+					t.Fatalf("got DAD event w/ nicid = %d, want = 1", e.nicid)
+				}
+				if e.addr != addr1 {
+					t.Fatalf("got DAD event w/ addr = %s, want = %s", addr, addr1)
+				}
+				if !e.resolved {
+					t.Fatal("got DAD event w/ resolved = false, want = true")
+				}
+			}
+			addr, err = s.GetMainNICAddress(1, header.IPv6ProtocolNumber)
+			if err != nil {
+				t.Fatalf("stack.GetMainNICAddress(_, _) err = %s", err)
+			}
+			if addr.Address != addr1 {
+				t.Fatalf("got stack.GetMainNICAddress(_, _) = %s, want = %s", addr, addr1)
+			}
+
+			// Should not have sent any more NS messages.
+			if got := stat.Value(); got != uint64(test.dupAddrDetectTransmits) {
+				t.Fatalf("got NeighborSolicit = %d, want = %d", got, test.dupAddrDetectTransmits)
+			}
+
+			// Validate the sent Neighbor Solicitation messages.
+			for i := uint8(0); i < test.dupAddrDetectTransmits; i++ {
+				p := <-e.C
+
+				// Make sure its an IPv6 packet.
+				if p.Proto != header.IPv6ProtocolNumber {
+					t.Fatalf("got Proto = %d, want = %d", p.Proto, header.IPv6ProtocolNumber)
+				}
+
+				// Check NDP packet.
+				checker.IPv6(t, p.Header.ToVectorisedView().First(),
+					checker.TTL(header.NDPHopLimit),
+					checker.NDPNS(
+						checker.NDPNSTargetAddress(addr1)))
+			}
+		})
+	}
+
+}
+
+// TestDADFail tests to make sure that the DAD process fails if another node is
+// detected to be performing DAD on the same address (receive an NS message from
+// a node doing DAD for the same address), or if another node is detected to own
+// the address already (receive an NA message for the tentative address).
+func TestDADFail(t *testing.T) {
+	tests := []struct {
+		name    string
+		makeBuf func(tgt tcpip.Address) buffer.Prependable
+		getStat func(s tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter
+	}{
+		{
+			"RxSolicit",
+			func(tgt tcpip.Address) buffer.Prependable {
+				hdr := buffer.NewPrependable(header.IPv6MinimumSize + header.ICMPv6NeighborSolicitMinimumSize)
+				pkt := header.ICMPv6(hdr.Prepend(header.ICMPv6NeighborSolicitMinimumSize))
+				pkt.SetType(header.ICMPv6NeighborSolicit)
+				ns := header.NDPNeighborSolicit(pkt.NDPPayload())
+				ns.SetTargetAddress(tgt)
+				snmc := header.SolicitedNodeAddr(tgt)
+				pkt.SetChecksum(header.ICMPv6Checksum(pkt, header.IPv6Any, snmc, buffer.VectorisedView{}))
+				payloadLength := hdr.UsedLength()
+				ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
+				ip.Encode(&header.IPv6Fields{
+					PayloadLength: uint16(payloadLength),
+					NextHeader:    uint8(icmp.ProtocolNumber6),
+					HopLimit:      255,
+					SrcAddr:       header.IPv6Any,
+					DstAddr:       snmc,
+				})
+
+				return hdr
+
+			},
+			func(s tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+				return s.NeighborSolicit
+			},
+		},
+		{
+			"RxAdvert",
+			func(tgt tcpip.Address) buffer.Prependable {
+				hdr := buffer.NewPrependable(header.IPv6MinimumSize + header.ICMPv6NeighborAdvertSize)
+				pkt := header.ICMPv6(hdr.Prepend(header.ICMPv6NeighborAdvertSize))
+				pkt.SetType(header.ICMPv6NeighborAdvert)
+				na := header.NDPNeighborAdvert(pkt.NDPPayload())
+				na.SetSolicitedFlag(true)
+				na.SetOverrideFlag(true)
+				na.SetTargetAddress(tgt)
+				pkt.SetChecksum(header.ICMPv6Checksum(pkt, tgt, header.IPv6AllNodesMulticastAddress, buffer.VectorisedView{}))
+				payloadLength := hdr.UsedLength()
+				ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
+				ip.Encode(&header.IPv6Fields{
+					PayloadLength: uint16(payloadLength),
+					NextHeader:    uint8(icmp.ProtocolNumber6),
+					HopLimit:      255,
+					SrcAddr:       tgt,
+					DstAddr:       header.IPv6AllNodesMulticastAddress,
+				})
+
+				return hdr
+
+			},
+			func(s tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+				return s.NeighborAdvert
+			},
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			ndpDisp := ndpDispatcher{
+				dadC: make(chan ndpDADEvent),
+			}
+			ndpConfigs := stack.DefaultNDPConfigurations()
+			opts := stack.Options{
+				NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+				NDPConfigs:       ndpConfigs,
+				NDPDisp:          &ndpDisp,
+			}
+			opts.NDPConfigs.RetransmitTimer = time.Second * 2
+
+			e := channel.New(10, 1280, linkAddr1)
+			s := stack.New(opts)
+			if err := s.CreateNIC(1, e); err != nil {
+				t.Fatalf("CreateNIC(_) = %s", err)
+			}
+
+			if err := s.AddAddress(1, header.IPv6ProtocolNumber, addr1); err != nil {
+				t.Fatalf("AddAddress(_, %d, %s) = %s", header.IPv6ProtocolNumber, addr1, err)
+			}
+
+			// Address should not be considered bound to the NIC yet
+			// (DAD ongoing).
+			addr, err := s.GetMainNICAddress(1, header.IPv6ProtocolNumber)
+			if err != nil {
+				t.Fatalf("got stack.GetMainNICAddress(_, _) = (_, %v), want = (_, nil)", err)
+			}
+			if want := (tcpip.AddressWithPrefix{}); addr != want {
+				t.Fatalf("got stack.GetMainNICAddress(_, _) = (%s, nil), want = (%s, nil)", addr, want)
+			}
+
+			// Receive a packet to simulate multiple nodes owning or
+			// attempting to own the same address.
+			hdr := test.makeBuf(addr1)
+			e.Inject(header.IPv6ProtocolNumber, hdr.View().ToVectorisedView())
+
+			stat := test.getStat(s.Stats().ICMP.V6PacketsReceived)
+			if got := stat.Value(); got != 1 {
+				t.Fatalf("got stat = %d, want = 1", got)
+			}
+
+			// Wait for DAD to fail and make sure the address did
+			// not get resolved.
+			select {
+			case <-time.After(time.Duration(ndpConfigs.DupAddrDetectTransmits)*ndpConfigs.RetransmitTimer + time.Second):
+				// If we don't get a failure event after the
+				// expected resolution time + extra 1s buffer,
+				// something is wrong.
+				t.Fatal("timed out waiting for DAD failure")
+			case e := <-ndpDisp.dadC:
+				if e.err != nil {
+					t.Fatal("got DAD error: ", e.err)
+				}
+				if e.nicid != 1 {
+					t.Fatalf("got DAD event w/ nicid = %d, want = 1", e.nicid)
+				}
+				if e.addr != addr1 {
+					t.Fatalf("got DAD event w/ addr = %s, want = %s", addr, addr1)
+				}
+				if e.resolved {
+					t.Fatal("got DAD event w/ resolved = true, want = false")
+				}
+			}
+			addr, err = s.GetMainNICAddress(1, header.IPv6ProtocolNumber)
+			if err != nil {
+				t.Fatalf("got stack.GetMainNICAddress(_, _) = (_, %v), want = (_, nil)", err)
+			}
+			if want := (tcpip.AddressWithPrefix{}); addr != want {
+				t.Fatalf("got stack.GetMainNICAddress(_, _) = (%s, nil), want = (%s, nil)", addr, want)
+			}
+		})
+	}
+}
+
+// TestDADStop tests to make sure that the DAD process stops when an address is
+// removed.
+func TestDADStop(t *testing.T) {
+	ndpDisp := ndpDispatcher{
+		dadC: make(chan ndpDADEvent),
+	}
+	ndpConfigs := stack.NDPConfigurations{
+		RetransmitTimer:        time.Second,
+		DupAddrDetectTransmits: 2,
+	}
+	opts := stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+		NDPDisp:          &ndpDisp,
+		NDPConfigs:       ndpConfigs,
+	}
+
+	e := channel.New(10, 1280, linkAddr1)
+	s := stack.New(opts)
+	if err := s.CreateNIC(1, e); err != nil {
+		t.Fatalf("CreateNIC(_) = %s", err)
+	}
+
+	if err := s.AddAddress(1, header.IPv6ProtocolNumber, addr1); err != nil {
+		t.Fatalf("AddAddress(_, %d, %s) = %s", header.IPv6ProtocolNumber, addr1, err)
+	}
+
+	// Address should not be considered bound to the NIC yet (DAD ongoing).
+	addr, err := s.GetMainNICAddress(1, header.IPv6ProtocolNumber)
+	if err != nil {
+		t.Fatalf("got stack.GetMainNICAddress(_, _) = (_, %v), want = (_, nil)", err)
+	}
+	if want := (tcpip.AddressWithPrefix{}); addr != want {
+		t.Fatalf("got stack.GetMainNICAddress(_, _) = (%s, nil), want = (%s, nil)", addr, want)
+	}
+
+	// Remove the address. This should stop DAD.
+	if err := s.RemoveAddress(1, addr1); err != nil {
+		t.Fatalf("RemoveAddress(_, %s) = %s", addr1, err)
+	}
+
+	// Wait for DAD to fail (since the address was removed during DAD).
+	select {
+	case <-time.After(time.Duration(ndpConfigs.DupAddrDetectTransmits)*ndpConfigs.RetransmitTimer + time.Second):
+		// If we don't get a failure event after the expected resolution
+		// time + extra 1s buffer, something is wrong.
+		t.Fatal("timed out waiting for DAD failure")
+	case e := <-ndpDisp.dadC:
+		if e.err != nil {
+			t.Fatal("got DAD error: ", e.err)
+		}
+		if e.nicid != 1 {
+			t.Fatalf("got DAD event w/ nicid = %d, want = 1", e.nicid)
+		}
+		if e.addr != addr1 {
+			t.Fatalf("got DAD event w/ addr = %s, want = %s", addr, addr1)
+		}
+		if e.resolved {
+			t.Fatal("got DAD event w/ resolved = true, want = false")
+		}
+
+	}
+	addr, err = s.GetMainNICAddress(1, header.IPv6ProtocolNumber)
+	if err != nil {
+		t.Fatalf("got stack.GetMainNICAddress(_, _) = (_, %v), want = (_, nil)", err)
+	}
+	if want := (tcpip.AddressWithPrefix{}); addr != want {
+		t.Fatalf("got stack.GetMainNICAddress(_, _) = (%s, nil), want = (%s, nil)", addr, want)
+	}
+
+	// Should not have sent more than 1 NS message.
+	if got := s.Stats().ICMP.V6PacketsSent.NeighborSolicit.Value(); got > 1 {
+		t.Fatalf("got NeighborSolicit = %d, want <= 1", got)
+	}
+}
diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index 5993fe582..2d29fa88e 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -19,7 +19,6 @@ import (
 	"sync"
 	"sync/atomic"
 
-	"gvisor.dev/gvisor/pkg/ilist"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
@@ -37,12 +36,17 @@ type NIC struct {
 	mu            sync.RWMutex
 	spoofing      bool
 	promiscuous   bool
-	primary       map[tcpip.NetworkProtocolNumber]*ilist.List
+	primary       map[tcpip.NetworkProtocolNumber][]*referencedNetworkEndpoint
 	endpoints     map[NetworkEndpointID]*referencedNetworkEndpoint
 	addressRanges []tcpip.Subnet
 	mcastJoins    map[NetworkEndpointID]int32
+	// packetEPs is protected by mu, but the contained PacketEndpoint
+	// values are not.
+	packetEPs map[tcpip.NetworkProtocolNumber][]PacketEndpoint
 
 	stats NICStats
+
+	ndp ndpState
 }
 
 // NICStats includes transmitted and received stats.
@@ -77,15 +81,19 @@ const (
 )
 
 func newNIC(stack *Stack, id tcpip.NICID, name string, ep LinkEndpoint, loopback bool) *NIC {
-	return &NIC{
+	// TODO(b/141011931): Validate a LinkEndpoint (ep) is valid. For
+	// example, make sure that the link address it provides is a valid
+	// unicast ethernet address.
+	nic := &NIC{
 		stack:      stack,
 		id:         id,
 		name:       name,
 		linkEP:     ep,
 		loopback:   loopback,
-		primary:    make(map[tcpip.NetworkProtocolNumber]*ilist.List),
+		primary:    make(map[tcpip.NetworkProtocolNumber][]*referencedNetworkEndpoint),
 		endpoints:  make(map[NetworkEndpointID]*referencedNetworkEndpoint),
 		mcastJoins: make(map[NetworkEndpointID]int32),
+		packetEPs:  make(map[tcpip.NetworkProtocolNumber][]PacketEndpoint),
 		stats: NICStats{
 			Tx: DirectionStats{
 				Packets: &tcpip.StatCounter{},
@@ -96,7 +104,21 @@ func newNIC(stack *Stack, id tcpip.NICID, name string, ep LinkEndpoint, loopback
 				Bytes:   &tcpip.StatCounter{},
 			},
 		},
+		ndp: ndpState{
+			dad: make(map[tcpip.Address]dadState),
+		},
+	}
+	nic.ndp.nic = nic
+
+	// Register supported packet endpoint protocols.
+	for _, netProto := range header.Ethertypes {
+		nic.packetEPs[netProto] = []PacketEndpoint{}
 	}
+	for _, netProto := range stack.networkProtocols {
+		nic.packetEPs[netProto.Number()] = []PacketEndpoint{}
+	}
+
+	return nic
 }
 
 // enable enables the NIC. enable will attach the link to its LinkEndpoint and
@@ -121,11 +143,50 @@ func (n *NIC) enable() *tcpip.Error {
 	// when we perform Duplicate Address Detection, or Router Advertisement
 	// when we do Router Discovery. See RFC 4862, section 5.4.2 and RFC 4861
 	// section 4.2 for more information.
-	if _, ok := n.stack.networkProtocols[header.IPv6ProtocolNumber]; ok {
-		return n.joinGroup(header.IPv6ProtocolNumber, header.IPv6AllNodesMulticastAddress)
+	//
+	// Also auto-generate an IPv6 link-local address based on the NIC's
+	// link address if it is configured to do so. Note, each interface is
+	// required to have IPv6 link-local unicast address, as per RFC 4291
+	// section 2.1.
+	_, ok := n.stack.networkProtocols[header.IPv6ProtocolNumber]
+	if !ok {
+		return nil
 	}
 
-	return nil
+	n.mu.Lock()
+	defer n.mu.Unlock()
+
+	if err := n.joinGroupLocked(header.IPv6ProtocolNumber, header.IPv6AllNodesMulticastAddress); err != nil {
+		return err
+	}
+
+	if !n.stack.autoGenIPv6LinkLocal {
+		return nil
+	}
+
+	l2addr := n.linkEP.LinkAddress()
+
+	// Only attempt to generate the link-local address if we have a
+	// valid MAC address.
+	//
+	// TODO(b/141011931): Validate a LinkEndpoint's link address
+	// (provided by LinkEndpoint.LinkAddress) before reaching this
+	// point.
+	if !header.IsValidUnicastEthernetAddress(l2addr) {
+		return nil
+	}
+
+	addr := header.LinkLocalAddr(l2addr)
+
+	_, err := n.addPermanentAddressLocked(tcpip.ProtocolAddress{
+		Protocol: header.IPv6ProtocolNumber,
+		AddressWithPrefix: tcpip.AddressWithPrefix{
+			Address:   addr,
+			PrefixLen: header.IPv6LinkLocalPrefix.PrefixLen,
+		},
+	}, CanBePrimaryEndpoint)
+
+	return err
 }
 
 // attachLinkEndpoint attaches the NIC to the endpoint, which will enable it
@@ -161,18 +222,7 @@ func (n *NIC) primaryEndpoint(protocol tcpip.NetworkProtocolNumber) *referencedN
 	n.mu.RLock()
 	defer n.mu.RUnlock()
 
-	list := n.primary[protocol]
-	if list == nil {
-		return nil
-	}
-
-	for e := list.Front(); e != nil; e = e.Next() {
-		r := e.(*referencedNetworkEndpoint)
-		// TODO(crawshaw): allow broadcast address when SO_BROADCAST is set.
-		switch r.ep.ID().LocalAddress {
-		case header.IPv4Broadcast, header.IPv4Any:
-			continue
-		}
+	for _, r := range n.primary[protocol] {
 		if r.isValidForOutgoing() && r.tryIncRef() {
 			return r
 		}
@@ -282,13 +332,35 @@ func (n *NIC) addPermanentAddressLocked(protocolAddress tcpip.ProtocolAddress, p
 	id := NetworkEndpointID{protocolAddress.AddressWithPrefix.Address}
 	if ref, ok := n.endpoints[id]; ok {
 		switch ref.getKind() {
-		case permanent:
+		case permanentTentative, permanent:
 			// The NIC already have a permanent endpoint with that address.
 			return nil, tcpip.ErrDuplicateAddress
 		case permanentExpired, temporary:
-			// Promote the endpoint to become permanent.
+			// Promote the endpoint to become permanent and respect
+			// the new peb.
 			if ref.tryIncRef() {
 				ref.setKind(permanent)
+
+				refs := n.primary[ref.protocol]
+				for i, r := range refs {
+					if r == ref {
+						switch peb {
+						case CanBePrimaryEndpoint:
+							return ref, nil
+						case FirstPrimaryEndpoint:
+							if i == 0 {
+								return ref, nil
+							}
+							n.primary[r.protocol] = append(refs[:i], refs[i+1:]...)
+						case NeverPrimaryEndpoint:
+							n.primary[r.protocol] = append(refs[:i], refs[i+1:]...)
+							return ref, nil
+						}
+					}
+				}
+
+				n.insertPrimaryEndpointLocked(ref, peb)
+
 				return ref, nil
 			}
 			// tryIncRef failing means the endpoint is scheduled to be removed once
@@ -298,6 +370,7 @@ func (n *NIC) addPermanentAddressLocked(protocolAddress tcpip.ProtocolAddress, p
 			n.removeEndpointLocked(ref)
 		}
 	}
+
 	return n.addAddressLocked(protocolAddress, peb, permanent)
 }
 
@@ -321,6 +394,15 @@ func (n *NIC) addAddressLocked(protocolAddress tcpip.ProtocolAddress, peb Primar
 	if err != nil {
 		return nil, err
 	}
+
+	isIPv6Unicast := protocolAddress.Protocol == header.IPv6ProtocolNumber && header.IsV6UnicastAddress(protocolAddress.AddressWithPrefix.Address)
+
+	// If the address is an IPv6 address and it is a permanent address,
+	// mark it as tentative so it goes through the DAD process.
+	if isIPv6Unicast && kind == permanent {
+		kind = permanentTentative
+	}
+
 	ref := &referencedNetworkEndpoint{
 		refs:     1,
 		ep:       ep,
@@ -338,7 +420,7 @@ func (n *NIC) addAddressLocked(protocolAddress tcpip.ProtocolAddress, peb Primar
 
 	// If we are adding an IPv6 unicast address, join the solicited-node
 	// multicast address.
-	if protocolAddress.Protocol == header.IPv6ProtocolNumber && header.IsV6UnicastAddress(protocolAddress.AddressWithPrefix.Address) {
+	if isIPv6Unicast {
 		snmc := header.SolicitedNodeAddr(protocolAddress.AddressWithPrefix.Address)
 		if err := n.joinGroupLocked(protocolAddress.Protocol, snmc); err != nil {
 			return nil, err
@@ -347,17 +429,13 @@ func (n *NIC) addAddressLocked(protocolAddress tcpip.ProtocolAddress, peb Primar
 
 	n.endpoints[id] = ref
 
-	l, ok := n.primary[protocolAddress.Protocol]
-	if !ok {
-		l = &ilist.List{}
-		n.primary[protocolAddress.Protocol] = l
-	}
+	n.insertPrimaryEndpointLocked(ref, peb)
 
-	switch peb {
-	case CanBePrimaryEndpoint:
-		l.PushBack(ref)
-	case FirstPrimaryEndpoint:
-		l.PushFront(ref)
+	// If we are adding a tentative IPv6 address, start DAD.
+	if isIPv6Unicast && kind == permanentTentative {
+		if err := n.ndp.startDuplicateAddressDetection(protocolAddress.AddressWithPrefix.Address, ref); err != nil {
+			return nil, err
+		}
 	}
 
 	return ref, nil
@@ -382,10 +460,12 @@ func (n *NIC) AllAddresses() []tcpip.ProtocolAddress {
 
 	addrs := make([]tcpip.ProtocolAddress, 0, len(n.endpoints))
 	for nid, ref := range n.endpoints {
-		// Don't include expired or temporary endpoints to avoid confusion and
-		// prevent the caller from using those.
+		// Don't include tentative, expired or temporary endpoints to
+		// avoid confusion and prevent the caller from using those.
 		switch ref.getKind() {
-		case permanentExpired, temporary:
+		case permanentTentative, permanentExpired, temporary:
+			// TODO(b/140898488): Should tentative addresses be
+			//                    returned?
 			continue
 		}
 		addrs = append(addrs, tcpip.ProtocolAddress{
@@ -406,12 +486,12 @@ func (n *NIC) PrimaryAddresses() []tcpip.ProtocolAddress {
 
 	var addrs []tcpip.ProtocolAddress
 	for proto, list := range n.primary {
-		for e := list.Front(); e != nil; e = e.Next() {
-			ref := e.(*referencedNetworkEndpoint)
-			// Don't include expired or tempory endpoints to avoid confusion and
-			// prevent the caller from using those.
+		for _, ref := range list {
+			// Don't include tentative, expired or tempory endpoints
+			// to avoid confusion and prevent the caller from using
+			// those.
 			switch ref.getKind() {
-			case permanentExpired, temporary:
+			case permanentTentative, permanentExpired, temporary:
 				continue
 			}
 
@@ -471,6 +551,19 @@ func (n *NIC) AddressRanges() []tcpip.Subnet {
 	return append(sns, n.addressRanges...)
 }
 
+// insertPrimaryEndpointLocked adds r to n's primary endpoint list as required
+// by peb.
+//
+// n MUST be locked.
+func (n *NIC) insertPrimaryEndpointLocked(r *referencedNetworkEndpoint, peb PrimaryEndpointBehavior) {
+	switch peb {
+	case CanBePrimaryEndpoint:
+		n.primary[r.protocol] = append(n.primary[r.protocol], r)
+	case FirstPrimaryEndpoint:
+		n.primary[r.protocol] = append([]*referencedNetworkEndpoint{r}, n.primary[r.protocol]...)
+	}
+}
+
 func (n *NIC) removeEndpointLocked(r *referencedNetworkEndpoint) {
 	id := *r.ep.ID()
 
@@ -488,9 +581,12 @@ func (n *NIC) removeEndpointLocked(r *referencedNetworkEndpoint) {
 	}
 
 	delete(n.endpoints, id)
-	wasInList := r.Next() != nil || r.Prev() != nil || r == n.primary[r.protocol].Front()
-	if wasInList {
-		n.primary[r.protocol].Remove(r)
+	refs := n.primary[r.protocol]
+	for i, ref := range refs {
+		if ref == r {
+			n.primary[r.protocol] = append(refs[:i], refs[i+1:]...)
+			break
+		}
 	}
 
 	r.ep.Close()
@@ -504,10 +600,22 @@ func (n *NIC) removeEndpoint(r *referencedNetworkEndpoint) {
 
 func (n *NIC) removePermanentAddressLocked(addr tcpip.Address) *tcpip.Error {
 	r, ok := n.endpoints[NetworkEndpointID{addr}]
-	if !ok || r.getKind() != permanent {
+	if !ok {
 		return tcpip.ErrBadLocalAddress
 	}
 
+	kind := r.getKind()
+	if kind != permanent && kind != permanentTentative {
+		return tcpip.ErrBadLocalAddress
+	}
+
+	isIPv6Unicast := r.protocol == header.IPv6ProtocolNumber && header.IsV6UnicastAddress(addr)
+
+	// If we are removing a tentative IPv6 unicast address, stop DAD.
+	if isIPv6Unicast && kind == permanentTentative {
+		n.ndp.stopDuplicateAddressDetection(addr)
+	}
+
 	r.setKind(permanentExpired)
 	if !r.decRefLocked() {
 		// The endpoint still has references to it.
@@ -518,7 +626,7 @@ func (n *NIC) removePermanentAddressLocked(addr tcpip.Address) *tcpip.Error {
 
 	// If we are removing an IPv6 unicast address, leave the solicited-node
 	// multicast address.
-	if r.protocol == header.IPv6ProtocolNumber && header.IsV6UnicastAddress(addr) {
+	if isIPv6Unicast {
 		snmc := header.SolicitedNodeAddr(addr)
 		if err := n.leaveGroupLocked(snmc); err != nil {
 			return err
@@ -548,6 +656,11 @@ func (n *NIC) joinGroup(protocol tcpip.NetworkProtocolNumber, addr tcpip.Address
 // exists yet. Otherwise it just increments its count. n MUST be locked before
 // joinGroupLocked is called.
 func (n *NIC) joinGroupLocked(protocol tcpip.NetworkProtocolNumber, addr tcpip.Address) *tcpip.Error {
+	// TODO(b/143102137): When implementing MLD, make sure MLD packets are
+	// not sent unless a valid link-local address is available for use on n
+	// as an MLD packet's source address must be a link-local address as
+	// outlined in RFC 3810 section 5.
+
 	id := NetworkEndpointID{addr}
 	joins := n.mcastJoins[id]
 	if joins == 0 {
@@ -611,7 +724,7 @@ func handlePacket(protocol tcpip.NetworkProtocolNumber, dst, src tcpip.Address,
 // Note that the ownership of the slice backing vv is retained by the caller.
 // This rule applies only to the slice itself, not to the items of the slice;
 // the ownership of the items is not retained by the caller.
-func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, _ tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView) {
+func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView, linkHeader buffer.View) {
 	n.stats.Rx.Packets.Increment()
 	n.stats.Rx.Bytes.IncrementBy(uint64(vv.Size()))
 
@@ -621,6 +734,26 @@ func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, _ tcpip.LinkAddr
 		return
 	}
 
+	// If no local link layer address is provided, assume it was sent
+	// directly to this NIC.
+	if local == "" {
+		local = n.linkEP.LinkAddress()
+	}
+
+	// Are any packet sockets listening for this network protocol?
+	n.mu.RLock()
+	packetEPs := n.packetEPs[protocol]
+	// Check whether there are packet sockets listening for every protocol.
+	// If we received a packet with protocol EthernetProtocolAll, then the
+	// previous for loop will have handled it.
+	if protocol != header.EthernetProtocolAll {
+		packetEPs = append(packetEPs, n.packetEPs[header.EthernetProtocolAll]...)
+	}
+	n.mu.RUnlock()
+	for _, ep := range packetEPs {
+		ep.HandlePacket(n.id, local, protocol, vv, linkHeader)
+	}
+
 	if netProto.Number() == header.IPv4ProtocolNumber || netProto.Number() == header.IPv6ProtocolNumber {
 		n.stack.stats.IP.PacketsReceived.Increment()
 	}
@@ -632,8 +765,6 @@ func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, _ tcpip.LinkAddr
 
 	src, dst := netProto.ParseAddresses(vv.First())
 
-	n.stack.AddLinkAddress(n.id, src, remote)
-
 	if ref := n.getRef(protocol, dst); ref != nil {
 		handlePacket(protocol, dst, src, linkEP.LinkAddress(), remote, ref, vv)
 		return
@@ -682,7 +813,10 @@ func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, _ tcpip.LinkAddr
 		return
 	}
 
-	n.stack.stats.IP.InvalidAddressesReceived.Increment()
+	// If a packet socket handled the packet, don't treat it as invalid.
+	if len(packetEPs) == 0 {
+		n.stack.stats.IP.InvalidAddressesReceived.Increment()
+	}
 }
 
 // DeliverTransportPacket delivers the packets to the appropriate transport
@@ -769,14 +903,58 @@ func (n *NIC) Stack() *Stack {
 	return n.stack
 }
 
+// isAddrTentative returns true if addr is tentative on n.
+//
+// Note that if addr is not associated with n, then this function will return
+// false. It will only return true if the address is associated with the NIC
+// AND it is tentative.
+func (n *NIC) isAddrTentative(addr tcpip.Address) bool {
+	ref, ok := n.endpoints[NetworkEndpointID{addr}]
+	if !ok {
+		return false
+	}
+
+	return ref.getKind() == permanentTentative
+}
+
+// dupTentativeAddrDetected attempts to inform n that a tentative addr
+// is a duplicate on a link.
+//
+// dupTentativeAddrDetected will delete the tentative address if it exists.
+func (n *NIC) dupTentativeAddrDetected(addr tcpip.Address) *tcpip.Error {
+	n.mu.Lock()
+	defer n.mu.Unlock()
+
+	ref, ok := n.endpoints[NetworkEndpointID{addr}]
+	if !ok {
+		return tcpip.ErrBadAddress
+	}
+
+	if ref.getKind() != permanentTentative {
+		return tcpip.ErrInvalidEndpointState
+	}
+
+	return n.removePermanentAddressLocked(addr)
+}
+
 type networkEndpointKind int32
 
 const (
+	// A permanentTentative endpoint is a permanent address that is not yet
+	// considered to be fully bound to an interface in the traditional
+	// sense. That is, the address is associated with a NIC, but packets
+	// destined to the address MUST NOT be accepted and MUST be silently
+	// dropped, and the address MUST NOT be used as a source address for
+	// outgoing packets. For IPv6, addresses will be of this kind until
+	// NDP's Duplicate Address Detection has resolved, or be deleted if
+	// the process results in detecting a duplicate address.
+	permanentTentative networkEndpointKind = iota
+
 	// A permanent endpoint is created by adding a permanent address (vs. a
 	// temporary one) to the NIC. Its reference count is biased by 1 to avoid
 	// removal when no route holds a reference to it. It is removed by explicitly
 	// removing the permanent address from the NIC.
-	permanent networkEndpointKind = iota
+	permanent
 
 	// An expired permanent endoint is a permanent endoint that had its address
 	// removed from the NIC, and it is waiting to be removed once no more routes
@@ -794,8 +972,37 @@ const (
 	temporary
 )
 
+func (n *NIC) registerPacketEndpoint(netProto tcpip.NetworkProtocolNumber, ep PacketEndpoint) *tcpip.Error {
+	n.mu.Lock()
+	defer n.mu.Unlock()
+
+	eps, ok := n.packetEPs[netProto]
+	if !ok {
+		return tcpip.ErrNotSupported
+	}
+	n.packetEPs[netProto] = append(eps, ep)
+
+	return nil
+}
+
+func (n *NIC) unregisterPacketEndpoint(netProto tcpip.NetworkProtocolNumber, ep PacketEndpoint) {
+	n.mu.Lock()
+	defer n.mu.Unlock()
+
+	eps, ok := n.packetEPs[netProto]
+	if !ok {
+		return
+	}
+
+	for i, epOther := range eps {
+		if epOther == ep {
+			n.packetEPs[netProto] = append(eps[:i], eps[i+1:]...)
+			return
+		}
+	}
+}
+
 type referencedNetworkEndpoint struct {
-	ilist.Entry
 	ep       NetworkEndpoint
 	nic      *NIC
 	protocol tcpip.NetworkProtocolNumber
diff --git a/pkg/tcpip/stack/registration.go b/pkg/tcpip/stack/registration.go
index 9d6157f22..0869fb084 100644
--- a/pkg/tcpip/stack/registration.go
+++ b/pkg/tcpip/stack/registration.go
@@ -71,8 +71,8 @@ type TransportEndpoint interface {
 
 // RawTransportEndpoint is the interface that needs to be implemented by raw
 // transport protocol endpoints. RawTransportEndpoints receive the entire
-// packet - including the link, network, and transport headers - as delivered
-// to netstack.
+// packet - including the network and transport headers - as delivered to
+// netstack.
 type RawTransportEndpoint interface {
 	// HandlePacket is called by the stack when new packets arrive to
 	// this transport endpoint. The packet contains all data from the link
@@ -80,6 +80,22 @@ type RawTransportEndpoint interface {
 	HandlePacket(r *Route, netHeader buffer.View, packet buffer.VectorisedView)
 }
 
+// PacketEndpoint is the interface that needs to be implemented by packet
+// transport protocol endpoints. These endpoints receive link layer headers in
+// addition to whatever they contain (usually network and transport layer
+// headers and a payload).
+type PacketEndpoint interface {
+	// HandlePacket is called by the stack when new packets arrive that
+	// match the endpoint.
+	//
+	// Implementers should treat packet as immutable and should copy it
+	// before before modification.
+	//
+	// linkHeader may have a length of 0, in which case the PacketEndpoint
+	// should construct its own ethernet header for applications.
+	HandlePacket(nicid tcpip.NICID, addr tcpip.LinkAddress, netProto tcpip.NetworkProtocolNumber, packet buffer.VectorisedView, linkHeader buffer.View)
+}
+
 // TransportProtocol is the interface that needs to be implemented by transport
 // protocols (e.g., tcp, udp) that want to be part of the networking stack.
 type TransportProtocol interface {
@@ -185,6 +201,10 @@ type NetworkEndpoint interface {
 	// protocol.
 	WritePacket(r *Route, gso *GSO, hdr buffer.Prependable, payload buffer.VectorisedView, params NetworkHeaderParams, loop PacketLooping) *tcpip.Error
 
+	// WritePackets writes packets to the given destination address and
+	// protocol.
+	WritePackets(r *Route, gso *GSO, hdrs []PacketDescriptor, payload buffer.VectorisedView, params NetworkHeaderParams, loop PacketLooping) (int, *tcpip.Error)
+
 	// WriteHeaderIncludedPacket writes a packet that includes a network
 	// header to the given destination address.
 	WriteHeaderIncludedPacket(r *Route, payload buffer.VectorisedView, loop PacketLooping) *tcpip.Error
@@ -242,9 +262,10 @@ type NetworkProtocol interface {
 // packets to the appropriate network endpoint after it has been handled by
 // the data link layer.
 type NetworkDispatcher interface {
-	// DeliverNetworkPacket finds the appropriate network protocol
-	// endpoint and hands the packet over for further processing.
-	DeliverNetworkPacket(linkEP LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView)
+	// DeliverNetworkPacket finds the appropriate network protocol endpoint
+	// and hands the packet over for further processing. linkHeader may have
+	// length 0 when the caller does not have ethernet data.
+	DeliverNetworkPacket(linkEP LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView, linkHeader buffer.View)
 }
 
 // LinkEndpointCapabilities is the type associated with the capabilities
@@ -266,7 +287,11 @@ const (
 	CapabilitySaveRestore
 	CapabilityDisconnectOk
 	CapabilityLoopback
-	CapabilityGSO
+	CapabilityHardwareGSO
+
+	// CapabilitySoftwareGSO indicates the link endpoint supports of sending
+	// multiple packets using a single call (LinkEndpoint.WritePackets).
+	CapabilitySoftwareGSO
 )
 
 // LinkEndpoint is the interface implemented by data link layer protocols (e.g.,
@@ -301,6 +326,18 @@ type LinkEndpoint interface {
 	// r.LocalLinkAddress if it is provided.
 	WritePacket(r *Route, gso *GSO, hdr buffer.Prependable, payload buffer.VectorisedView, protocol tcpip.NetworkProtocolNumber) *tcpip.Error
 
+	// WritePackets writes packets with the given protocol through the
+	// given route.
+	//
+	// Right now, WritePackets is used only when the software segmentation
+	// offload is enabled. If it will be used for something else, it may
+	// require to change syscall filters.
+	WritePackets(r *Route, gso *GSO, hdrs []PacketDescriptor, payload buffer.VectorisedView, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error)
+
+	// WriteRawPacket writes a packet directly to the link. The packet
+	// should already have an ethernet header.
+	WriteRawPacket(packet buffer.VectorisedView) *tcpip.Error
+
 	// Attach attaches the data link layer endpoint to the network-layer
 	// dispatcher of the stack.
 	Attach(dispatcher NetworkDispatcher)
@@ -324,13 +361,14 @@ type LinkEndpoint interface {
 type InjectableLinkEndpoint interface {
 	LinkEndpoint
 
-	// Inject injects an inbound packet.
-	Inject(protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView)
+	// InjectInbound injects an inbound packet.
+	InjectInbound(protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView)
 
-	// WriteRawPacket writes a fully formed outbound packet directly to the link.
+	// InjectOutbound writes a fully formed outbound packet directly to the
+	// link.
 	//
 	// dest is used by endpoints with multiple raw destinations.
-	WriteRawPacket(dest tcpip.Address, packet []byte) *tcpip.Error
+	InjectOutbound(dest tcpip.Address, packet []byte) *tcpip.Error
 }
 
 // A LinkAddressResolver is an extension to a NetworkProtocol that
@@ -379,11 +417,16 @@ type LinkAddressCache interface {
 	RemoveWaker(nicid tcpip.NICID, addr tcpip.Address, waker *sleep.Waker)
 }
 
-// UnassociatedEndpointFactory produces endpoints for writing packets not
-// associated with a particular transport protocol. Such endpoints can be used
-// to write arbitrary packets that include the IP header.
-type UnassociatedEndpointFactory interface {
-	NewUnassociatedRawEndpoint(stack *Stack, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error)
+// RawFactory produces endpoints for writing various types of raw packets.
+type RawFactory interface {
+	// NewUnassociatedEndpoint produces endpoints for writing packets not
+	// associated with a particular transport protocol. Such endpoints can
+	// be used to write arbitrary packets that include the network header.
+	NewUnassociatedEndpoint(stack *Stack, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error)
+
+	// NewPacketEndpoint produces endpoints for reading and writing packets
+	// that include network and (when cooked is false) link layer headers.
+	NewPacketEndpoint(stack *Stack, cooked bool, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error)
 }
 
 // GSOType is the type of GSO segments.
@@ -394,8 +437,14 @@ type GSOType int
 // Types of gso segments.
 const (
 	GSONone GSOType = iota
+
+	// Hardware GSO types:
 	GSOTCPv4
 	GSOTCPv6
+
+	// GSOSW is used for software GSO segments which have to be sent by
+	// endpoint.WritePackets.
+	GSOSW
 )
 
 // GSO contains generic segmentation offload properties.
@@ -423,3 +472,7 @@ type GSOEndpoint interface {
 	// GSOMaxSize returns the maximum GSO packet size.
 	GSOMaxSize() uint32
 }
+
+// SoftwareGSOMaxSize is a maximum allowed size of a software GSO segment.
+// This isn't a hard limit, because it is never set into packet headers.
+const SoftwareGSOMaxSize = (1 << 16)
diff --git a/pkg/tcpip/stack/route.go b/pkg/tcpip/stack/route.go
index e72373964..1a0a51b57 100644
--- a/pkg/tcpip/stack/route.go
+++ b/pkg/tcpip/stack/route.go
@@ -47,8 +47,8 @@ type Route struct {
 	// starts.
 	ref *referencedNetworkEndpoint
 
-	// loop controls where WritePacket should send packets.
-	loop PacketLooping
+	// Loop controls where WritePacket should send packets.
+	Loop PacketLooping
 }
 
 // makeRoute initializes a new route. It takes ownership of the provided
@@ -69,7 +69,7 @@ func makeRoute(netProto tcpip.NetworkProtocolNumber, localAddr, remoteAddr tcpip
 		LocalLinkAddress: localLinkAddr,
 		RemoteAddress:    remoteAddr,
 		ref:              ref,
-		loop:             loop,
+		Loop:             loop,
 	}
 }
 
@@ -159,7 +159,7 @@ func (r *Route) WritePacket(gso *GSO, hdr buffer.Prependable, payload buffer.Vec
 		return tcpip.ErrInvalidEndpointState
 	}
 
-	err := r.ref.ep.WritePacket(r, gso, hdr, payload, params, r.loop)
+	err := r.ref.ep.WritePacket(r, gso, hdr, payload, params, r.Loop)
 	if err != nil {
 		r.Stats().IP.OutgoingPacketErrors.Increment()
 	} else {
@@ -169,6 +169,44 @@ func (r *Route) WritePacket(gso *GSO, hdr buffer.Prependable, payload buffer.Vec
 	return err
 }
 
+// PacketDescriptor is a packet descriptor which contains a packet header and
+// offset and size of packet data in a payload view.
+type PacketDescriptor struct {
+	Hdr  buffer.Prependable
+	Off  int
+	Size int
+}
+
+// NewPacketDescriptors allocates a set of packet descriptors.
+func NewPacketDescriptors(n int, hdrSize int) []PacketDescriptor {
+	buf := make([]byte, n*hdrSize)
+	hdrs := make([]PacketDescriptor, n)
+	for i := range hdrs {
+		hdrs[i].Hdr = buffer.NewEmptyPrependableFromView(buf[i*hdrSize:][:hdrSize])
+	}
+	return hdrs
+}
+
+// WritePackets writes the set of packets through the given route.
+func (r *Route) WritePackets(gso *GSO, hdrs []PacketDescriptor, payload buffer.VectorisedView, params NetworkHeaderParams) (int, *tcpip.Error) {
+	if !r.ref.isValidForOutgoing() {
+		return 0, tcpip.ErrInvalidEndpointState
+	}
+
+	n, err := r.ref.ep.WritePackets(r, gso, hdrs, payload, params, r.Loop)
+	if err != nil {
+		r.Stats().IP.OutgoingPacketErrors.IncrementBy(uint64(len(hdrs) - n))
+	}
+	r.ref.nic.stats.Tx.Packets.IncrementBy(uint64(n))
+	payloadSize := 0
+	for i := 0; i < n; i++ {
+		r.ref.nic.stats.Tx.Bytes.IncrementBy(uint64(hdrs[i].Hdr.UsedLength()))
+		payloadSize += hdrs[i].Size
+	}
+	r.ref.nic.stats.Tx.Bytes.IncrementBy(uint64(payloadSize))
+	return n, err
+}
+
 // WriteHeaderIncludedPacket writes a packet already containing a network
 // header through the given route.
 func (r *Route) WriteHeaderIncludedPacket(payload buffer.VectorisedView) *tcpip.Error {
@@ -176,7 +214,7 @@ func (r *Route) WriteHeaderIncludedPacket(payload buffer.VectorisedView) *tcpip.
 		return tcpip.ErrInvalidEndpointState
 	}
 
-	if err := r.ref.ep.WriteHeaderIncludedPacket(r, payload, r.loop); err != nil {
+	if err := r.ref.ep.WriteHeaderIncludedPacket(r, payload, r.Loop); err != nil {
 		r.Stats().IP.OutgoingPacketErrors.Increment()
 		return err
 	}
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index f67975525..5ea432a24 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -351,10 +351,9 @@ type Stack struct {
 	networkProtocols   map[tcpip.NetworkProtocolNumber]NetworkProtocol
 	linkAddrResolvers  map[tcpip.NetworkProtocolNumber]LinkAddressResolver
 
-	// unassociatedFactory creates unassociated endpoints. If nil, raw
-	// endpoints are disabled. It is set during Stack creation and is
-	// immutable.
-	unassociatedFactory UnassociatedEndpointFactory
+	// rawFactory creates raw endpoints. If nil, raw endpoints are
+	// disabled. It is set during Stack creation and is immutable.
+	rawFactory RawFactory
 
 	demux *transportDemuxer
 
@@ -399,6 +398,18 @@ type Stack struct {
 	//
 	// TODO(gvisor.dev/issue/940): S/R this field.
 	portSeed uint32
+
+	// ndpConfigs is the NDP configurations used by interfaces.
+	ndpConfigs NDPConfigurations
+
+	// autoGenIPv6LinkLocal determines whether or not the stack will attempt
+	// to auto-generate an IPv6 link-local address for newly enabled NICs.
+	// See the AutoGenIPv6LinkLocal field of Options for more details.
+	autoGenIPv6LinkLocal bool
+
+	// ndpDisp is the NDP event dispatcher that is used to send the netstack
+	// integrator NDP related events.
+	ndpDisp NDPDispatcher
 }
 
 // Options contains optional Stack configuration.
@@ -422,9 +433,32 @@ type Options struct {
 	// stack (false).
 	HandleLocal bool
 
-	// UnassociatedFactory produces unassociated endpoints raw endpoints.
-	// Raw endpoints are enabled only if this is non-nil.
-	UnassociatedFactory UnassociatedEndpointFactory
+	// NDPConfigs is the NDP configurations used by interfaces.
+	//
+	// By default, NDPConfigs will have a zero value for its
+	// DupAddrDetectTransmits field, implying that DAD will not be performed
+	// before assigning an address to a NIC.
+	NDPConfigs NDPConfigurations
+
+	// AutoGenIPv6LinkLocal determins whether or not the stack will attempt
+	// to auto-generate an IPv6 link-local address for newly enabled NICs.
+	// Note, setting this to true does not mean that a link-local address
+	// will be assigned right away, or at all. If Duplicate Address
+	// Detection is enabled, an address will only be assigned if it
+	// successfully resolves. If it fails, no further attempt will be made
+	// to auto-generate an IPv6 link-local address.
+	//
+	// The generated link-local address will follow RFC 4291 Appendix A
+	// guidelines.
+	AutoGenIPv6LinkLocal bool
+
+	// NDPDisp is the NDP event dispatcher that an integrator can provide to
+	// receive NDP related events.
+	NDPDisp NDPDispatcher
+
+	// RawFactory produces raw endpoints. Raw endpoints are enabled only if
+	// this is non-nil.
+	RawFactory RawFactory
 }
 
 // TransportEndpointInfo holds useful information about a transport endpoint
@@ -458,6 +492,9 @@ func (*TransportEndpointInfo) IsEndpointInfo() {}
 // New allocates a new networking stack with only the requested networking and
 // transport protocols configured with default options.
 //
+// Note, NDPConfigurations will be fixed before being used by the Stack. That
+// is, if an invalid value was provided, it will be reset to the default value.
+//
 // Protocol options can be changed by calling the
 // SetNetworkProtocolOption/SetTransportProtocolOption methods provided by the
 // stack. Please refer to individual protocol implementations as to what options
@@ -468,18 +505,24 @@ func New(opts Options) *Stack {
 		clock = &tcpip.StdClock{}
 	}
 
+	// Make sure opts.NDPConfigs contains valid values only.
+	opts.NDPConfigs.validate()
+
 	s := &Stack{
-		transportProtocols: make(map[tcpip.TransportProtocolNumber]*transportProtocolState),
-		networkProtocols:   make(map[tcpip.NetworkProtocolNumber]NetworkProtocol),
-		linkAddrResolvers:  make(map[tcpip.NetworkProtocolNumber]LinkAddressResolver),
-		nics:               make(map[tcpip.NICID]*NIC),
-		linkAddrCache:      newLinkAddrCache(ageLimit, resolutionTimeout, resolutionAttempts),
-		PortManager:        ports.NewPortManager(),
-		clock:              clock,
-		stats:              opts.Stats.FillIn(),
-		handleLocal:        opts.HandleLocal,
-		icmpRateLimiter:    NewICMPRateLimiter(),
-		portSeed:           generateRandUint32(),
+		transportProtocols:   make(map[tcpip.TransportProtocolNumber]*transportProtocolState),
+		networkProtocols:     make(map[tcpip.NetworkProtocolNumber]NetworkProtocol),
+		linkAddrResolvers:    make(map[tcpip.NetworkProtocolNumber]LinkAddressResolver),
+		nics:                 make(map[tcpip.NICID]*NIC),
+		linkAddrCache:        newLinkAddrCache(ageLimit, resolutionTimeout, resolutionAttempts),
+		PortManager:          ports.NewPortManager(),
+		clock:                clock,
+		stats:                opts.Stats.FillIn(),
+		handleLocal:          opts.HandleLocal,
+		icmpRateLimiter:      NewICMPRateLimiter(),
+		portSeed:             generateRandUint32(),
+		ndpConfigs:           opts.NDPConfigs,
+		autoGenIPv6LinkLocal: opts.AutoGenIPv6LinkLocal,
+		ndpDisp:              opts.NDPDisp,
 	}
 
 	// Add specified network protocols.
@@ -497,8 +540,8 @@ func New(opts Options) *Stack {
 		}
 	}
 
-	// Add the factory for unassociated endpoints, if present.
-	s.unassociatedFactory = opts.UnassociatedFactory
+	// Add the factory for raw endpoints, if present.
+	s.rawFactory = opts.RawFactory
 
 	// Create the global transport demuxer.
 	s.demux = newTransportDemuxer(s)
@@ -633,12 +676,12 @@ func (s *Stack) NewEndpoint(transport tcpip.TransportProtocolNumber, network tcp
 // protocol. Raw endpoints receive all traffic for a given protocol regardless
 // of address.
 func (s *Stack) NewRawEndpoint(transport tcpip.TransportProtocolNumber, network tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue, associated bool) (tcpip.Endpoint, *tcpip.Error) {
-	if s.unassociatedFactory == nil {
+	if s.rawFactory == nil {
 		return nil, tcpip.ErrNotPermitted
 	}
 
 	if !associated {
-		return s.unassociatedFactory.NewUnassociatedRawEndpoint(s, network, transport, waiterQueue)
+		return s.rawFactory.NewUnassociatedEndpoint(s, network, transport, waiterQueue)
 	}
 
 	t, ok := s.transportProtocols[transport]
@@ -649,6 +692,16 @@ func (s *Stack) NewRawEndpoint(transport tcpip.TransportProtocolNumber, network
 	return t.proto.NewRawEndpoint(s, network, waiterQueue)
 }
 
+// NewPacketEndpoint creates a new packet endpoint listening for the given
+// netProto.
+func (s *Stack) NewPacketEndpoint(cooked bool, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
+	if s.rawFactory == nil {
+		return nil, tcpip.ErrNotPermitted
+	}
+
+	return s.rawFactory.NewPacketEndpoint(s, cooked, netProto, waiterQueue)
+}
+
 // createNIC creates a NIC with the provided id and link-layer endpoint, and
 // optionally enable it.
 func (s *Stack) createNIC(id tcpip.NICID, name string, ep LinkEndpoint, enabled, loopback bool) *tcpip.Error {
@@ -1118,6 +1171,109 @@ func (s *Stack) Resume() {
 	}
 }
 
+// RegisterPacketEndpoint registers ep with the stack, causing it to receive
+// all traffic of the specified netProto on the given NIC. If nicID is 0, it
+// receives traffic from every NIC.
+func (s *Stack) RegisterPacketEndpoint(nicID tcpip.NICID, netProto tcpip.NetworkProtocolNumber, ep PacketEndpoint) *tcpip.Error {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	// If no NIC is specified, capture on all devices.
+	if nicID == 0 {
+		// Register with each NIC.
+		for _, nic := range s.nics {
+			if err := nic.registerPacketEndpoint(netProto, ep); err != nil {
+				s.unregisterPacketEndpointLocked(0, netProto, ep)
+				return err
+			}
+		}
+		return nil
+	}
+
+	// Capture on a specific device.
+	nic, ok := s.nics[nicID]
+	if !ok {
+		return tcpip.ErrUnknownNICID
+	}
+	if err := nic.registerPacketEndpoint(netProto, ep); err != nil {
+		return err
+	}
+
+	return nil
+}
+
+// UnregisterPacketEndpoint unregisters ep for packets of the specified
+// netProto from the specified NIC. If nicID is 0, ep is unregistered from all
+// NICs.
+func (s *Stack) UnregisterPacketEndpoint(nicID tcpip.NICID, netProto tcpip.NetworkProtocolNumber, ep PacketEndpoint) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	s.unregisterPacketEndpointLocked(nicID, netProto, ep)
+}
+
+func (s *Stack) unregisterPacketEndpointLocked(nicID tcpip.NICID, netProto tcpip.NetworkProtocolNumber, ep PacketEndpoint) {
+	// If no NIC is specified, unregister on all devices.
+	if nicID == 0 {
+		// Unregister with each NIC.
+		for _, nic := range s.nics {
+			nic.unregisterPacketEndpoint(netProto, ep)
+		}
+		return
+	}
+
+	// Unregister in a single device.
+	nic, ok := s.nics[nicID]
+	if !ok {
+		return
+	}
+	nic.unregisterPacketEndpoint(netProto, ep)
+}
+
+// WritePacket writes data directly to the specified NIC. It adds an ethernet
+// header based on the arguments.
+func (s *Stack) WritePacket(nicid tcpip.NICID, dst tcpip.LinkAddress, netProto tcpip.NetworkProtocolNumber, payload buffer.VectorisedView) *tcpip.Error {
+	s.mu.Lock()
+	nic, ok := s.nics[nicid]
+	s.mu.Unlock()
+	if !ok {
+		return tcpip.ErrUnknownDevice
+	}
+
+	// Add our own fake ethernet header.
+	ethFields := header.EthernetFields{
+		SrcAddr: nic.linkEP.LinkAddress(),
+		DstAddr: dst,
+		Type:    netProto,
+	}
+	fakeHeader := make(header.Ethernet, header.EthernetMinimumSize)
+	fakeHeader.Encode(&ethFields)
+	ethHeader := buffer.View(fakeHeader).ToVectorisedView()
+	ethHeader.Append(payload)
+
+	if err := nic.linkEP.WriteRawPacket(ethHeader); err != nil {
+		return err
+	}
+
+	return nil
+}
+
+// WriteRawPacket writes data directly to the specified NIC without adding any
+// headers.
+func (s *Stack) WriteRawPacket(nicid tcpip.NICID, payload buffer.VectorisedView) *tcpip.Error {
+	s.mu.Lock()
+	nic, ok := s.nics[nicid]
+	s.mu.Unlock()
+	if !ok {
+		return tcpip.ErrUnknownDevice
+	}
+
+	if err := nic.linkEP.WriteRawPacket(payload); err != nil {
+		return err
+	}
+
+	return nil
+}
+
 // NetworkProtocolInstance returns the protocol instance in the stack for the
 // specified network protocol. This method is public for protocol implementers
 // and tests to use.
@@ -1238,6 +1394,37 @@ func (s *Stack) AllowICMPMessage() bool {
 	return s.icmpRateLimiter.Allow()
 }
 
+// IsAddrTentative returns true if addr is tentative on the NIC with ID id.
+//
+// Note that if addr is not associated with a NIC with id ID, then this
+// function will return false. It will only return true if the address is
+// associated with the NIC AND it is tentative.
+func (s *Stack) IsAddrTentative(id tcpip.NICID, addr tcpip.Address) (bool, *tcpip.Error) {
+	s.mu.RLock()
+	defer s.mu.RUnlock()
+
+	nic, ok := s.nics[id]
+	if !ok {
+		return false, tcpip.ErrUnknownNICID
+	}
+
+	return nic.isAddrTentative(addr), nil
+}
+
+// DupTentativeAddrDetected attempts to inform the NIC with ID id that a
+// tentative addr on it is a duplicate on a link.
+func (s *Stack) DupTentativeAddrDetected(id tcpip.NICID, addr tcpip.Address) *tcpip.Error {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	nic, ok := s.nics[id]
+	if !ok {
+		return tcpip.ErrUnknownNICID
+	}
+
+	return nic.dupTentativeAddrDetected(addr)
+}
+
 // PortSeed returns a 32 bit value that can be used as a seed value for port
 // picking.
 //
diff --git a/pkg/tcpip/stack/stack_test.go b/pkg/tcpip/stack/stack_test.go
index 10fd1065f..9dae853d0 100644
--- a/pkg/tcpip/stack/stack_test.go
+++ b/pkg/tcpip/stack/stack_test.go
@@ -24,11 +24,14 @@ import (
 	"sort"
 	"strings"
 	"testing"
+	"time"
 
+	"github.com/google/go-cmp/cmp"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/link/channel"
+	"gvisor.dev/gvisor/pkg/tcpip/network/ipv6"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 )
 
@@ -144,6 +147,11 @@ func (f *fakeNetworkEndpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr bu
 	return f.ep.WritePacket(r, gso, hdr, payload, fakeNetNumber)
 }
 
+// WritePackets implements stack.LinkEndpoint.WritePackets.
+func (f *fakeNetworkEndpoint) WritePackets(r *stack.Route, gso *stack.GSO, hdrs []stack.PacketDescriptor, payload buffer.VectorisedView, params stack.NetworkHeaderParams, loop stack.PacketLooping) (int, *tcpip.Error) {
+	panic("not implemented")
+}
+
 func (*fakeNetworkEndpoint) WriteHeaderIncludedPacket(r *stack.Route, payload buffer.VectorisedView, loop stack.PacketLooping) *tcpip.Error {
 	return tcpip.ErrNotSupported
 }
@@ -1864,3 +1872,297 @@ func TestNICForwarding(t *testing.T) {
 		t.Errorf("got Tx.Bytes.Value() = %d, want = %d", got, want)
 	}
 }
+
+// TestNICAutoGenAddr tests the auto-generation of IPv6 link-local addresses
+// (or lack there-of if disabled (default)). Note, DAD will be disabled in
+// these tests.
+func TestNICAutoGenAddr(t *testing.T) {
+	tests := []struct {
+		name      string
+		autoGen   bool
+		linkAddr  tcpip.LinkAddress
+		shouldGen bool
+	}{
+		{
+			"Disabled",
+			false,
+			linkAddr1,
+			false,
+		},
+		{
+			"Enabled",
+			true,
+			linkAddr1,
+			true,
+		},
+		{
+			"Nil MAC",
+			true,
+			tcpip.LinkAddress([]byte(nil)),
+			false,
+		},
+		{
+			"Empty MAC",
+			true,
+			tcpip.LinkAddress(""),
+			false,
+		},
+		{
+			"Invalid MAC",
+			true,
+			tcpip.LinkAddress("\x01\x02\x03"),
+			false,
+		},
+		{
+			"Multicast MAC",
+			true,
+			tcpip.LinkAddress("\x01\x02\x03\x04\x05\x06"),
+			false,
+		},
+		{
+			"Unspecified MAC",
+			true,
+			tcpip.LinkAddress("\x00\x00\x00\x00\x00\x00"),
+			false,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			opts := stack.Options{
+				NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+			}
+
+			if test.autoGen {
+				// Only set opts.AutoGenIPv6LinkLocal when
+				// test.autoGen is true because
+				// opts.AutoGenIPv6LinkLocal should be false by
+				// default.
+				opts.AutoGenIPv6LinkLocal = true
+			}
+
+			e := channel.New(10, 1280, test.linkAddr)
+			s := stack.New(opts)
+			if err := s.CreateNIC(1, e); err != nil {
+				t.Fatalf("CreateNIC(_) = %s", err)
+			}
+
+			addr, err := s.GetMainNICAddress(1, header.IPv6ProtocolNumber)
+			if err != nil {
+				t.Fatalf("stack.GetMainNICAddress(_, _) err = %s", err)
+			}
+
+			if test.shouldGen {
+				// Should have auto-generated an address and
+				// resolved immediately (DAD is disabled).
+				if want := (tcpip.AddressWithPrefix{Address: header.LinkLocalAddr(test.linkAddr), PrefixLen: header.IPv6LinkLocalPrefix.PrefixLen}); addr != want {
+					t.Fatalf("got stack.GetMainNICAddress(_, _) = %s, want = %s", addr, want)
+				}
+			} else {
+				// Should not have auto-generated an address.
+				if want := (tcpip.AddressWithPrefix{}); addr != want {
+					t.Fatalf("got stack.GetMainNICAddress(_, _) = (%s, nil), want = (%s, nil)", addr, want)
+				}
+			}
+		})
+	}
+}
+
+// TestNICAutoGenAddrDoesDAD tests that the successful auto-generation of IPv6
+// link-local addresses will only be assigned after the DAD process resolves.
+func TestNICAutoGenAddrDoesDAD(t *testing.T) {
+	ndpDisp := ndpDispatcher{
+		dadC: make(chan ndpDADEvent),
+	}
+	ndpConfigs := stack.DefaultNDPConfigurations()
+	opts := stack.Options{
+		NetworkProtocols:     []stack.NetworkProtocol{ipv6.NewProtocol()},
+		NDPConfigs:           ndpConfigs,
+		AutoGenIPv6LinkLocal: true,
+		NDPDisp:              &ndpDisp,
+	}
+
+	e := channel.New(10, 1280, linkAddr1)
+	s := stack.New(opts)
+	if err := s.CreateNIC(1, e); err != nil {
+		t.Fatalf("CreateNIC(_) = %s", err)
+	}
+
+	// Address should not be considered bound to the
+	// NIC yet (DAD ongoing).
+	addr, err := s.GetMainNICAddress(1, header.IPv6ProtocolNumber)
+	if err != nil {
+		t.Fatalf("got stack.GetMainNICAddress(_, _) = (_, %v), want = (_, nil)", err)
+	}
+	if want := (tcpip.AddressWithPrefix{}); addr != want {
+		t.Fatalf("got stack.GetMainNICAddress(_, _) = (%s, nil), want = (%s, nil)", addr, want)
+	}
+
+	linkLocalAddr := header.LinkLocalAddr(linkAddr1)
+
+	// Wait for DAD to resolve.
+	select {
+	case <-time.After(time.Duration(ndpConfigs.DupAddrDetectTransmits)*ndpConfigs.RetransmitTimer + time.Second):
+		// We should get a resolution event after 1s (default time to
+		// resolve as per default NDP configurations). Waiting for that
+		// resolution time + an extra 1s without a resolution event
+		// means something is wrong.
+		t.Fatal("timed out waiting for DAD resolution")
+	case e := <-ndpDisp.dadC:
+		if e.err != nil {
+			t.Fatal("got DAD error: ", e.err)
+		}
+		if e.nicid != 1 {
+			t.Fatalf("got DAD event w/ nicid = %d, want = 1", e.nicid)
+		}
+		if e.addr != linkLocalAddr {
+			t.Fatalf("got DAD event w/ addr = %s, want = %s", addr, linkLocalAddr)
+		}
+		if !e.resolved {
+			t.Fatal("got DAD event w/ resolved = false, want = true")
+		}
+	}
+	addr, err = s.GetMainNICAddress(1, header.IPv6ProtocolNumber)
+	if err != nil {
+		t.Fatalf("stack.GetMainNICAddress(_, _) err = %s", err)
+	}
+	if want := (tcpip.AddressWithPrefix{Address: linkLocalAddr, PrefixLen: header.IPv6LinkLocalPrefix.PrefixLen}); addr != want {
+		t.Fatalf("got stack.GetMainNICAddress(_, _) = %s, want = %s", addr, want)
+	}
+}
+
+// TestNewPEB tests that a new PrimaryEndpointBehavior value (peb) is respected
+// when an address's kind gets "promoted" to permanent from permanentExpired.
+func TestNewPEBOnPromotionToPermanent(t *testing.T) {
+	pebs := []stack.PrimaryEndpointBehavior{
+		stack.NeverPrimaryEndpoint,
+		stack.CanBePrimaryEndpoint,
+		stack.FirstPrimaryEndpoint,
+	}
+
+	for _, pi := range pebs {
+		for _, ps := range pebs {
+			t.Run(fmt.Sprintf("%d-to-%d", pi, ps), func(t *testing.T) {
+				s := stack.New(stack.Options{
+					NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+				})
+				ep1 := channel.New(10, defaultMTU, "")
+				if err := s.CreateNIC(1, ep1); err != nil {
+					t.Fatal("CreateNIC failed:", err)
+				}
+
+				// Add a permanent address with initial
+				// PrimaryEndpointBehavior (peb), pi. If pi is
+				// NeverPrimaryEndpoint, the address should not
+				// be returned by a call to GetMainNICAddress;
+				// else, it should.
+				if err := s.AddAddressWithOptions(1, fakeNetNumber, "\x01", pi); err != nil {
+					t.Fatal("AddAddressWithOptions failed:", err)
+				}
+				addr, err := s.GetMainNICAddress(1, fakeNetNumber)
+				if err != nil {
+					t.Fatal("s.GetMainNICAddress failed:", err)
+				}
+				if pi == stack.NeverPrimaryEndpoint {
+					if want := (tcpip.AddressWithPrefix{}); addr != want {
+						t.Fatalf("got GetMainNICAddress = %s, want = %s", addr, want)
+
+					}
+				} else if addr.Address != "\x01" {
+					t.Fatalf("got GetMainNICAddress = %s, want = 1", addr.Address)
+				}
+
+				{
+					subnet, err := tcpip.NewSubnet("\x00", "\x00")
+					if err != nil {
+						t.Fatalf("NewSubnet failed:", err)
+					}
+					s.SetRouteTable([]tcpip.Route{{Destination: subnet, Gateway: "\x00", NIC: 1}})
+				}
+
+				// Take a route through the address so its ref
+				// count gets incremented and does not actually
+				// get deleted when RemoveAddress is called
+				// below. This is because we want to test that a
+				// new peb is respected when an address gets
+				// "promoted" to permanent from a
+				// permanentExpired kind.
+				r, err := s.FindRoute(1, "\x01", "\x02", fakeNetNumber, false)
+				if err != nil {
+					t.Fatal("FindRoute failed:", err)
+				}
+				defer r.Release()
+				if err := s.RemoveAddress(1, "\x01"); err != nil {
+					t.Fatalf("RemoveAddress failed:", err)
+				}
+
+				//
+				// At this point, the address should still be
+				// known by the NIC, but have its
+				// kind = permanentExpired.
+				//
+
+				// Add some other address with peb set to
+				// FirstPrimaryEndpoint.
+				if err := s.AddAddressWithOptions(1, fakeNetNumber, "\x03", stack.FirstPrimaryEndpoint); err != nil {
+					t.Fatal("AddAddressWithOptions failed:", err)
+
+				}
+
+				// Add back the address we removed earlier and
+				// make sure the new peb was respected.
+				// (The address should just be promoted now).
+				if err := s.AddAddressWithOptions(1, fakeNetNumber, "\x01", ps); err != nil {
+					t.Fatal("AddAddressWithOptions failed:", err)
+				}
+				var primaryAddrs []tcpip.Address
+				for _, pa := range s.NICInfo()[1].ProtocolAddresses {
+					primaryAddrs = append(primaryAddrs, pa.AddressWithPrefix.Address)
+				}
+				var expectedList []tcpip.Address
+				switch ps {
+				case stack.FirstPrimaryEndpoint:
+					expectedList = []tcpip.Address{
+						"\x01",
+						"\x03",
+					}
+				case stack.CanBePrimaryEndpoint:
+					expectedList = []tcpip.Address{
+						"\x03",
+						"\x01",
+					}
+				case stack.NeverPrimaryEndpoint:
+					expectedList = []tcpip.Address{
+						"\x03",
+					}
+				}
+				if !cmp.Equal(primaryAddrs, expectedList) {
+					t.Fatalf("got NIC's primary addresses = %v, want = %v", primaryAddrs, expectedList)
+				}
+
+				// Once we remove the other address, if the new
+				// peb, ps, was NeverPrimaryEndpoint, no address
+				// should be returned by a call to
+				// GetMainNICAddress; else, our original address
+				// should be returned.
+				if err := s.RemoveAddress(1, "\x03"); err != nil {
+					t.Fatalf("RemoveAddress failed:", err)
+				}
+				addr, err = s.GetMainNICAddress(1, fakeNetNumber)
+				if err != nil {
+					t.Fatal("s.GetMainNICAddress failed:", err)
+				}
+				if ps == stack.NeverPrimaryEndpoint {
+					if want := (tcpip.AddressWithPrefix{}); addr != want {
+						t.Fatalf("got GetMainNICAddress = %s, want = %s", addr, want)
+
+					}
+				} else {
+					if addr.Address != "\x01" {
+						t.Fatalf("got GetMainNICAddress = %s, want = 1", addr.Address)
+					}
+				}
+			})
+		}
+	}
+}
diff --git a/pkg/tcpip/stack/transport_demuxer.go b/pkg/tcpip/stack/transport_demuxer.go
index 92267ce4d..97a1aec4b 100644
--- a/pkg/tcpip/stack/transport_demuxer.go
+++ b/pkg/tcpip/stack/transport_demuxer.go
@@ -465,7 +465,7 @@ func (d *transportDemuxer) findEndpointLocked(eps *transportEndpoints, vv buffer
 func (d *transportDemuxer) registerRawEndpoint(netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, ep RawTransportEndpoint) *tcpip.Error {
 	eps, ok := d.protocol[protocolIDs{netProto, transProto}]
 	if !ok {
-		return nil
+		return tcpip.ErrNotSupported
 	}
 
 	eps.mu.Lock()
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index 26f338d8d..353ecd49b 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -255,7 +255,7 @@ type FullAddress struct {
 	// This may not be used by all endpoint types.
 	NIC NICID
 
-	// Addr is the network address.
+	// Addr is the network or link layer address.
 	Addr Address
 
 	// Port is the transport port.
@@ -1100,15 +1100,12 @@ func (*TransportEndpointStats) IsEndpointStats() {}
 func fillIn(v reflect.Value) {
 	for i := 0; i < v.NumField(); i++ {
 		v := v.Field(i)
-		switch v.Kind() {
-		case reflect.Ptr:
-			if s := v.Addr().Interface().(**StatCounter); *s == nil {
-				*s = &StatCounter{}
+		if s, ok := v.Addr().Interface().(**StatCounter); ok {
+			if *s == nil {
+				*s = new(StatCounter)
 			}
-		case reflect.Struct:
+		} else {
 			fillIn(v)
-		default:
-			panic(fmt.Sprintf("unexpected type %s", v.Type()))
 		}
 	}
 }
diff --git a/pkg/tcpip/transport/packet/BUILD b/pkg/tcpip/transport/packet/BUILD
new file mode 100644
index 000000000..8ea2e6ee5
--- /dev/null
+++ b/pkg/tcpip/transport/packet/BUILD
@@ -0,0 +1,46 @@
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+load("//tools/go_stateify:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_template_instance(
+    name = "packet_list",
+    out = "packet_list.go",
+    package = "packet",
+    prefix = "packet",
+    template = "//pkg/ilist:generic_list",
+    types = {
+        "Element": "*packet",
+        "Linker": "*packet",
+    },
+)
+
+go_library(
+    name = "packet",
+    srcs = [
+        "endpoint.go",
+        "endpoint_state.go",
+        "packet_list.go",
+    ],
+    importpath = "gvisor.dev/gvisor/pkg/tcpip/transport/packet",
+    imports = ["gvisor.dev/gvisor/pkg/tcpip/buffer"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//pkg/log",
+        "//pkg/sleep",
+        "//pkg/tcpip",
+        "//pkg/tcpip/buffer",
+        "//pkg/tcpip/header",
+        "//pkg/tcpip/iptables",
+        "//pkg/tcpip/stack",
+        "//pkg/waiter",
+    ],
+)
+
+filegroup(
+    name = "autogen",
+    srcs = [
+        "packet_list.go",
+    ],
+    visibility = ["//:sandbox"],
+)
diff --git a/pkg/tcpip/transport/packet/endpoint.go b/pkg/tcpip/transport/packet/endpoint.go
new file mode 100644
index 000000000..73cdaa265
--- /dev/null
+++ b/pkg/tcpip/transport/packet/endpoint.go
@@ -0,0 +1,363 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package packet provides the implementation of packet sockets (see
+// packet(7)). Packet sockets allow applications to:
+//
+//   * manually write and inspect link, network, and transport headers
+//   * receive all traffic of a given network protocol, or all protocols
+//
+// Packet sockets are similar to raw sockets, but provide even more power to
+// users, letting them effectively talk directly to the network device.
+//
+// Packet sockets skip the input and output iptables chains.
+package packet
+
+import (
+	"sync"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/iptables"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// +stateify savable
+type packet struct {
+	packetEntry
+	// data holds the actual packet data, including any headers and
+	// payload.
+	data buffer.VectorisedView `state:".(buffer.VectorisedView)"`
+	// views is pre-allocated space to back data. As long as the packet is
+	// made up of fewer than 8 buffer.Views, no extra allocation is
+	// necessary to store packet data.
+	views [8]buffer.View `state:"nosave"`
+	// timestampNS is the unix time at which the packet was received.
+	timestampNS int64
+	// senderAddr is the network address of the sender.
+	senderAddr tcpip.FullAddress
+}
+
+// endpoint is the packet socket implementation of tcpip.Endpoint. It is legal
+// to have goroutines make concurrent calls into the endpoint.
+//
+// Lock order:
+//   endpoint.mu
+//     endpoint.rcvMu
+//
+// +stateify savable
+type endpoint struct {
+	stack.TransportEndpointInfo
+	// The following fields are initialized at creation time and are
+	// immutable.
+	stack       *stack.Stack `state:"manual"`
+	netProto    tcpip.NetworkProtocolNumber
+	waiterQueue *waiter.Queue
+	cooked      bool
+
+	// The following fields are used to manage the receive queue and are
+	// protected by rcvMu.
+	rcvMu         sync.Mutex `state:"nosave"`
+	rcvList       packetList
+	rcvBufSizeMax int `state:".(int)"`
+	rcvBufSize    int
+	rcvClosed     bool
+
+	// The following fields are protected by mu.
+	mu         sync.RWMutex `state:"nosave"`
+	sndBufSize int
+	closed     bool
+	stats      tcpip.TransportEndpointStats `state:"nosave"`
+}
+
+// NewEndpoint returns a new packet endpoint.
+func NewEndpoint(s *stack.Stack, cooked bool, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
+	ep := &endpoint{
+		stack: s,
+		TransportEndpointInfo: stack.TransportEndpointInfo{
+			NetProto: netProto,
+		},
+		cooked:        cooked,
+		netProto:      netProto,
+		waiterQueue:   waiterQueue,
+		rcvBufSizeMax: 32 * 1024,
+		sndBufSize:    32 * 1024,
+	}
+
+	if err := s.RegisterPacketEndpoint(0, netProto, ep); err != nil {
+		return nil, err
+	}
+	return ep, nil
+}
+
+// Close implements tcpip.Endpoint.Close.
+func (ep *endpoint) Close() {
+	ep.mu.Lock()
+	defer ep.mu.Unlock()
+
+	if ep.closed {
+		return
+	}
+
+	ep.stack.UnregisterPacketEndpoint(0, ep.netProto, ep)
+
+	ep.rcvMu.Lock()
+	defer ep.rcvMu.Unlock()
+
+	// Clear the receive list.
+	ep.rcvClosed = true
+	ep.rcvBufSize = 0
+	for !ep.rcvList.Empty() {
+		ep.rcvList.Remove(ep.rcvList.Front())
+	}
+
+	ep.closed = true
+	ep.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.EventIn | waiter.EventOut)
+}
+
+// ModerateRecvBuf implements tcpip.Endpoint.ModerateRecvBuf.
+func (ep *endpoint) ModerateRecvBuf(copied int) {}
+
+// IPTables implements tcpip.Endpoint.IPTables.
+func (ep *endpoint) IPTables() (iptables.IPTables, error) {
+	return ep.stack.IPTables(), nil
+}
+
+// Read implements tcpip.Endpoint.Read.
+func (ep *endpoint) Read(addr *tcpip.FullAddress) (buffer.View, tcpip.ControlMessages, *tcpip.Error) {
+	ep.rcvMu.Lock()
+
+	// If there's no data to read, return that read would block or that the
+	// endpoint is closed.
+	if ep.rcvList.Empty() {
+		err := tcpip.ErrWouldBlock
+		if ep.rcvClosed {
+			ep.stats.ReadErrors.ReadClosed.Increment()
+			err = tcpip.ErrClosedForReceive
+		}
+		ep.rcvMu.Unlock()
+		return buffer.View{}, tcpip.ControlMessages{}, err
+	}
+
+	packet := ep.rcvList.Front()
+	ep.rcvList.Remove(packet)
+	ep.rcvBufSize -= packet.data.Size()
+
+	ep.rcvMu.Unlock()
+
+	if addr != nil {
+		*addr = packet.senderAddr
+	}
+
+	return packet.data.ToView(), tcpip.ControlMessages{HasTimestamp: true, Timestamp: packet.timestampNS}, nil
+}
+
+func (ep *endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-chan struct{}, *tcpip.Error) {
+	// TODO(b/129292371): Implement.
+	return 0, nil, tcpip.ErrInvalidOptionValue
+}
+
+// Peek implements tcpip.Endpoint.Peek.
+func (ep *endpoint) Peek([][]byte) (int64, tcpip.ControlMessages, *tcpip.Error) {
+	return 0, tcpip.ControlMessages{}, nil
+}
+
+// Disconnect implements tcpip.Endpoint.Disconnect. Packet sockets cannot be
+// disconnected, and this function always returns tpcip.ErrNotSupported.
+func (*endpoint) Disconnect() *tcpip.Error {
+	return tcpip.ErrNotSupported
+}
+
+// Connect implements tcpip.Endpoint.Connect. Packet sockets cannot be
+// connected, and this function always returnes tcpip.ErrNotSupported.
+func (ep *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
+	return tcpip.ErrNotSupported
+}
+
+// Shutdown implements tcpip.Endpoint.Shutdown. Packet sockets cannot be used
+// with Shutdown, and this function always returns tcpip.ErrNotSupported.
+func (ep *endpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error {
+	return tcpip.ErrNotSupported
+}
+
+// Listen implements tcpip.Endpoint.Listen. Packet sockets cannot be used with
+// Listen, and this function always returns tcpip.ErrNotSupported.
+func (ep *endpoint) Listen(backlog int) *tcpip.Error {
+	return tcpip.ErrNotSupported
+}
+
+// Accept implements tcpip.Endpoint.Accept. Packet sockets cannot be used with
+// Accept, and this function always returns tcpip.ErrNotSupported.
+func (ep *endpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
+	return nil, nil, tcpip.ErrNotSupported
+}
+
+// Bind implements tcpip.Endpoint.Bind.
+func (ep *endpoint) Bind(addr tcpip.FullAddress) *tcpip.Error {
+	// TODO(gvisor.dev/issue/173): Add Bind support.
+
+	// "By default, all packets of the specified protocol type are passed
+	// to a packet socket.  To get packets only from a specific interface
+	// use bind(2) specifying an address in a struct sockaddr_ll to bind
+	// the packet socket  to  an interface.  Fields used for binding are
+	// sll_family (should be AF_PACKET), sll_protocol, and sll_ifindex."
+	// - packet(7).
+
+	return tcpip.ErrNotSupported
+}
+
+// GetLocalAddress implements tcpip.Endpoint.GetLocalAddress.
+func (ep *endpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) {
+	return tcpip.FullAddress{}, tcpip.ErrNotSupported
+}
+
+// GetRemoteAddress implements tcpip.Endpoint.GetRemoteAddress.
+func (ep *endpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) {
+	// Even a connected socket doesn't return a remote address.
+	return tcpip.FullAddress{}, tcpip.ErrNotConnected
+}
+
+// Readiness implements tcpip.Endpoint.Readiness.
+func (ep *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
+	// The endpoint is always writable.
+	result := waiter.EventOut & mask
+
+	// Determine whether the endpoint is readable.
+	if (mask & waiter.EventIn) != 0 {
+		ep.rcvMu.Lock()
+		if !ep.rcvList.Empty() || ep.rcvClosed {
+			result |= waiter.EventIn
+		}
+		ep.rcvMu.Unlock()
+	}
+
+	return result
+}
+
+// SetSockOpt implements tcpip.Endpoint.SetSockOpt. Packet sockets cannot be
+// used with SetSockOpt, and this function always returns
+// tcpip.ErrNotSupported.
+func (ep *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
+	return tcpip.ErrNotSupported
+}
+
+// SetSockOptInt implements tcpip.Endpoint.SetSockOptInt.
+func (ep *endpoint) SetSockOptInt(opt tcpip.SockOpt, v int) *tcpip.Error {
+	return tcpip.ErrUnknownProtocolOption
+}
+
+// GetSockOptInt implements tcpip.Endpoint.GetSockOptInt.
+func (ep *endpoint) GetSockOptInt(opt tcpip.SockOpt) (int, *tcpip.Error) {
+	return 0, tcpip.ErrNotSupported
+}
+
+// GetSockOpt implements tcpip.Endpoint.GetSockOpt.
+func (ep *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
+	return tcpip.ErrNotSupported
+}
+
+// HandlePacket implements stack.PacketEndpoint.HandlePacket.
+func (ep *endpoint) HandlePacket(nicid tcpip.NICID, localAddr tcpip.LinkAddress, netProto tcpip.NetworkProtocolNumber, vv buffer.VectorisedView, ethHeader buffer.View) {
+	ep.rcvMu.Lock()
+
+	// Drop the packet if our buffer is currently full.
+	if ep.rcvClosed {
+		ep.rcvMu.Unlock()
+		ep.stack.Stats().DroppedPackets.Increment()
+		ep.stats.ReceiveErrors.ClosedReceiver.Increment()
+		return
+	}
+
+	if ep.rcvBufSize >= ep.rcvBufSizeMax {
+		ep.rcvMu.Unlock()
+		ep.stack.Stats().DroppedPackets.Increment()
+		ep.stats.ReceiveErrors.ReceiveBufferOverflow.Increment()
+		return
+	}
+
+	wasEmpty := ep.rcvBufSize == 0
+
+	// Push new packet into receive list and increment the buffer size.
+	var packet packet
+	// TODO(b/129292371): Return network protocol.
+	if len(ethHeader) > 0 {
+		// Get info directly from the ethernet header.
+		hdr := header.Ethernet(ethHeader)
+		packet.senderAddr = tcpip.FullAddress{
+			NIC:  nicid,
+			Addr: tcpip.Address(hdr.SourceAddress()),
+		}
+	} else {
+		// Guess the would-be ethernet header.
+		packet.senderAddr = tcpip.FullAddress{
+			NIC:  nicid,
+			Addr: tcpip.Address(localAddr),
+		}
+	}
+
+	if ep.cooked {
+		// Cooked packets can simply be queued.
+		packet.data = vv.Clone(packet.views[:])
+	} else {
+		// Raw packets need their ethernet headers prepended before
+		// queueing.
+		if len(ethHeader) == 0 {
+			// We weren't provided with an actual ethernet header,
+			// so fake one.
+			ethFields := header.EthernetFields{
+				SrcAddr: tcpip.LinkAddress([]byte{0x00, 0x00, 0x00, 0x00, 0x00, 0x00}),
+				DstAddr: localAddr,
+				Type:    netProto,
+			}
+			fakeHeader := make(header.Ethernet, header.EthernetMinimumSize)
+			fakeHeader.Encode(&ethFields)
+			ethHeader = buffer.View(fakeHeader)
+		}
+		combinedVV := buffer.View(ethHeader).ToVectorisedView()
+		combinedVV.Append(vv)
+		packet.data = combinedVV.Clone(packet.views[:])
+	}
+	packet.timestampNS = ep.stack.NowNanoseconds()
+
+	ep.rcvList.PushBack(&packet)
+	ep.rcvBufSize += packet.data.Size()
+
+	ep.rcvMu.Unlock()
+	ep.stats.PacketsReceived.Increment()
+	// Notify waiters that there's data to be read.
+	if wasEmpty {
+		ep.waiterQueue.Notify(waiter.EventIn)
+	}
+}
+
+// State implements socket.Socket.State.
+func (ep *endpoint) State() uint32 {
+	return 0
+}
+
+// Info returns a copy of the endpoint info.
+func (ep *endpoint) Info() tcpip.EndpointInfo {
+	ep.mu.RLock()
+	// Make a copy of the endpoint info.
+	ret := ep.TransportEndpointInfo
+	ep.mu.RUnlock()
+	return &ret
+}
+
+// Stats returns a pointer to the endpoint stats.
+func (ep *endpoint) Stats() tcpip.EndpointStats {
+	return &ep.stats
+}
diff --git a/pkg/tcpip/transport/packet/endpoint_state.go b/pkg/tcpip/transport/packet/endpoint_state.go
new file mode 100644
index 000000000..9b88f17e4
--- /dev/null
+++ b/pkg/tcpip/transport/packet/endpoint_state.go
@@ -0,0 +1,72 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package packet
+
+import (
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+)
+
+// saveData saves packet.data field.
+func (p *packet) saveData() buffer.VectorisedView {
+	// We cannot save p.data directly as p.data.views may alias to p.views,
+	// which is not allowed by state framework (in-struct pointer).
+	return p.data.Clone(nil)
+}
+
+// loadData loads packet.data field.
+func (p *packet) loadData(data buffer.VectorisedView) {
+	// NOTE: We cannot do the p.data = data.Clone(p.views[:]) optimization
+	// here because data.views is not guaranteed to be loaded by now. Plus,
+	// data.views will be allocated anyway so there really is little point
+	// of utilizing p.views for data.views.
+	p.data = data
+}
+
+// beforeSave is invoked by stateify.
+func (ep *endpoint) beforeSave() {
+	// Stop incoming packets from being handled (and mutate endpoint state).
+	// The lock will be released after saveRcvBufSizeMax(), which would have
+	// saved ep.rcvBufSizeMax and set it to 0 to continue blocking incoming
+	// packets.
+	ep.rcvMu.Lock()
+}
+
+// saveRcvBufSizeMax is invoked by stateify.
+func (ep *endpoint) saveRcvBufSizeMax() int {
+	max := ep.rcvBufSizeMax
+	// Make sure no new packets will be handled regardless of the lock.
+	ep.rcvBufSizeMax = 0
+	// Release the lock acquired in beforeSave() so regular endpoint closing
+	// logic can proceed after save.
+	ep.rcvMu.Unlock()
+	return max
+}
+
+// loadRcvBufSizeMax is invoked by stateify.
+func (ep *endpoint) loadRcvBufSizeMax(max int) {
+	ep.rcvBufSizeMax = max
+}
+
+// afterLoad is invoked by stateify.
+func (ep *endpoint) afterLoad() {
+	// StackFromEnv is a stack used specifically for save/restore.
+	ep.stack = stack.StackFromEnv
+
+	// TODO(gvisor.dev/173): Once bind is supported, choose the right NIC.
+	if err := ep.stack.RegisterPacketEndpoint(0, ep.netProto, ep); err != nil {
+		panic(*err)
+	}
+}
diff --git a/pkg/tcpip/transport/raw/BUILD b/pkg/tcpip/transport/raw/BUILD
index fba598d51..4af49218c 100644
--- a/pkg/tcpip/transport/raw/BUILD
+++ b/pkg/tcpip/transport/raw/BUILD
@@ -4,14 +4,14 @@ load("//tools/go_stateify:defs.bzl", "go_library")
 package(licenses = ["notice"])
 
 go_template_instance(
-    name = "packet_list",
-    out = "packet_list.go",
+    name = "raw_packet_list",
+    out = "raw_packet_list.go",
     package = "raw",
-    prefix = "packet",
+    prefix = "rawPacket",
     template = "//pkg/ilist:generic_list",
     types = {
-        "Element": "*packet",
-        "Linker": "*packet",
+        "Element": "*rawPacket",
+        "Linker": "*rawPacket",
     },
 )
 
@@ -20,8 +20,8 @@ go_library(
     srcs = [
         "endpoint.go",
         "endpoint_state.go",
-        "packet_list.go",
         "protocol.go",
+        "raw_packet_list.go",
     ],
     importpath = "gvisor.dev/gvisor/pkg/tcpip/transport/raw",
     imports = ["gvisor.dev/gvisor/pkg/tcpip/buffer"],
@@ -34,6 +34,7 @@ go_library(
         "//pkg/tcpip/header",
         "//pkg/tcpip/iptables",
         "//pkg/tcpip/stack",
+        "//pkg/tcpip/transport/packet",
         "//pkg/waiter",
     ],
 )
@@ -41,7 +42,7 @@ go_library(
 filegroup(
     name = "autogen",
     srcs = [
-        "packet_list.go",
+        "raw_packet_list.go",
     ],
     visibility = ["//:sandbox"],
 )
diff --git a/pkg/tcpip/transport/raw/endpoint.go b/pkg/tcpip/transport/raw/endpoint.go
index b4c660859..308f10d24 100644
--- a/pkg/tcpip/transport/raw/endpoint.go
+++ b/pkg/tcpip/transport/raw/endpoint.go
@@ -17,8 +17,7 @@
 //
 //   * manually write and inspect transport layer headers and payloads
 //   * receive all traffic of a given transport protocol (e.g. ICMP or UDP)
-//   * optionally write and inspect network layer and link layer headers for
-//     packets
+//   * optionally write and inspect network layer headers of packets
 //
 // Raw sockets don't have any notion of ports, and incoming packets are
 // demultiplexed solely by protocol number. Thus, a raw UDP endpoint will
@@ -38,8 +37,8 @@ import (
 )
 
 // +stateify savable
-type packet struct {
-	packetEntry
+type rawPacket struct {
+	rawPacketEntry
 	// data holds the actual packet data, including any headers and
 	// payload.
 	data buffer.VectorisedView `state:".(buffer.VectorisedView)"`
@@ -72,7 +71,7 @@ type endpoint struct {
 	// The following fields are used to manage the receive queue and are
 	// protected by rcvMu.
 	rcvMu         sync.Mutex `state:"nosave"`
-	rcvList       packetList
+	rcvList       rawPacketList
 	rcvBufSizeMax int `state:".(int)"`
 	rcvBufSize    int
 	rcvClosed     bool
@@ -90,7 +89,6 @@ type endpoint struct {
 }
 
 // NewEndpoint returns a raw  endpoint for the given protocols.
-// TODO(b/129292371): IP_HDRINCL and AF_PACKET.
 func NewEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
 	return newEndpoint(stack, netProto, transProto, waiterQueue, true /* associated */)
 }
@@ -187,17 +185,17 @@ func (e *endpoint) Read(addr *tcpip.FullAddress) (buffer.View, tcpip.ControlMess
 		return buffer.View{}, tcpip.ControlMessages{}, err
 	}
 
-	packet := e.rcvList.Front()
-	e.rcvList.Remove(packet)
-	e.rcvBufSize -= packet.data.Size()
+	pkt := e.rcvList.Front()
+	e.rcvList.Remove(pkt)
+	e.rcvBufSize -= pkt.data.Size()
 
 	e.rcvMu.Unlock()
 
 	if addr != nil {
-		*addr = packet.senderAddr
+		*addr = pkt.senderAddr
 	}
 
-	return packet.data.ToView(), tcpip.ControlMessages{HasTimestamp: true, Timestamp: packet.timestampNS}, nil
+	return pkt.data.ToView(), tcpip.ControlMessages{HasTimestamp: true, Timestamp: pkt.timestampNS}, nil
 }
 
 // Write implements tcpip.Endpoint.Write.
@@ -602,7 +600,7 @@ func (e *endpoint) HandlePacket(route *stack.Route, netHeader buffer.View, vv bu
 	wasEmpty := e.rcvBufSize == 0
 
 	// Push new packet into receive list and increment the buffer size.
-	packet := &packet{
+	pkt := &rawPacket{
 		senderAddr: tcpip.FullAddress{
 			NIC:  route.NICID(),
 			Addr: route.RemoteAddress,
@@ -611,11 +609,11 @@ func (e *endpoint) HandlePacket(route *stack.Route, netHeader buffer.View, vv bu
 
 	combinedVV := netHeader.ToVectorisedView()
 	combinedVV.Append(vv)
-	packet.data = combinedVV.Clone(packet.views[:])
-	packet.timestampNS = e.stack.NowNanoseconds()
+	pkt.data = combinedVV.Clone(pkt.views[:])
+	pkt.timestampNS = e.stack.NowNanoseconds()
 
-	e.rcvList.PushBack(packet)
-	e.rcvBufSize += packet.data.Size()
+	e.rcvList.PushBack(pkt)
+	e.rcvBufSize += pkt.data.Size()
 
 	e.rcvMu.Unlock()
 	e.stats.PacketsReceived.Increment()
diff --git a/pkg/tcpip/transport/raw/endpoint_state.go b/pkg/tcpip/transport/raw/endpoint_state.go
index a6c7cc43a..33bfb56cd 100644
--- a/pkg/tcpip/transport/raw/endpoint_state.go
+++ b/pkg/tcpip/transport/raw/endpoint_state.go
@@ -20,15 +20,15 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 )
 
-// saveData saves packet.data field.
-func (p *packet) saveData() buffer.VectorisedView {
+// saveData saves rawPacket.data field.
+func (p *rawPacket) saveData() buffer.VectorisedView {
 	// We cannot save p.data directly as p.data.views may alias to p.views,
 	// which is not allowed by state framework (in-struct pointer).
 	return p.data.Clone(nil)
 }
 
-// loadData loads packet.data field.
-func (p *packet) loadData(data buffer.VectorisedView) {
+// loadData loads rawPacket.data field.
+func (p *rawPacket) loadData(data buffer.VectorisedView) {
 	// NOTE: We cannot do the p.data = data.Clone(p.views[:]) optimization
 	// here because data.views is not guaranteed to be loaded by now. Plus,
 	// data.views will be allocated anyway so there really is little point
@@ -86,7 +86,9 @@ func (ep *endpoint) Resume(s *stack.Stack) {
 		}
 	}
 
-	if err := ep.stack.RegisterRawTransportEndpoint(ep.RegisterNICID, ep.NetProto, ep.TransProto, ep); err != nil {
-		panic(err)
+	if ep.associated {
+		if err := ep.stack.RegisterRawTransportEndpoint(ep.RegisterNICID, ep.NetProto, ep.TransProto, ep); err != nil {
+			panic(err)
+		}
 	}
 }
diff --git a/pkg/tcpip/transport/raw/protocol.go b/pkg/tcpip/transport/raw/protocol.go
index a2512d666..f30aa2a4a 100644
--- a/pkg/tcpip/transport/raw/protocol.go
+++ b/pkg/tcpip/transport/raw/protocol.go
@@ -17,13 +17,19 @@ package raw
 import (
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
+	"gvisor.dev/gvisor/pkg/tcpip/transport/packet"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
-// EndpointFactory implements stack.UnassociatedEndpointFactory.
+// EndpointFactory implements stack.RawFactory.
 type EndpointFactory struct{}
 
-// NewUnassociatedRawEndpoint implements stack.UnassociatedEndpointFactory.
-func (EndpointFactory) NewUnassociatedRawEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
+// NewUnassociatedEndpoint implements stack.RawFactory.NewUnassociatedEndpoint.
+func (EndpointFactory) NewUnassociatedEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
 	return newEndpoint(stack, netProto, transProto, waiterQueue, false /* associated */)
 }
+
+// NewPacketEndpoint implements stack.RawFactory.NewPacketEndpoint.
+func (EndpointFactory) NewPacketEndpoint(stack *stack.Stack, cooked bool, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
+	return packet.NewEndpoint(stack, cooked, netProto, waiterQueue)
+}
diff --git a/pkg/tcpip/transport/tcp/BUILD b/pkg/tcpip/transport/tcp/BUILD
index aed70e06f..f1dbc6f91 100644
--- a/pkg/tcpip/transport/tcp/BUILD
+++ b/pkg/tcpip/transport/tcp/BUILD
@@ -44,6 +44,7 @@ go_library(
     imports = ["gvisor.dev/gvisor/pkg/tcpip/buffer"],
     visibility = ["//visibility:public"],
     deps = [
+        "//pkg/log",
         "//pkg/rand",
         "//pkg/sleep",
         "//pkg/tcpip",
diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index b724d02bb..8db1cc028 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -607,17 +607,11 @@ func (e *endpoint) sendTCP(r *stack.Route, id stack.TransportEndpointID, data bu
 	return nil
 }
 
-// sendTCP sends a TCP segment with the provided options via the provided
-// network endpoint and under the provided identity.
-func sendTCP(r *stack.Route, id stack.TransportEndpointID, data buffer.VectorisedView, ttl, tos uint8, flags byte, seq, ack seqnum.Value, rcvWnd seqnum.Size, opts []byte, gso *stack.GSO) *tcpip.Error {
+func buildTCPHdr(r *stack.Route, id stack.TransportEndpointID, d *stack.PacketDescriptor, data buffer.VectorisedView, flags byte, seq, ack seqnum.Value, rcvWnd seqnum.Size, opts []byte, gso *stack.GSO) {
 	optLen := len(opts)
-	// Allocate a buffer for the TCP header.
-	hdr := buffer.NewPrependable(header.TCPMinimumSize + int(r.MaxHeaderLength()) + optLen)
-
-	if rcvWnd > 0xffff {
-		rcvWnd = 0xffff
-	}
-
+	hdr := &d.Hdr
+	packetSize := d.Size
+	off := d.Off
 	// Initialize the header.
 	tcp := header.TCP(hdr.Prepend(header.TCPMinimumSize + optLen))
 	tcp.Encode(&header.TCPFields{
@@ -631,7 +625,7 @@ func sendTCP(r *stack.Route, id stack.TransportEndpointID, data buffer.Vectorise
 	})
 	copy(tcp[header.TCPMinimumSize:], opts)
 
-	length := uint16(hdr.UsedLength() + data.Size())
+	length := uint16(hdr.UsedLength() + packetSize)
 	xsum := r.PseudoHeaderChecksum(ProtocolNumber, length)
 	// Only calculate the checksum if offloading isn't supported.
 	if gso != nil && gso.NeedsCsum {
@@ -641,14 +635,71 @@ func sendTCP(r *stack.Route, id stack.TransportEndpointID, data buffer.Vectorise
 		// header and data and get the right sum of the TCP packet.
 		tcp.SetChecksum(xsum)
 	} else if r.Capabilities()&stack.CapabilityTXChecksumOffload == 0 {
-		xsum = header.ChecksumVV(data, xsum)
+		xsum = header.ChecksumVVWithOffset(data, xsum, off, packetSize)
 		tcp.SetChecksum(^tcp.CalculateChecksum(xsum))
 	}
 
+}
+
+func sendTCPBatch(r *stack.Route, id stack.TransportEndpointID, data buffer.VectorisedView, ttl, tos uint8, flags byte, seq, ack seqnum.Value, rcvWnd seqnum.Size, opts []byte, gso *stack.GSO) *tcpip.Error {
+	optLen := len(opts)
+	if rcvWnd > 0xffff {
+		rcvWnd = 0xffff
+	}
+
+	mss := int(gso.MSS)
+	n := (data.Size() + mss - 1) / mss
+
+	hdrs := stack.NewPacketDescriptors(n, header.TCPMinimumSize+int(r.MaxHeaderLength())+optLen)
+
+	size := data.Size()
+	off := 0
+	for i := 0; i < n; i++ {
+		packetSize := mss
+		if packetSize > size {
+			packetSize = size
+		}
+		size -= packetSize
+		hdrs[i].Off = off
+		hdrs[i].Size = packetSize
+		buildTCPHdr(r, id, &hdrs[i], data, flags, seq, ack, rcvWnd, opts, gso)
+		off += packetSize
+		seq = seq.Add(seqnum.Size(packetSize))
+	}
+	if ttl == 0 {
+		ttl = r.DefaultTTL()
+	}
+	sent, err := r.WritePackets(gso, hdrs, data, stack.NetworkHeaderParams{Protocol: ProtocolNumber, TTL: ttl, TOS: tos})
+	if err != nil {
+		r.Stats().TCP.SegmentSendErrors.IncrementBy(uint64(n - sent))
+	}
+	r.Stats().TCP.SegmentsSent.IncrementBy(uint64(sent))
+	return err
+}
+
+// sendTCP sends a TCP segment with the provided options via the provided
+// network endpoint and under the provided identity.
+func sendTCP(r *stack.Route, id stack.TransportEndpointID, data buffer.VectorisedView, ttl, tos uint8, flags byte, seq, ack seqnum.Value, rcvWnd seqnum.Size, opts []byte, gso *stack.GSO) *tcpip.Error {
+	optLen := len(opts)
+	if rcvWnd > 0xffff {
+		rcvWnd = 0xffff
+	}
+
+	if r.Loop&stack.PacketLoop == 0 && gso != nil && gso.Type == stack.GSOSW && int(gso.MSS) < data.Size() {
+		return sendTCPBatch(r, id, data, ttl, tos, flags, seq, ack, rcvWnd, opts, gso)
+	}
+
+	d := &stack.PacketDescriptor{
+		Hdr:  buffer.NewPrependable(header.TCPMinimumSize + int(r.MaxHeaderLength()) + optLen),
+		Off:  0,
+		Size: data.Size(),
+	}
+	buildTCPHdr(r, id, d, data, flags, seq, ack, rcvWnd, opts, gso)
+
 	if ttl == 0 {
 		ttl = r.DefaultTTL()
 	}
-	if err := r.WritePacket(gso, hdr, data, stack.NetworkHeaderParams{Protocol: ProtocolNumber, TTL: ttl, TOS: tos}); err != nil {
+	if err := r.WritePacket(gso, d.Hdr, data, stack.NetworkHeaderParams{Protocol: ProtocolNumber, TTL: ttl, TOS: tos}); err != nil {
 		r.Stats().TCP.SegmentSendErrors.Increment()
 		return err
 	}
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index 31a22c1eb..c6bc5528c 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -2328,11 +2328,7 @@ func (e *endpoint) completeState() stack.TCPEndpointState {
 	return s
 }
 
-func (e *endpoint) initGSO() {
-	if e.route.Capabilities()&stack.CapabilityGSO == 0 {
-		return
-	}
-
+func (e *endpoint) initHardwareGSO() {
 	gso := &stack.GSO{}
 	switch e.route.NetProto {
 	case header.IPv4ProtocolNumber:
@@ -2350,6 +2346,18 @@ func (e *endpoint) initGSO() {
 	e.gso = gso
 }
 
+func (e *endpoint) initGSO() {
+	if e.route.Capabilities()&stack.CapabilityHardwareGSO != 0 {
+		e.initHardwareGSO()
+	} else if e.route.Capabilities()&stack.CapabilitySoftwareGSO != 0 {
+		e.gso = &stack.GSO{
+			MaxSize:   e.route.GSOMaxSize(),
+			Type:      stack.GSOSW,
+			NeedsCsum: false,
+		}
+	}
+}
+
 // State implements tcpip.Endpoint.State. It exports the endpoint's protocol
 // state for diagnostics.
 func (e *endpoint) State() uint32 {
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index 6e87245b7..91c8487f3 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -140,7 +140,7 @@ func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQue
 		stack: s,
 		TransportEndpointInfo: stack.TransportEndpointInfo{
 			NetProto:   netProto,
-			TransProto: header.TCPProtocolNumber,
+			TransProto: header.UDPProtocolNumber,
 		},
 		waiterQueue: waiterQueue,
 		// RFC 1075 section 5.4 recommends a TTL of 1 for membership
diff --git a/pkg/tcpip/transport/udp/protocol.go b/pkg/tcpip/transport/udp/protocol.go
index de026880f..5c3358a5e 100644
--- a/pkg/tcpip/transport/udp/protocol.go
+++ b/pkg/tcpip/transport/udp/protocol.go
@@ -121,8 +121,15 @@ func (p *protocol) HandleUnknownDestinationPacket(r *stack.Route, id stack.Trans
 			payloadLen = available
 		}
 
-		payload := buffer.NewVectorisedView(len(netHeader), []buffer.View{netHeader})
-		payload.Append(vv)
+		// The buffers used by vv and netHeader may be used elsewhere
+		// in the system.  For example, a raw or packet socket may use
+		// what UDP considers an unreachable destination. Thus we deep
+		// copy vv and netHeader to prevent multiple ownership and SR
+		// errors.
+		newNetHeader := make(buffer.View, len(netHeader))
+		copy(newNetHeader, netHeader)
+		payload := buffer.NewVectorisedView(len(newNetHeader), []buffer.View{newNetHeader})
+		payload.Append(vv.ToView().ToVectorisedView())
 		payload.CapLength(payloadLen)
 
 		hdr := buffer.NewPrependable(headerLen)
diff --git a/runsc/boot/config.go b/runsc/boot/config.go
index 38278d0a2..72a33534f 100644
--- a/runsc/boot/config.go
+++ b/runsc/boot/config.go
@@ -178,8 +178,11 @@ type Config struct {
 	// capabilities.
 	EnableRaw bool
 
-	// GSO indicates that generic segmentation offload is enabled.
-	GSO bool
+	// HardwareGSO indicates that hardware segmentation offload is enabled.
+	HardwareGSO bool
+
+	// SoftwareGSO indicates that software segmentation offload is enabled.
+	SoftwareGSO bool
 
 	// LogPackets indicates that all network packets should be logged.
 	LogPackets bool
@@ -231,6 +234,10 @@ type Config struct {
 	// ReferenceLeakMode sets reference leak check mode
 	ReferenceLeakMode refs.LeakMode
 
+	// OverlayfsStaleRead causes cached FDs to reopen after a file is opened for
+	// write to workaround overlayfs limitation on kernels before 4.19.
+	OverlayfsStaleRead bool
+
 	// TestOnlyAllowRunAsCurrentUserWithoutChroot should only be used in
 	// tests. It allows runsc to start the sandbox process as the current
 	// user, and without chrooting the sandbox process. This can be
@@ -271,6 +278,9 @@ func (c *Config) ToFlags() []string {
 		"--rootless=" + strconv.FormatBool(c.Rootless),
 		"--alsologtostderr=" + strconv.FormatBool(c.AlsoLogToStderr),
 		"--ref-leak-mode=" + refsLeakModeToString(c.ReferenceLeakMode),
+		"--gso=" + strconv.FormatBool(c.HardwareGSO),
+		"--software-gso=" + strconv.FormatBool(c.SoftwareGSO),
+		"--overlayfs-stale-read=" + strconv.FormatBool(c.OverlayfsStaleRead),
 	}
 	// Only include these if set since it is never to be used by users.
 	if c.TestOnlyAllowRunAsCurrentUserWithoutChroot {
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index a73c593ea..5f644b57e 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -32,6 +32,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/watchdog"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/urpc"
+	"gvisor.dev/gvisor/runsc/specutils"
 )
 
 const (
@@ -237,6 +238,9 @@ func (cm *containerManager) Start(args *StartArgs, _ *struct{}) error {
 		return fmt.Errorf("start arguments must contain stdin, stderr, and stdout followed by at least one file for the container root gofer")
 	}
 
+	// All validation passed, logs the spec for debugging.
+	specutils.LogSpec(args.Spec)
+
 	err := cm.l.startContainer(args.Spec, args.Conf, args.CID, args.FilePayload.Files)
 	if err != nil {
 		log.Debugf("containerManager.Start failed %q: %+v: %v", args.CID, args, err)
diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go
index a2ecc6bcb..5ad108261 100644
--- a/runsc/boot/filter/config.go
+++ b/runsc/boot/filter/config.go
@@ -44,6 +44,7 @@ var allowedSyscalls = seccomp.SyscallRules{
 	},
 	syscall.SYS_CLOSE:         {},
 	syscall.SYS_DUP:           {},
+	syscall.SYS_DUP2:          {},
 	syscall.SYS_EPOLL_CREATE1: {},
 	syscall.SYS_EPOLL_CTL:     {},
 	syscall.SYS_EPOLL_PWAIT: []seccomp.Rule{
@@ -242,6 +243,15 @@ var allowedSyscalls = seccomp.SyscallRules{
 			seccomp.AllowValue(0),
 		},
 	},
+	unix.SYS_SENDMMSG: []seccomp.Rule{
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowValue(syscall.MSG_DONTWAIT),
+			seccomp.AllowValue(0),
+		},
+	},
 	syscall.SYS_RESTART_SYSCALL: {},
 	syscall.SYS_RT_SIGACTION:    {},
 	syscall.SYS_RT_SIGPROCMASK:  {},
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index 393c2a88b..76036c147 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -703,6 +703,14 @@ func (c *containerMounter) createRootMount(ctx context.Context, conf *Config) (*
 	log.Infof("Mounting root over 9P, ioFD: %d", fd)
 	p9FS := mustFindFilesystem("9p")
 	opts := p9MountOptions(fd, conf.FileAccess)
+
+	if conf.OverlayfsStaleRead {
+		// We can't check for overlayfs here because sandbox is chroot'ed and gofer
+		// can only send mount options for specs.Mounts (specs.Root is missing
+		// Options field). So assume root is always on top of overlayfs.
+		opts = append(opts, "overlayfs_stale_read")
+	}
+
 	rootInode, err := p9FS.Mount(ctx, rootDevice, mf, strings.Join(opts, ","), nil)
 	if err != nil {
 		return nil, fmt.Errorf("creating root mount point: %v", err)
@@ -737,7 +745,6 @@ func (c *containerMounter) getMountNameAndOptions(conf *Config, m specs.Mount) (
 		fsName     string
 		opts       []string
 		useOverlay bool
-		err        error
 	)
 
 	switch m.Type {
@@ -747,7 +754,12 @@ func (c *containerMounter) getMountNameAndOptions(conf *Config, m specs.Mount) (
 		fsName = sysfs
 	case tmpfs:
 		fsName = m.Type
+
+		var err error
 		opts, err = parseAndFilterOptions(m.Options, tmpfsAllowedOptions...)
+		if err != nil {
+			return "", nil, false, err
+		}
 
 	case bind:
 		fd := c.fds.remove()
@@ -763,7 +775,7 @@ func (c *containerMounter) getMountNameAndOptions(conf *Config, m specs.Mount) (
 		// for now.
 		log.Warningf("ignoring unknown filesystem type %q", m.Type)
 	}
-	return fsName, opts, useOverlay, err
+	return fsName, opts, useOverlay, nil
 }
 
 // mountSubmount mounts volumes inside the container's root. Because mounts may
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index c8e5e86ee..0c0eba99e 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -922,7 +922,7 @@ func newEmptyNetworkStack(conf *Config, clock tcpip.Clock) (inet.Stack, error) {
 			HandleLocal:        true,
 			// Enable raw sockets for users with sufficient
 			// privileges.
-			UnassociatedFactory: raw.EndpointFactory{},
+			RawFactory: raw.EndpointFactory{},
 		})}
 
 		// Enable SACK Recovery.
diff --git a/runsc/boot/network.go b/runsc/boot/network.go
index 32cba5ac1..f98c5fd36 100644
--- a/runsc/boot/network.go
+++ b/runsc/boot/network.go
@@ -50,12 +50,13 @@ type DefaultRoute struct {
 
 // FDBasedLink configures an fd-based link.
 type FDBasedLink struct {
-	Name        string
-	MTU         int
-	Addresses   []net.IP
-	Routes      []Route
-	GSOMaxSize  uint32
-	LinkAddress net.HardwareAddr
+	Name               string
+	MTU                int
+	Addresses          []net.IP
+	Routes             []Route
+	GSOMaxSize         uint32
+	SoftwareGSOEnabled bool
+	LinkAddress        net.HardwareAddr
 
 	// NumChannels controls how many underlying FD's are to be used to
 	// create this endpoint.
@@ -163,6 +164,7 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct
 			Address:            mac,
 			PacketDispatchMode: fdbased.RecvMMsg,
 			GSOMaxSize:         link.GSOMaxSize,
+			SoftwareGSOEnabled: link.SoftwareGSOEnabled,
 			RXChecksumOffload:  true,
 		})
 		if err != nil {
diff --git a/runsc/cmd/gofer.go b/runsc/cmd/gofer.go
index 4c2fb80bf..4831210c0 100644
--- a/runsc/cmd/gofer.go
+++ b/runsc/cmd/gofer.go
@@ -27,6 +27,7 @@ import (
 	"flag"
 	"github.com/google/subcommands"
 	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/p9"
 	"gvisor.dev/gvisor/pkg/unet"
@@ -135,7 +136,7 @@ func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	//
 	// Note that all mount points have been mounted in the proper location in
 	// setupRootFS().
-	cleanMounts, err := resolveMounts(spec.Mounts, root)
+	cleanMounts, err := resolveMounts(conf, spec.Mounts, root)
 	if err != nil {
 		Fatalf("Failure to resolve mounts: %v", err)
 	}
@@ -380,7 +381,7 @@ func setupMounts(mounts []specs.Mount, root string) error {
 // Otherwise, it may follow symlinks to locations that would be overwritten
 // with another mount point and return the wrong location. In short, make sure
 // setupMounts() has been called before.
-func resolveMounts(mounts []specs.Mount, root string) ([]specs.Mount, error) {
+func resolveMounts(conf *boot.Config, mounts []specs.Mount, root string) ([]specs.Mount, error) {
 	cleanMounts := make([]specs.Mount, 0, len(mounts))
 	for _, m := range mounts {
 		if m.Type != "bind" || !specutils.IsSupportedDevMount(m) {
@@ -395,8 +396,15 @@ func resolveMounts(mounts []specs.Mount, root string) ([]specs.Mount, error) {
 		if err != nil {
 			panic(fmt.Sprintf("%q could not be made relative to %q: %v", dst, root, err))
 		}
+
+		opts, err := adjustMountOptions(conf, filepath.Join(root, relDst), m.Options)
+		if err != nil {
+			return nil, err
+		}
+
 		cpy := m
 		cpy.Destination = filepath.Join("/", relDst)
+		cpy.Options = opts
 		cleanMounts = append(cleanMounts, cpy)
 	}
 	return cleanMounts, nil
@@ -453,3 +461,20 @@ func resolveSymlinksImpl(root, base, rel string, followCount uint) (string, erro
 	}
 	return base, nil
 }
+
+// adjustMountOptions adds 'overlayfs_stale_read' if mounting over overlayfs.
+func adjustMountOptions(conf *boot.Config, path string, opts []string) ([]string, error) {
+	rv := make([]string, len(opts))
+	copy(rv, opts)
+
+	if conf.OverlayfsStaleRead {
+		statfs := syscall.Statfs_t{}
+		if err := syscall.Statfs(path, &statfs); err != nil {
+			return nil, err
+		}
+		if statfs.Type == unix.OVERLAYFS_SUPER_MAGIC {
+			rv = append(rv, "overlayfs_stale_read")
+		}
+	}
+	return rv, nil
+}
diff --git a/runsc/container/container.go b/runsc/container/container.go
index a721c1c31..32510d427 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -1149,7 +1149,7 @@ func maybeLockRootContainer(spec *specs.Spec, rootDir string) (func() error, err
 }
 
 func isRoot(spec *specs.Spec) bool {
-	return specutils.ShouldCreateSandbox(spec)
+	return specutils.SpecContainerType(spec) != specutils.ContainerTypeContainer
 }
 
 // runInCgroup executes fn inside the specified cgroup. If cg is nil, execute
@@ -1198,7 +1198,7 @@ func adjustSandboxOOMScoreAdj(s *sandbox.Sandbox, rootDir string, destroy bool)
 	// Get the lowest score for all containers.
 	var lowScore int
 	scoreFound := false
-	if len(containers) == 1 && len(containers[0].Spec.Annotations[specutils.ContainerdContainerTypeAnnotation]) == 0 {
+	if len(containers) == 1 && specutils.SpecContainerType(containers[0].Spec) == specutils.ContainerTypeUnspecified {
 		// This is a single-container sandbox. Set the oom_score_adj to
 		// the value specified in the OCI bundle.
 		if containers[0].Spec.Process.OOMScoreAdj != nil {
@@ -1214,7 +1214,7 @@ func adjustSandboxOOMScoreAdj(s *sandbox.Sandbox, rootDir string, destroy bool)
 			//
 			// We will use OOMScoreAdj in the single-container case where the
 			// containerd container-type annotation is not present.
-			if container.Spec.Annotations[specutils.ContainerdContainerTypeAnnotation] == specutils.ContainerdContainerTypeSandbox {
+			if specutils.SpecContainerType(container.Spec) == specutils.ContainerTypeSandbox {
 				continue
 			}
 
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index 519f5ed9b..c4c56b2e0 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -2074,6 +2074,43 @@ func TestNetRaw(t *testing.T) {
 	}
 }
 
+// TestOverlayfsStaleRead most basic test that '--overlayfs-stale-read' works.
+func TestOverlayfsStaleRead(t *testing.T) {
+	conf := testutil.TestConfig()
+	conf.OverlayfsStaleRead = true
+
+	in, err := ioutil.TempFile(testutil.TmpDir(), "stale-read.in")
+	if err != nil {
+		t.Fatalf("ioutil.TempFile() failed: %v", err)
+	}
+	defer in.Close()
+	if _, err := in.WriteString("stale data"); err != nil {
+		t.Fatalf("in.Write() failed: %v", err)
+	}
+
+	out, err := ioutil.TempFile(testutil.TmpDir(), "stale-read.out")
+	if err != nil {
+		t.Fatalf("ioutil.TempFile() failed: %v", err)
+	}
+	defer out.Close()
+
+	const want = "foobar"
+	cmd := fmt.Sprintf("cat %q && echo %q> %q && cp %q %q", in.Name(), want, in.Name(), in.Name(), out.Name())
+	spec := testutil.NewSpecWithArgs("/bin/bash", "-c", cmd)
+	if err := run(spec, conf); err != nil {
+		t.Fatalf("Error running container: %v", err)
+	}
+
+	gotBytes, err := ioutil.ReadAll(out)
+	if err != nil {
+		t.Fatalf("out.Read() failed: %v", err)
+	}
+	got := strings.TrimSpace(string(gotBytes))
+	if want != got {
+		t.Errorf("Wrong content in out file, got: %q. want: %q", got, want)
+	}
+}
+
 // executeSync synchronously executes a new process.
 func (cont *Container) executeSync(args *control.ExecArgs) (syscall.WaitStatus, error) {
 	pid, err := cont.Execute(args)
diff --git a/runsc/fsgofer/filter/config.go b/runsc/fsgofer/filter/config.go
index 0bf7507b7..2ea95f8fb 100644
--- a/runsc/fsgofer/filter/config.go
+++ b/runsc/fsgofer/filter/config.go
@@ -220,6 +220,18 @@ var udsSyscalls = seccomp.SyscallRules{
 	syscall.SYS_SOCKET: []seccomp.Rule{
 		{
 			seccomp.AllowValue(syscall.AF_UNIX),
+			seccomp.AllowValue(syscall.SOCK_STREAM),
+			seccomp.AllowValue(0),
+		},
+		{
+			seccomp.AllowValue(syscall.AF_UNIX),
+			seccomp.AllowValue(syscall.SOCK_DGRAM),
+			seccomp.AllowValue(0),
+		},
+		{
+			seccomp.AllowValue(syscall.AF_UNIX),
+			seccomp.AllowValue(syscall.SOCK_SEQPACKET),
+			seccomp.AllowValue(0),
 		},
 	},
 	syscall.SYS_CONNECT: []seccomp.Rule{
diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go
index 29a82138e..3fceecb3d 100644
--- a/runsc/fsgofer/fsgofer.go
+++ b/runsc/fsgofer/fsgofer.go
@@ -21,7 +21,6 @@
 package fsgofer
 
 import (
-	"errors"
 	"fmt"
 	"io"
 	"math"
@@ -126,13 +125,6 @@ func NewAttachPoint(prefix string, c Config) (p9.Attacher, error) {
 
 // Attach implements p9.Attacher.
 func (a *attachPoint) Attach() (p9.File, error) {
-	// dirFD (1st argument) is ignored because 'prefix' is always absolute.
-	stat, err := statAt(-1, a.prefix)
-	if err != nil {
-		return nil, fmt.Errorf("stat file %q, err: %v", a.prefix, err)
-	}
-
-	// Acquire the attach point lock.
 	a.attachedMu.Lock()
 	defer a.attachedMu.Unlock()
 
@@ -140,47 +132,24 @@ func (a *attachPoint) Attach() (p9.File, error) {
 		return nil, fmt.Errorf("attach point already attached, prefix: %s", a.prefix)
 	}
 
-	// Hold the file descriptor we are converting into a p9.File.
-	var f *fd.FD
-
-	// Apply the S_IFMT bitmask so we can detect file type appropriately.
-	switch fmtStat := stat.Mode & syscall.S_IFMT; fmtStat {
-	case syscall.S_IFSOCK:
-		// Check to see if the CLI option has been set to allow the UDS mount.
-		if !a.conf.HostUDS {
-			return nil, errors.New("host UDS support is disabled")
-		}
-
-		// Attempt to open a connection. Bubble up the failures.
-		f, err = fd.DialUnix(a.prefix)
-		if err != nil {
-			return nil, err
-		}
-
-	default:
-		// Default to Read/Write permissions.
-		mode := syscall.O_RDWR
-
-		// If the configuration is Read Only or the mount point is a directory,
-		// set the mode to Read Only.
-		if a.conf.ROMount || fmtStat == syscall.S_IFDIR {
-			mode = syscall.O_RDONLY
-		}
+	f, err := openAnyFile(a.prefix, func(mode int) (*fd.FD, error) {
+		return fd.Open(a.prefix, openFlags|mode, 0)
+	})
+	if err != nil {
+		return nil, fmt.Errorf("unable to open %q: %v", a.prefix, err)
+	}
 
-		// Open the mount point & capture the FD.
-		f, err = fd.Open(a.prefix, openFlags|mode, 0)
-		if err != nil {
-			return nil, fmt.Errorf("unable to open file %q, err: %v", a.prefix, err)
-		}
+	stat, err := stat(f.FD())
+	if err != nil {
+		return nil, fmt.Errorf("unable to stat %q: %v", a.prefix, err)
 	}
 
-	// Return a localFile object to the caller with the UDS FD included.
-	rv, err := newLocalFile(a, f, a.prefix, stat)
+	lf, err := newLocalFile(a, f, a.prefix, stat)
 	if err != nil {
-		return nil, err
+		return nil, fmt.Errorf("unable to create localFile %q: %v", a.prefix, err)
 	}
 	a.attached = true
-	return rv, nil
+	return lf, nil
 }
 
 // makeQID returns a unique QID for the given stat buffer.
@@ -296,10 +265,10 @@ func openAnyFileFromParent(parent *localFile, name string) (*fd.FD, string, erro
 // actual file open and is customizable by the caller.
 func openAnyFile(path string, fn func(mode int) (*fd.FD, error)) (*fd.FD, error) {
 	// Attempt to open file in the following mode in order:
-	//   1. RDONLY | NONBLOCK: for all files, works for directories and ro mounts too.
-	//      Use non-blocking to prevent getting stuck inside open(2) for FIFOs. This option
-	//      has no effect on regular files.
-	//   2. PATH: for symlinks
+	//   1. RDONLY | NONBLOCK: for all files, directories, ro mounts, FIFOs.
+	//      Use non-blocking to prevent getting stuck inside open(2) for
+	//      FIFOs. This option has no effect on regular files.
+	//   2. PATH: for symlinks, sockets.
 	modes := []int{syscall.O_RDONLY | syscall.O_NONBLOCK, unix.O_PATH}
 
 	var err error
@@ -1063,12 +1032,48 @@ func (l *localFile) Flush() error {
 }
 
 // Connect implements p9.File.
-func (l *localFile) Connect(p9.ConnectFlags) (*fd.FD, error) {
-	// Check to see if the CLI option has been set to allow the UDS mount.
+func (l *localFile) Connect(flags p9.ConnectFlags) (*fd.FD, error) {
 	if !l.attachPoint.conf.HostUDS {
 		return nil, syscall.ECONNREFUSED
 	}
-	return fd.DialUnix(l.hostPath)
+
+	// TODO(gvisor.dev/issue/1003): Due to different app vs replacement
+	// mappings, the app path may have fit in the sockaddr, but we can't
+	// fit f.path in our sockaddr. We'd need to redirect through a shorter
+	// path in order to actually connect to this socket.
+	if len(l.hostPath) > linux.UnixPathMax {
+		return nil, syscall.ECONNREFUSED
+	}
+
+	var stype int
+	switch flags {
+	case p9.StreamSocket:
+		stype = syscall.SOCK_STREAM
+	case p9.DgramSocket:
+		stype = syscall.SOCK_DGRAM
+	case p9.SeqpacketSocket:
+		stype = syscall.SOCK_SEQPACKET
+	default:
+		return nil, syscall.ENXIO
+	}
+
+	f, err := syscall.Socket(syscall.AF_UNIX, stype, 0)
+	if err != nil {
+		return nil, err
+	}
+
+	if err := syscall.SetNonblock(f, true); err != nil {
+		syscall.Close(f)
+		return nil, err
+	}
+
+	sa := syscall.SockaddrUnix{Name: l.hostPath}
+	if err := syscall.Connect(f, &sa); err != nil {
+		syscall.Close(f)
+		return nil, err
+	}
+
+	return fd.New(f), nil
 }
 
 // Close implements p9.File.
diff --git a/runsc/main.go b/runsc/main.go
index 7dce9dc00..ae906c661 100644
--- a/runsc/main.go
+++ b/runsc/main.go
@@ -41,35 +41,37 @@ import (
 var (
 	// Although these flags are not part of the OCI spec, they are used by
 	// Docker, and thus should not be changed.
-	rootDir     = flag.String("root", "", "root directory for storage of container state")
-	logFilename = flag.String("log", "", "file path where internal debug information is written, default is stdout")
-	logFormat   = flag.String("log-format", "text", "log format: text (default), json, or json-k8s")
-	debug       = flag.Bool("debug", false, "enable debug logging")
-	showVersion = flag.Bool("version", false, "show version and exit")
+	rootDir     = flag.String("root", "", "root directory for storage of container state.")
+	logFilename = flag.String("log", "", "file path where internal debug information is written, default is stdout.")
+	logFormat   = flag.String("log-format", "text", "log format: text (default), json, or json-k8s.")
+	debug       = flag.Bool("debug", false, "enable debug logging.")
+	showVersion = flag.Bool("version", false, "show version and exit.")
 
 	// These flags are unique to runsc, and are used to configure parts of the
 	// system that are not covered by the runtime spec.
 
 	// Debugging flags.
 	debugLog        = flag.String("debug-log", "", "additional location for logs. If it ends with '/', log files are created inside the directory with default names. The following variables are available: %TIMESTAMP%, %COMMAND%.")
-	logPackets      = flag.Bool("log-packets", false, "enable network packet logging")
+	logPackets      = flag.Bool("log-packets", false, "enable network packet logging.")
 	logFD           = flag.Int("log-fd", -1, "file descriptor to log to.  If set, the 'log' flag is ignored.")
 	debugLogFD      = flag.Int("debug-log-fd", -1, "file descriptor to write debug logs to.  If set, the 'debug-log-dir' flag is ignored.")
-	debugLogFormat  = flag.String("debug-log-format", "text", "log format: text (default), json, or json-k8s")
-	alsoLogToStderr = flag.Bool("alsologtostderr", false, "send log messages to stderr")
+	debugLogFormat  = flag.String("debug-log-format", "text", "log format: text (default), json, or json-k8s.")
+	alsoLogToStderr = flag.Bool("alsologtostderr", false, "send log messages to stderr.")
 
 	// Debugging flags: strace related
-	strace         = flag.Bool("strace", false, "enable strace")
+	strace         = flag.Bool("strace", false, "enable strace.")
 	straceSyscalls = flag.String("strace-syscalls", "", "comma-separated list of syscalls to trace. If --strace is true and this list is empty, then all syscalls will be traced.")
-	straceLogSize  = flag.Uint("strace-log-size", 1024, "default size (in bytes) to log data argument blobs")
+	straceLogSize  = flag.Uint("strace-log-size", 1024, "default size (in bytes) to log data argument blobs.")
 
 	// Flags that control sandbox runtime behavior.
-	platformName       = flag.String("platform", "ptrace", "specifies which platform to use: ptrace (default), kvm")
+	platformName       = flag.String("platform", "ptrace", "specifies which platform to use: ptrace (default), kvm.")
 	network            = flag.String("network", "sandbox", "specifies which network to use: sandbox (default), host, none. Using network inside the sandbox is more secure because it's isolated from the host network.")
-	gso                = flag.Bool("gso", true, "enable generic segmenation offload")
+	hardwareGSO        = flag.Bool("gso", true, "enable hardware segmentation offload if it is supported by a network device.")
+	softwareGSO        = flag.Bool("software-gso", true, "enable software segmentation offload when hardware ofload can't be enabled.")
 	fileAccess         = flag.String("file-access", "exclusive", "specifies which filesystem to use for the root mount: exclusive (default), shared. Volume mounts are always shared.")
-	fsGoferHostUDS     = flag.Bool("fsgofer-host-uds", false, "Allow the gofer to mount Unix Domain Sockets.")
+	fsGoferHostUDS     = flag.Bool("fsgofer-host-uds", false, "allow the gofer to mount Unix Domain Sockets.")
 	overlay            = flag.Bool("overlay", false, "wrap filesystem mounts with writable overlay. All modifications are stored in memory inside the sandbox.")
+	overlayfsStaleRead = flag.Bool("overlayfs-stale-read", false, "reopen cached FDs after a file is opened for write to workaround overlayfs limitation on kernels before 4.19.")
 	watchdogAction     = flag.String("watchdog-action", "log", "sets what action the watchdog takes when triggered: log (default), panic.")
 	panicSignal        = flag.Int("panic-signal", -1, "register signal handling that panics. Usually set to SIGUSR2(12) to troubleshoot hangs. -1 disables it.")
 	profile            = flag.Bool("profile", false, "prepares the sandbox to use Golang profiler. Note that enabling profiler loosens the seccomp protection added to the sandbox (DO NOT USE IN PRODUCTION).")
@@ -199,7 +201,8 @@ func main() {
 		FSGoferHostUDS:     *fsGoferHostUDS,
 		Overlay:            *overlay,
 		Network:            netType,
-		GSO:                *gso,
+		HardwareGSO:        *hardwareGSO,
+		SoftwareGSO:        *softwareGSO,
 		LogPackets:         *logPackets,
 		Platform:           platformType,
 		Strace:             *strace,
@@ -212,6 +215,7 @@ func main() {
 		Rootless:           *rootless,
 		AlsoLogToStderr:    *alsoLogToStderr,
 		ReferenceLeakMode:  refsLeakMode,
+		OverlayfsStaleRead: *overlayfsStaleRead,
 
 		TestOnlyAllowRunAsCurrentUserWithoutChroot: *testOnlyAllowRunAsCurrentUserWithoutChroot,
 		TestOnlyTestNameEnv:                        *testOnlyTestNameEnv,
diff --git a/runsc/sandbox/BUILD b/runsc/sandbox/BUILD
index 7fdceaab6..27459e6d1 100644
--- a/runsc/sandbox/BUILD
+++ b/runsc/sandbox/BUILD
@@ -19,6 +19,7 @@ go_library(
         "//pkg/log",
         "//pkg/sentry/control",
         "//pkg/sentry/platform",
+        "//pkg/tcpip/stack",
         "//pkg/urpc",
         "//runsc/boot",
         "//runsc/boot/platforms",
diff --git a/runsc/sandbox/network.go b/runsc/sandbox/network.go
index 5634f0707..d42de0176 100644
--- a/runsc/sandbox/network.go
+++ b/runsc/sandbox/network.go
@@ -28,6 +28,7 @@ import (
 	"github.com/vishvananda/netlink"
 	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/urpc"
 	"gvisor.dev/gvisor/runsc/boot"
 	"gvisor.dev/gvisor/runsc/specutils"
@@ -61,7 +62,7 @@ func setupNetwork(conn *urpc.Client, pid int, spec *specs.Spec, conf *boot.Confi
 		// Build the path to the net namespace of the sandbox process.
 		// This is what we will copy.
 		nsPath := filepath.Join("/proc", strconv.Itoa(pid), "ns/net")
-		if err := createInterfacesAndRoutesFromNS(conn, nsPath, conf.GSO, conf.NumNetworkChannels); err != nil {
+		if err := createInterfacesAndRoutesFromNS(conn, nsPath, conf.HardwareGSO, conf.SoftwareGSO, conf.NumNetworkChannels); err != nil {
 			return fmt.Errorf("creating interfaces from net namespace %q: %v", nsPath, err)
 		}
 	case boot.NetworkHost:
@@ -136,7 +137,7 @@ func isRootNS() (bool, error) {
 // createInterfacesAndRoutesFromNS scrapes the interface and routes from the
 // net namespace with the given path, creates them in the sandbox, and removes
 // them from the host.
-func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, enableGSO bool, numNetworkChannels int) error {
+func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, hardwareGSO bool, softwareGSO bool, numNetworkChannels int) error {
 	// Join the network namespace that we will be copying.
 	restore, err := joinNetNS(nsPath)
 	if err != nil {
@@ -232,7 +233,7 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, enableGSO
 		// Create the socket for the device.
 		for i := 0; i < link.NumChannels; i++ {
 			log.Debugf("Creating Channel %d", i)
-			socketEntry, err := createSocket(iface, ifaceLink, enableGSO)
+			socketEntry, err := createSocket(iface, ifaceLink, hardwareGSO)
 			if err != nil {
 				return fmt.Errorf("failed to createSocket for %s : %v", iface.Name, err)
 			}
@@ -246,6 +247,11 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, enableGSO
 			}
 			args.FilePayload.Files = append(args.FilePayload.Files, socketEntry.deviceFile)
 		}
+		if link.GSOMaxSize == 0 && softwareGSO {
+			// Hardware GSO is disabled. Let's enable software GSO.
+			link.GSOMaxSize = stack.SoftwareGSOMaxSize
+			link.SoftwareGSOEnabled = true
+		}
 
 		// Collect the addresses for the interface, enable forwarding,
 		// and remove them from the host.
diff --git a/runsc/specutils/BUILD b/runsc/specutils/BUILD
index fa58313a0..205638803 100644
--- a/runsc/specutils/BUILD
+++ b/runsc/specutils/BUILD
@@ -5,6 +5,7 @@ package(licenses = ["notice"])
 go_library(
     name = "specutils",
     srcs = [
+        "cri.go",
         "fs.go",
         "namespace.go",
         "specutils.go",
diff --git a/runsc/specutils/cri.go b/runsc/specutils/cri.go
new file mode 100644
index 000000000..9c5877cd5
--- /dev/null
+++ b/runsc/specutils/cri.go
@@ -0,0 +1,110 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package specutils
+
+import (
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+)
+
+const (
+	// ContainerdContainerTypeAnnotation is the OCI annotation set by
+	// containerd to indicate whether the container to create should have
+	// its own sandbox or a container within an existing sandbox.
+	ContainerdContainerTypeAnnotation = "io.kubernetes.cri.container-type"
+	// ContainerdContainerTypeContainer is the container type value
+	// indicating the container should be created in an existing sandbox.
+	ContainerdContainerTypeContainer = "container"
+	// ContainerdContainerTypeSandbox is the container type value
+	// indicating the container should be created in a new sandbox.
+	ContainerdContainerTypeSandbox = "sandbox"
+
+	// ContainerdSandboxIDAnnotation is the OCI annotation set to indicate
+	// which sandbox the container should be created in when the container
+	// is not the first container in the sandbox.
+	ContainerdSandboxIDAnnotation = "io.kubernetes.cri.sandbox-id"
+
+	// CRIOContainerTypeAnnotation is the OCI annotation set by
+	// CRI-O to indicate whether the container to create should have
+	// its own sandbox or a container within an existing sandbox.
+	CRIOContainerTypeAnnotation = "io.kubernetes.cri-o.ContainerType"
+
+	// CRIOContainerTypeContainer is the container type value
+	// indicating the container should be created in an existing sandbox.
+	CRIOContainerTypeContainer = "container"
+	// CRIOContainerTypeSandbox is the container type value
+	// indicating the container should be created in a new sandbox.
+	CRIOContainerTypeSandbox = "sandbox"
+
+	// CRIOSandboxIDAnnotation is the OCI annotation set to indicate
+	// which sandbox the container should be created in when the container
+	// is not the first container in the sandbox.
+	CRIOSandboxIDAnnotation = "io.kubernetes.cri-o.SandboxID"
+)
+
+// ContainerType represents the type of container requested by the calling container manager.
+type ContainerType int
+
+const (
+	// ContainerTypeUnspecified indicates that no known container type
+	// annotation was found in the spec.
+	ContainerTypeUnspecified ContainerType = iota
+	// ContainerTypeUnknown indicates that a container type was specified
+	// but is unknown to us.
+	ContainerTypeUnknown
+	// ContainerTypeSandbox indicates that the container should be run in a
+	// new sandbox.
+	ContainerTypeSandbox
+	// ContainerTypeContainer indicates that the container should be run in
+	// an existing sandbox.
+	ContainerTypeContainer
+)
+
+// SpecContainerType tries to determine the type of container specified by the
+// container manager using well-known container annotations.
+func SpecContainerType(spec *specs.Spec) ContainerType {
+	if t, ok := spec.Annotations[ContainerdContainerTypeAnnotation]; ok {
+		switch t {
+		case ContainerdContainerTypeSandbox:
+			return ContainerTypeSandbox
+		case ContainerdContainerTypeContainer:
+			return ContainerTypeContainer
+		default:
+			return ContainerTypeUnknown
+		}
+	}
+	if t, ok := spec.Annotations[CRIOContainerTypeAnnotation]; ok {
+		switch t {
+		case CRIOContainerTypeSandbox:
+			return ContainerTypeSandbox
+		case CRIOContainerTypeContainer:
+			return ContainerTypeContainer
+		default:
+			return ContainerTypeUnknown
+		}
+	}
+	return ContainerTypeUnspecified
+}
+
+// SandboxID returns the ID of the sandbox to join and whether an ID was found
+// in the spec.
+func SandboxID(spec *specs.Spec) (string, bool) {
+	if id, ok := spec.Annotations[ContainerdSandboxIDAnnotation]; ok {
+		return id, true
+	}
+	if id, ok := spec.Annotations[CRIOSandboxIDAnnotation]; ok {
+		return id, true
+	}
+	return "", false
+}
diff --git a/runsc/specutils/specutils.go b/runsc/specutils/specutils.go
index 3d9ced1b6..d3c2e4e78 100644
--- a/runsc/specutils/specutils.go
+++ b/runsc/specutils/specutils.go
@@ -108,23 +108,18 @@ func ValidateSpec(spec *specs.Spec) error {
 		}
 	}
 
-	// Two annotations are use by containerd to support multi-container pods.
-	//   "io.kubernetes.cri.container-type"
-	//   "io.kubernetes.cri.sandbox-id"
-	containerType, hasContainerType := spec.Annotations[ContainerdContainerTypeAnnotation]
-	_, hasSandboxID := spec.Annotations[ContainerdSandboxIDAnnotation]
-	switch {
-	// Non-containerd use won't set a container type.
-	case !hasContainerType:
-	case containerType == ContainerdContainerTypeSandbox:
-	// When starting a container in an existing sandbox, the sandbox ID
-	// must be set.
-	case containerType == ContainerdContainerTypeContainer:
-		if !hasSandboxID {
-			return fmt.Errorf("spec has container-type of %s, but no sandbox ID set", containerType)
+	// CRI specifies whether a container should start a new sandbox, or run
+	// another container in an existing sandbox.
+	switch SpecContainerType(spec) {
+	case ContainerTypeContainer:
+		// When starting a container in an existing sandbox, the
+		// sandbox ID must be set.
+		if _, ok := SandboxID(spec); !ok {
+			return fmt.Errorf("spec has container-type of container, but no sandbox ID set")
 		}
+	case ContainerTypeUnknown:
+		return fmt.Errorf("unknown container-type")
 	default:
-		return fmt.Errorf("unknown container-type: %s", containerType)
 	}
 
 	return nil
@@ -338,39 +333,6 @@ func IsSupportedDevMount(m specs.Mount) bool {
 	return true
 }
 
-const (
-	// ContainerdContainerTypeAnnotation is the OCI annotation set by
-	// containerd to indicate whether the container to create should have
-	// its own sandbox or a container within an existing sandbox.
-	ContainerdContainerTypeAnnotation = "io.kubernetes.cri.container-type"
-	// ContainerdContainerTypeContainer is the container type value
-	// indicating the container should be created in an existing sandbox.
-	ContainerdContainerTypeContainer = "container"
-	// ContainerdContainerTypeSandbox is the container type value
-	// indicating the container should be created in a new sandbox.
-	ContainerdContainerTypeSandbox = "sandbox"
-
-	// ContainerdSandboxIDAnnotation is the OCI annotation set to indicate
-	// which sandbox the container should be created in when the container
-	// is not the first container in the sandbox.
-	ContainerdSandboxIDAnnotation = "io.kubernetes.cri.sandbox-id"
-)
-
-// ShouldCreateSandbox returns true if the spec indicates that a new sandbox
-// should be created for the container. If false, the container should be
-// started in an existing sandbox.
-func ShouldCreateSandbox(spec *specs.Spec) bool {
-	t, ok := spec.Annotations[ContainerdContainerTypeAnnotation]
-	return !ok || t == ContainerdContainerTypeSandbox
-}
-
-// SandboxID returns the ID of the sandbox to join and whether an ID was found
-// in the spec.
-func SandboxID(spec *specs.Spec) (string, bool) {
-	id, ok := spec.Annotations[ContainerdSandboxIDAnnotation]
-	return id, ok
-}
-
 // WaitForReady waits for a process to become ready. The process is ready when
 // the 'ready' function returns true. It continues to wait if 'ready' returns
 // false. It returns error on timeout, if the process stops or if 'ready' fails.
diff --git a/runsc/testutil/BUILD b/runsc/testutil/BUILD
index d44ebc906..c96ca2eb6 100644
--- a/runsc/testutil/BUILD
+++ b/runsc/testutil/BUILD
@@ -9,6 +9,7 @@ go_library(
     importpath = "gvisor.dev/gvisor/runsc/testutil",
     visibility = ["//:sandbox"],
     deps = [
+        "//pkg/log",
         "//runsc/boot",
         "//runsc/specutils",
         "@com_github_cenkalti_backoff//:go_default_library",
diff --git a/runsc/testutil/testutil.go b/runsc/testutil/testutil.go
index edf8b126c..26467bdc7 100644
--- a/runsc/testutil/testutil.go
+++ b/runsc/testutil/testutil.go
@@ -25,7 +25,6 @@ import (
 	"fmt"
 	"io"
 	"io/ioutil"
-	"log"
 	"math"
 	"math/rand"
 	"net/http"
@@ -42,6 +41,7 @@ import (
 
 	"github.com/cenkalti/backoff"
 	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/runsc/boot"
 	"gvisor.dev/gvisor/runsc/specutils"
 )
@@ -286,7 +286,7 @@ func WaitForHTTP(port int, timeout time.Duration) error {
 		url := fmt.Sprintf("http://localhost:%d/", port)
 		resp, err := c.Get(url)
 		if err != nil {
-			log.Printf("Waiting %s: %v", url, err)
+			log.Infof("Waiting %s: %v", url, err)
 			return err
 		}
 		resp.Body.Close()
diff --git a/scripts/swgso_tests.sh b/scripts/swgso_tests.sh
new file mode 100755
index 000000000..0de2df1d2
--- /dev/null
+++ b/scripts/swgso_tests.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+# Copyright 2019 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+source $(dirname $0)/common.sh
+
+# Install the runtime and perform basic tests.
+install_runsc_for_test swgso --software-gso=true --gso=false
+test_runsc //test/image:image_test //test/e2e:integration_test
diff --git a/test/README.md b/test/README.md
index 09c36b461..97fe7ea04 100644
--- a/test/README.md
+++ b/test/README.md
@@ -10,9 +10,31 @@ they may need extra setup in the test machine and extra configuration to run.
     functionality.
 -   **image:** basic end to end test for popular images. These require the same
     setup as integration tests.
--   **root:** tests that require to be run as root.
+-   **root:** tests that require to be run as root. These require the same setup
+    as integration tests.
 -   **util:** utilities library to support the tests.
 
 For the above noted cases, the relevant runtime must be installed via `runsc
-install` before running. This is handled automatically by the test scripts in
-the `kokoro` directory.
+install` before running. Just note that they require specific configuration to
+work. This is handled automatically by the test scripts in the `scripts`
+directory and they can be used to run tests locally on your machine. They are
+also used to run these tests in `kokoro`.
+
+**Example:**
+
+To run image and integration tests, run:
+
+`./scripts/docker_test.sh`
+
+To run root tests, run:
+
+`./scripts/root_test.sh`
+
+There are a few other interesting variations for image and integration tests:
+
+*   overlay: sets writable overlay inside the sentry
+*   hostnet: configures host network pass-thru, instead of netstack
+*   kvm: runsc the test using the KVM platform, instead of ptrace
+
+The test will build runsc, configure it with your local docker, restart
+`dockerd`, and run tests. The location for runsc logs is printed to the output.
diff --git a/test/e2e/exec_test.go b/test/e2e/exec_test.go
index c962a3159..4074d2285 100644
--- a/test/e2e/exec_test.go
+++ b/test/e2e/exec_test.go
@@ -109,7 +109,7 @@ func TestExecPrivileged(t *testing.T) {
 	t.Logf("Exec CapEff: %v", got)
 	want := fmt.Sprintf("CapEff:\t%016x\n", specutils.AllCapabilitiesUint64()&^bits.MaskOf64(int(linux.CAP_NET_RAW)))
 	if got != want {
-		t.Errorf("wrong capabilities, got: %q, want: %q", got, want)
+		t.Errorf("Wrong capabilities, got: %q, want: %q. Make sure runsc is not using '--net-raw'", got, want)
 	}
 }
 
diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD
index 87ef87e07..a53a23afd 100644
--- a/test/syscalls/BUILD
+++ b/test/syscalls/BUILD
@@ -79,6 +79,12 @@ syscall_test(test = "//test/syscalls/linux:clock_nanosleep_test")
 syscall_test(test = "//test/syscalls/linux:concurrency_test")
 
 syscall_test(
+    add_uds_tree = True,
+    test = "//test/syscalls/linux:connect_external_test",
+    use_tmpfs = True,
+)
+
+syscall_test(
     add_overlay = True,
     test = "//test/syscalls/linux:creat_test",
 )
@@ -716,6 +722,7 @@ go_binary(
         "//runsc/specutils",
         "//runsc/testutil",
         "//test/syscalls/gtest",
+        "//test/uds",
         "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
         "@org_golang_x_sys//unix:go_default_library",
     ],
diff --git a/test/syscalls/build_defs.bzl b/test/syscalls/build_defs.bzl
index e94ef5602..dcf5b73ed 100644
--- a/test/syscalls/build_defs.bzl
+++ b/test/syscalls/build_defs.bzl
@@ -8,6 +8,7 @@ def syscall_test(
         size = "small",
         use_tmpfs = False,
         add_overlay = False,
+        add_uds_tree = False,
         tags = None):
     _syscall_test(
         test = test,
@@ -15,6 +16,7 @@ def syscall_test(
         size = size,
         platform = "native",
         use_tmpfs = False,
+        add_uds_tree = add_uds_tree,
         tags = tags,
     )
 
@@ -24,6 +26,7 @@ def syscall_test(
         size = size,
         platform = "kvm",
         use_tmpfs = use_tmpfs,
+        add_uds_tree = add_uds_tree,
         tags = tags,
     )
 
@@ -33,6 +36,7 @@ def syscall_test(
         size = size,
         platform = "ptrace",
         use_tmpfs = use_tmpfs,
+        add_uds_tree = add_uds_tree,
         tags = tags,
     )
 
@@ -43,6 +47,7 @@ def syscall_test(
             size = size,
             platform = "ptrace",
             use_tmpfs = False,  # overlay is adding a writable tmpfs on top of root.
+            add_uds_tree = add_uds_tree,
             tags = tags,
             overlay = True,
         )
@@ -55,6 +60,7 @@ def syscall_test(
             size = size,
             platform = "ptrace",
             use_tmpfs = use_tmpfs,
+            add_uds_tree = add_uds_tree,
             tags = tags,
             file_access = "shared",
         )
@@ -67,7 +73,8 @@ def _syscall_test(
         use_tmpfs,
         tags,
         file_access = "exclusive",
-        overlay = False):
+        overlay = False,
+        add_uds_tree = False):
     test_name = test.split(":")[1]
 
     # Prepend "runsc" to non-native platform names.
@@ -103,6 +110,7 @@ def _syscall_test(
         "--use-tmpfs=" + str(use_tmpfs),
         "--file-access=" + file_access,
         "--overlay=" + str(overlay),
+        "--add-uds-tree=" + str(add_uds_tree),
     ]
 
     sh_test(
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index d243be9e4..833fbaa09 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -480,6 +480,21 @@ cc_binary(
 )
 
 cc_binary(
+    name = "connect_external_test",
+    testonly = 1,
+    srcs = ["connect_external.cc"],
+    linkstatic = 1,
+    deps = [
+        ":socket_test_util",
+        "//test/util:file_descriptor",
+        "//test/util:fs_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
     name = "creat_test",
     testonly = 1,
     srcs = ["creat.cc"],
@@ -655,6 +670,7 @@ cc_binary(
         "//test/util:thread_util",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/types:optional",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -1558,6 +1574,8 @@ cc_binary(
         "//test/util:fs_util",
         "//test/util:test_main",
         "//test/util:test_util",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/time",
         "@com_google_googletest//:gtest",
     ],
 )
diff --git a/test/syscalls/linux/accept_bind.cc b/test/syscalls/linux/accept_bind.cc
index 1122ea240..328192a05 100644
--- a/test/syscalls/linux/accept_bind.cc
+++ b/test/syscalls/linux/accept_bind.cc
@@ -140,6 +140,18 @@ TEST_P(AllSocketPairTest, Connect) {
               SyscallSucceeds());
 }
 
+TEST_P(AllSocketPairTest, ConnectNonListening) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+                   sockets->first_addr_size()),
+              SyscallSucceeds());
+
+  ASSERT_THAT(connect(sockets->second_fd(), sockets->first_addr(),
+                      sockets->first_addr_size()),
+              SyscallFailsWithErrno(ECONNREFUSED));
+}
+
 TEST_P(AllSocketPairTest, ConnectToFilePath) {
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
 
diff --git a/test/syscalls/linux/connect_external.cc b/test/syscalls/linux/connect_external.cc
new file mode 100644
index 000000000..98032ac19
--- /dev/null
+++ b/test/syscalls/linux/connect_external.cc
@@ -0,0 +1,164 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <stdlib.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <sys/un.h>
+
+#include <string>
+#include <tuple>
+
+#include "gtest/gtest.h"
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/fs_util.h"
+#include "test/util/test_util.h"
+
+// This file contains tests specific to connecting to host UDS managed outside
+// the sandbox / test.
+//
+// A set of ultity sockets will be created externally in $TEST_UDS_TREE and
+// $TEST_UDS_ATTACH_TREE for these tests to interact with.
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+struct ProtocolSocket {
+  int protocol;
+  std::string name;
+};
+
+// Parameter is (socket root dir, ProtocolSocket).
+using GoferStreamSeqpacketTest =
+    ::testing::TestWithParam<std::tuple<std::string, ProtocolSocket>>;
+
+// Connect to a socket and verify that write/read work.
+//
+// An "echo" socket doesn't work for dgram sockets because our socket is
+// unnamed. The server thus has no way to reply to us.
+TEST_P(GoferStreamSeqpacketTest, Echo) {
+  std::string env;
+  ProtocolSocket proto;
+  std::tie(env, proto) = GetParam();
+
+  char *val = getenv(env.c_str());
+  ASSERT_NE(val, nullptr);
+  std::string root(val);
+
+  FileDescriptor sock =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_UNIX, proto.protocol, 0));
+
+  std::string socket_path = JoinPath(root, proto.name, "echo");
+
+  struct sockaddr_un addr = {};
+  addr.sun_family = AF_UNIX;
+  memcpy(addr.sun_path, socket_path.c_str(), socket_path.length());
+
+  ASSERT_THAT(connect(sock.get(), reinterpret_cast<struct sockaddr *>(&addr),
+                      sizeof(addr)),
+              SyscallSucceeds());
+
+  constexpr int kBufferSize = 64;
+  char send_buffer[kBufferSize];
+  memset(send_buffer, 'a', sizeof(send_buffer));
+
+  ASSERT_THAT(WriteFd(sock.get(), send_buffer, sizeof(send_buffer)),
+              SyscallSucceedsWithValue(sizeof(send_buffer)));
+
+  char recv_buffer[kBufferSize];
+  ASSERT_THAT(ReadFd(sock.get(), recv_buffer, sizeof(recv_buffer)),
+              SyscallSucceedsWithValue(sizeof(recv_buffer)));
+  ASSERT_EQ(0, memcmp(send_buffer, recv_buffer, sizeof(send_buffer)));
+}
+
+// It is not possible to connect to a bound but non-listening socket.
+TEST_P(GoferStreamSeqpacketTest, NonListening) {
+  std::string env;
+  ProtocolSocket proto;
+  std::tie(env, proto) = GetParam();
+
+  char *val = getenv(env.c_str());
+  ASSERT_NE(val, nullptr);
+  std::string root(val);
+
+  FileDescriptor sock =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_UNIX, proto.protocol, 0));
+
+  std::string socket_path = JoinPath(root, proto.name, "nonlistening");
+
+  struct sockaddr_un addr = {};
+  addr.sun_family = AF_UNIX;
+  memcpy(addr.sun_path, socket_path.c_str(), socket_path.length());
+
+  ASSERT_THAT(connect(sock.get(), reinterpret_cast<struct sockaddr *>(&addr),
+                      sizeof(addr)),
+              SyscallFailsWithErrno(ECONNREFUSED));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    StreamSeqpacket, GoferStreamSeqpacketTest,
+    ::testing::Combine(
+        // Test access via standard path and attach point.
+        ::testing::Values("TEST_UDS_TREE", "TEST_UDS_ATTACH_TREE"),
+        ::testing::Values(ProtocolSocket{SOCK_STREAM, "stream"},
+                          ProtocolSocket{SOCK_SEQPACKET, "seqpacket"})));
+
+// Parameter is socket root dir.
+using GoferDgramTest = ::testing::TestWithParam<std::string>;
+
+// Connect to a socket and verify that write works.
+//
+// An "echo" socket doesn't work for dgram sockets because our socket is
+// unnamed. The server thus has no way to reply to us.
+TEST_P(GoferDgramTest, Null) {
+  std::string env = GetParam();
+  char *val = getenv(env.c_str());
+  ASSERT_NE(val, nullptr);
+  std::string root(val);
+
+  FileDescriptor sock =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_UNIX, SOCK_DGRAM, 0));
+
+  std::string socket_path = JoinPath(root, "dgram/null");
+
+  struct sockaddr_un addr = {};
+  addr.sun_family = AF_UNIX;
+  memcpy(addr.sun_path, socket_path.c_str(), socket_path.length());
+
+  ASSERT_THAT(connect(sock.get(), reinterpret_cast<struct sockaddr *>(&addr),
+                      sizeof(addr)),
+              SyscallSucceeds());
+
+  constexpr int kBufferSize = 64;
+  char send_buffer[kBufferSize];
+  memset(send_buffer, 'a', sizeof(send_buffer));
+
+  ASSERT_THAT(WriteFd(sock.get(), send_buffer, sizeof(send_buffer)),
+              SyscallSucceedsWithValue(sizeof(send_buffer)));
+}
+
+INSTANTIATE_TEST_SUITE_P(Dgram, GoferDgramTest,
+                         // Test access via standard path and attach point.
+                         ::testing::Values("TEST_UDS_TREE",
+                                           "TEST_UDS_ATTACH_TREE"));
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/exec.cc b/test/syscalls/linux/exec.cc
index 4c7c95321..85734c290 100644
--- a/test/syscalls/linux/exec.cc
+++ b/test/syscalls/linux/exec.cc
@@ -33,6 +33,7 @@
 #include "absl/strings/str_split.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
+#include "absl/types/optional.h"
 #include "test/util/file_descriptor.h"
 #include "test/util/fs_util.h"
 #include "test/util/multiprocess_util.h"
@@ -68,11 +69,12 @@ constexpr char kExit42[] = "--exec_exit_42";
 constexpr char kExecWithThread[] = "--exec_exec_with_thread";
 constexpr char kExecFromThread[] = "--exec_exec_from_thread";
 
-// Runs filename with argv and checks that the exit status is expect_status and
-// that stderr contains expect_stderr.
-void CheckOutput(const std::string& filename, const ExecveArray& argv,
-                 const ExecveArray& envv, int expect_status,
-                 const std::string& expect_stderr) {
+// Runs file specified by dirfd and pathname with argv and checks that the exit
+// status is expect_status and that stderr contains expect_stderr.
+void CheckExecHelper(const absl::optional<int32_t> dirfd,
+                     const std::string& pathname, const ExecveArray& argv,
+                     const ExecveArray& envv, const int flags,
+                     int expect_status, const std::string& expect_stderr) {
   int pipe_fds[2];
   ASSERT_THAT(pipe2(pipe_fds, O_CLOEXEC), SyscallSucceeds());
 
@@ -110,8 +112,15 @@ void CheckOutput(const std::string& filename, const ExecveArray& argv,
     // CloexecEventfd depend on that not happening.
   };
 
-  auto kill = ASSERT_NO_ERRNO_AND_VALUE(
-      ForkAndExec(filename, argv, envv, remap_stderr, &child, &execve_errno));
+  Cleanup kill;
+  if (dirfd.has_value()) {
+    kill = ASSERT_NO_ERRNO_AND_VALUE(ForkAndExecveat(*dirfd, pathname, argv,
+                                                     envv, flags, remap_stderr,
+                                                     &child, &execve_errno));
+  } else {
+    kill = ASSERT_NO_ERRNO_AND_VALUE(
+        ForkAndExec(pathname, argv, envv, remap_stderr, &child, &execve_errno));
+  }
 
   ASSERT_EQ(0, execve_errno);
 
@@ -140,57 +149,71 @@ void CheckOutput(const std::string& filename, const ExecveArray& argv,
   EXPECT_TRUE(absl::StrContains(output, expect_stderr)) << output;
 }
 
-TEST(ExecDeathTest, EmptyPath) {
+void CheckExec(const std::string& filename, const ExecveArray& argv,
+               const ExecveArray& envv, int expect_status,
+               const std::string& expect_stderr) {
+  CheckExecHelper(/*dirfd=*/absl::optional<int32_t>(), filename, argv, envv,
+                  /*flags=*/0, expect_status, expect_stderr);
+}
+
+void CheckExecveat(const int32_t dirfd, const std::string& pathname,
+                   const ExecveArray& argv, const ExecveArray& envv,
+                   const int flags, int expect_status,
+                   const std::string& expect_stderr) {
+  CheckExecHelper(absl::optional<int32_t>(dirfd), pathname, argv, envv, flags,
+                  expect_status, expect_stderr);
+}
+
+TEST(ExecTest, EmptyPath) {
   int execve_errno;
   ASSERT_NO_ERRNO_AND_VALUE(ForkAndExec("", {}, {}, nullptr, &execve_errno));
   EXPECT_EQ(execve_errno, ENOENT);
 }
 
-TEST(ExecDeathTest, Basic) {
-  CheckOutput(WorkloadPath(kBasicWorkload), {WorkloadPath(kBasicWorkload)}, {},
-              ArgEnvExitStatus(0, 0),
-              absl::StrCat(WorkloadPath(kBasicWorkload), "\n"));
+TEST(ExecTest, Basic) {
+  CheckExec(WorkloadPath(kBasicWorkload), {WorkloadPath(kBasicWorkload)}, {},
+            ArgEnvExitStatus(0, 0),
+            absl::StrCat(WorkloadPath(kBasicWorkload), "\n"));
 }
 
-TEST(ExecDeathTest, OneArg) {
-  CheckOutput(WorkloadPath(kBasicWorkload), {WorkloadPath(kBasicWorkload), "1"},
-              {}, ArgEnvExitStatus(1, 0),
-              absl::StrCat(WorkloadPath(kBasicWorkload), "\n1\n"));
+TEST(ExecTest, OneArg) {
+  CheckExec(WorkloadPath(kBasicWorkload), {WorkloadPath(kBasicWorkload), "1"},
+            {}, ArgEnvExitStatus(1, 0),
+            absl::StrCat(WorkloadPath(kBasicWorkload), "\n1\n"));
 }
 
-TEST(ExecDeathTest, FiveArg) {
-  CheckOutput(WorkloadPath(kBasicWorkload),
-              {WorkloadPath(kBasicWorkload), "1", "2", "3", "4", "5"}, {},
-              ArgEnvExitStatus(5, 0),
-              absl::StrCat(WorkloadPath(kBasicWorkload), "\n1\n2\n3\n4\n5\n"));
+TEST(ExecTest, FiveArg) {
+  CheckExec(WorkloadPath(kBasicWorkload),
+            {WorkloadPath(kBasicWorkload), "1", "2", "3", "4", "5"}, {},
+            ArgEnvExitStatus(5, 0),
+            absl::StrCat(WorkloadPath(kBasicWorkload), "\n1\n2\n3\n4\n5\n"));
 }
 
-TEST(ExecDeathTest, OneEnv) {
-  CheckOutput(WorkloadPath(kBasicWorkload), {WorkloadPath(kBasicWorkload)},
-              {"1"}, ArgEnvExitStatus(0, 1),
-              absl::StrCat(WorkloadPath(kBasicWorkload), "\n1\n"));
+TEST(ExecTest, OneEnv) {
+  CheckExec(WorkloadPath(kBasicWorkload), {WorkloadPath(kBasicWorkload)}, {"1"},
+            ArgEnvExitStatus(0, 1),
+            absl::StrCat(WorkloadPath(kBasicWorkload), "\n1\n"));
 }
 
-TEST(ExecDeathTest, FiveEnv) {
-  CheckOutput(WorkloadPath(kBasicWorkload), {WorkloadPath(kBasicWorkload)},
-              {"1", "2", "3", "4", "5"}, ArgEnvExitStatus(0, 5),
-              absl::StrCat(WorkloadPath(kBasicWorkload), "\n1\n2\n3\n4\n5\n"));
+TEST(ExecTest, FiveEnv) {
+  CheckExec(WorkloadPath(kBasicWorkload), {WorkloadPath(kBasicWorkload)},
+            {"1", "2", "3", "4", "5"}, ArgEnvExitStatus(0, 5),
+            absl::StrCat(WorkloadPath(kBasicWorkload), "\n1\n2\n3\n4\n5\n"));
 }
 
-TEST(ExecDeathTest, OneArgOneEnv) {
-  CheckOutput(WorkloadPath(kBasicWorkload),
-              {WorkloadPath(kBasicWorkload), "arg"}, {"env"},
-              ArgEnvExitStatus(1, 1),
-              absl::StrCat(WorkloadPath(kBasicWorkload), "\narg\nenv\n"));
+TEST(ExecTest, OneArgOneEnv) {
+  CheckExec(WorkloadPath(kBasicWorkload), {WorkloadPath(kBasicWorkload), "arg"},
+            {"env"}, ArgEnvExitStatus(1, 1),
+            absl::StrCat(WorkloadPath(kBasicWorkload), "\narg\nenv\n"));
 }
 
-TEST(ExecDeathTest, InterpreterScript) {
-  CheckOutput(WorkloadPath(kExitScript), {WorkloadPath(kExitScript), "25"}, {},
-              ArgEnvExitStatus(25, 0), "");
+TEST(ExecTest, InterpreterScript) {
+  CheckExec(WorkloadPath(kExitScript), {WorkloadPath(kExitScript), "25"}, {},
+            ArgEnvExitStatus(25, 0), "");
 }
 
 // Everything after the path in the interpreter script is a single argument.
-TEST(ExecDeathTest, InterpreterScriptArgSplit) {
+TEST(ExecTest, InterpreterScriptArgSplit) {
   // Symlink through /tmp to ensure the path is short enough.
   TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
       TempPath::CreateSymlinkTo("/tmp", WorkloadPath(kBasicWorkload)));
@@ -199,12 +222,12 @@ TEST(ExecDeathTest, InterpreterScriptArgSplit) {
       GetAbsoluteTestTmpdir(), absl::StrCat("#!", link.path(), " foo bar"),
       0755));
 
-  CheckOutput(script.path(), {script.path()}, {}, ArgEnvExitStatus(2, 0),
-              absl::StrCat(link.path(), "\nfoo bar\n", script.path(), "\n"));
+  CheckExec(script.path(), {script.path()}, {}, ArgEnvExitStatus(2, 0),
+            absl::StrCat(link.path(), "\nfoo bar\n", script.path(), "\n"));
 }
 
 // Original argv[0] is replaced with the script path.
-TEST(ExecDeathTest, InterpreterScriptArgvZero) {
+TEST(ExecTest, InterpreterScriptArgvZero) {
   // Symlink through /tmp to ensure the path is short enough.
   TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
       TempPath::CreateSymlinkTo("/tmp", WorkloadPath(kBasicWorkload)));
@@ -212,13 +235,13 @@ TEST(ExecDeathTest, InterpreterScriptArgvZero) {
   TempPath script = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
       GetAbsoluteTestTmpdir(), absl::StrCat("#!", link.path()), 0755));
 
-  CheckOutput(script.path(), {"REPLACED"}, {}, ArgEnvExitStatus(1, 0),
-              absl::StrCat(link.path(), "\n", script.path(), "\n"));
+  CheckExec(script.path(), {"REPLACED"}, {}, ArgEnvExitStatus(1, 0),
+            absl::StrCat(link.path(), "\n", script.path(), "\n"));
 }
 
 // Original argv[0] is replaced with the script path, exactly as passed to
 // execve.
-TEST(ExecDeathTest, InterpreterScriptArgvZeroRelative) {
+TEST(ExecTest, InterpreterScriptArgvZeroRelative) {
   // Symlink through /tmp to ensure the path is short enough.
   TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
       TempPath::CreateSymlinkTo("/tmp", WorkloadPath(kBasicWorkload)));
@@ -230,12 +253,12 @@ TEST(ExecDeathTest, InterpreterScriptArgvZeroRelative) {
   auto script_relative =
       ASSERT_NO_ERRNO_AND_VALUE(GetRelativePath(cwd, script.path()));
 
-  CheckOutput(script_relative, {"REPLACED"}, {}, ArgEnvExitStatus(1, 0),
-              absl::StrCat(link.path(), "\n", script_relative, "\n"));
+  CheckExec(script_relative, {"REPLACED"}, {}, ArgEnvExitStatus(1, 0),
+            absl::StrCat(link.path(), "\n", script_relative, "\n"));
 }
 
 // argv[0] is added as the script path, even if there was none.
-TEST(ExecDeathTest, InterpreterScriptArgvZeroAdded) {
+TEST(ExecTest, InterpreterScriptArgvZeroAdded) {
   // Symlink through /tmp to ensure the path is short enough.
   TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
       TempPath::CreateSymlinkTo("/tmp", WorkloadPath(kBasicWorkload)));
@@ -243,12 +266,12 @@ TEST(ExecDeathTest, InterpreterScriptArgvZeroAdded) {
   TempPath script = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
       GetAbsoluteTestTmpdir(), absl::StrCat("#!", link.path()), 0755));
 
-  CheckOutput(script.path(), {}, {}, ArgEnvExitStatus(1, 0),
-              absl::StrCat(link.path(), "\n", script.path(), "\n"));
+  CheckExec(script.path(), {}, {}, ArgEnvExitStatus(1, 0),
+            absl::StrCat(link.path(), "\n", script.path(), "\n"));
 }
 
 // A NUL byte in the script line ends parsing.
-TEST(ExecDeathTest, InterpreterScriptArgNUL) {
+TEST(ExecTest, InterpreterScriptArgNUL) {
   // Symlink through /tmp to ensure the path is short enough.
   TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
       TempPath::CreateSymlinkTo("/tmp", WorkloadPath(kBasicWorkload)));
@@ -258,12 +281,12 @@ TEST(ExecDeathTest, InterpreterScriptArgNUL) {
       absl::StrCat("#!", link.path(), " foo", std::string(1, '\0'), "bar"),
       0755));
 
-  CheckOutput(script.path(), {script.path()}, {}, ArgEnvExitStatus(2, 0),
-              absl::StrCat(link.path(), "\nfoo\n", script.path(), "\n"));
+  CheckExec(script.path(), {script.path()}, {}, ArgEnvExitStatus(2, 0),
+            absl::StrCat(link.path(), "\nfoo\n", script.path(), "\n"));
 }
 
 // Trailing whitespace following interpreter path is ignored.
-TEST(ExecDeathTest, InterpreterScriptTrailingWhitespace) {
+TEST(ExecTest, InterpreterScriptTrailingWhitespace) {
   // Symlink through /tmp to ensure the path is short enough.
   TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
       TempPath::CreateSymlinkTo("/tmp", WorkloadPath(kBasicWorkload)));
@@ -271,12 +294,12 @@ TEST(ExecDeathTest, InterpreterScriptTrailingWhitespace) {
   TempPath script = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
       GetAbsoluteTestTmpdir(), absl::StrCat("#!", link.path(), "  "), 0755));
 
-  CheckOutput(script.path(), {script.path()}, {}, ArgEnvExitStatus(1, 0),
-              absl::StrCat(link.path(), "\n", script.path(), "\n"));
+  CheckExec(script.path(), {script.path()}, {}, ArgEnvExitStatus(1, 0),
+            absl::StrCat(link.path(), "\n", script.path(), "\n"));
 }
 
 // Multiple whitespace characters between interpreter and arg allowed.
-TEST(ExecDeathTest, InterpreterScriptArgWhitespace) {
+TEST(ExecTest, InterpreterScriptArgWhitespace) {
   // Symlink through /tmp to ensure the path is short enough.
   TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
       TempPath::CreateSymlinkTo("/tmp", WorkloadPath(kBasicWorkload)));
@@ -284,11 +307,11 @@ TEST(ExecDeathTest, InterpreterScriptArgWhitespace) {
   TempPath script = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
       GetAbsoluteTestTmpdir(), absl::StrCat("#!", link.path(), "  foo"), 0755));
 
-  CheckOutput(script.path(), {script.path()}, {}, ArgEnvExitStatus(2, 0),
-              absl::StrCat(link.path(), "\nfoo\n", script.path(), "\n"));
+  CheckExec(script.path(), {script.path()}, {}, ArgEnvExitStatus(2, 0),
+            absl::StrCat(link.path(), "\nfoo\n", script.path(), "\n"));
 }
 
-TEST(ExecDeathTest, InterpreterScriptNoPath) {
+TEST(ExecTest, InterpreterScriptNoPath) {
   TempPath script = ASSERT_NO_ERRNO_AND_VALUE(
       TempPath::CreateFileWith(GetAbsoluteTestTmpdir(), "#!", 0755));
 
@@ -299,7 +322,7 @@ TEST(ExecDeathTest, InterpreterScriptNoPath) {
 }
 
 // AT_EXECFN is the path passed to execve.
-TEST(ExecDeathTest, ExecFn) {
+TEST(ExecTest, ExecFn) {
   // Symlink through /tmp to ensure the path is short enough.
   TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
       TempPath::CreateSymlinkTo("/tmp", WorkloadPath(kStateWorkload)));
@@ -314,18 +337,18 @@ TEST(ExecDeathTest, ExecFn) {
   auto script_relative =
       ASSERT_NO_ERRNO_AND_VALUE(GetRelativePath(cwd, script.path()));
 
-  CheckOutput(script_relative, {script_relative}, {}, ArgEnvExitStatus(0, 0),
-              absl::StrCat(script_relative, "\n"));
+  CheckExec(script_relative, {script_relative}, {}, ArgEnvExitStatus(0, 0),
+            absl::StrCat(script_relative, "\n"));
 }
 
-TEST(ExecDeathTest, ExecName) {
+TEST(ExecTest, ExecName) {
   std::string path = WorkloadPath(kStateWorkload);
 
-  CheckOutput(path, {path, "PrintExecName"}, {}, ArgEnvExitStatus(0, 0),
-              absl::StrCat(Basename(path).substr(0, 15), "\n"));
+  CheckExec(path, {path, "PrintExecName"}, {}, ArgEnvExitStatus(0, 0),
+            absl::StrCat(Basename(path).substr(0, 15), "\n"));
 }
 
-TEST(ExecDeathTest, ExecNameScript) {
+TEST(ExecTest, ExecNameScript) {
   // Symlink through /tmp to ensure the path is short enough.
   TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
       TempPath::CreateSymlinkTo("/tmp", WorkloadPath(kStateWorkload)));
@@ -336,21 +359,21 @@ TEST(ExecDeathTest, ExecNameScript) {
 
   std::string script_path = script.path();
 
-  CheckOutput(script_path, {script_path}, {}, ArgEnvExitStatus(0, 0),
-              absl::StrCat(Basename(script_path).substr(0, 15), "\n"));
+  CheckExec(script_path, {script_path}, {}, ArgEnvExitStatus(0, 0),
+            absl::StrCat(Basename(script_path).substr(0, 15), "\n"));
 }
 
 // execve may be called by a multithreaded process.
-TEST(ExecDeathTest, WithSiblingThread) {
-  CheckOutput("/proc/self/exe", {"/proc/self/exe", kExecWithThread}, {},
-              W_EXITCODE(42, 0), "");
+TEST(ExecTest, WithSiblingThread) {
+  CheckExec("/proc/self/exe", {"/proc/self/exe", kExecWithThread}, {},
+            W_EXITCODE(42, 0), "");
 }
 
 // execve may be called from a thread other than the leader of a multithreaded
 // process.
-TEST(ExecDeathTest, FromSiblingThread) {
-  CheckOutput("/proc/self/exe", {"/proc/self/exe", kExecFromThread}, {},
-              W_EXITCODE(42, 0), "");
+TEST(ExecTest, FromSiblingThread) {
+  CheckExec("/proc/self/exe", {"/proc/self/exe", kExecFromThread}, {},
+            W_EXITCODE(42, 0), "");
 }
 
 TEST(ExecTest, NotFound) {
@@ -376,7 +399,7 @@ void SignalHandler(int signo) {
 
 // Signal handlers are reset on execve(2), unless they have default or ignored
 // disposition.
-TEST(ExecStateDeathTest, HandlerReset) {
+TEST(ExecStateTest, HandlerReset) {
   struct sigaction sa;
   sa.sa_handler = SignalHandler;
   ASSERT_THAT(sigaction(SIGUSR1, &sa, nullptr), SyscallSucceeds());
@@ -388,11 +411,11 @@ TEST(ExecStateDeathTest, HandlerReset) {
       absl::StrCat(absl::Hex(reinterpret_cast<uintptr_t>(SIG_DFL))),
   };
 
-  CheckOutput(WorkloadPath(kStateWorkload), args, {}, W_EXITCODE(0, 0), "");
+  CheckExec(WorkloadPath(kStateWorkload), args, {}, W_EXITCODE(0, 0), "");
 }
 
 // Ignored signal dispositions are not reset.
-TEST(ExecStateDeathTest, IgnorePreserved) {
+TEST(ExecStateTest, IgnorePreserved) {
   struct sigaction sa;
   sa.sa_handler = SIG_IGN;
   ASSERT_THAT(sigaction(SIGUSR1, &sa, nullptr), SyscallSucceeds());
@@ -404,11 +427,11 @@ TEST(ExecStateDeathTest, IgnorePreserved) {
       absl::StrCat(absl::Hex(reinterpret_cast<uintptr_t>(SIG_IGN))),
   };
 
-  CheckOutput(WorkloadPath(kStateWorkload), args, {}, W_EXITCODE(0, 0), "");
+  CheckExec(WorkloadPath(kStateWorkload), args, {}, W_EXITCODE(0, 0), "");
 }
 
 // Signal masks are not reset on exec
-TEST(ExecStateDeathTest, SignalMask) {
+TEST(ExecStateTest, SignalMask) {
   sigset_t s;
   sigemptyset(&s);
   sigaddset(&s, SIGUSR1);
@@ -420,12 +443,12 @@ TEST(ExecStateDeathTest, SignalMask) {
       absl::StrCat(SIGUSR1),
   };
 
-  CheckOutput(WorkloadPath(kStateWorkload), args, {}, W_EXITCODE(0, 0), "");
+  CheckExec(WorkloadPath(kStateWorkload), args, {}, W_EXITCODE(0, 0), "");
 }
 
 // itimers persist across execve.
 // N.B. Timers created with timer_create(2) should not be preserved!
-TEST(ExecStateDeathTest, ItimerPreserved) {
+TEST(ExecStateTest, ItimerPreserved) {
   // The fork in ForkAndExec clears itimers, so only set them up after fork.
   auto setup_itimer = [] {
     // Ignore SIGALRM, as we don't actually care about timer
@@ -472,10 +495,10 @@ TEST(ExecStateDeathTest, ItimerPreserved) {
 TEST(ProcSelfExe, ChangesAcrossExecve) {
   // See exec_proc_exe_workload for more details. We simply
   // assert that the /proc/self/exe link changes across execve.
-  CheckOutput(WorkloadPath(kProcExeWorkload),
-              {WorkloadPath(kProcExeWorkload),
-               ASSERT_NO_ERRNO_AND_VALUE(ProcessExePath(getpid()))},
-              {}, W_EXITCODE(0, 0), "");
+  CheckExec(WorkloadPath(kProcExeWorkload),
+            {WorkloadPath(kProcExeWorkload),
+             ASSERT_NO_ERRNO_AND_VALUE(ProcessExePath(getpid()))},
+            {}, W_EXITCODE(0, 0), "");
 }
 
 TEST(ExecTest, CloexecNormalFile) {
@@ -484,20 +507,20 @@ TEST(ExecTest, CloexecNormalFile) {
   const FileDescriptor fd_closed_on_exec =
       ASSERT_NO_ERRNO_AND_VALUE(Open(tempFile.path(), O_RDONLY | O_CLOEXEC));
 
-  CheckOutput(WorkloadPath(kAssertClosedWorkload),
-              {WorkloadPath(kAssertClosedWorkload),
-               absl::StrCat(fd_closed_on_exec.get())},
-              {}, W_EXITCODE(0, 0), "");
+  CheckExec(WorkloadPath(kAssertClosedWorkload),
+            {WorkloadPath(kAssertClosedWorkload),
+             absl::StrCat(fd_closed_on_exec.get())},
+            {}, W_EXITCODE(0, 0), "");
 
   // The assert closed workload exits with code 2 if the file still exists.  We
   // can use this to do a negative test.
   const FileDescriptor fd_open_on_exec =
       ASSERT_NO_ERRNO_AND_VALUE(Open(tempFile.path(), O_RDONLY));
 
-  CheckOutput(WorkloadPath(kAssertClosedWorkload),
-              {WorkloadPath(kAssertClosedWorkload),
-               absl::StrCat(fd_open_on_exec.get())},
-              {}, W_EXITCODE(2, 0), "");
+  CheckExec(WorkloadPath(kAssertClosedWorkload),
+            {WorkloadPath(kAssertClosedWorkload),
+             absl::StrCat(fd_open_on_exec.get())},
+            {}, W_EXITCODE(2, 0), "");
 }
 
 TEST(ExecTest, CloexecEventfd) {
@@ -505,9 +528,40 @@ TEST(ExecTest, CloexecEventfd) {
   ASSERT_THAT(efd = eventfd(0, EFD_CLOEXEC), SyscallSucceeds());
   FileDescriptor fd(efd);
 
-  CheckOutput(WorkloadPath(kAssertClosedWorkload),
-              {WorkloadPath(kAssertClosedWorkload), absl::StrCat(fd.get())}, {},
-              W_EXITCODE(0, 0), "");
+  CheckExec(WorkloadPath(kAssertClosedWorkload),
+            {WorkloadPath(kAssertClosedWorkload), absl::StrCat(fd.get())}, {},
+            W_EXITCODE(0, 0), "");
+}
+
+TEST(ExecveatTest, BasicWithFDCWD) {
+  std::string path = WorkloadPath(kBasicWorkload);
+  CheckExecveat(AT_FDCWD, path, {path}, {}, /*flags=*/0, ArgEnvExitStatus(0, 0),
+                absl::StrCat(path, "\n"));
+}
+
+TEST(ExecveatTest, Basic) {
+  std::string absolute_path = WorkloadPath(kBasicWorkload);
+  std::string parent_dir = std::string(Dirname(absolute_path));
+  std::string relative_path = std::string(Basename(absolute_path));
+  const FileDescriptor dirfd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(parent_dir, O_DIRECTORY));
+
+  CheckExecveat(dirfd.get(), relative_path, {absolute_path}, {}, /*flags=*/0,
+                ArgEnvExitStatus(0, 0), absl::StrCat(absolute_path, "\n"));
+}
+
+TEST(ExecveatTest, AbsolutePathWithFDCWD) {
+  std::string path = WorkloadPath(kBasicWorkload);
+  CheckExecveat(AT_FDCWD, path, {path}, {}, ArgEnvExitStatus(0, 0), 0,
+                absl::StrCat(path, "\n"));
+}
+
+TEST(ExecveatTest, AbsolutePath) {
+  std::string path = WorkloadPath(kBasicWorkload);
+  // File descriptor should be ignored when an absolute path is given.
+  const int32_t badFD = -1;
+  CheckExecveat(badFD, path, {path}, {}, ArgEnvExitStatus(0, 0), 0,
+                absl::StrCat(path, "\n"));
 }
 
 // Priority consistent across calls to execve()
@@ -522,9 +576,8 @@ TEST(GetpriorityTest, ExecveMaintainsPriority) {
 
   // Program run (priority_execve) will exit(X) where
   // X=getpriority(PRIO_PROCESS,0). Check that this exit value is prio.
-  CheckOutput(WorkloadPath(kPriorityWorkload),
-              {WorkloadPath(kPriorityWorkload)}, {},
-              W_EXITCODE(expected_exit_code, 0), "");
+  CheckExec(WorkloadPath(kPriorityWorkload), {WorkloadPath(kPriorityWorkload)},
+            {}, W_EXITCODE(expected_exit_code, 0), "");
 }
 
 void ExecWithThread() {
diff --git a/test/syscalls/linux/packet_socket.cc b/test/syscalls/linux/packet_socket.cc
index 37b4e6575..fcf64ee59 100644
--- a/test/syscalls/linux/packet_socket.cc
+++ b/test/syscalls/linux/packet_socket.cc
@@ -61,6 +61,9 @@ namespace testing {
 
 namespace {
 
+using ::testing::AnyOf;
+using ::testing::Eq;
+
 constexpr char kMessage[] = "soweoneul malhaebwa";
 constexpr in_port_t kPort = 0x409c;  // htons(40000)
 
@@ -83,17 +86,14 @@ void SendUDPMessage(int sock) {
 
 // Send an IP packet and make sure ETH_P_<something else> doesn't pick it up.
 TEST(BasicCookedPacketTest, WrongType) {
-  // (b/129292371): Remove once we support packet sockets.
-  SKIP_IF(IsRunningOnGvisor());
-
   if (!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))) {
     ASSERT_THAT(socket(AF_PACKET, SOCK_DGRAM, ETH_P_PUP),
                 SyscallFailsWithErrno(EPERM));
     GTEST_SKIP();
   }
 
-  FileDescriptor sock =
-      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_PACKET, SOCK_DGRAM, ETH_P_PUP));
+  FileDescriptor sock = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(AF_PACKET, SOCK_DGRAM, htons(ETH_P_PUP)));
 
   // Let's use a simple IP payload: a UDP datagram.
   FileDescriptor udp_sock =
@@ -124,9 +124,6 @@ class CookedPacketTest : public ::testing::TestWithParam<int> {
 };
 
 void CookedPacketTest::SetUp() {
-  // (b/129292371): Remove once we support packet sockets.
-  SKIP_IF(IsRunningOnGvisor());
-
   if (!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))) {
     ASSERT_THAT(socket(AF_PACKET, SOCK_DGRAM, htons(GetParam())),
                 SyscallFailsWithErrno(EPERM));
@@ -138,9 +135,6 @@ void CookedPacketTest::SetUp() {
 }
 
 void CookedPacketTest::TearDown() {
-  // (b/129292371): Remove once we support packet sockets.
-  SKIP_IF(IsRunningOnGvisor());
-
   // TearDown will be run even if we skip the test.
   if (ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))) {
     EXPECT_THAT(close(socket_), SyscallSucceeds());
@@ -177,13 +171,16 @@ TEST_P(CookedPacketTest, Receive) {
   ASSERT_THAT(recvfrom(socket_, buf, sizeof(buf), 0,
                        reinterpret_cast<struct sockaddr*>(&src), &src_len),
               SyscallSucceedsWithValue(packet_size));
-  ASSERT_EQ(src_len, sizeof(src));
+  // sockaddr_ll ends with an 8 byte physical address field, but ethernet
+  // addresses only use 6 bytes.  Linux used to return sizeof(sockaddr_ll)-2
+  // here, but since commit b2cf86e1563e33a14a1c69b3e508d15dc12f804c returns
+  // sizeof(sockaddr_ll).
+  ASSERT_THAT(src_len, AnyOf(Eq(sizeof(src)), Eq(sizeof(src) - 2)));
 
+  // TODO(b/129292371): Verify protocol once we return it.
   // Verify the source address.
   EXPECT_EQ(src.sll_family, AF_PACKET);
-  EXPECT_EQ(src.sll_protocol, htons(ETH_P_IP));
   EXPECT_EQ(src.sll_ifindex, GetLoopbackIndex());
-  EXPECT_EQ(src.sll_hatype, ARPHRD_LOOPBACK);
   EXPECT_EQ(src.sll_halen, ETH_ALEN);
   // This came from the loopback device, so the address is all 0s.
   for (int i = 0; i < src.sll_halen; i++) {
@@ -213,6 +210,9 @@ TEST_P(CookedPacketTest, Receive) {
 
 // Send via a packet socket.
 TEST_P(CookedPacketTest, Send) {
+  // TODO(b/129292371): Remove once we support packet socket writing.
+  SKIP_IF(IsRunningOnGvisor());
+
   // Let's send a UDP packet and receive it using a regular UDP socket.
   FileDescriptor udp_sock =
       ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, 0));
diff --git a/test/syscalls/linux/packet_socket_raw.cc b/test/syscalls/linux/packet_socket_raw.cc
index 6491453b6..d258d353c 100644
--- a/test/syscalls/linux/packet_socket_raw.cc
+++ b/test/syscalls/linux/packet_socket_raw.cc
@@ -26,6 +26,7 @@
 #include <sys/types.h>
 #include <unistd.h>
 
+#include "gmock/gmock.h"
 #include "gtest/gtest.h"
 #include "absl/base/internal/endian.h"
 #include "test/syscalls/linux/socket_test_util.h"
@@ -61,6 +62,9 @@ namespace testing {
 
 namespace {
 
+using ::testing::AnyOf;
+using ::testing::Eq;
+
 constexpr char kMessage[] = "soweoneul malhaebwa";
 constexpr in_port_t kPort = 0x409c;  // htons(40000)
 
@@ -97,9 +101,6 @@ class RawPacketTest : public ::testing::TestWithParam<int> {
 };
 
 void RawPacketTest::SetUp() {
-  // (b/129292371): Remove once we support packet sockets.
-  SKIP_IF(IsRunningOnGvisor());
-
   if (!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))) {
     ASSERT_THAT(socket(AF_PACKET, SOCK_RAW, htons(GetParam())),
                 SyscallFailsWithErrno(EPERM));
@@ -125,9 +126,6 @@ void RawPacketTest::SetUp() {
 }
 
 void RawPacketTest::TearDown() {
-  // (b/129292371): Remove once we support packet sockets.
-  SKIP_IF(IsRunningOnGvisor());
-
   // TearDown will be run even if we skip the test.
   if (ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))) {
     EXPECT_THAT(close(socket_), SyscallSucceeds());
@@ -164,16 +162,16 @@ TEST_P(RawPacketTest, Receive) {
   ASSERT_THAT(recvfrom(socket_, buf, sizeof(buf), 0,
                        reinterpret_cast<struct sockaddr*>(&src), &src_len),
               SyscallSucceedsWithValue(packet_size));
-  // sizeof(src) is the size of a struct sockaddr_ll. sockaddr_ll ends with an 8
-  // byte physical address field, but ethernet (MAC) addresses only use 6 bytes.
-  // Thus src_len should get modified to be 2 less than the size of sockaddr_ll.
-  ASSERT_EQ(src_len, sizeof(src) - 2);
+  // sockaddr_ll ends with an 8 byte physical address field, but ethernet
+  // addresses only use 6 bytes.  Linux used to return sizeof(sockaddr_ll)-2
+  // here, but since commit b2cf86e1563e33a14a1c69b3e508d15dc12f804c returns
+  // sizeof(sockaddr_ll).
+  ASSERT_THAT(src_len, AnyOf(Eq(sizeof(src)), Eq(sizeof(src) - 2)));
 
+  // TODO(b/129292371): Verify protocol once we return it.
   // Verify the source address.
   EXPECT_EQ(src.sll_family, AF_PACKET);
-  EXPECT_EQ(src.sll_protocol, htons(ETH_P_IP));
   EXPECT_EQ(src.sll_ifindex, GetLoopbackIndex());
-  EXPECT_EQ(src.sll_hatype, ARPHRD_LOOPBACK);
   EXPECT_EQ(src.sll_halen, ETH_ALEN);
   // This came from the loopback device, so the address is all 0s.
   for (int i = 0; i < src.sll_halen; i++) {
@@ -214,6 +212,9 @@ TEST_P(RawPacketTest, Receive) {
 
 // Send via a packet socket.
 TEST_P(RawPacketTest, Send) {
+  // TODO(b/129292371): Remove once we support packet socket writing.
+  SKIP_IF(IsRunningOnGvisor());
+
   // Let's send a UDP packet and receive it using a regular UDP socket.
   FileDescriptor udp_sock =
       ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, 0));
@@ -309,7 +310,7 @@ TEST_P(RawPacketTest, Send) {
 }
 
 INSTANTIATE_TEST_SUITE_P(AllInetTests, RawPacketTest,
-                         ::testing::Values(ETH_P_IP /*, ETH_P_ALL*/));
+                         ::testing::Values(ETH_P_IP, ETH_P_ALL));
 
 }  // namespace
 
diff --git a/test/syscalls/linux/proc_net.cc b/test/syscalls/linux/proc_net.cc
index d0ef8d380..dcfd5f86c 100644
--- a/test/syscalls/linux/proc_net.cc
+++ b/test/syscalls/linux/proc_net.cc
@@ -16,16 +16,16 @@
 #include <errno.h>
 #include <netinet/in.h>
 #include <poll.h>
-#include <sys/types.h>
 #include <sys/socket.h>
 #include <sys/syscall.h>
+#include <sys/types.h>
 
+#include "gtest/gtest.h"
+#include "gtest/gtest.h"
 #include "absl/strings/str_split.h"
 #include "absl/time/clock.h"
-#include "absl/time/time.h"
-#include "gtest/gtest.h"
-#include "test/util/capability_util.h"
 #include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/capability_util.h"
 #include "test/util/file_descriptor.h"
 #include "test/util/fs_util.h"
 #include "test/util/test_util.h"
@@ -105,25 +105,29 @@ PosixErrorOr<uint64_t> GetSNMPMetricFromProc(const std::string snmp,
       EINVAL, absl::StrCat("failed to find ", type, "/", item, " in:", snmp));
 }
 
-TEST(ProcNetSnmp, TcpReset) {
+TEST(ProcNetSnmp, TcpReset_NoRandomSave) {
   // TODO(gvisor.dev/issue/866): epsocket metrics are not savable.
-  const DisableSave ds;
+  DisableSave ds;
 
   uint64_t oldAttemptFails;
   uint64_t oldActiveOpens;
   uint64_t oldOutRsts;
   auto snmp = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/net/snmp"));
-  oldActiveOpens = ASSERT_NO_ERRNO_AND_VALUE(GetSNMPMetricFromProc(snmp, "Tcp", "ActiveOpens"));
-  oldOutRsts = ASSERT_NO_ERRNO_AND_VALUE(GetSNMPMetricFromProc(snmp, "Tcp", "OutRsts"));
-  oldAttemptFails = ASSERT_NO_ERRNO_AND_VALUE(GetSNMPMetricFromProc(snmp, "Tcp", "AttemptFails"));
+  oldActiveOpens = ASSERT_NO_ERRNO_AND_VALUE(
+      GetSNMPMetricFromProc(snmp, "Tcp", "ActiveOpens"));
+  oldOutRsts =
+      ASSERT_NO_ERRNO_AND_VALUE(GetSNMPMetricFromProc(snmp, "Tcp", "OutRsts"));
+  oldAttemptFails = ASSERT_NO_ERRNO_AND_VALUE(
+      GetSNMPMetricFromProc(snmp, "Tcp", "AttemptFails"));
 
   FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_STREAM, 0));
 
   struct sockaddr_in sin = {
-    .sin_family = AF_INET,
-    .sin_port = htons(1234),
+      .sin_family = AF_INET,
+      .sin_port = htons(1234),
   };
-  sin.sin_addr.s_addr = inet_addr("127.0.0.1");
+
+  ASSERT_EQ(inet_pton(AF_INET, "127.0.0.1", &(sin.sin_addr)), 1);
   ASSERT_THAT(connect(s.get(), (struct sockaddr *)&sin, sizeof(sin)),
               SyscallFailsWithErrno(ECONNREFUSED));
 
@@ -131,41 +135,54 @@ TEST(ProcNetSnmp, TcpReset) {
   uint64_t newActiveOpens;
   uint64_t newOutRsts;
   snmp = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/net/snmp"));
-  newActiveOpens = ASSERT_NO_ERRNO_AND_VALUE(GetSNMPMetricFromProc(snmp, "Tcp", "ActiveOpens"));
-  newOutRsts = ASSERT_NO_ERRNO_AND_VALUE(GetSNMPMetricFromProc(snmp, "Tcp", "OutRsts"));
-  newAttemptFails = ASSERT_NO_ERRNO_AND_VALUE(GetSNMPMetricFromProc(snmp, "Tcp", "AttemptFails"));
+  newActiveOpens = ASSERT_NO_ERRNO_AND_VALUE(
+      GetSNMPMetricFromProc(snmp, "Tcp", "ActiveOpens"));
+  newOutRsts =
+      ASSERT_NO_ERRNO_AND_VALUE(GetSNMPMetricFromProc(snmp, "Tcp", "OutRsts"));
+  newAttemptFails = ASSERT_NO_ERRNO_AND_VALUE(
+      GetSNMPMetricFromProc(snmp, "Tcp", "AttemptFails"));
 
   EXPECT_EQ(oldActiveOpens, newActiveOpens - 1);
   EXPECT_EQ(oldOutRsts, newOutRsts - 1);
   EXPECT_EQ(oldAttemptFails, newAttemptFails - 1);
 }
 
-TEST(ProcNetSnmp, TcpEstab) {
+TEST(ProcNetSnmp, TcpEstab_NoRandomSave) {
   // TODO(gvisor.dev/issue/866): epsocket metrics are not savable.
-  const DisableSave ds;
+  DisableSave ds;
 
   uint64_t oldEstabResets;
   uint64_t oldActiveOpens;
   uint64_t oldPassiveOpens;
   uint64_t oldCurrEstab;
   auto snmp = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/net/snmp"));
-  oldActiveOpens = ASSERT_NO_ERRNO_AND_VALUE(GetSNMPMetricFromProc(snmp, "Tcp", "ActiveOpens"));
-  oldPassiveOpens = ASSERT_NO_ERRNO_AND_VALUE(GetSNMPMetricFromProc(snmp, "Tcp", "PassiveOpens"));
-  oldCurrEstab = ASSERT_NO_ERRNO_AND_VALUE(GetSNMPMetricFromProc(snmp, "Tcp", "CurrEstab"));
-  oldEstabResets = ASSERT_NO_ERRNO_AND_VALUE(GetSNMPMetricFromProc(snmp, "Tcp", "EstabResets"));
+  oldActiveOpens = ASSERT_NO_ERRNO_AND_VALUE(
+      GetSNMPMetricFromProc(snmp, "Tcp", "ActiveOpens"));
+  oldPassiveOpens = ASSERT_NO_ERRNO_AND_VALUE(
+      GetSNMPMetricFromProc(snmp, "Tcp", "PassiveOpens"));
+  oldCurrEstab = ASSERT_NO_ERRNO_AND_VALUE(
+      GetSNMPMetricFromProc(snmp, "Tcp", "CurrEstab"));
+  oldEstabResets = ASSERT_NO_ERRNO_AND_VALUE(
+      GetSNMPMetricFromProc(snmp, "Tcp", "EstabResets"));
 
   FileDescriptor s_listen =
       ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_STREAM, 0));
-
   struct sockaddr_in sin = {
-    .sin_family = AF_INET,
-    .sin_port = htons(1234),
+      .sin_family = AF_INET,
+      .sin_port = 0,
   };
-  sin.sin_addr.s_addr = inet_addr("127.0.0.1");
+
+  ASSERT_EQ(inet_pton(AF_INET, "127.0.0.1", &(sin.sin_addr)), 1);
   ASSERT_THAT(bind(s_listen.get(), (struct sockaddr *)&sin, sizeof(sin)),
               SyscallSucceeds());
   ASSERT_THAT(listen(s_listen.get(), 1), SyscallSucceeds());
 
+  // Get the port bound by the listening socket.
+  socklen_t addrlen = sizeof(sin);
+  ASSERT_THAT(
+      getsockname(s_listen.get(), reinterpret_cast<sockaddr *>(&sin), &addrlen),
+      SyscallSucceeds());
+
   FileDescriptor s_connect =
       ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_STREAM, 0));
   ASSERT_THAT(connect(s_connect.get(), (struct sockaddr *)&sin, sizeof(sin)),
@@ -179,9 +196,12 @@ TEST(ProcNetSnmp, TcpEstab) {
   uint64_t newPassiveOpens;
   uint64_t newCurrEstab;
   snmp = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/net/snmp"));
-  newActiveOpens = ASSERT_NO_ERRNO_AND_VALUE(GetSNMPMetricFromProc(snmp, "Tcp", "ActiveOpens"));
-  newPassiveOpens = ASSERT_NO_ERRNO_AND_VALUE(GetSNMPMetricFromProc(snmp, "Tcp", "PassiveOpens"));
-  newCurrEstab = ASSERT_NO_ERRNO_AND_VALUE(GetSNMPMetricFromProc(snmp, "Tcp", "CurrEstab"));
+  newActiveOpens = ASSERT_NO_ERRNO_AND_VALUE(
+      GetSNMPMetricFromProc(snmp, "Tcp", "ActiveOpens"));
+  newPassiveOpens = ASSERT_NO_ERRNO_AND_VALUE(
+      GetSNMPMetricFromProc(snmp, "Tcp", "PassiveOpens"));
+  newCurrEstab = ASSERT_NO_ERRNO_AND_VALUE(
+      GetSNMPMetricFromProc(snmp, "Tcp", "CurrEstab"));
 
   EXPECT_EQ(oldActiveOpens, newActiveOpens - 1);
   EXPECT_EQ(oldPassiveOpens, newPassiveOpens - 1);
@@ -210,42 +230,47 @@ TEST(ProcNetSnmp, TcpEstab) {
   s_connect.reset(-1);
 
   // Wait until the process of the netstack.
-  absl::SleepFor(absl::Seconds(1.0));
+  absl::SleepFor(absl::Seconds(1));
 
   snmp = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/net/snmp"));
-  newCurrEstab = ASSERT_NO_ERRNO_AND_VALUE(GetSNMPMetricFromProc(snmp, "Tcp", "CurrEstab"));
-  newEstabResets = ASSERT_NO_ERRNO_AND_VALUE(GetSNMPMetricFromProc(snmp, "Tcp", "EstabResets"));
+  newCurrEstab = ASSERT_NO_ERRNO_AND_VALUE(
+      GetSNMPMetricFromProc(snmp, "Tcp", "CurrEstab"));
+  newEstabResets = ASSERT_NO_ERRNO_AND_VALUE(
+      GetSNMPMetricFromProc(snmp, "Tcp", "EstabResets"));
 
   EXPECT_EQ(oldCurrEstab, newCurrEstab);
   EXPECT_EQ(oldEstabResets, newEstabResets - 2);
 }
 
-TEST(ProcNetSnmp, UdpNoPorts) {
+TEST(ProcNetSnmp, UdpNoPorts_NoRandomSave) {
   // TODO(gvisor.dev/issue/866): epsocket metrics are not savable.
-  const DisableSave ds;
+  DisableSave ds;
 
   uint64_t oldOutDatagrams;
   uint64_t oldNoPorts;
   auto snmp = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/net/snmp"));
-  oldOutDatagrams = ASSERT_NO_ERRNO_AND_VALUE(GetSNMPMetricFromProc(snmp, "Udp", "OutDatagrams"));
-  oldNoPorts = ASSERT_NO_ERRNO_AND_VALUE(GetSNMPMetricFromProc(snmp, "Udp", "NoPorts"));
+  oldOutDatagrams = ASSERT_NO_ERRNO_AND_VALUE(
+      GetSNMPMetricFromProc(snmp, "Udp", "OutDatagrams"));
+  oldNoPorts =
+      ASSERT_NO_ERRNO_AND_VALUE(GetSNMPMetricFromProc(snmp, "Udp", "NoPorts"));
 
-  FileDescriptor s =
-      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, 0));
+  FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, 0));
 
   struct sockaddr_in sin = {
-    .sin_family = AF_INET,
-    .sin_port = htons(1234),
+      .sin_family = AF_INET,
+      .sin_port = htons(4444),
   };
-  sin.sin_addr.s_addr = inet_addr("127.0.0.1");
+  ASSERT_EQ(inet_pton(AF_INET, "127.0.0.1", &(sin.sin_addr)), 1);
   ASSERT_THAT(sendto(s.get(), "a", 1, 0, (struct sockaddr *)&sin, sizeof(sin)),
               SyscallSucceedsWithValue(1));
 
   uint64_t newOutDatagrams;
   uint64_t newNoPorts;
   snmp = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/net/snmp"));
-  newOutDatagrams = ASSERT_NO_ERRNO_AND_VALUE(GetSNMPMetricFromProc(snmp, "Udp", "OutDatagrams"));
-  newNoPorts = ASSERT_NO_ERRNO_AND_VALUE(GetSNMPMetricFromProc(snmp, "Udp", "NoPorts"));
+  newOutDatagrams = ASSERT_NO_ERRNO_AND_VALUE(
+      GetSNMPMetricFromProc(snmp, "Udp", "OutDatagrams"));
+  newNoPorts =
+      ASSERT_NO_ERRNO_AND_VALUE(GetSNMPMetricFromProc(snmp, "Udp", "NoPorts"));
 
   EXPECT_EQ(oldOutDatagrams, newOutDatagrams - 1);
   EXPECT_EQ(oldNoPorts, newNoPorts - 1);
@@ -258,24 +283,32 @@ TEST(ProcNetSnmp, UdpIn) {
   uint64_t oldOutDatagrams;
   uint64_t oldInDatagrams;
   auto snmp = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/net/snmp"));
-  oldOutDatagrams = ASSERT_NO_ERRNO_AND_VALUE(GetSNMPMetricFromProc(snmp, "Udp", "OutDatagrams"));
-  oldInDatagrams = ASSERT_NO_ERRNO_AND_VALUE(GetSNMPMetricFromProc(snmp, "Udp", "InDatagrams"));
+  oldOutDatagrams = ASSERT_NO_ERRNO_AND_VALUE(
+      GetSNMPMetricFromProc(snmp, "Udp", "OutDatagrams"));
+  oldInDatagrams = ASSERT_NO_ERRNO_AND_VALUE(
+      GetSNMPMetricFromProc(snmp, "Udp", "InDatagrams"));
 
+  std::cerr << "snmp: " << std::endl << snmp << std::endl;
   FileDescriptor server =
       ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, 0));
-
   struct sockaddr_in sin = {
-    .sin_family = AF_INET,
-    .sin_port = htons(1234),
+      .sin_family = AF_INET,
+      .sin_port = htons(0),
   };
-  sin.sin_addr.s_addr = inet_addr("127.0.0.1");
+  ASSERT_EQ(inet_pton(AF_INET, "127.0.0.1", &(sin.sin_addr)), 1);
   ASSERT_THAT(bind(server.get(), (struct sockaddr *)&sin, sizeof(sin)),
+              SyscallSucceeds());
+  // Get the port bound by the server socket.
+  socklen_t addrlen = sizeof(sin);
+  ASSERT_THAT(
+      getsockname(server.get(), reinterpret_cast<sockaddr *>(&sin), &addrlen),
       SyscallSucceeds());
 
   FileDescriptor client =
       ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, 0));
-  ASSERT_THAT(sendto(client.get(), "a", 1, 0, (struct sockaddr *)&sin,
-                     sizeof(sin)), SyscallSucceedsWithValue(1));
+  ASSERT_THAT(
+      sendto(client.get(), "a", 1, 0, (struct sockaddr *)&sin, sizeof(sin)),
+      SyscallSucceedsWithValue(1));
 
   char buf[128];
   ASSERT_THAT(recvfrom(server.get(), buf, sizeof(buf), 0, NULL, NULL),
@@ -284,8 +317,11 @@ TEST(ProcNetSnmp, UdpIn) {
   uint64_t newOutDatagrams;
   uint64_t newInDatagrams;
   snmp = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/net/snmp"));
-  newOutDatagrams = ASSERT_NO_ERRNO_AND_VALUE(GetSNMPMetricFromProc(snmp, "Udp", "OutDatagrams"));
-  newInDatagrams = ASSERT_NO_ERRNO_AND_VALUE(GetSNMPMetricFromProc(snmp, "Udp", "InDatagrams"));
+  std::cerr << "new snmp: " << std::endl << snmp << std::endl;
+  newOutDatagrams = ASSERT_NO_ERRNO_AND_VALUE(
+      GetSNMPMetricFromProc(snmp, "Udp", "OutDatagrams"));
+  newInDatagrams = ASSERT_NO_ERRNO_AND_VALUE(
+      GetSNMPMetricFromProc(snmp, "Udp", "InDatagrams"));
 
   EXPECT_EQ(oldOutDatagrams, newOutDatagrams - 1);
   EXPECT_EQ(oldInDatagrams, newInDatagrams - 1);
diff --git a/test/syscalls/linux/sendfile_socket.cc b/test/syscalls/linux/sendfile_socket.cc
index 1c56540bc..3331288b7 100644
--- a/test/syscalls/linux/sendfile_socket.cc
+++ b/test/syscalls/linux/sendfile_socket.cc
@@ -185,7 +185,7 @@ TEST_P(SendFileTest, Shutdown) {
   // Create a socket.
   std::tuple<int, int> fds = ASSERT_NO_ERRNO_AND_VALUE(Sockets());
   const FileDescriptor client(std::get<0>(fds));
-  FileDescriptor server(std::get<1>(fds));  // non-const, released below.
+  FileDescriptor server(std::get<1>(fds));  // non-const, reset below.
 
   // If this is a TCP socket, then turn off linger.
   if (GetParam() == AF_INET) {
@@ -210,14 +210,14 @@ TEST_P(SendFileTest, Shutdown) {
   // checking the contents (other tests do that), so we just re-use the same
   // buffer as above.
   ScopedThread t([&]() {
-    int done = 0;
+    size_t done = 0;
     while (done < data.size()) {
-      int n = read(server.get(), data.data(), data.size());
+      int n = RetryEINTR(read)(server.get(), data.data(), data.size());
       ASSERT_THAT(n, SyscallSucceeds());
       done += n;
     }
     // Close the server side socket.
-    ASSERT_THAT(close(server.release()), SyscallSucceeds());
+    server.reset();
   });
 
   // Continuously stream from the file to the socket. Note we do not assert
diff --git a/test/syscalls/linux/socket_unix_non_stream.cc b/test/syscalls/linux/socket_unix_non_stream.cc
index dafe82494..b5c82cd67 100644
--- a/test/syscalls/linux/socket_unix_non_stream.cc
+++ b/test/syscalls/linux/socket_unix_non_stream.cc
@@ -231,11 +231,21 @@ TEST_P(UnixNonStreamSocketPairTest, SendTimeout) {
       setsockopt(sockets->first_fd(), SOL_SOCKET, SO_SNDTIMEO, &tv, sizeof(tv)),
       SyscallSucceeds());
 
-  char buf[100] = {};
+  const int buf_size = 5 * kPageSize;
+  EXPECT_THAT(setsockopt(sockets->first_fd(), SOL_SOCKET, SO_SNDBUF, &buf_size,
+                         sizeof(buf_size)),
+              SyscallSucceeds());
+  EXPECT_THAT(setsockopt(sockets->second_fd(), SOL_SOCKET, SO_RCVBUF, &buf_size,
+                         sizeof(buf_size)),
+              SyscallSucceeds());
+
+  // The buffer size should be big enough to avoid many iterations in the next
+  // loop. Otherwise, this will slow down cooperative_save tests.
+  std::vector<char> buf(kPageSize);
   for (;;) {
     int ret;
     ASSERT_THAT(
-        ret = RetryEINTR(send)(sockets->first_fd(), buf, sizeof(buf), 0),
+        ret = RetryEINTR(send)(sockets->first_fd(), buf.data(), buf.size(), 0),
         ::testing::AnyOf(SyscallSucceeds(), SyscallFailsWithErrno(EAGAIN)));
     if (ret == -1) {
       break;
diff --git a/test/syscalls/linux/socket_unix_stream.cc b/test/syscalls/linux/socket_unix_stream.cc
index 659c93945..8f38ed92f 100644
--- a/test/syscalls/linux/socket_unix_stream.cc
+++ b/test/syscalls/linux/socket_unix_stream.cc
@@ -12,8 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <poll.h>
 #include <stdio.h>
 #include <sys/un.h>
+
 #include "gtest/gtest.h"
 #include "gtest/gtest.h"
 #include "test/syscalls/linux/socket_test_util.h"
@@ -44,6 +46,50 @@ TEST_P(StreamUnixSocketPairTest, ReadOneSideClosed) {
               SyscallSucceedsWithValue(0));
 }
 
+TEST_P(StreamUnixSocketPairTest, RecvmsgOneSideClosed) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  // Set timeout so that it will not wait for ever.
+  struct timeval tv {
+    .tv_sec = 0, .tv_usec = 10
+  };
+  EXPECT_THAT(setsockopt(sockets->second_fd(), SOL_SOCKET, SO_RCVTIMEO, &tv,
+                         sizeof(tv)),
+              SyscallSucceeds());
+
+  ASSERT_THAT(close(sockets->release_first_fd()), SyscallSucceeds());
+
+  char received_data[10] = {};
+  struct iovec iov;
+  iov.iov_base = received_data;
+  iov.iov_len = sizeof(received_data);
+  struct msghdr msg = {};
+  msg.msg_flags = -1;
+  msg.msg_iov = &iov;
+  msg.msg_iovlen = 1;
+
+  ASSERT_THAT(recvmsg(sockets->second_fd(), &msg, MSG_WAITALL),
+              SyscallSucceedsWithValue(0));
+}
+
+TEST_P(StreamUnixSocketPairTest, ReadOneSideClosedWithUnreadData) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char buf[10] = {};
+  ASSERT_THAT(RetryEINTR(write)(sockets->second_fd(), buf, sizeof(buf)),
+              SyscallSucceedsWithValue(sizeof(buf)));
+
+  ASSERT_THAT(shutdown(sockets->first_fd(), SHUT_RDWR), SyscallSucceeds());
+
+  ASSERT_THAT(RetryEINTR(read)(sockets->second_fd(), buf, sizeof(buf)),
+              SyscallSucceedsWithValue(0));
+
+  ASSERT_THAT(close(sockets->release_first_fd()), SyscallSucceeds());
+
+  ASSERT_THAT(RetryEINTR(read)(sockets->second_fd(), buf, sizeof(buf)),
+              SyscallFailsWithErrno(ECONNRESET));
+}
+
 INSTANTIATE_TEST_SUITE_P(
     AllUnixDomainSockets, StreamUnixSocketPairTest,
     ::testing::ValuesIn(IncludeReversals(VecCat<SocketPairKind>(
diff --git a/test/syscalls/syscall_test_runner.go b/test/syscalls/syscall_test_runner.go
index c1e9ce22c..856398994 100644
--- a/test/syscalls/syscall_test_runner.go
+++ b/test/syscalls/syscall_test_runner.go
@@ -35,6 +35,7 @@ import (
 	"gvisor.dev/gvisor/runsc/specutils"
 	"gvisor.dev/gvisor/runsc/testutil"
 	"gvisor.dev/gvisor/test/syscalls/gtest"
+	"gvisor.dev/gvisor/test/uds"
 )
 
 // Location of syscall tests, relative to the repo root.
@@ -50,6 +51,8 @@ var (
 	overlay    = flag.Bool("overlay", false, "wrap filesystem mounts with writable tmpfs overlay")
 	parallel   = flag.Bool("parallel", false, "run tests in parallel")
 	runscPath  = flag.String("runsc", "", "path to runsc binary")
+
+	addUDSTree = flag.Bool("add-uds-tree", false, "expose a tree of UDS utilities for use in tests")
 )
 
 // runTestCaseNative runs the test case directly on the host machine.
@@ -86,6 +89,19 @@ func runTestCaseNative(testBin string, tc gtest.TestCase, t *testing.T) {
 	// intepret them.
 	env = filterEnv(env, []string{"TEST_SHARD_INDEX", "TEST_TOTAL_SHARDS", "GTEST_SHARD_INDEX", "GTEST_TOTAL_SHARDS"})
 
+	if *addUDSTree {
+		socketDir, cleanup, err := uds.CreateSocketTree("/tmp")
+		if err != nil {
+			t.Fatalf("failed to create socket tree: %v", err)
+		}
+		defer cleanup()
+
+		env = append(env, "TEST_UDS_TREE="+socketDir)
+		// On Linux, the concept of "attach" location doesn't exist.
+		// Just pass the same path to make these test identical.
+		env = append(env, "TEST_UDS_ATTACH_TREE="+socketDir)
+	}
+
 	cmd := exec.Command(testBin, gtest.FilterTestFlag+"="+tc.FullName())
 	cmd.Env = env
 	cmd.Stdout = os.Stdout
@@ -96,101 +112,39 @@ func runTestCaseNative(testBin string, tc gtest.TestCase, t *testing.T) {
 	}
 }
 
-// runsTestCaseRunsc runs the test case in runsc.
-func runTestCaseRunsc(testBin string, tc gtest.TestCase, t *testing.T) {
-	rootDir, err := testutil.SetupRootDir()
+// runRunsc runs spec in runsc in a standard test configuration.
+//
+// runsc logs will be saved to a path in TEST_UNDECLARED_OUTPUTS_DIR.
+//
+// Returns an error if the sandboxed application exits non-zero.
+func runRunsc(tc gtest.TestCase, spec *specs.Spec) error {
+	bundleDir, err := testutil.SetupBundleDir(spec)
 	if err != nil {
-		t.Fatalf("SetupRootDir failed: %v", err)
+		return fmt.Errorf("SetupBundleDir failed: %v", err)
 	}
-	defer os.RemoveAll(rootDir)
-
-	// Run a new container with the test executable and filter for the
-	// given test suite and name.
-	spec := testutil.NewSpecWithArgs(testBin, gtest.FilterTestFlag+"="+tc.FullName())
-
-	// Mark the root as writeable, as some tests attempt to
-	// write to the rootfs, and expect EACCES, not EROFS.
-	spec.Root.Readonly = false
-
-	// Test spec comes with pre-defined mounts that we don't want. Reset it.
-	spec.Mounts = nil
-	if *useTmpfs {
-		// Forces '/tmp' to be mounted as tmpfs, otherwise test that rely on
-		// features only available in gVisor's internal tmpfs may fail.
-		spec.Mounts = append(spec.Mounts, specs.Mount{
-			Destination: "/tmp",
-			Type:        "tmpfs",
-		})
-	} else {
-		// Use a gofer-backed directory as '/tmp'.
-		//
-		// Tests might be running in parallel, so make sure each has a
-		// unique test temp dir.
-		//
-		// Some tests (e.g., sticky) access this mount from other
-		// users, so make sure it is world-accessible.
-		tmpDir, err := ioutil.TempDir(testutil.TmpDir(), "")
-		if err != nil {
-			t.Fatalf("could not create temp dir: %v", err)
-		}
-		defer os.RemoveAll(tmpDir)
-
-		if err := os.Chmod(tmpDir, 0777); err != nil {
-			t.Fatalf("could not chmod temp dir: %v", err)
-		}
-
-		spec.Mounts = append(spec.Mounts, specs.Mount{
-			Type:        "bind",
-			Destination: "/tmp",
-			Source:      tmpDir,
-		})
-	}
-
-	// Set environment variable that indicates we are
-	// running in gVisor and with the given platform.
-	platformVar := "TEST_ON_GVISOR"
-	env := append(os.Environ(), platformVar+"="+*platform)
-
-	// Remove env variables that cause the gunit binary to write output
-	// files, since they will stomp on eachother, and on the output files
-	// from this go test.
-	env = filterEnv(env, []string{"GUNIT_OUTPUT", "TEST_PREMATURE_EXIT_FILE", "XML_OUTPUT_FILE"})
-
-	// Remove shard env variables so that the gunit binary does not try to
-	// intepret them.
-	env = filterEnv(env, []string{"TEST_SHARD_INDEX", "TEST_TOTAL_SHARDS", "GTEST_SHARD_INDEX", "GTEST_TOTAL_SHARDS"})
-
-	// Set TEST_TMPDIR to /tmp, as some of the syscall tests require it to
-	// be backed by tmpfs.
-	for i, kv := range env {
-		if strings.HasPrefix(kv, "TEST_TMPDIR=") {
-			env[i] = "TEST_TMPDIR=/tmp"
-			break
-		}
-	}
-
-	spec.Process.Env = env
+	defer os.RemoveAll(bundleDir)
 
-	bundleDir, err := testutil.SetupBundleDir(spec)
+	rootDir, err := testutil.SetupRootDir()
 	if err != nil {
-		t.Fatalf("SetupBundleDir failed: %v", err)
+		return fmt.Errorf("SetupRootDir failed: %v", err)
 	}
-	defer os.RemoveAll(bundleDir)
+	defer os.RemoveAll(rootDir)
 
+	name := tc.FullName()
 	id := testutil.UniqueContainerID()
-	log.Infof("Running test %q in container %q", tc.FullName(), id)
+	log.Infof("Running test %q in container %q", name, id)
 	specutils.LogSpec(spec)
 
 	args := []string{
-		"-platform", *platform,
 		"-root", rootDir,
-		"-file-access", *fileAccess,
 		"-network=none",
 		"-log-format=text",
 		"-TESTONLY-unsafe-nonroot=true",
 		"-net-raw=true",
 		fmt.Sprintf("-panic-signal=%d", syscall.SIGTERM),
 		"-watchdog-action=panic",
+		"-platform", *platform,
+		"-file-access", *fileAccess,
 	}
 	if *overlay {
 		args = append(args, "-overlay")
@@ -201,14 +155,18 @@ func runTestCaseRunsc(testBin string, tc gtest.TestCase, t *testing.T) {
 	if *strace {
 		args = append(args, "-strace")
 	}
+	if *addUDSTree {
+		args = append(args, "-fsgofer-host-uds")
+	}
+
 	if outDir, ok := syscall.Getenv("TEST_UNDECLARED_OUTPUTS_DIR"); ok {
-		tdir := filepath.Join(outDir, strings.Replace(tc.FullName(), "/", "_", -1))
+		tdir := filepath.Join(outDir, strings.Replace(name, "/", "_", -1))
 		if err := os.MkdirAll(tdir, 0755); err != nil {
-			t.Fatalf("could not create test dir: %v", err)
+			return fmt.Errorf("could not create test dir: %v", err)
 		}
 		debugLogDir, err := ioutil.TempDir(tdir, "runsc")
 		if err != nil {
-			t.Fatalf("could not create temp dir: %v", err)
+			return fmt.Errorf("could not create temp dir: %v", err)
 		}
 		debugLogDir += "/"
 		log.Infof("runsc logs: %s", debugLogDir)
@@ -248,7 +206,7 @@ func runTestCaseRunsc(testBin string, tc gtest.TestCase, t *testing.T) {
 		if !ok {
 			return
 		}
-		t.Errorf("%s: Got signal: %v", tc.FullName(), s)
+		log.Warningf("%s: Got signal: %v", name, s)
 		done := make(chan bool)
 		go func() {
 			dArgs := append(args, "-alsologtostderr=true", "debug", "--stacks", id)
@@ -259,14 +217,14 @@ func runTestCaseRunsc(testBin string, tc gtest.TestCase, t *testing.T) {
 			done <- true
 		}()
 
-		timeout := time.Tick(3 * time.Second)
+		timeout := time.After(3 * time.Second)
 		select {
 		case <-timeout:
-			t.Logf("runsc debug --stacks is timeouted")
+			log.Infof("runsc debug --stacks is timeouted")
 		case <-done:
 		}
 
-		t.Logf("Send SIGTERM to the sandbox process")
+		log.Warningf("Send SIGTERM to the sandbox process")
 		dArgs := append(args, "debug",
 			fmt.Sprintf("--signal=%d", syscall.SIGTERM),
 			id)
@@ -275,11 +233,143 @@ func runTestCaseRunsc(testBin string, tc gtest.TestCase, t *testing.T) {
 		cmd.Stderr = os.Stderr
 		cmd.Run()
 	}()
-	if err = cmd.Run(); err != nil {
-		t.Errorf("test %q exited with status %v, want 0", tc.FullName(), err)
-	}
+
+	err = cmd.Run()
+
 	signal.Stop(sig)
 	close(sig)
+
+	return err
+}
+
+// setupUDSTree updates the spec to expose a UDS tree for gofer socket testing.
+func setupUDSTree(spec *specs.Spec) (cleanup func(), err error) {
+	socketDir, cleanup, err := uds.CreateSocketTree("/tmp")
+	if err != nil {
+		return nil, fmt.Errorf("failed to create socket tree: %v", err)
+	}
+
+	// Standard access to entire tree.
+	spec.Mounts = append(spec.Mounts, specs.Mount{
+		Destination: "/tmp/sockets",
+		Source:      socketDir,
+		Type:        "bind",
+	})
+
+	// Individial attach points for each socket to test mounts that attach
+	// directly to the sockets.
+	spec.Mounts = append(spec.Mounts, specs.Mount{
+		Destination: "/tmp/sockets-attach/stream/echo",
+		Source:      filepath.Join(socketDir, "stream/echo"),
+		Type:        "bind",
+	})
+	spec.Mounts = append(spec.Mounts, specs.Mount{
+		Destination: "/tmp/sockets-attach/stream/nonlistening",
+		Source:      filepath.Join(socketDir, "stream/nonlistening"),
+		Type:        "bind",
+	})
+	spec.Mounts = append(spec.Mounts, specs.Mount{
+		Destination: "/tmp/sockets-attach/seqpacket/echo",
+		Source:      filepath.Join(socketDir, "seqpacket/echo"),
+		Type:        "bind",
+	})
+	spec.Mounts = append(spec.Mounts, specs.Mount{
+		Destination: "/tmp/sockets-attach/seqpacket/nonlistening",
+		Source:      filepath.Join(socketDir, "seqpacket/nonlistening"),
+		Type:        "bind",
+	})
+	spec.Mounts = append(spec.Mounts, specs.Mount{
+		Destination: "/tmp/sockets-attach/dgram/null",
+		Source:      filepath.Join(socketDir, "dgram/null"),
+		Type:        "bind",
+	})
+
+	spec.Process.Env = append(spec.Process.Env, "TEST_UDS_TREE=/tmp/sockets")
+	spec.Process.Env = append(spec.Process.Env, "TEST_UDS_ATTACH_TREE=/tmp/sockets-attach")
+
+	return cleanup, nil
+}
+
+// runsTestCaseRunsc runs the test case in runsc.
+func runTestCaseRunsc(testBin string, tc gtest.TestCase, t *testing.T) {
+	// Run a new container with the test executable and filter for the
+	// given test suite and name.
+	spec := testutil.NewSpecWithArgs(testBin, gtest.FilterTestFlag+"="+tc.FullName())
+
+	// Mark the root as writeable, as some tests attempt to
+	// write to the rootfs, and expect EACCES, not EROFS.
+	spec.Root.Readonly = false
+
+	// Test spec comes with pre-defined mounts that we don't want. Reset it.
+	spec.Mounts = nil
+	if *useTmpfs {
+		// Forces '/tmp' to be mounted as tmpfs, otherwise test that rely on
+		// features only available in gVisor's internal tmpfs may fail.
+		spec.Mounts = append(spec.Mounts, specs.Mount{
+			Destination: "/tmp",
+			Type:        "tmpfs",
+		})
+	} else {
+		// Use a gofer-backed directory as '/tmp'.
+		//
+		// Tests might be running in parallel, so make sure each has a
+		// unique test temp dir.
+		//
+		// Some tests (e.g., sticky) access this mount from other
+		// users, so make sure it is world-accessible.
+		tmpDir, err := ioutil.TempDir(testutil.TmpDir(), "")
+		if err != nil {
+			t.Fatalf("could not create temp dir: %v", err)
+		}
+		defer os.RemoveAll(tmpDir)
+
+		if err := os.Chmod(tmpDir, 0777); err != nil {
+			t.Fatalf("could not chmod temp dir: %v", err)
+		}
+
+		spec.Mounts = append(spec.Mounts, specs.Mount{
+			Type:        "bind",
+			Destination: "/tmp",
+			Source:      tmpDir,
+		})
+	}
+
+	// Set environment variable that indicates we are
+	// running in gVisor and with the given platform.
+	platformVar := "TEST_ON_GVISOR"
+	env := append(os.Environ(), platformVar+"="+*platform)
+
+	// Remove env variables that cause the gunit binary to write output
+	// files, since they will stomp on eachother, and on the output files
+	// from this go test.
+	env = filterEnv(env, []string{"GUNIT_OUTPUT", "TEST_PREMATURE_EXIT_FILE", "XML_OUTPUT_FILE"})
+
+	// Remove shard env variables so that the gunit binary does not try to
+	// intepret them.
+	env = filterEnv(env, []string{"TEST_SHARD_INDEX", "TEST_TOTAL_SHARDS", "GTEST_SHARD_INDEX", "GTEST_TOTAL_SHARDS"})
+
+	// Set TEST_TMPDIR to /tmp, as some of the syscall tests require it to
+	// be backed by tmpfs.
+	for i, kv := range env {
+		if strings.HasPrefix(kv, "TEST_TMPDIR=") {
+			env[i] = "TEST_TMPDIR=/tmp"
+			break
+		}
+	}
+
+	spec.Process.Env = env
+
+	if *addUDSTree {
+		cleanup, err := setupUDSTree(spec)
+		if err != nil {
+			t.Fatalf("error creating UDS tree: %v", err)
+		}
+		defer cleanup()
+	}
+
+	if err := runRunsc(tc, spec); err != nil {
+		t.Errorf("test %q failed with error %v, want nil", tc.FullName(), err)
+	}
 }
 
 // filterEnv returns an environment with the blacklisted variables removed.
diff --git a/test/uds/BUILD b/test/uds/BUILD
new file mode 100644
index 000000000..a3843e699
--- /dev/null
+++ b/test/uds/BUILD
@@ -0,0 +1,17 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+
+package(
+    default_visibility = ["//:sandbox"],
+    licenses = ["notice"],
+)
+
+go_library(
+    name = "uds",
+    testonly = 1,
+    srcs = ["uds.go"],
+    importpath = "gvisor.dev/gvisor/test/uds",
+    deps = [
+        "//pkg/log",
+        "//pkg/unet",
+    ],
+)
diff --git a/test/uds/uds.go b/test/uds/uds.go
new file mode 100644
index 000000000..b714c61b0
--- /dev/null
+++ b/test/uds/uds.go
@@ -0,0 +1,228 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package uds contains helpers for testing external UDS functionality.
+package uds
+
+import (
+	"fmt"
+	"io"
+	"io/ioutil"
+	"os"
+	"path/filepath"
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/unet"
+)
+
+// createEchoSocket creates a socket that echoes back anything received.
+//
+// Only works for stream, seqpacket sockets.
+func createEchoSocket(path string, protocol int) (cleanup func(), err error) {
+	fd, err := syscall.Socket(syscall.AF_UNIX, protocol, 0)
+	if err != nil {
+		return nil, fmt.Errorf("error creating echo(%d) socket: %v", protocol, err)
+	}
+
+	if err := syscall.Bind(fd, &syscall.SockaddrUnix{Name: path}); err != nil {
+		return nil, fmt.Errorf("error binding echo(%d) socket: %v", protocol, err)
+	}
+
+	if err := syscall.Listen(fd, 0); err != nil {
+		return nil, fmt.Errorf("error listening echo(%d) socket: %v", protocol, err)
+	}
+
+	server, err := unet.NewServerSocket(fd)
+	if err != nil {
+		return nil, fmt.Errorf("error creating echo(%d) unet socket: %v", protocol, err)
+	}
+
+	acceptAndEchoOne := func() error {
+		s, err := server.Accept()
+		if err != nil {
+			return fmt.Errorf("failed to accept: %v", err)
+		}
+		defer s.Close()
+
+		for {
+			buf := make([]byte, 512)
+			for {
+				n, err := s.Read(buf)
+				if err == io.EOF {
+					return nil
+				}
+				if err != nil {
+					return fmt.Errorf("failed to read: %d, %v", n, err)
+				}
+
+				n, err = s.Write(buf[:n])
+				if err != nil {
+					return fmt.Errorf("failed to write: %d, %v", n, err)
+				}
+			}
+		}
+	}
+
+	go func() {
+		for {
+			if err := acceptAndEchoOne(); err != nil {
+				log.Warningf("Failed to handle echo(%d) socket: %v", protocol, err)
+				return
+			}
+		}
+	}()
+
+	cleanup = func() {
+		if err := server.Close(); err != nil {
+			log.Warningf("Failed to close echo(%d) socket: %v", protocol, err)
+		}
+	}
+
+	return cleanup, nil
+}
+
+// createNonListeningSocket creates a socket that is bound but not listening.
+//
+// Only relevant for stream, seqpacket sockets.
+func createNonListeningSocket(path string, protocol int) (cleanup func(), err error) {
+	fd, err := syscall.Socket(syscall.AF_UNIX, protocol, 0)
+	if err != nil {
+		return nil, fmt.Errorf("error creating nonlistening(%d) socket: %v", protocol, err)
+	}
+
+	if err := syscall.Bind(fd, &syscall.SockaddrUnix{Name: path}); err != nil {
+		return nil, fmt.Errorf("error binding nonlistening(%d) socket: %v", protocol, err)
+	}
+
+	cleanup = func() {
+		if err := syscall.Close(fd); err != nil {
+			log.Warningf("Failed to close nonlistening(%d) socket: %v", protocol, err)
+		}
+	}
+
+	return cleanup, nil
+}
+
+// createNullSocket creates a socket that reads anything received.
+//
+// Only works for dgram sockets.
+func createNullSocket(path string, protocol int) (cleanup func(), err error) {
+	fd, err := syscall.Socket(syscall.AF_UNIX, protocol, 0)
+	if err != nil {
+		return nil, fmt.Errorf("error creating null(%d) socket: %v", protocol, err)
+	}
+
+	if err := syscall.Bind(fd, &syscall.SockaddrUnix{Name: path}); err != nil {
+		return nil, fmt.Errorf("error binding null(%d) socket: %v", protocol, err)
+	}
+
+	s, err := unet.NewSocket(fd)
+	if err != nil {
+		return nil, fmt.Errorf("error creating null(%d) unet socket: %v", protocol, err)
+	}
+
+	go func() {
+		buf := make([]byte, 512)
+		for {
+			n, err := s.Read(buf)
+			if err != nil {
+				log.Warningf("failed to read: %d, %v", n, err)
+				return
+			}
+		}
+	}()
+
+	cleanup = func() {
+		if err := s.Close(); err != nil {
+			log.Warningf("Failed to close null(%d) socket: %v", protocol, err)
+		}
+	}
+
+	return cleanup, nil
+}
+
+type socketCreator func(path string, proto int) (cleanup func(), err error)
+
+// CreateSocketTree creates a local tree of unix domain sockets for use in
+// testing:
+//  * /stream/echo
+//  * /stream/nonlistening
+//  * /seqpacket/echo
+//  * /seqpacket/nonlistening
+//  * /dgram/null
+func CreateSocketTree(baseDir string) (dir string, cleanup func(), err error) {
+	dir, err = ioutil.TempDir(baseDir, "sockets")
+	if err != nil {
+		return "", nil, fmt.Errorf("error creating temp dir: %v", err)
+	}
+
+	var protocols = []struct {
+		protocol int
+		name     string
+		sockets  map[string]socketCreator
+	}{
+		{
+			protocol: syscall.SOCK_STREAM,
+			name:     "stream",
+			sockets: map[string]socketCreator{
+				"echo":         createEchoSocket,
+				"nonlistening": createNonListeningSocket,
+			},
+		},
+		{
+			protocol: syscall.SOCK_SEQPACKET,
+			name:     "seqpacket",
+			sockets: map[string]socketCreator{
+				"echo":         createEchoSocket,
+				"nonlistening": createNonListeningSocket,
+			},
+		},
+		{
+			protocol: syscall.SOCK_DGRAM,
+			name:     "dgram",
+			sockets: map[string]socketCreator{
+				"null": createNullSocket,
+			},
+		},
+	}
+
+	var cleanups []func()
+	for _, proto := range protocols {
+		protoDir := filepath.Join(dir, proto.name)
+		if err := os.Mkdir(protoDir, 0755); err != nil {
+			return "", nil, fmt.Errorf("error creating %s dir: %v", proto.name, err)
+		}
+
+		for name, fn := range proto.sockets {
+			path := filepath.Join(protoDir, name)
+			cleanup, err := fn(path, proto.protocol)
+			if err != nil {
+				return "", nil, fmt.Errorf("error creating %s %s socket: %v", proto.name, name, err)
+			}
+
+			cleanups = append(cleanups, cleanup)
+		}
+	}
+
+	cleanup = func() {
+		for _, c := range cleanups {
+			c()
+		}
+
+		os.RemoveAll(dir)
+	}
+
+	return dir, cleanup, nil
+}
diff --git a/test/util/fs_util.cc b/test/util/fs_util.cc
index f7d231b14..88b1e7911 100644
--- a/test/util/fs_util.cc
+++ b/test/util/fs_util.cc
@@ -163,6 +163,26 @@ PosixError Chmod(absl::string_view path, int mode) {
   return NoError();
 }
 
+PosixError MknodAt(const FileDescriptor& dfd, absl::string_view path, int mode,
+                   dev_t dev) {
+  int res = mknodat(dfd.get(), std::string(path).c_str(), mode, dev);
+  if (res < 0) {
+    return PosixError(errno, absl::StrCat("mknod ", path));
+  }
+
+  return NoError();
+}
+
+PosixError UnlinkAt(const FileDescriptor& dfd, absl::string_view path,
+                    int flags) {
+  int res = unlinkat(dfd.get(), std::string(path).c_str(), flags);
+  if (res < 0) {
+    return PosixError(errno, absl::StrCat("unlink ", path));
+  }
+
+  return NoError();
+}
+
 PosixError Mkdir(absl::string_view path, int mode) {
   int res = mkdir(std::string(path).c_str(), mode);
   if (res < 0) {
diff --git a/test/util/fs_util.h b/test/util/fs_util.h
index e5b555891..ee1b341d7 100644
--- a/test/util/fs_util.h
+++ b/test/util/fs_util.h
@@ -21,6 +21,7 @@
 #include <unistd.h>
 
 #include "absl/strings/string_view.h"
+#include "test/util/file_descriptor.h"
 #include "test/util/posix_error.h"
 
 namespace gvisor {
@@ -44,6 +45,14 @@ PosixError Delete(absl::string_view path);
 // Changes the mode of a file or returns an error.
 PosixError Chmod(absl::string_view path, int mode);
 
+// Create a special or ordinary file.
+PosixError MknodAt(const FileDescriptor& dfd, absl::string_view path, int mode,
+                   dev_t dev);
+
+// Unlink the file.
+PosixError UnlinkAt(const FileDescriptor& dfd, absl::string_view path,
+                    int flags);
+
 // Truncates a file to the given length or returns an error.
 PosixError Truncate(absl::string_view path, int length);
 
diff --git a/test/util/multiprocess_util.cc b/test/util/multiprocess_util.cc
index 95f5f3b4f..8b676751b 100644
--- a/test/util/multiprocess_util.cc
+++ b/test/util/multiprocess_util.cc
@@ -14,6 +14,7 @@
 
 #include "test/util/multiprocess_util.h"
 
+#include <asm/unistd.h>
 #include <errno.h>
 #include <fcntl.h>
 #include <signal.h>
@@ -30,11 +31,12 @@
 namespace gvisor {
 namespace testing {
 
-PosixErrorOr<Cleanup> ForkAndExec(const std::string& filename,
-                                  const ExecveArray& argv,
-                                  const ExecveArray& envv,
-                                  const std::function<void()>& fn, pid_t* child,
-                                  int* execve_errno) {
+namespace {
+
+// exec_fn wraps a variant of the exec family, e.g. execve or execveat.
+PosixErrorOr<Cleanup> ForkAndExecHelper(const std::function<void()>& exec_fn,
+                                        const std::function<void()>& fn,
+                                        pid_t* child, int* execve_errno) {
   int pfds[2];
   int ret = pipe2(pfds, O_CLOEXEC);
   if (ret < 0) {
@@ -76,7 +78,9 @@ PosixErrorOr<Cleanup> ForkAndExec(const std::string& filename,
       fn();
     }
 
-    execve(filename.c_str(), argv.get(), envv.get());
+    // Call variant of exec function.
+    exec_fn();
+
     int error = errno;
     if (WriteFd(pfds[1], &error, sizeof(error)) != sizeof(error)) {
       // We can't do much if the write fails, but we can at least exit with a
@@ -116,6 +120,36 @@ PosixErrorOr<Cleanup> ForkAndExec(const std::string& filename,
   return std::move(cleanup);
 }
 
+}  // namespace
+
+PosixErrorOr<Cleanup> ForkAndExec(const std::string& filename,
+                                  const ExecveArray& argv,
+                                  const ExecveArray& envv,
+                                  const std::function<void()>& fn, pid_t* child,
+                                  int* execve_errno) {
+  char* const* argv_data = argv.get();
+  char* const* envv_data = envv.get();
+  const std::function<void()> exec_fn = [=] {
+    execve(filename.c_str(), argv_data, envv_data);
+  };
+  return ForkAndExecHelper(exec_fn, fn, child, execve_errno);
+}
+
+PosixErrorOr<Cleanup> ForkAndExecveat(const int32_t dirfd,
+                                      const std::string& pathname,
+                                      const ExecveArray& argv,
+                                      const ExecveArray& envv, const int flags,
+                                      const std::function<void()>& fn,
+                                      pid_t* child, int* execve_errno) {
+  char* const* argv_data = argv.get();
+  char* const* envv_data = envv.get();
+  const std::function<void()> exec_fn = [=] {
+    syscall(__NR_execveat, dirfd, pathname.c_str(), argv_data, envv_data,
+            flags);
+  };
+  return ForkAndExecHelper(exec_fn, fn, child, execve_errno);
+}
+
 PosixErrorOr<int> InForkedProcess(const std::function<void()>& fn) {
   pid_t pid = fork();
   if (pid == 0) {
diff --git a/test/util/multiprocess_util.h b/test/util/multiprocess_util.h
index 0aecd3439..c413d63ea 100644
--- a/test/util/multiprocess_util.h
+++ b/test/util/multiprocess_util.h
@@ -102,6 +102,13 @@ inline PosixErrorOr<Cleanup> ForkAndExec(const std::string& filename,
   return ForkAndExec(filename, argv, envv, [] {}, child, execve_errno);
 }
 
+// Equivalent to ForkAndExec, except using dirfd and flags with execveat.
+PosixErrorOr<Cleanup> ForkAndExecveat(int32_t dirfd, const std::string& pathname,
+                                      const ExecveArray& argv,
+                                      const ExecveArray& envv, int flags,
+                                      const std::function<void()>& fn,
+                                      pid_t* child, int* execve_errno);
+
 // Calls fn in a forked subprocess and returns the exit status of the
 // subprocess.
 //
diff --git a/third_party/gvsync/BUILD b/third_party/gvsync/BUILD
index 8dab51daa..7d6d59c48 100644
--- a/third_party/gvsync/BUILD
+++ b/third_party/gvsync/BUILD
@@ -1,4 +1,5 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template")
 
 package(
     default_visibility = ["//:sandbox"],
@@ -7,8 +8,6 @@ package(
 
 exports_files(["LICENSE"])
 
-load("//tools/go_generics:defs.bzl", "go_template")
-
 go_template(
     name = "generic_atomicptr",
     srcs = ["atomicptr_unsafe.go"],
diff --git a/third_party/gvsync/atomicptrtest/BUILD b/third_party/gvsync/atomicptrtest/BUILD
index 6cf69ea91..447ecf96a 100644
--- a/third_party/gvsync/atomicptrtest/BUILD
+++ b/third_party/gvsync/atomicptrtest/BUILD
@@ -1,9 +1,8 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
 
 package(licenses = ["notice"])
 
-load("//tools/go_generics:defs.bzl", "go_template_instance")
-
 go_template_instance(
     name = "atomicptr_int",
     out = "atomicptr_int_unsafe.go",
diff --git a/third_party/gvsync/seqatomictest/BUILD b/third_party/gvsync/seqatomictest/BUILD
index 9e87e0bc5..c858c20c4 100644
--- a/third_party/gvsync/seqatomictest/BUILD
+++ b/third_party/gvsync/seqatomictest/BUILD
@@ -1,9 +1,8 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
 
 package(licenses = ["notice"])
 
-load("//tools/go_generics:defs.bzl", "go_template_instance")
-
 go_template_instance(
     name = "seqatomic_int",
     out = "seqatomic_int_unsafe.go",
diff --git a/tools/go_generics/rules_tests/BUILD b/tools/go_generics/rules_tests/BUILD
index a6f8cdd3c..9d26a88b7 100644
--- a/tools/go_generics/rules_tests/BUILD
+++ b/tools/go_generics/rules_tests/BUILD
@@ -1,9 +1,8 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template", "go_template_instance")
 
 package(licenses = ["notice"])
 
-load("//tools/go_generics:defs.bzl", "go_template", "go_template_instance")
-
 go_template_instance(
     name = "instance",
     out = "instance_test.go",