657 files changed, 38020 insertions, 14363 deletions
diff --git a/pkg/abi/linux/BUILD b/pkg/abi/linux/BUILD
index b5c5cc20b..4a26e28de 100644
--- a/pkg/abi/linux/BUILD
+++ b/pkg/abi/linux/BUILD
@@ -38,6 +38,7 @@ go_library(
         "ipc.go",
         "limits.go",
         "linux.go",
+        "membarrier.go",
         "mm.go",
         "netdevice.go",
         "netfilter.go",
@@ -74,9 +75,9 @@ go_library(
         "//pkg/abi",
         "//pkg/binary",
         "//pkg/bits",
+        "//pkg/marshal",
+        "//pkg/marshal/primitive",
         "//pkg/usermem",
-        "//tools/go_marshal/marshal",
-        "//tools/go_marshal/primitive",
     ],
 )
 
diff --git a/pkg/abi/linux/aio.go b/pkg/abi/linux/aio.go
index 86ee3f8b5..5fc099892 100644
--- a/pkg/abi/linux/aio.go
+++ b/pkg/abi/linux/aio.go
@@ -42,6 +42,8 @@ const (
 //
 // The priority field is currently ignored in the implementation below. Also
 // note that the IOCB_FLAG_RESFD feature is not supported.
+//
+// +marshal
 type IOCallback struct {
 	Data uint64
 	Key  uint32
@@ -64,6 +66,7 @@ type IOCallback struct {
 
 // IOEvent describes an I/O result.
 //
+// +marshal
 // +stateify savable
 type IOEvent struct {
 	Data    uint64
diff --git a/pkg/abi/linux/bpf.go b/pkg/abi/linux/bpf.go
index aa3d3ce70..9422fcf69 100644
--- a/pkg/abi/linux/bpf.go
+++ b/pkg/abi/linux/bpf.go
@@ -16,6 +16,7 @@ package linux
 
 // BPFInstruction is a raw BPF virtual machine instruction.
 //
+// +marshal slice:BPFInstructionSlice
 // +stateify savable
 type BPFInstruction struct {
 	// OpCode is the operation to execute.
diff --git a/pkg/abi/linux/capability.go b/pkg/abi/linux/capability.go
index 965f74663..afd16cc27 100644
--- a/pkg/abi/linux/capability.go
+++ b/pkg/abi/linux/capability.go
@@ -177,12 +177,16 @@ const (
 )
 
 // CapUserHeader is equivalent to Linux's cap_user_header_t.
+//
+// +marshal
 type CapUserHeader struct {
 	Version uint32
 	Pid     int32
 }
 
 // CapUserData is equivalent to Linux's cap_user_data_t.
+//
+// +marshal slice:CapUserDataSlice
 type CapUserData struct {
 	Effective   uint32
 	Permitted   uint32
diff --git a/pkg/abi/linux/dev.go b/pkg/abi/linux/dev.go
index 192e2093b..7771650b3 100644
--- a/pkg/abi/linux/dev.go
+++ b/pkg/abi/linux/dev.go
@@ -54,9 +54,9 @@ const (
 	// Unix98 PTY masters.
 	UNIX98_PTY_MASTER_MAJOR = 128
 
-	// UNIX98_PTY_SLAVE_MAJOR is the initial major device number for
-	// Unix98 PTY slaves.
-	UNIX98_PTY_SLAVE_MAJOR = 136
+	// UNIX98_PTY_REPLICA_MAJOR is the initial major device number for
+	// Unix98 PTY replicas.
+	UNIX98_PTY_REPLICA_MAJOR = 136
 )
 
 // Minor device numbers for TTYAUX_MAJOR.
diff --git a/pkg/abi/linux/fcntl.go b/pkg/abi/linux/fcntl.go
index 9242e80a5..cc3571fad 100644
--- a/pkg/abi/linux/fcntl.go
+++ b/pkg/abi/linux/fcntl.go
@@ -45,6 +45,8 @@ const (
 )
 
 // Flock is the lock structure for F_SETLK.
+//
+// +marshal
 type Flock struct {
 	Type   int16
 	Whence int16
@@ -63,6 +65,8 @@ const (
 )
 
 // FOwnerEx is the owner structure for F_SETOWN_EX and F_GETOWN_EX.
+//
+// +marshal
 type FOwnerEx struct {
 	Type int32
 	PID  int32
diff --git a/pkg/abi/linux/fs.go b/pkg/abi/linux/fs.go
index 158d2db5b..0d921ed6f 100644
--- a/pkg/abi/linux/fs.go
+++ b/pkg/abi/linux/fs.go
@@ -29,6 +29,7 @@ const (
 	SYSFS_MAGIC           = 0x62656572
 	TMPFS_MAGIC           = 0x01021994
 	V9FS_MAGIC            = 0x01021997
+	FUSE_SUPER_MAGIC      = 0x65735546
 )
 
 // Filesystem path limits, from uapi/linux/limits.h.
@@ -44,17 +45,18 @@ type Statfs struct {
 	// Type is one of the filesystem magic values, defined above.
 	Type uint64
 
-	// BlockSize is the data block size.
+	// BlockSize is the optimal transfer block size in bytes.
 	BlockSize int64
 
-	// Blocks is the number of data blocks in use.
+	// Blocks is the maximum number of data blocks the filesystem may store, in
+	// units of BlockSize.
 	Blocks uint64
 
-	// BlocksFree is the number of free blocks.
+	// BlocksFree is the number of free data blocks, in units of BlockSize.
 	BlocksFree uint64
 
-	// BlocksAvailable is the number of blocks free for use by
-	// unprivileged users.
+	// BlocksAvailable is the number of data blocks free for use by
+	// unprivileged users, in units of BlockSize.
 	BlocksAvailable uint64
 
 	// Files is the number of used file nodes on the filesystem.
diff --git a/pkg/abi/linux/fuse.go b/pkg/abi/linux/fuse.go
index 7e30483ee..d91c97a64 100644
--- a/pkg/abi/linux/fuse.go
+++ b/pkg/abi/linux/fuse.go
@@ -14,12 +14,20 @@
 
 package linux
 
+import (
+	"gvisor.dev/gvisor/pkg/marshal"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
+)
+
 // +marshal
 type FUSEOpcode uint32
 
 // +marshal
 type FUSEOpID uint64
 
+// FUSE_ROOT_ID is the id of root inode.
+const FUSE_ROOT_ID = 1
+
 // Opcodes for FUSE operations. Analogous to the opcodes in include/linux/fuse.h.
 const (
 	FUSE_LOOKUP   FUSEOpcode = 1
@@ -116,61 +124,28 @@ type FUSEHeaderOut struct {
 	Unique FUSEOpID
 }
 
-// FUSEWriteIn is the header written by a daemon when it makes a
-// write request to the FUSE filesystem.
-//
-// +marshal
-type FUSEWriteIn struct {
-	// Fh specifies the file handle that is being written to.
-	Fh uint64
-
-	// Offset is the offset of the write.
-	Offset uint64
-
-	// Size is the size of data being written.
-	Size uint32
-
-	// WriteFlags is the flags used during the write.
-	WriteFlags uint32
-
-	// LockOwner is the ID of the lock owner.
-	LockOwner uint64
-
-	// Flags is the flags for the request.
-	Flags uint32
-
-	_ uint32
-}
-
 // FUSE_INIT flags, consistent with the ones in include/uapi/linux/fuse.h.
+// Our taget version is 7.23 but we have few implemented in advance.
 const (
-	FUSE_ASYNC_READ          = 1 << 0
-	FUSE_POSIX_LOCKS         = 1 << 1
-	FUSE_FILE_OPS            = 1 << 2
-	FUSE_ATOMIC_O_TRUNC      = 1 << 3
-	FUSE_EXPORT_SUPPORT      = 1 << 4
-	FUSE_BIG_WRITES          = 1 << 5
-	FUSE_DONT_MASK           = 1 << 6
-	FUSE_SPLICE_WRITE        = 1 << 7
-	FUSE_SPLICE_MOVE         = 1 << 8
-	FUSE_SPLICE_READ         = 1 << 9
-	FUSE_FLOCK_LOCKS         = 1 << 10
-	FUSE_HAS_IOCTL_DIR       = 1 << 11
-	FUSE_AUTO_INVAL_DATA     = 1 << 12
-	FUSE_DO_READDIRPLUS      = 1 << 13
-	FUSE_READDIRPLUS_AUTO    = 1 << 14
-	FUSE_ASYNC_DIO           = 1 << 15
-	FUSE_WRITEBACK_CACHE     = 1 << 16
-	FUSE_NO_OPEN_SUPPORT     = 1 << 17
-	FUSE_PARALLEL_DIROPS     = 1 << 18
-	FUSE_HANDLE_KILLPRIV     = 1 << 19
-	FUSE_POSIX_ACL           = 1 << 20
-	FUSE_ABORT_ERROR         = 1 << 21
-	FUSE_MAX_PAGES           = 1 << 22
-	FUSE_CACHE_SYMLINKS      = 1 << 23
-	FUSE_NO_OPENDIR_SUPPORT  = 1 << 24
-	FUSE_EXPLICIT_INVAL_DATA = 1 << 25
-	FUSE_MAP_ALIGNMENT       = 1 << 26
+	FUSE_ASYNC_READ       = 1 << 0
+	FUSE_POSIX_LOCKS      = 1 << 1
+	FUSE_FILE_OPS         = 1 << 2
+	FUSE_ATOMIC_O_TRUNC   = 1 << 3
+	FUSE_EXPORT_SUPPORT   = 1 << 4
+	FUSE_BIG_WRITES       = 1 << 5
+	FUSE_DONT_MASK        = 1 << 6
+	FUSE_SPLICE_WRITE     = 1 << 7
+	FUSE_SPLICE_MOVE      = 1 << 8
+	FUSE_SPLICE_READ      = 1 << 9
+	FUSE_FLOCK_LOCKS      = 1 << 10
+	FUSE_HAS_IOCTL_DIR    = 1 << 11
+	FUSE_AUTO_INVAL_DATA  = 1 << 12
+	FUSE_DO_READDIRPLUS   = 1 << 13
+	FUSE_READDIRPLUS_AUTO = 1 << 14
+	FUSE_ASYNC_DIO        = 1 << 15
+	FUSE_WRITEBACK_CACHE  = 1 << 16
+	FUSE_NO_OPEN_SUPPORT  = 1 << 17
+	FUSE_MAX_PAGES        = 1 << 22 // From FUSE 7.28
 )
 
 // currently supported FUSE protocol version numbers.
@@ -179,6 +154,13 @@ const (
 	FUSE_KERNEL_MINOR_VERSION = 31
 )
 
+// Constants relevant to FUSE operations.
+const (
+	FUSE_NAME_MAX     = 1024
+	FUSE_PAGE_SIZE    = 4096
+	FUSE_DIRENT_ALIGN = 8
+)
+
 // FUSEInitIn is the request sent by the kernel to the daemon,
 // to negotiate the version and flags.
 //
@@ -199,7 +181,7 @@ type FUSEInitIn struct {
 }
 
 // FUSEInitOut is the reply sent by the daemon to the kernel
-// for FUSEInitIn.
+// for FUSEInitIn. We target FUSE 7.23; this struct supports 7.28.
 //
 // +marshal
 type FUSEInitOut struct {
@@ -240,13 +222,16 @@ type FUSEInitOut struct {
 	// if the value from daemon is too large.
 	MaxPages uint16
 
-	// MapAlignment is an unknown field and not used by this package at this moment.
-	// Use as a placeholder to be consistent with the FUSE protocol.
-	MapAlignment uint16
+	_ uint16
 
 	_ [8]uint32
 }
 
+// FUSE_GETATTR_FH is currently the only flag of FUSEGetAttrIn.GetAttrFlags.
+// If it is set, the file handle (FUSEGetAttrIn.Fh) is used to indicate the
+// object instead of the node id attribute in the request header.
+const FUSE_GETATTR_FH = (1 << 0)
+
 // FUSEGetAttrIn is the request sent by the kernel to the daemon,
 // to get the attribute of a inode.
 //
@@ -267,22 +252,52 @@ type FUSEGetAttrIn struct {
 //
 // +marshal
 type FUSEAttr struct {
-	Ino       uint64
-	Size      uint64
-	Blocks    uint64
-	Atime     uint64
-	Mtime     uint64
-	Ctime     uint64
+	// Ino is the inode number of this file.
+	Ino uint64
+
+	// Size is the size of this file.
+	Size uint64
+
+	// Blocks is the number of the 512B blocks allocated by this file.
+	Blocks uint64
+
+	// Atime is the time of last access.
+	Atime uint64
+
+	// Mtime is the time of last modification.
+	Mtime uint64
+
+	// Ctime is the time of last status change.
+	Ctime uint64
+
+	// AtimeNsec is the nano second part of Atime.
 	AtimeNsec uint32
+
+	// MtimeNsec is the nano second part of Mtime.
 	MtimeNsec uint32
+
+	// CtimeNsec is the nano second part of Ctime.
 	CtimeNsec uint32
-	Mode      uint32
-	Nlink     uint32
-	UID       uint32
-	GID       uint32
-	Rdev      uint32
-	BlkSize   uint32
-	_         uint32
+
+	// Mode contains the file type and mode.
+	Mode uint32
+
+	// Nlink is the number of the hard links.
+	Nlink uint32
+
+	// UID is user ID of the owner.
+	UID uint32
+
+	// GID is group ID of the owner.
+	GID uint32
+
+	// Rdev is the device ID if this is a special file.
+	Rdev uint32
+
+	// BlkSize is the block size for filesystem I/O.
+	BlkSize uint32
+
+	_ uint32
 }
 
 // FUSEGetAttrOut is the reply sent by the daemon to the kernel
@@ -301,3 +316,558 @@ type FUSEGetAttrOut struct {
 	// Attr contains the metadata returned from the FUSE server
 	Attr FUSEAttr
 }
+
+// FUSEEntryOut is the reply sent by the daemon to the kernel
+// for FUSE_MKNOD, FUSE_MKDIR, FUSE_SYMLINK, FUSE_LINK and
+// FUSE_LOOKUP.
+//
+// +marshal
+type FUSEEntryOut struct {
+	// NodeID is the ID for current inode.
+	NodeID uint64
+
+	// Generation is the generation number of inode.
+	// Used to identify an inode that have different ID at different time.
+	Generation uint64
+
+	// EntryValid indicates timeout for an entry.
+	EntryValid uint64
+
+	// AttrValid indicates timeout for an entry's attributes.
+	AttrValid uint64
+
+	// EntryValidNsec indicates timeout for an entry in nanosecond.
+	EntryValidNSec uint32
+
+	// AttrValidNsec indicates timeout for an entry's attributes in nanosecond.
+	AttrValidNSec uint32
+
+	// Attr contains the attributes of an entry.
+	Attr FUSEAttr
+}
+
+// FUSELookupIn is the request sent by the kernel to the daemon
+// to look up a file name.
+//
+// Dynamically-sized objects cannot be marshalled.
+type FUSELookupIn struct {
+	marshal.StubMarshallable
+
+	// Name is a file name to be looked up.
+	Name string
+}
+
+// MarshalBytes serializes r.name to the dst buffer.
+func (r *FUSELookupIn) MarshalBytes(buf []byte) {
+	copy(buf, r.Name)
+}
+
+// SizeBytes is the size of the memory representation of FUSELookupIn.
+// 1 extra byte for null-terminated string.
+func (r *FUSELookupIn) SizeBytes() int {
+	return len(r.Name) + 1
+}
+
+// MAX_NON_LFS indicates the maximum offset without large file support.
+const MAX_NON_LFS = ((1 << 31) - 1)
+
+// flags returned by OPEN request.
+const (
+	// FOPEN_DIRECT_IO indicates bypassing page cache for this opened file.
+	FOPEN_DIRECT_IO = 1 << 0
+	// FOPEN_KEEP_CACHE avoids invalidate of data cache on open.
+	FOPEN_KEEP_CACHE = 1 << 1
+	// FOPEN_NONSEEKABLE indicates the file cannot be seeked.
+	FOPEN_NONSEEKABLE = 1 << 2
+)
+
+// FUSEOpenIn is the request sent by the kernel to the daemon,
+// to negotiate flags and get file handle.
+//
+// +marshal
+type FUSEOpenIn struct {
+	// Flags of this open request.
+	Flags uint32
+
+	_ uint32
+}
+
+// FUSEOpenOut is the reply sent by the daemon to the kernel
+// for FUSEOpenIn.
+//
+// +marshal
+type FUSEOpenOut struct {
+	// Fh is the file handler for opened file.
+	Fh uint64
+
+	// OpenFlag for the opened file.
+	OpenFlag uint32
+
+	_ uint32
+}
+
+// FUSE_READ flags, consistent with the ones in include/uapi/linux/fuse.h.
+const (
+	FUSE_READ_LOCKOWNER = 1 << 1
+)
+
+// FUSEReadIn is the request sent by the kernel to the daemon
+// for FUSE_READ.
+//
+// +marshal
+type FUSEReadIn struct {
+	// Fh is the file handle in userspace.
+	Fh uint64
+
+	// Offset is the read offset.
+	Offset uint64
+
+	// Size is the number of bytes to read.
+	Size uint32
+
+	// ReadFlags for this FUSE_READ request.
+	// Currently only contains FUSE_READ_LOCKOWNER.
+	ReadFlags uint32
+
+	// LockOwner is the id of the lock owner if there is one.
+	LockOwner uint64
+
+	// Flags for the underlying file.
+	Flags uint32
+
+	_ uint32
+}
+
+// FUSEWriteIn is the first part of the payload of the
+// request sent by the kernel to the daemon
+// for FUSE_WRITE (struct for FUSE version >= 7.9).
+//
+// The second part of the payload is the
+// binary bytes of the data to be written.
+//
+// +marshal
+type FUSEWriteIn struct {
+	// Fh is the file handle in userspace.
+	Fh uint64
+
+	// Offset is the write offset.
+	Offset uint64
+
+	// Size is the number of bytes to write.
+	Size uint32
+
+	// ReadFlags for this FUSE_WRITE request.
+	WriteFlags uint32
+
+	// LockOwner is the id of the lock owner if there is one.
+	LockOwner uint64
+
+	// Flags for the underlying file.
+	Flags uint32
+
+	_ uint32
+}
+
+// FUSEWriteOut is the payload of the reply sent by the daemon to the kernel
+// for a FUSE_WRITE request.
+//
+// +marshal
+type FUSEWriteOut struct {
+	// Size is the number of bytes written.
+	Size uint32
+
+	_ uint32
+}
+
+// FUSEReleaseIn is the request sent by the kernel to the daemon
+// when there is no more reference to a file.
+//
+// +marshal
+type FUSEReleaseIn struct {
+	// Fh is the file handler for the file to be released.
+	Fh uint64
+
+	// Flags of the file.
+	Flags uint32
+
+	// ReleaseFlags of this release request.
+	ReleaseFlags uint32
+
+	// LockOwner is the id of the lock owner if there is one.
+	LockOwner uint64
+}
+
+// FUSECreateMeta contains all the static fields of FUSECreateIn,
+// which is used for FUSE_CREATE.
+//
+// +marshal
+type FUSECreateMeta struct {
+	// Flags of the creating file.
+	Flags uint32
+
+	// Mode is the mode of the creating file.
+	Mode uint32
+
+	// Umask is the current file mode creation mask.
+	Umask uint32
+	_     uint32
+}
+
+// FUSECreateIn contains all the arguments sent by the kernel to the daemon, to
+// atomically create and open a new regular file.
+//
+// Dynamically-sized objects cannot be marshalled.
+type FUSECreateIn struct {
+	marshal.StubMarshallable
+
+	// CreateMeta contains mode, rdev and umash field for FUSE_MKNODS.
+	CreateMeta FUSECreateMeta
+
+	// Name is the name of the node to create.
+	Name string
+}
+
+// MarshalBytes serializes r.CreateMeta and r.Name to the dst buffer.
+func (r *FUSECreateIn) MarshalBytes(buf []byte) {
+	r.CreateMeta.MarshalBytes(buf[:r.CreateMeta.SizeBytes()])
+	copy(buf[r.CreateMeta.SizeBytes():], r.Name)
+}
+
+// SizeBytes is the size of the memory representation of FUSECreateIn.
+// 1 extra byte for null-terminated string.
+func (r *FUSECreateIn) SizeBytes() int {
+	return r.CreateMeta.SizeBytes() + len(r.Name) + 1
+}
+
+// FUSEMknodMeta contains all the static fields of FUSEMknodIn,
+// which is used for FUSE_MKNOD.
+//
+// +marshal
+type FUSEMknodMeta struct {
+	// Mode of the inode to create.
+	Mode uint32
+
+	// Rdev encodes device major and minor information.
+	Rdev uint32
+
+	// Umask is the current file mode creation mask.
+	Umask uint32
+
+	_ uint32
+}
+
+// FUSEMknodIn contains all the arguments sent by the kernel
+// to the daemon, to create a new file node.
+//
+// Dynamically-sized objects cannot be marshalled.
+type FUSEMknodIn struct {
+	marshal.StubMarshallable
+
+	// MknodMeta contains mode, rdev and umash field for FUSE_MKNODS.
+	MknodMeta FUSEMknodMeta
+
+	// Name is the name of the node to create.
+	Name string
+}
+
+// MarshalBytes serializes r.MknodMeta and r.Name to the dst buffer.
+func (r *FUSEMknodIn) MarshalBytes(buf []byte) {
+	r.MknodMeta.MarshalBytes(buf[:r.MknodMeta.SizeBytes()])
+	copy(buf[r.MknodMeta.SizeBytes():], r.Name)
+}
+
+// SizeBytes is the size of the memory representation of FUSEMknodIn.
+// 1 extra byte for null-terminated string.
+func (r *FUSEMknodIn) SizeBytes() int {
+	return r.MknodMeta.SizeBytes() + len(r.Name) + 1
+}
+
+// FUSESymLinkIn is the request sent by the kernel to the daemon,
+// to create a symbolic link.
+//
+// Dynamically-sized objects cannot be marshalled.
+type FUSESymLinkIn struct {
+	marshal.StubMarshallable
+
+	// Name of symlink to create.
+	Name string
+
+	// Target of the symlink.
+	Target string
+}
+
+// MarshalBytes serializes r.Name and r.Target to the dst buffer.
+// Left null-termination at end of r.Name and r.Target.
+func (r *FUSESymLinkIn) MarshalBytes(buf []byte) {
+	copy(buf, r.Name)
+	copy(buf[len(r.Name)+1:], r.Target)
+}
+
+// SizeBytes is the size of the memory representation of FUSESymLinkIn.
+// 2 extra bytes for null-terminated string.
+func (r *FUSESymLinkIn) SizeBytes() int {
+	return len(r.Name) + len(r.Target) + 2
+}
+
+// FUSEEmptyIn is used by operations without request body.
+type FUSEEmptyIn struct{ marshal.StubMarshallable }
+
+// MarshalBytes do nothing for marshal.
+func (r *FUSEEmptyIn) MarshalBytes(buf []byte) {}
+
+// SizeBytes is 0 for empty request.
+func (r *FUSEEmptyIn) SizeBytes() int {
+	return 0
+}
+
+// FUSEMkdirMeta contains all the static fields of FUSEMkdirIn,
+// which is used for FUSE_MKDIR.
+//
+// +marshal
+type FUSEMkdirMeta struct {
+	// Mode of the directory of create.
+	Mode uint32
+
+	// Umask is the user file creation mask.
+	Umask uint32
+}
+
+// FUSEMkdirIn contains all the arguments sent by the kernel
+// to the daemon, to create a new directory.
+//
+// Dynamically-sized objects cannot be marshalled.
+type FUSEMkdirIn struct {
+	marshal.StubMarshallable
+
+	// MkdirMeta contains Mode and Umask of the directory to create.
+	MkdirMeta FUSEMkdirMeta
+
+	// Name of the directory to create.
+	Name string
+}
+
+// MarshalBytes serializes r.MkdirMeta and r.Name to the dst buffer.
+func (r *FUSEMkdirIn) MarshalBytes(buf []byte) {
+	r.MkdirMeta.MarshalBytes(buf[:r.MkdirMeta.SizeBytes()])
+	copy(buf[r.MkdirMeta.SizeBytes():], r.Name)
+}
+
+// SizeBytes is the size of the memory representation of FUSEMkdirIn.
+// 1 extra byte for null-terminated Name string.
+func (r *FUSEMkdirIn) SizeBytes() int {
+	return r.MkdirMeta.SizeBytes() + len(r.Name) + 1
+}
+
+// FUSERmDirIn is the request sent by the kernel to the daemon
+// when trying to remove a directory.
+//
+// Dynamically-sized objects cannot be marshalled.
+type FUSERmDirIn struct {
+	marshal.StubMarshallable
+
+	// Name is a directory name to be removed.
+	Name string
+}
+
+// MarshalBytes serializes r.name to the dst buffer.
+func (r *FUSERmDirIn) MarshalBytes(buf []byte) {
+	copy(buf, r.Name)
+}
+
+// SizeBytes is the size of the memory representation of FUSERmDirIn.
+func (r *FUSERmDirIn) SizeBytes() int {
+	return len(r.Name) + 1
+}
+
+// FUSEDirents is a list of Dirents received from the FUSE daemon server.
+// It is used for FUSE_READDIR.
+//
+// Dynamically-sized objects cannot be marshalled.
+type FUSEDirents struct {
+	marshal.StubMarshallable
+
+	Dirents []*FUSEDirent
+}
+
+// FUSEDirent is a Dirent received from the FUSE daemon server.
+// It is used for FUSE_READDIR.
+//
+// Dynamically-sized objects cannot be marshalled.
+type FUSEDirent struct {
+	marshal.StubMarshallable
+
+	// Meta contains all the static fields of FUSEDirent.
+	Meta FUSEDirentMeta
+
+	// Name is the filename of the dirent.
+	Name string
+}
+
+// FUSEDirentMeta contains all the static fields of FUSEDirent.
+// It is used for FUSE_READDIR.
+//
+// +marshal
+type FUSEDirentMeta struct {
+	// Inode of the dirent.
+	Ino uint64
+
+	// Offset of the dirent.
+	Off uint64
+
+	// NameLen is the length of the dirent name.
+	NameLen uint32
+
+	// Type of the dirent.
+	Type uint32
+}
+
+// SizeBytes is the size of the memory representation of FUSEDirents.
+func (r *FUSEDirents) SizeBytes() int {
+	var sizeBytes int
+	for _, dirent := range r.Dirents {
+		sizeBytes += dirent.SizeBytes()
+	}
+
+	return sizeBytes
+}
+
+// UnmarshalBytes deserializes FUSEDirents from the src buffer.
+func (r *FUSEDirents) UnmarshalBytes(src []byte) {
+	for {
+		if len(src) <= (*FUSEDirentMeta)(nil).SizeBytes() {
+			break
+		}
+
+		// Its unclear how many dirents there are in src. Each dirent is dynamically
+		// sized and so we can't make assumptions about how many dirents we can allocate.
+		if r.Dirents == nil {
+			r.Dirents = make([]*FUSEDirent, 0)
+		}
+
+		// We have to allocate a struct for each dirent - there must be a better way
+		// to do this. Linux allocates 1 page to store all the dirents and then
+		// simply reads them from the page.
+		var dirent FUSEDirent
+		dirent.UnmarshalBytes(src)
+		r.Dirents = append(r.Dirents, &dirent)
+
+		src = src[dirent.SizeBytes():]
+	}
+}
+
+// SizeBytes is the size of the memory representation of FUSEDirent.
+func (r *FUSEDirent) SizeBytes() int {
+	dataSize := r.Meta.SizeBytes() + len(r.Name)
+
+	// Each Dirent must be padded such that its size is a multiple
+	// of FUSE_DIRENT_ALIGN. Similar to the fuse dirent alignment
+	// in linux/fuse.h.
+	return (dataSize + (FUSE_DIRENT_ALIGN - 1)) & ^(FUSE_DIRENT_ALIGN - 1)
+}
+
+// UnmarshalBytes deserializes FUSEDirent from the src buffer.
+func (r *FUSEDirent) UnmarshalBytes(src []byte) {
+	r.Meta.UnmarshalBytes(src)
+	src = src[r.Meta.SizeBytes():]
+
+	if r.Meta.NameLen > FUSE_NAME_MAX {
+		// The name is too long and therefore invalid. We don't
+		// need to unmarshal the name since it'll be thrown away.
+		return
+	}
+
+	buf := make([]byte, r.Meta.NameLen)
+	name := primitive.ByteSlice(buf)
+	name.UnmarshalBytes(src[:r.Meta.NameLen])
+	r.Name = string(name)
+}
+
+// FATTR_* consts are the attribute flags defined in include/uapi/linux/fuse.h.
+// These should be or-ed together for setattr to know what has been changed.
+const (
+	FATTR_MODE      = (1 << 0)
+	FATTR_UID       = (1 << 1)
+	FATTR_GID       = (1 << 2)
+	FATTR_SIZE      = (1 << 3)
+	FATTR_ATIME     = (1 << 4)
+	FATTR_MTIME     = (1 << 5)
+	FATTR_FH        = (1 << 6)
+	FATTR_ATIME_NOW = (1 << 7)
+	FATTR_MTIME_NOW = (1 << 8)
+	FATTR_LOCKOWNER = (1 << 9)
+	FATTR_CTIME     = (1 << 10)
+)
+
+// FUSESetAttrIn is the request sent by the kernel to the daemon,
+// to set the attribute(s) of a file.
+//
+// +marshal
+type FUSESetAttrIn struct {
+	// Valid indicates which attributes are modified by this request.
+	Valid uint32
+
+	_ uint32
+
+	// Fh is used to identify the file if FATTR_FH is set in Valid.
+	Fh uint64
+
+	// Size is the size that the request wants to change to.
+	Size uint64
+
+	// LockOwner is the owner of the lock that the request wants to change to.
+	LockOwner uint64
+
+	// Atime is the access time that the request wants to change to.
+	Atime uint64
+
+	// Mtime is the modification time that the request wants to change to.
+	Mtime uint64
+
+	// Ctime is the status change time that the request wants to change to.
+	Ctime uint64
+
+	// AtimeNsec is the nano second part of Atime.
+	AtimeNsec uint32
+
+	// MtimeNsec is the nano second part of Mtime.
+	MtimeNsec uint32
+
+	// CtimeNsec is the nano second part of Ctime.
+	CtimeNsec uint32
+
+	// Mode is the file mode that the request wants to change to.
+	Mode uint32
+
+	_ uint32
+
+	// UID is the user ID of the owner that the request wants to change to.
+	UID uint32
+
+	// GID is the group ID of the owner that the request wants to change to.
+	GID uint32
+
+	_ uint32
+}
+
+// FUSEUnlinkIn is the request sent by the kernel to the daemon
+// when trying to unlink a node.
+//
+// Dynamically-sized objects cannot be marshalled.
+type FUSEUnlinkIn struct {
+	marshal.StubMarshallable
+
+	// Name of the node to unlink.
+	Name string
+}
+
+// MarshalBytes serializes r.name to the dst buffer, which should
+// have size len(r.Name) + 1 and last byte set to 0.
+func (r *FUSEUnlinkIn) MarshalBytes(buf []byte) {
+	copy(buf, r.Name)
+}
+
+// SizeBytes is the size of the memory representation of FUSEUnlinkIn.
+// 1 extra byte for null-terminated Name string.
+func (r *FUSEUnlinkIn) SizeBytes() int {
+	return len(r.Name) + 1
+}
diff --git a/pkg/abi/linux/ioctl.go b/pkg/abi/linux/ioctl.go
index 2c5e56ae5..7df02dd6d 100644
--- a/pkg/abi/linux/ioctl.go
+++ b/pkg/abi/linux/ioctl.go
@@ -113,7 +113,57 @@ const (
 	_IOC_DIRSHIFT  = _IOC_SIZESHIFT + _IOC_SIZEBITS
 )
 
+// Constants from uapi/linux/fs.h.
+const (
+	FS_IOC_GETFLAGS = 2148034049
+	FS_VERITY_FL    = 1048576
+)
+
+// Constants from uapi/linux/fsverity.h.
+const (
+	FS_IOC_ENABLE_VERITY  = 1082156677
+	FS_IOC_MEASURE_VERITY = 3221513862
+)
+
+// DigestMetadata is a helper struct for VerityDigest.
+//
+// +marshal
+type DigestMetadata struct {
+	DigestAlgorithm uint16
+	DigestSize      uint16
+}
+
+// SizeOfDigestMetadata is the size of struct DigestMetadata.
+const SizeOfDigestMetadata = 4
+
+// VerityDigest is struct from uapi/linux/fsverity.h.
+type VerityDigest struct {
+	Metadata DigestMetadata
+	Digest   []byte
+}
+
 // IOC outputs the result of _IOC macro in asm-generic/ioctl.h.
 func IOC(dir, typ, nr, size uint32) uint32 {
 	return uint32(dir)<<_IOC_DIRSHIFT | typ<<_IOC_TYPESHIFT | nr<<_IOC_NRSHIFT | size<<_IOC_SIZESHIFT
 }
+
+// Kcov ioctls from kernel/kcov.h.
+var (
+	KCOV_INIT_TRACE = IOC(_IOC_READ, 'c', 1, 8)
+	KCOV_ENABLE     = IOC(_IOC_NONE, 'c', 100, 0)
+	KCOV_DISABLE    = IOC(_IOC_NONE, 'c', 101, 0)
+)
+
+// Kcov trace types from kernel/kcov.h.
+const (
+	KCOV_TRACE_PC  = 0
+	KCOV_TRACE_CMP = 1
+)
+
+// Kcov state constants from kernel/kcov.h.
+const (
+	KCOV_MODE_DISABLED  = 0
+	KCOV_MODE_INIT      = 1
+	KCOV_MODE_TRACE_PC  = 2
+	KCOV_MODE_TRACE_CMP = 3
+)
diff --git a/pkg/abi/linux/ipc.go b/pkg/abi/linux/ipc.go
index 22acd2d43..c6e65df62 100644
--- a/pkg/abi/linux/ipc.go
+++ b/pkg/abi/linux/ipc.go
@@ -37,6 +37,8 @@ const IPC_PRIVATE = 0
 // features like 32-bit UIDs.
 
 // IPCPerm is equivalent to struct ipc64_perm.
+//
+// +marshal
 type IPCPerm struct {
 	Key     uint32
 	UID     uint32
diff --git a/pkg/abi/linux/linux.go b/pkg/abi/linux/linux.go
index 281acdbde..3b4abece1 100644
--- a/pkg/abi/linux/linux.go
+++ b/pkg/abi/linux/linux.go
@@ -12,7 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// Package linux contains the constants and types needed to interface with a Linux kernel.
+// Package linux contains the constants and types needed to interface with a
+// Linux kernel.
 package linux
 
 // NumSoftIRQ is the number of software IRQs, exposed via /proc/stat.
@@ -21,6 +22,8 @@ package linux
 const NumSoftIRQ = 10
 
 // Sysinfo is the structure provided by sysinfo on linux versions > 2.3.48.
+//
+// +marshal
 type Sysinfo struct {
 	Uptime    int64
 	Loads     [3]uint64
@@ -34,6 +37,6 @@ type Sysinfo struct {
 	_         [6]byte // Pad Procs to 64bits.
 	TotalHigh uint64
 	FreeHigh  uint64
-	Unit      uint32
-	/* The _f field in the glibc version of Sysinfo has size 0 on AMD64 */
+	Unit      uint32 `marshal:"unaligned"` // Struct ends mid-64-bit-word.
+	// The _f field in the glibc version of Sysinfo has size 0 on AMD64.
 }
diff --git a/pkg/abi/linux/membarrier.go b/pkg/abi/linux/membarrier.go
new file mode 100644
index 000000000..4f6021a1d
--- /dev/null
+++ b/pkg/abi/linux/membarrier.go
@@ -0,0 +1,34 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+// membarrier(2) commands, from include/uapi/linux/membarrier.h.
+const (
+	MEMBARRIER_CMD_QUERY                                = 0
+	MEMBARRIER_CMD_GLOBAL                               = (1 << 0)
+	MEMBARRIER_CMD_GLOBAL_EXPEDITED                     = (1 << 1)
+	MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED            = (1 << 2)
+	MEMBARRIER_CMD_PRIVATE_EXPEDITED                    = (1 << 3)
+	MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED           = (1 << 4)
+	MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE          = (1 << 5)
+	MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE = (1 << 6)
+	MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ               = (1 << 7)
+	MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ      = (1 << 8)
+)
+
+// membarrier(2) flags, from include/uapi/linux/membarrier.h.
+const (
+	MEMBARRIER_CMD_FLAG_CPU = (1 << 0)
+)
diff --git a/pkg/abi/linux/netfilter.go b/pkg/abi/linux/netfilter.go
index 91e35366f..b521144d9 100644
--- a/pkg/abi/linux/netfilter.go
+++ b/pkg/abi/linux/netfilter.go
@@ -17,9 +17,9 @@ package linux
 import (
 	"io"
 
+	"gvisor.dev/gvisor/pkg/marshal"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
 	"gvisor.dev/gvisor/pkg/usermem"
-	"gvisor.dev/gvisor/tools/go_marshal/marshal"
-	"gvisor.dev/gvisor/tools/go_marshal/primitive"
 )
 
 // This file contains structures required to support netfilter, specifically
@@ -265,6 +265,18 @@ type KernelXTEntryMatch struct {
 	Data []byte
 }
 
+// XTGetRevision corresponds to xt_get_revision in
+// include/uapi/linux/netfilter/x_tables.h
+//
+// +marshal
+type XTGetRevision struct {
+	Name     ExtensionName
+	Revision uint8
+}
+
+// SizeOfXTGetRevision is the size of an XTGetRevision.
+const SizeOfXTGetRevision = 30
+
 // XTEntryTarget holds a target for a rule. For example, it can specify that
 // packets matching the rule should DROP, ACCEPT, or use an extension target.
 // iptables-extension(8) has a list of possible targets.
@@ -285,6 +297,13 @@ type XTEntryTarget struct {
 // SizeOfXTEntryTarget is the size of an XTEntryTarget.
 const SizeOfXTEntryTarget = 32
 
+// KernelXTEntryTarget is identical to XTEntryTarget, but contains a
+// variable-length Data field.
+type KernelXTEntryTarget struct {
+	XTEntryTarget
+	Data []byte
+}
+
 // XTStandardTarget is a built-in target, one of ACCEPT, DROP, JUMP, QUEUE,
 // RETURN, or jump. It corresponds to struct xt_standard_target in
 // include/uapi/linux/netfilter/x_tables.h.
@@ -450,9 +469,9 @@ func (ke *KernelIPTGetEntries) UnmarshalUnsafe(src []byte) {
 }
 
 // CopyIn implements marshal.Marshallable.CopyIn.
-func (ke *KernelIPTGetEntries) CopyIn(task marshal.Task, addr usermem.Addr) (int, error) {
-	buf := task.CopyScratchBuffer(ke.SizeBytes()) // escapes: okay.
-	length, err := task.CopyInBytes(addr, buf)    // escapes: okay.
+func (ke *KernelIPTGetEntries) CopyIn(cc marshal.CopyContext, addr usermem.Addr) (int, error) {
+	buf := cc.CopyScratchBuffer(ke.SizeBytes()) // escapes: okay.
+	length, err := cc.CopyInBytes(addr, buf)    // escapes: okay.
 	// Unmarshal unconditionally. If we had a short copy-in, this results in a
 	// partially unmarshalled struct.
 	ke.UnmarshalBytes(buf) // escapes: fallback.
@@ -460,21 +479,21 @@ func (ke *KernelIPTGetEntries) CopyIn(task marshal.Task, addr usermem.Addr) (int
 }
 
 // CopyOut implements marshal.Marshallable.CopyOut.
-func (ke *KernelIPTGetEntries) CopyOut(task marshal.Task, addr usermem.Addr) (int, error) {
+func (ke *KernelIPTGetEntries) CopyOut(cc marshal.CopyContext, addr usermem.Addr) (int, error) {
 	// Type KernelIPTGetEntries doesn't have a packed layout in memory, fall
 	// back to MarshalBytes.
-	return task.CopyOutBytes(addr, ke.marshalAll(task))
+	return cc.CopyOutBytes(addr, ke.marshalAll(cc))
 }
 
 // CopyOutN implements marshal.Marshallable.CopyOutN.
-func (ke *KernelIPTGetEntries) CopyOutN(task marshal.Task, addr usermem.Addr, limit int) (int, error) {
+func (ke *KernelIPTGetEntries) CopyOutN(cc marshal.CopyContext, addr usermem.Addr, limit int) (int, error) {
 	// Type KernelIPTGetEntries doesn't have a packed layout in memory, fall
 	// back to MarshalBytes.
-	return task.CopyOutBytes(addr, ke.marshalAll(task)[:limit])
+	return cc.CopyOutBytes(addr, ke.marshalAll(cc)[:limit])
 }
 
-func (ke *KernelIPTGetEntries) marshalAll(task marshal.Task) []byte {
-	buf := task.CopyScratchBuffer(ke.SizeBytes())
+func (ke *KernelIPTGetEntries) marshalAll(cc marshal.CopyContext) []byte {
+	buf := cc.CopyScratchBuffer(ke.SizeBytes())
 	ke.MarshalBytes(buf)
 	return buf
 }
@@ -510,6 +529,8 @@ type IPTReplace struct {
 const SizeOfIPTReplace = 96
 
 // ExtensionName holds the name of a netfilter extension.
+//
+// +marshal
 type ExtensionName [XT_EXTENSION_MAXNAMELEN]byte
 
 // String implements fmt.Stringer.
diff --git a/pkg/abi/linux/netfilter_ipv6.go b/pkg/abi/linux/netfilter_ipv6.go
index 9bb9efb10..6d31eb5e3 100644
--- a/pkg/abi/linux/netfilter_ipv6.go
+++ b/pkg/abi/linux/netfilter_ipv6.go
@@ -17,9 +17,9 @@ package linux
 import (
 	"io"
 
+	"gvisor.dev/gvisor/pkg/marshal"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
 	"gvisor.dev/gvisor/pkg/usermem"
-	"gvisor.dev/gvisor/tools/go_marshal/marshal"
-	"gvisor.dev/gvisor/tools/go_marshal/primitive"
 )
 
 // This file contains structures required to support IPv6 netfilter and
@@ -128,9 +128,9 @@ func (ke *KernelIP6TGetEntries) UnmarshalUnsafe(src []byte) {
 }
 
 // CopyIn implements marshal.Marshallable.CopyIn.
-func (ke *KernelIP6TGetEntries) CopyIn(task marshal.Task, addr usermem.Addr) (int, error) {
-	buf := task.CopyScratchBuffer(ke.SizeBytes()) // escapes: okay.
-	length, err := task.CopyInBytes(addr, buf)    // escapes: okay.
+func (ke *KernelIP6TGetEntries) CopyIn(cc marshal.CopyContext, addr usermem.Addr) (int, error) {
+	buf := cc.CopyScratchBuffer(ke.SizeBytes()) // escapes: okay.
+	length, err := cc.CopyInBytes(addr, buf)    // escapes: okay.
 	// Unmarshal unconditionally. If we had a short copy-in, this results
 	// in a partially unmarshalled struct.
 	ke.UnmarshalBytes(buf) // escapes: fallback.
@@ -138,21 +138,21 @@ func (ke *KernelIP6TGetEntries) CopyIn(task marshal.Task, addr usermem.Addr) (in
 }
 
 // CopyOut implements marshal.Marshallable.CopyOut.
-func (ke *KernelIP6TGetEntries) CopyOut(task marshal.Task, addr usermem.Addr) (int, error) {
+func (ke *KernelIP6TGetEntries) CopyOut(cc marshal.CopyContext, addr usermem.Addr) (int, error) {
 	// Type KernelIP6TGetEntries doesn't have a packed layout in memory,
 	// fall back to MarshalBytes.
-	return task.CopyOutBytes(addr, ke.marshalAll(task))
+	return cc.CopyOutBytes(addr, ke.marshalAll(cc))
 }
 
 // CopyOutN implements marshal.Marshallable.CopyOutN.
-func (ke *KernelIP6TGetEntries) CopyOutN(task marshal.Task, addr usermem.Addr, limit int) (int, error) {
+func (ke *KernelIP6TGetEntries) CopyOutN(cc marshal.CopyContext, addr usermem.Addr, limit int) (int, error) {
 	// Type KernelIP6TGetEntries doesn't have a packed layout in memory, fall
 	// back to MarshalBytes.
-	return task.CopyOutBytes(addr, ke.marshalAll(task)[:limit])
+	return cc.CopyOutBytes(addr, ke.marshalAll(cc)[:limit])
 }
 
-func (ke *KernelIP6TGetEntries) marshalAll(task marshal.Task) []byte {
-	buf := task.CopyScratchBuffer(ke.SizeBytes())
+func (ke *KernelIP6TGetEntries) marshalAll(cc marshal.CopyContext) []byte {
+	buf := cc.CopyScratchBuffer(ke.SizeBytes())
 	ke.MarshalBytes(buf)
 	return buf
 }
@@ -290,6 +290,19 @@ type IP6TIP struct {
 
 const SizeOfIP6TIP = 136
 
+// Flags in IP6TIP.Flags. Corresponding constants are in
+// include/uapi/linux/netfilter_ipv6/ip6_tables.h.
+const (
+	// Whether to check the Protocol field.
+	IP6T_F_PROTO = 0x01
+	// Whether to match the TOS field.
+	IP6T_F_TOS = 0x02
+	// Indicates that the jump target is an aboslute GOTO, not an offset.
+	IP6T_F_GOTO = 0x04
+	// Enables all flags.
+	IP6T_F_MASK = 0x07
+)
+
 // Flags in IP6TIP.InverseFlags. Corresponding constants are in
 // include/uapi/linux/netfilter_ipv6/ip6_tables.h.
 const (
@@ -308,3 +321,16 @@ const (
 	// Enable all flags.
 	IP6T_INV_MASK = 0x7F
 )
+
+// NFNATRange corresponds to struct nf_nat_range in
+// include/uapi/linux/netfilter/nf_nat.h.
+type NFNATRange struct {
+	Flags    uint32
+	MinAddr  Inet6Addr
+	MaxAddr  Inet6Addr
+	MinProto uint16 // Network byte order.
+	MaxProto uint16 // Network byte order.
+}
+
+// SizeOfNFNATRange is the size of NFNATRange.
+const SizeOfNFNATRange = 40
diff --git a/pkg/abi/linux/netlink.go b/pkg/abi/linux/netlink.go
index 0ba086c76..b41f94a69 100644
--- a/pkg/abi/linux/netlink.go
+++ b/pkg/abi/linux/netlink.go
@@ -40,6 +40,8 @@ const (
 )
 
 // SockAddrNetlink is struct sockaddr_nl, from uapi/linux/netlink.h.
+//
+// +marshal
 type SockAddrNetlink struct {
 	Family uint16
 	_      uint16
diff --git a/pkg/abi/linux/poll.go b/pkg/abi/linux/poll.go
index c04d26e4c..3443a5768 100644
--- a/pkg/abi/linux/poll.go
+++ b/pkg/abi/linux/poll.go
@@ -15,6 +15,8 @@
 package linux
 
 // PollFD is struct pollfd, used by poll(2)/ppoll(2), from uapi/asm-generic/poll.h.
+//
+// +marshal slice:PollFDSlice
 type PollFD struct {
 	FD      int32
 	Events  int16
diff --git a/pkg/abi/linux/rusage.go b/pkg/abi/linux/rusage.go
index d8302dc85..e29d0ac7e 100644
--- a/pkg/abi/linux/rusage.go
+++ b/pkg/abi/linux/rusage.go
@@ -26,6 +26,8 @@ const (
 )
 
 // Rusage represents the Linux struct rusage.
+//
+// +marshal
 type Rusage struct {
 	UTime    Timeval
 	STime    Timeval
diff --git a/pkg/abi/linux/seccomp.go b/pkg/abi/linux/seccomp.go
index d0607e256..5be3f10f9 100644
--- a/pkg/abi/linux/seccomp.go
+++ b/pkg/abi/linux/seccomp.go
@@ -34,11 +34,11 @@ type BPFAction uint32
 
 const (
 	SECCOMP_RET_KILL_PROCESS BPFAction = 0x80000000
-	SECCOMP_RET_KILL_THREAD            = 0x00000000
-	SECCOMP_RET_TRAP                   = 0x00030000
-	SECCOMP_RET_ERRNO                  = 0x00050000
-	SECCOMP_RET_TRACE                  = 0x7ff00000
-	SECCOMP_RET_ALLOW                  = 0x7fff0000
+	SECCOMP_RET_KILL_THREAD  BPFAction = 0x00000000
+	SECCOMP_RET_TRAP         BPFAction = 0x00030000
+	SECCOMP_RET_ERRNO        BPFAction = 0x00050000
+	SECCOMP_RET_TRACE        BPFAction = 0x7ff00000
+	SECCOMP_RET_ALLOW        BPFAction = 0x7fff0000
 )
 
 func (a BPFAction) String() string {
@@ -64,9 +64,41 @@ func (a BPFAction) Data() uint16 {
 	return uint16(a & SECCOMP_RET_DATA)
 }
 
+// WithReturnCode sets the lower 16 bits of the SECCOMP_RET_ERRNO or
+// SECCOMP_RET_TRACE actions to the provided return code, overwriting the previous
+// action, and returns a new BPFAction. If not SECCOMP_RET_ERRNO or
+// SECCOMP_RET_TRACE then this panics.
+func (a BPFAction) WithReturnCode(code uint16) BPFAction {
+	// mask out the previous return value
+	baseAction := a & SECCOMP_RET_ACTION_FULL
+	if baseAction == SECCOMP_RET_ERRNO || baseAction == SECCOMP_RET_TRACE {
+		return BPFAction(uint32(baseAction) | uint32(code))
+	}
+	panic("WithReturnCode only valid for SECCOMP_RET_ERRNO and SECCOMP_RET_TRACE")
+}
+
 // SockFprog is sock_fprog taken from <linux/filter.h>.
 type SockFprog struct {
 	Len    uint16
 	pad    [6]byte
 	Filter *BPFInstruction
 }
+
+// SeccompData is equivalent to struct seccomp_data, which contains the data
+// passed to seccomp-bpf filters.
+//
+// +marshal
+type SeccompData struct {
+	// Nr is the system call number.
+	Nr int32
+
+	// Arch is an AUDIT_ARCH_* value indicating the system call convention.
+	Arch uint32
+
+	// InstructionPointer is the value of the instruction pointer at the time
+	// of the system call.
+	InstructionPointer uint64
+
+	// Args contains the first 6 system call arguments.
+	Args [6]uint64
+}
diff --git a/pkg/abi/linux/sem.go b/pkg/abi/linux/sem.go
index de422c519..487a626cc 100644
--- a/pkg/abi/linux/sem.go
+++ b/pkg/abi/linux/sem.go
@@ -35,6 +35,8 @@ const (
 const SEM_UNDO = 0x1000
 
 // SemidDS is equivalent to struct semid64_ds.
+//
+// +marshal
 type SemidDS struct {
 	SemPerm  IPCPerm
 	SemOTime TimeT
@@ -45,6 +47,8 @@ type SemidDS struct {
 }
 
 // Sembuf is equivalent to struct sembuf.
+//
+// +marshal slice:SembufSlice
 type Sembuf struct {
 	SemNum uint16
 	SemOp  int16
diff --git a/pkg/abi/linux/shm.go b/pkg/abi/linux/shm.go
index e45aadb10..274b1e847 100644
--- a/pkg/abi/linux/shm.go
+++ b/pkg/abi/linux/shm.go
@@ -51,6 +51,8 @@ const (
 
 // ShmidDS is equivalent to struct shmid64_ds. Source:
 // include/uapi/asm-generic/shmbuf.h
+//
+// +marshal
 type ShmidDS struct {
 	ShmPerm    IPCPerm
 	ShmSegsz   uint64
@@ -66,6 +68,8 @@ type ShmidDS struct {
 }
 
 // ShmParams is equivalent to struct shminfo. Source: include/uapi/linux/shm.h
+//
+// +marshal
 type ShmParams struct {
 	ShmMax uint64
 	ShmMin uint64
@@ -75,6 +79,8 @@ type ShmParams struct {
 }
 
 // ShmInfo is equivalent to struct shm_info. Source: include/uapi/linux/shm.h
+//
+// +marshal
 type ShmInfo struct {
 	UsedIDs       int32 // Number of currently existing segments.
 	_             [4]byte
diff --git a/pkg/abi/linux/signal.go b/pkg/abi/linux/signal.go
index 1c330e763..6ca57ffbb 100644
--- a/pkg/abi/linux/signal.go
+++ b/pkg/abi/linux/signal.go
@@ -214,6 +214,8 @@ const (
 )
 
 // Sigevent represents struct sigevent.
+//
+// +marshal
 type Sigevent struct {
 	Value  uint64 // union sigval {int, void*}
 	Signo  int32
diff --git a/pkg/abi/linux/signalfd.go b/pkg/abi/linux/signalfd.go
index 85fad9956..468c6a387 100644
--- a/pkg/abi/linux/signalfd.go
+++ b/pkg/abi/linux/signalfd.go
@@ -23,6 +23,8 @@ const (
 )
 
 // SignalfdSiginfo is the siginfo encoding for signalfds.
+//
+// +marshal
 type SignalfdSiginfo struct {
 	Signo   uint32
 	Errno   int32
@@ -41,5 +43,5 @@ type SignalfdSiginfo struct {
 	STime   uint64
 	Addr    uint64
 	AddrLSB uint16
-	_       [48]uint8
+	_       [48]uint8 `marshal:"unaligned"`
 }
diff --git a/pkg/abi/linux/socket.go b/pkg/abi/linux/socket.go
index e37c8727d..d156d41e4 100644
--- a/pkg/abi/linux/socket.go
+++ b/pkg/abi/linux/socket.go
@@ -14,7 +14,10 @@
 
 package linux
 
-import "gvisor.dev/gvisor/pkg/binary"
+import (
+	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/marshal"
+)
 
 // Address families, from linux/socket.h.
 const (
@@ -265,6 +268,8 @@ type InetMulticastRequestWithNIC struct {
 type Inet6Addr [16]byte
 
 // SockAddrInet6 is struct sockaddr_in6, from uapi/linux/in6.h.
+//
+// +marshal
 type SockAddrInet6 struct {
 	Family   uint16
 	Port     uint16
@@ -274,6 +279,8 @@ type SockAddrInet6 struct {
 }
 
 // SockAddrLink is a struct sockaddr_ll, from uapi/linux/if_packet.h.
+//
+// +marshal
 type SockAddrLink struct {
 	Family          uint16
 	Protocol        uint16
@@ -290,6 +297,8 @@ type SockAddrLink struct {
 const UnixPathMax = 108
 
 // SockAddrUnix is struct sockaddr_un, from uapi/linux/un.h.
+//
+// +marshal
 type SockAddrUnix struct {
 	Family uint16
 	Path   [UnixPathMax]int8
@@ -299,6 +308,8 @@ type SockAddrUnix struct {
 // equivalent to struct sockaddr. SockAddr ensures that a well-defined set of
 // types can be used as socket addresses.
 type SockAddr interface {
+	marshal.Marshallable
+
 	// implementsSockAddr exists purely to allow a type to indicate that they
 	// implement this interface. This method is a no-op and shouldn't be called.
 	implementsSockAddr()
diff --git a/pkg/abi/linux/time.go b/pkg/abi/linux/time.go
index e6860ed49..206f5af7e 100644
--- a/pkg/abi/linux/time.go
+++ b/pkg/abi/linux/time.go
@@ -93,6 +93,8 @@ const (
 const maxSecInDuration = math.MaxInt64 / int64(time.Second)
 
 // TimeT represents time_t in <time.h>. It represents time in seconds.
+//
+// +marshal
 type TimeT int64
 
 // NsecToTimeT translates nanoseconds to TimeT (seconds).
@@ -102,7 +104,7 @@ func NsecToTimeT(nsec int64) TimeT {
 
 // Timespec represents struct timespec in <time.h>.
 //
-// +marshal
+// +marshal slice:TimespecSlice
 type Timespec struct {
 	Sec  int64
 	Nsec int64
@@ -158,7 +160,7 @@ const SizeOfTimeval = 16
 
 // Timeval represents struct timeval in <time.h>.
 //
-// +marshal
+// +marshal slice:TimevalSlice
 type Timeval struct {
 	Sec  int64
 	Usec int64
@@ -196,6 +198,8 @@ func DurationToTimeval(dur time.Duration) Timeval {
 }
 
 // Itimerspec represents struct itimerspec in <time.h>.
+//
+// +marshal
 type Itimerspec struct {
 	Interval Timespec
 	Value    Timespec
@@ -206,12 +210,16 @@ type Itimerspec struct {
 //     struct timeval it_interval; /* next value */
 //     struct timeval it_value;    /* current value */
 //   };
+//
+// +marshal
 type ItimerVal struct {
 	Interval Timeval
 	Value    Timeval
 }
 
 // ClockT represents type clock_t.
+//
+// +marshal
 type ClockT int64
 
 // ClockTFromDuration converts time.Duration to clock_t.
@@ -220,6 +228,8 @@ func ClockTFromDuration(d time.Duration) ClockT {
 }
 
 // Tms represents struct tms, used by times(2).
+//
+// +marshal
 type Tms struct {
 	UTime  ClockT
 	STime  ClockT
@@ -229,6 +239,8 @@ type Tms struct {
 
 // TimerID represents type timer_t, which identifies a POSIX per-process
 // interval timer.
+//
+// +marshal
 type TimerID int32
 
 // StatxTimestamp represents struct statx_timestamp.
diff --git a/pkg/abi/linux/tty.go b/pkg/abi/linux/tty.go
index 8ac02aee8..47e65d9fb 100644
--- a/pkg/abi/linux/tty.go
+++ b/pkg/abi/linux/tty.go
@@ -23,6 +23,8 @@ const (
 )
 
 // Winsize is struct winsize, defined in uapi/asm-generic/termios.h.
+//
+// +marshal
 type Winsize struct {
 	Row    uint16
 	Col    uint16
@@ -31,6 +33,8 @@ type Winsize struct {
 }
 
 // Termios is struct termios, defined in uapi/asm-generic/termbits.h.
+//
+// +marshal
 type Termios struct {
 	InputFlags        uint32
 	OutputFlags       uint32
@@ -321,9 +325,9 @@ var MasterTermios = KernelTermios{
 	OutputSpeed:       38400,
 }
 
-// DefaultSlaveTermios is the default terminal configuration of the slave end
-// of a Unix98 pseudoterminal.
-var DefaultSlaveTermios = KernelTermios{
+// DefaultReplicaTermios is the default terminal configuration of the replica
+// end of a Unix98 pseudoterminal.
+var DefaultReplicaTermios = KernelTermios{
 	InputFlags:        ICRNL | IXON,
 	OutputFlags:       OPOST | ONLCR,
 	ControlFlags:      B38400 | CS8 | CREAD,
@@ -337,6 +341,7 @@ var DefaultSlaveTermios = KernelTermios{
 // include/uapi/asm-generic/termios.h.
 //
 // +stateify savable
+// +marshal
 type WindowSize struct {
 	Rows uint16
 	Cols uint16
diff --git a/pkg/abi/linux/utsname.go b/pkg/abi/linux/utsname.go
index 60f220a67..cb7c95437 100644
--- a/pkg/abi/linux/utsname.go
+++ b/pkg/abi/linux/utsname.go
@@ -26,6 +26,8 @@ const (
 )
 
 // UtsName represents struct utsname, the struct returned by uname(2).
+//
+// +marshal
 type UtsName struct {
 	Sysname    [UTSLen + 1]byte
 	Nodename   [UTSLen + 1]byte
diff --git a/pkg/abi/linux/xattr.go b/pkg/abi/linux/xattr.go
index 99180b208..8ef837f27 100644
--- a/pkg/abi/linux/xattr.go
+++ b/pkg/abi/linux/xattr.go
@@ -23,6 +23,9 @@ const (
 	XATTR_CREATE  = 1
 	XATTR_REPLACE = 2
 
+	XATTR_TRUSTED_PREFIX     = "trusted."
+	XATTR_TRUSTED_PREFIX_LEN = len(XATTR_TRUSTED_PREFIX)
+
 	XATTR_USER_PREFIX     = "user."
 	XATTR_USER_PREFIX_LEN = len(XATTR_USER_PREFIX)
 )
diff --git a/pkg/amutex/BUILD b/pkg/amutex/BUILD
index ffc918846..bd3a5cce9 100644
--- a/pkg/amutex/BUILD
+++ b/pkg/amutex/BUILD
@@ -6,7 +6,10 @@ go_library(
     name = "amutex",
     srcs = ["amutex.go"],
     visibility = ["//:sandbox"],
-    deps = ["//pkg/syserror"],
+    deps = [
+        "//pkg/context",
+        "//pkg/syserror",
+    ],
 )
 
 go_test(
diff --git a/pkg/amutex/amutex.go b/pkg/amutex/amutex.go
index a078a31db..d7acc1d9f 100644
--- a/pkg/amutex/amutex.go
+++ b/pkg/amutex/amutex.go
@@ -19,41 +19,17 @@ package amutex
 import (
 	"sync/atomic"
 
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
 // Sleeper must be implemented by users of the abortable mutex to allow for
 // cancellation of waits.
-type Sleeper interface {
-	// SleepStart is called by the AbortableMutex.Lock() function when the
-	// mutex is contended and the goroutine is about to sleep.
-	//
-	// A channel can be returned that causes the sleep to be canceled if
-	// it's readable. If no cancellation is desired, nil can be returned.
-	SleepStart() <-chan struct{}
-
-	// SleepFinish is called by AbortableMutex.Lock() once a contended mutex
-	// is acquired or the wait is aborted.
-	SleepFinish(success bool)
-
-	// Interrupted returns true if the wait is aborted.
-	Interrupted() bool
-}
+type Sleeper = context.ChannelSleeper
 
 // NoopSleeper is a stateless no-op implementation of Sleeper for anonymous
 // embedding in other types that do not support cancelation.
-type NoopSleeper struct{}
-
-// SleepStart implements Sleeper.SleepStart.
-func (NoopSleeper) SleepStart() <-chan struct{} {
-	return nil
-}
-
-// SleepFinish implements Sleeper.SleepFinish.
-func (NoopSleeper) SleepFinish(success bool) {}
-
-// Interrupted implements Sleeper.Interrupted.
-func (NoopSleeper) Interrupted() bool { return false }
+type NoopSleeper = context.Context
 
 // Block blocks until either receiving from ch succeeds (in which case it
 // returns nil) or sleeper is interrupted (in which case it returns
diff --git a/pkg/bpf/decoder.go b/pkg/bpf/decoder.go
index c8ee0c3b1..069d0395d 100644
--- a/pkg/bpf/decoder.go
+++ b/pkg/bpf/decoder.go
@@ -21,10 +21,15 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 )
 
-// DecodeProgram translates an array of BPF instructions into text format.
-func DecodeProgram(program []linux.BPFInstruction) (string, error) {
+// DecodeProgram translates a compiled BPF program into text format.
+func DecodeProgram(p Program) (string, error) {
+	return DecodeInstructions(p.instructions)
+}
+
+// DecodeInstructions translates an array of BPF instructions into text format.
+func DecodeInstructions(instns []linux.BPFInstruction) (string, error) {
 	var ret bytes.Buffer
-	for line, s := range program {
+	for line, s := range instns {
 		ret.WriteString(fmt.Sprintf("%v: ", line))
 		if err := decode(s, line, &ret); err != nil {
 			return "", err
@@ -34,7 +39,7 @@ func DecodeProgram(program []linux.BPFInstruction) (string, error) {
 	return ret.String(), nil
 }
 
-// Decode translates BPF instruction into text format.
+// Decode translates a single BPF instruction into text format.
 func Decode(inst linux.BPFInstruction) (string, error) {
 	var ret bytes.Buffer
 	err := decode(inst, -1, &ret)
diff --git a/pkg/bpf/decoder_test.go b/pkg/bpf/decoder_test.go
index 6a023f0c0..bb971ce21 100644
--- a/pkg/bpf/decoder_test.go
+++ b/pkg/bpf/decoder_test.go
@@ -93,7 +93,7 @@ func TestDecode(t *testing.T) {
 	}
 }
 
-func TestDecodeProgram(t *testing.T) {
+func TestDecodeInstructions(t *testing.T) {
 	for _, test := range []struct {
 		name     string
 		program  []linux.BPFInstruction
@@ -126,7 +126,7 @@ func TestDecodeProgram(t *testing.T) {
 			program: []linux.BPFInstruction{Stmt(Ld+Abs+W, 10), Stmt(Ld+Len+Mem, 0)},
 			fail:    true},
 	} {
-		got, err := DecodeProgram(test.program)
+		got, err := DecodeInstructions(test.program)
 		if test.fail {
 			if err == nil {
 				t.Errorf("%s: Decode(...) failed, expected: 'error', got: %q", test.name, got)
diff --git a/pkg/bpf/program_builder.go b/pkg/bpf/program_builder.go
index 7992044d0..caaf99c83 100644
--- a/pkg/bpf/program_builder.go
+++ b/pkg/bpf/program_builder.go
@@ -32,13 +32,21 @@ type ProgramBuilder struct {
 	// Maps label names to label objects.
 	labels map[string]*label
 
+	// unusableLabels are labels that are added before being referenced in a
+	// jump. Any labels added this way cannot be referenced later in order to
+	// avoid backwards references.
+	unusableLabels map[string]bool
+
 	// Array of BPF instructions that makes up the program.
 	instructions []linux.BPFInstruction
 }
 
 // NewProgramBuilder creates a new ProgramBuilder instance.
 func NewProgramBuilder() *ProgramBuilder {
-	return &ProgramBuilder{labels: map[string]*label{}}
+	return &ProgramBuilder{
+		labels:         map[string]*label{},
+		unusableLabels: map[string]bool{},
+	}
 }
 
 // label contains information to resolve a label to an offset.
@@ -108,9 +116,12 @@ func (b *ProgramBuilder) AddJumpLabels(code uint16, k uint32, jtLabel, jfLabel s
 func (b *ProgramBuilder) AddLabel(name string) error {
 	l, ok := b.labels[name]
 	if !ok {
-		// This is done to catch jump backwards cases, but it's not strictly wrong
-		// to have unused labels.
-		return fmt.Errorf("Adding a label that hasn't been used is not allowed: %v", name)
+		if _, ok = b.unusableLabels[name]; ok {
+			return fmt.Errorf("label %q already set", name)
+		}
+		// Mark the label as unusable. This is done to catch backwards jumps.
+		b.unusableLabels[name] = true
+		return nil
 	}
 	if l.target != -1 {
 		return fmt.Errorf("label %q target already set: %v", name, l.target)
@@ -141,6 +152,10 @@ func (b *ProgramBuilder) addLabelSource(labelName string, t jmpType) {
 
 func (b *ProgramBuilder) resolveLabels() error {
 	for key, v := range b.labels {
+		if _, ok := b.unusableLabels[key]; ok {
+			return fmt.Errorf("backwards reference detected for label: %q", key)
+		}
+
 		if v.target == -1 {
 			return fmt.Errorf("label target not set: %v", key)
 		}
diff --git a/pkg/bpf/program_builder_test.go b/pkg/bpf/program_builder_test.go
index 92ca5f4c3..37f684f25 100644
--- a/pkg/bpf/program_builder_test.go
+++ b/pkg/bpf/program_builder_test.go
@@ -26,16 +26,16 @@ func validate(p *ProgramBuilder, expected []linux.BPFInstruction) error {
 	if err != nil {
 		return fmt.Errorf("Instructions() failed: %v", err)
 	}
-	got, err := DecodeProgram(instructions)
+	got, err := DecodeInstructions(instructions)
 	if err != nil {
-		return fmt.Errorf("DecodeProgram('instructions') failed: %v", err)
+		return fmt.Errorf("DecodeInstructions('instructions') failed: %v", err)
 	}
-	expectedDecoded, err := DecodeProgram(expected)
+	expectedDecoded, err := DecodeInstructions(expected)
 	if err != nil {
-		return fmt.Errorf("DecodeProgram('expected') failed: %v", err)
+		return fmt.Errorf("DecodeInstructions('expected') failed: %v", err)
 	}
 	if got != expectedDecoded {
-		return fmt.Errorf("DecodeProgram() failed, expected: %q, got: %q", expectedDecoded, got)
+		return fmt.Errorf("DecodeInstructions() failed, expected: %q, got: %q", expectedDecoded, got)
 	}
 	return nil
 }
@@ -124,10 +124,38 @@ func TestProgramBuilderLabelWithNoInstruction(t *testing.T) {
 	}
 }
 
+// TestProgramBuilderUnusedLabel tests that adding an unused label doesn't
+// cause program generation to fail.
 func TestProgramBuilderUnusedLabel(t *testing.T) {
 	p := NewProgramBuilder()
-	if err := p.AddLabel("unused"); err == nil {
-		t.Errorf("AddLabel(unused) should have failed")
+	p.AddStmt(Ld+Abs+W, 10)
+	p.AddJump(Jmp+Ja, 10, 0, 0)
+
+	expected := []linux.BPFInstruction{
+		Stmt(Ld+Abs+W, 10),
+		Jump(Jmp+Ja, 10, 0, 0),
+	}
+
+	if err := p.AddLabel("unused"); err != nil {
+		t.Errorf("AddLabel(unused) should have succeeded")
+	}
+
+	if err := validate(p, expected); err != nil {
+		t.Errorf("Validate() failed: %v", err)
+	}
+}
+
+// TestProgramBuilderBackwardsReference tests that including a backwards
+// reference to a label in a program causes a failure.
+func TestProgramBuilderBackwardsReference(t *testing.T) {
+	p := NewProgramBuilder()
+	if err := p.AddLabel("bw_label"); err != nil {
+		t.Errorf("failed to add label")
+	}
+	p.AddStmt(Ld+Abs+W, 10)
+	p.AddJumpTrueLabel(Jmp+Jeq+K, 10, "bw_label", 0)
+	if _, err := p.Instructions(); err == nil {
+		t.Errorf("Instructions() should have failed")
 	}
 }
 
diff --git a/pkg/buffer/BUILD b/pkg/buffer/BUILD
index dcd086298..1186f788e 100644
--- a/pkg/buffer/BUILD
+++ b/pkg/buffer/BUILD
@@ -20,14 +20,17 @@ go_library(
     srcs = [
         "buffer.go",
         "buffer_list.go",
+        "pool.go",
         "safemem.go",
         "view.go",
         "view_unsafe.go",
     ],
     visibility = ["//visibility:public"],
     deps = [
+        "//pkg/context",
         "//pkg/log",
         "//pkg/safemem",
+        "//pkg/usermem",
     ],
 )
 
@@ -35,9 +38,13 @@ go_test(
     name = "buffer_test",
     size = "small",
     srcs = [
+        "pool_test.go",
         "safemem_test.go",
         "view_test.go",
     ],
     library = ":buffer",
-    deps = ["//pkg/safemem"],
+    deps = [
+        "//pkg/safemem",
+        "//pkg/state",
+    ],
 )
diff --git a/pkg/buffer/buffer.go b/pkg/buffer/buffer.go
index c6d089fd9..311808ae9 100644
--- a/pkg/buffer/buffer.go
+++ b/pkg/buffer/buffer.go
@@ -14,36 +14,26 @@
 
 // Package buffer provides the implementation of a buffer view.
 //
-// A view is an flexible buffer, backed by a pool, supporting the safecopy
-// operations natively as well as the ability to grow via either prepend or
-// append, as well as shrink.
+// A view is an flexible buffer, supporting the safecopy operations natively as
+// well as the ability to grow via either prepend or append, as well as shrink.
 package buffer
 
-import (
-	"sync"
-)
-
-const bufferSize = 8144 // See below.
-
 // buffer encapsulates a queueable byte buffer.
 //
-// Note that the total size is slightly less than two pages. This is done
-// intentionally to ensure that the buffer object aligns with runtime
-// internals. We have no hard size or alignment requirements. This two page
-// size will effectively minimize internal fragmentation, but still have a
-// large enough chunk to limit excessive segmentation.
-//
 // +stateify savable
 type buffer struct {
-	data  [bufferSize]byte
+	data  []byte
 	read  int
 	write int
 	bufferEntry
 }
 
-// reset resets internal data.
-//
-// This must be called before returning the buffer to the pool.
+// init performs in-place initialization for zero value.
+func (b *buffer) init(size int) {
+	b.data = make([]byte, size)
+}
+
+// Reset resets read and write locations, effectively emptying the buffer.
 func (b *buffer) Reset() {
 	b.read = 0
 	b.write = 0
@@ -85,10 +75,3 @@ func (b *buffer) WriteMove(n int) {
 func (b *buffer) WriteSlice() []byte {
 	return b.data[b.write:]
 }
-
-// bufferPool is a pool for buffers.
-var bufferPool = sync.Pool{
-	New: func() interface{} {
-		return new(buffer)
-	},
-}
diff --git a/pkg/buffer/pool.go b/pkg/buffer/pool.go
new file mode 100644
index 000000000..7ad6132ab
--- /dev/null
+++ b/pkg/buffer/pool.go
@@ -0,0 +1,83 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package buffer
+
+const (
+	// embeddedCount is the number of buffer structures embedded in the pool. It
+	// is also the number for overflow allocations.
+	embeddedCount = 8
+
+	// defaultBufferSize is the default size for each underlying storage buffer.
+	//
+	// It is slightly less than two pages. This is done intentionally to ensure
+	// that the buffer object aligns with runtime internals. This two page size
+	// will effectively minimize internal fragmentation, but still have a large
+	// enough chunk to limit excessive segmentation.
+	defaultBufferSize = 8144
+)
+
+// pool allocates buffer.
+//
+// It contains an embedded buffer storage for fast path when the number of
+// buffers needed is small.
+//
+// +stateify savable
+type pool struct {
+	bufferSize      int
+	avail           []buffer              `state:"nosave"`
+	embeddedStorage [embeddedCount]buffer `state:"wait"`
+}
+
+// get gets a new buffer from p.
+func (p *pool) get() *buffer {
+	if p.avail == nil {
+		p.avail = p.embeddedStorage[:]
+	}
+	if len(p.avail) == 0 {
+		p.avail = make([]buffer, embeddedCount)
+	}
+	if p.bufferSize <= 0 {
+		p.bufferSize = defaultBufferSize
+	}
+	buf := &p.avail[0]
+	buf.init(p.bufferSize)
+	p.avail = p.avail[1:]
+	return buf
+}
+
+// put releases buf.
+func (p *pool) put(buf *buffer) {
+	// Remove reference to the underlying storage, allowing it to be garbage
+	// collected.
+	buf.data = nil
+}
+
+// setBufferSize sets the size of underlying storage buffer for future
+// allocations. It can be called at any time.
+func (p *pool) setBufferSize(size int) {
+	p.bufferSize = size
+}
+
+// afterLoad is invoked by stateify.
+func (p *pool) afterLoad() {
+	// S/R does not save subslice into embeddedStorage correctly. Restore
+	// available portion of embeddedStorage manually. Restore as nil if none used.
+	for i := len(p.embeddedStorage); i > 0; i-- {
+		if p.embeddedStorage[i-1].data != nil {
+			p.avail = p.embeddedStorage[i:]
+			break
+		}
+	}
+}
diff --git a/pkg/buffer/pool_test.go b/pkg/buffer/pool_test.go
new file mode 100644
index 000000000..8584bac89
--- /dev/null
+++ b/pkg/buffer/pool_test.go
@@ -0,0 +1,51 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package buffer
+
+import (
+	"testing"
+)
+
+func TestGetDefaultBufferSize(t *testing.T) {
+	var p pool
+	for i := 0; i < embeddedCount*2; i++ {
+		buf := p.get()
+		if got, want := len(buf.data), defaultBufferSize; got != want {
+			t.Errorf("#%d len(buf.data) = %d, want %d", i, got, want)
+		}
+	}
+}
+
+func TestGetCustomBufferSize(t *testing.T) {
+	const size = 100
+
+	var p pool
+	p.setBufferSize(size)
+	for i := 0; i < embeddedCount*2; i++ {
+		buf := p.get()
+		if got, want := len(buf.data), size; got != want {
+			t.Errorf("#%d len(buf.data) = %d, want %d", i, got, want)
+		}
+	}
+}
+
+func TestPut(t *testing.T) {
+	var p pool
+	buf := p.get()
+	p.put(buf)
+	if buf.data != nil {
+		t.Errorf("buf.data = %x, want nil", buf.data)
+	}
+}
diff --git a/pkg/buffer/safemem.go b/pkg/buffer/safemem.go
index b789e56e9..8b42575b4 100644
--- a/pkg/buffer/safemem.go
+++ b/pkg/buffer/safemem.go
@@ -44,7 +44,7 @@ func (v *View) WriteFromSafememReader(r safemem.Reader, count uint64) (uint64, e
 	// Need at least one buffer.
 	firstBuf := v.data.Back()
 	if firstBuf == nil {
-		firstBuf = bufferPool.Get().(*buffer)
+		firstBuf = v.pool.get()
 		v.data.PushBack(firstBuf)
 	}
 
@@ -56,7 +56,7 @@ func (v *View) WriteFromSafememReader(r safemem.Reader, count uint64) (uint64, e
 		count -= l
 		blocks = append(blocks, firstBuf.WriteBlock())
 		for count > 0 {
-			emptyBuf := bufferPool.Get().(*buffer)
+			emptyBuf := v.pool.get()
 			v.data.PushBack(emptyBuf)
 			block := emptyBuf.WriteBlock().TakeFirst64(count)
 			count -= uint64(block.Len())
diff --git a/pkg/buffer/safemem_test.go b/pkg/buffer/safemem_test.go
index 47f357e0c..721cc5934 100644
--- a/pkg/buffer/safemem_test.go
+++ b/pkg/buffer/safemem_test.go
@@ -23,6 +23,8 @@ import (
 )
 
 func TestSafemem(t *testing.T) {
+	const bufferSize = defaultBufferSize
+
 	testCases := []struct {
 		name    string
 		input   string
diff --git a/pkg/buffer/view.go b/pkg/buffer/view.go
index e6901eadb..00652d675 100644
--- a/pkg/buffer/view.go
+++ b/pkg/buffer/view.go
@@ -27,6 +27,7 @@ import (
 type View struct {
 	data bufferList
 	size int64
+	pool pool
 }
 
 // TrimFront removes the first count bytes from the buffer.
@@ -81,7 +82,7 @@ func (v *View) advanceRead(count int64) {
 		buf = buf.Next() // Iterate.
 		v.data.Remove(oldBuf)
 		oldBuf.Reset()
-		bufferPool.Put(oldBuf)
+		v.pool.put(oldBuf)
 
 		// Update counts.
 		count -= sz
@@ -118,7 +119,7 @@ func (v *View) Truncate(length int64) {
 		// Drop the buffer completely; see above.
 		v.data.Remove(buf)
 		buf.Reset()
-		bufferPool.Put(buf)
+		v.pool.put(buf)
 		v.size -= sz
 	}
 }
@@ -137,7 +138,7 @@ func (v *View) Grow(length int64, zero bool) {
 
 		// Is there some space in the last buffer?
 		if buf == nil || buf.Full() {
-			buf = bufferPool.Get().(*buffer)
+			buf = v.pool.get()
 			v.data.PushBack(buf)
 		}
 
@@ -181,7 +182,7 @@ func (v *View) Prepend(data []byte) {
 
 	for len(data) > 0 {
 		// Do we need an empty buffer?
-		buf := bufferPool.Get().(*buffer)
+		buf := v.pool.get()
 		v.data.PushFront(buf)
 
 		// The buffer is empty; copy last chunk.
@@ -211,7 +212,7 @@ func (v *View) Append(data []byte) {
 
 		// Ensure there's a buffer with space.
 		if buf == nil || buf.Full() {
-			buf = bufferPool.Get().(*buffer)
+			buf = v.pool.get()
 			v.data.PushBack(buf)
 		}
 
@@ -297,7 +298,7 @@ func (v *View) WriteFromReader(r io.Reader, count int64) (int64, error) {
 
 		// Ensure we have an empty buffer.
 		if buf == nil || buf.Full() {
-			buf = bufferPool.Get().(*buffer)
+			buf = v.pool.get()
 			v.data.PushBack(buf)
 		}
 
diff --git a/pkg/buffer/view_test.go b/pkg/buffer/view_test.go
index 3db1bc6ee..839af0223 100644
--- a/pkg/buffer/view_test.go
+++ b/pkg/buffer/view_test.go
@@ -16,11 +16,16 @@ package buffer
 
 import (
 	"bytes"
+	"context"
 	"io"
 	"strings"
 	"testing"
+
+	"gvisor.dev/gvisor/pkg/state"
 )
 
+const bufferSize = defaultBufferSize
+
 func fillAppend(v *View, data []byte) {
 	v.Append(data)
 }
@@ -50,6 +55,30 @@ var fillFuncs = map[string]func(*View, []byte){
 	"writeFromReaderEnd": fillWriteFromReaderEnd,
 }
 
+func BenchmarkReadAt(b *testing.B) {
+	b.ReportAllocs()
+	var v View
+	v.Append(make([]byte, 100))
+
+	buf := make([]byte, 10)
+	for i := 0; i < b.N; i++ {
+		v.ReadAt(buf, 0)
+	}
+}
+
+func BenchmarkWriteRead(b *testing.B) {
+	b.ReportAllocs()
+	var v View
+	sz := 1000
+	wbuf := make([]byte, sz)
+	rbuf := bytes.NewBuffer(make([]byte, sz))
+	for i := 0; i < b.N; i++ {
+		v.Append(wbuf)
+		rbuf.Reset()
+		v.ReadToWriter(rbuf, int64(sz))
+	}
+}
+
 func testReadAt(t *testing.T, v *View, offset int64, n int, wantStr string, wantErr error) {
 	t.Helper()
 	d := make([]byte, n)
@@ -465,3 +494,51 @@ func TestView(t *testing.T) {
 		}
 	}
 }
+
+func doSaveAndLoad(t *testing.T, toSave, toLoad *View) {
+	t.Helper()
+	var buf bytes.Buffer
+	ctx := context.Background()
+	if _, err := state.Save(ctx, &buf, toSave); err != nil {
+		t.Fatal("state.Save:", err)
+	}
+	if _, err := state.Load(ctx, bytes.NewReader(buf.Bytes()), toLoad); err != nil {
+		t.Fatal("state.Load:", err)
+	}
+}
+
+func TestSaveRestoreViewEmpty(t *testing.T) {
+	var toSave View
+	var v View
+	doSaveAndLoad(t, &toSave, &v)
+
+	if got := v.pool.avail; got != nil {
+		t.Errorf("pool is not in zero state: v.pool.avail = %v, want nil", got)
+	}
+	if got := v.Flatten(); len(got) != 0 {
+		t.Errorf("v.Flatten() = %x, want []", got)
+	}
+}
+
+func TestSaveRestoreView(t *testing.T) {
+	// Create data that fits 2.5 slots.
+	data := bytes.Join([][]byte{
+		bytes.Repeat([]byte{1, 2}, defaultBufferSize),
+		bytes.Repeat([]byte{3}, defaultBufferSize/2),
+	}, nil)
+
+	var toSave View
+	toSave.Append(data)
+
+	var v View
+	doSaveAndLoad(t, &toSave, &v)
+
+	// Next available slot at index 3; 0-2 slot are used.
+	i := 3
+	if got, want := &v.pool.avail[0], &v.pool.embeddedStorage[i]; got != want {
+		t.Errorf("next available buffer points to %p, want %p (&v.pool.embeddedStorage[%d])", got, want, i)
+	}
+	if got := v.Flatten(); !bytes.Equal(got, data) {
+		t.Errorf("v.Flatten() = %x, want %x", got, data)
+	}
+}
diff --git a/pkg/context/BUILD b/pkg/context/BUILD
index 239f31149..f33e23bf7 100644
--- a/pkg/context/BUILD
+++ b/pkg/context/BUILD
@@ -7,7 +7,6 @@ go_library(
     srcs = ["context.go"],
     visibility = ["//:sandbox"],
     deps = [
-        "//pkg/amutex",
         "//pkg/log",
     ],
 )
diff --git a/pkg/context/context.go b/pkg/context/context.go
index 5319b6d8d..2613bc752 100644
--- a/pkg/context/context.go
+++ b/pkg/context/context.go
@@ -26,7 +26,6 @@ import (
 	"context"
 	"time"
 
-	"gvisor.dev/gvisor/pkg/amutex"
 	"gvisor.dev/gvisor/pkg/log"
 )
 
@@ -68,9 +67,10 @@ func ThreadGroupIDFromContext(ctx Context) (tgid int32, ok bool) {
 // In both cases, values extracted from the Context should be used instead.
 type Context interface {
 	log.Logger
-	amutex.Sleeper
 	context.Context
 
+	ChannelSleeper
+
 	// UninterruptibleSleepStart indicates the beginning of an uninterruptible
 	// sleep state (equivalent to Linux's TASK_UNINTERRUPTIBLE). If deactivate
 	// is true and the Context represents a Task, the Task's AddressSpace is
@@ -85,29 +85,60 @@ type Context interface {
 	UninterruptibleSleepFinish(activate bool)
 }
 
-// NoopSleeper is a noop implementation of amutex.Sleeper and UninterruptibleSleep
-// methods for anonymous embedding in other types that do not implement sleeps.
-type NoopSleeper struct {
-	amutex.NoopSleeper
+// A ChannelSleeper represents a goroutine that may sleep interruptibly, where
+// interruption is indicated by a channel becoming readable.
+type ChannelSleeper interface {
+	// SleepStart is called before going to sleep interruptibly. If SleepStart
+	// returns a non-nil channel and that channel becomes ready for receiving
+	// while the goroutine is sleeping, the goroutine should be woken, and
+	// SleepFinish(false) should be called. Otherwise, SleepFinish(true) should
+	// be called after the goroutine stops sleeping.
+	SleepStart() <-chan struct{}
+
+	// SleepFinish is called after an interruptibly-sleeping goroutine stops
+	// sleeping, as documented by SleepStart.
+	SleepFinish(success bool)
+
+	// Interrupted returns true if the channel returned by SleepStart is
+	// ready for receiving.
+	Interrupted() bool
+}
+
+// NoopSleeper is a noop implementation of ChannelSleeper and
+// Context.UninterruptibleSleep* methods for anonymous embedding in other types
+// that do not implement special behavior around sleeps.
+type NoopSleeper struct{}
+
+// SleepStart implements ChannelSleeper.SleepStart.
+func (NoopSleeper) SleepStart() <-chan struct{} {
+	return nil
+}
+
+// SleepFinish implements ChannelSleeper.SleepFinish.
+func (NoopSleeper) SleepFinish(success bool) {}
+
+// Interrupted implements ChannelSleeper.Interrupted.
+func (NoopSleeper) Interrupted() bool {
+	return false
 }
 
-// UninterruptibleSleepStart does nothing.
-func (NoopSleeper) UninterruptibleSleepStart(bool) {}
+// UninterruptibleSleepStart implements Context.UninterruptibleSleepStart.
+func (NoopSleeper) UninterruptibleSleepStart(deactivate bool) {}
 
-// UninterruptibleSleepFinish does nothing.
-func (NoopSleeper) UninterruptibleSleepFinish(bool) {}
+// UninterruptibleSleepFinish implements Context.UninterruptibleSleepFinish.
+func (NoopSleeper) UninterruptibleSleepFinish(activate bool) {}
 
-// Deadline returns zero values, meaning no deadline.
+// Deadline implements context.Context.Deadline.
 func (NoopSleeper) Deadline() (time.Time, bool) {
 	return time.Time{}, false
 }
 
-// Done returns nil.
+// Done implements context.Context.Done.
 func (NoopSleeper) Done() <-chan struct{} {
 	return nil
 }
 
-// Err returns nil.
+// Err returns context.Context.Err.
 func (NoopSleeper) Err() error {
 	return nil
 }
diff --git a/pkg/coverage/BUILD b/pkg/coverage/BUILD
new file mode 100644
index 000000000..a198e8028
--- /dev/null
+++ b/pkg/coverage/BUILD
@@ -0,0 +1,14 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "coverage",
+    srcs = ["coverage.go"],
+    visibility = ["//:sandbox"],
+    deps = [
+        "//pkg/sync",
+        "//pkg/usermem",
+        "@io_bazel_rules_go//go/tools/coverdata",
+    ],
+)
diff --git a/pkg/coverage/coverage.go b/pkg/coverage/coverage.go
new file mode 100644
index 000000000..a4f4b2c5e
--- /dev/null
+++ b/pkg/coverage/coverage.go
@@ -0,0 +1,172 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package coverage provides an interface through which Go coverage data can
+// be collected, converted to kcov format, and exposed to userspace.
+//
+// Coverage can be enabled by calling bazel {build,test} with
+// --collect_coverage_data and --instrumentation_filter with the desired
+// coverage surface. This causes bazel to use the Go cover tool manually to
+// generate instrumented files. It injects a hook that registers all coverage
+// data with the coverdata package.
+package coverage
+
+import (
+	"fmt"
+	"io"
+	"sort"
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/usermem"
+
+	"github.com/bazelbuild/rules_go/go/tools/coverdata"
+)
+
+// KcovAvailable returns whether the kcov coverage interface is available. It is
+// available as long as coverage is enabled for some files.
+func KcovAvailable() bool {
+	return len(coverdata.Cover.Blocks) > 0
+}
+
+// coverageMu must be held while accessing coverdata.Cover. This prevents
+// concurrent reads/writes from multiple threads collecting coverage data.
+var coverageMu sync.RWMutex
+
+// once ensures that globalData is only initialized once.
+var once sync.Once
+
+var globalData struct {
+	// files is the set of covered files sorted by filename. It is calculated at
+	// startup.
+	files []string
+
+	// syntheticPCs are a set of PCs calculated at startup, where the PC
+	// at syntheticPCs[i][j] corresponds to file i, block j.
+	syntheticPCs [][]uint64
+}
+
+// ClearCoverageData clears existing coverage data.
+func ClearCoverageData() {
+	coverageMu.Lock()
+	defer coverageMu.Unlock()
+	for _, counters := range coverdata.Cover.Counters {
+		for index := 0; index < len(counters); index++ {
+			atomic.StoreUint32(&counters[index], 0)
+		}
+	}
+}
+
+var coveragePool = sync.Pool{
+	New: func() interface{} {
+		return make([]byte, 0)
+	},
+}
+
+// ConsumeCoverageData builds and writes the collection of covered PCs. It
+// returns the number of bytes written.
+//
+// In Linux, a kernel configuration is set that compiles the kernel with a
+// custom function that is called at the beginning of every basic block, which
+// updates the memory-mapped coverage information. The Go coverage tool does not
+// allow us to inject arbitrary instructions into basic blocks, but it does
+// provide data that we can convert to a kcov-like format and transfer them to
+// userspace through a memory mapping.
+//
+// Note that this is not a strict implementation of kcov, which is especially
+// tricky to do because we do not have the same coverage tools available in Go
+// that that are available for the actual Linux kernel. In Linux, a kernel
+// configuration is set that compiles the kernel with a custom function that is
+// called at the beginning of every basic block to write program counters to the
+// kcov memory mapping. In Go, however, coverage tools only give us a count of
+// basic blocks as they are executed. Every time we return to userspace, we
+// collect the coverage information and write out PCs for each block that was
+// executed, providing userspace with the illusion that the kcov data is always
+// up to date. For convenience, we also generate a unique synthetic PC for each
+// block instead of using actual PCs. Finally, we do not provide thread-specific
+// coverage data (each kcov instance only contains PCs executed by the thread
+// owning it); instead, we will supply data for any file specified by --
+// instrumentation_filter.
+//
+// Note that we "consume", i.e. clear, coverdata when this function is run, to
+// ensure that each event is only reported once. Due to the limitations of Go
+// coverage tools, we reset the global coverage data every time this function is
+// run.
+func ConsumeCoverageData(w io.Writer) int {
+	once.Do(initCoverageData)
+
+	coverageMu.Lock()
+	defer coverageMu.Unlock()
+
+	total := 0
+	var pcBuffer [8]byte
+	for fileIndex, file := range globalData.files {
+		counters := coverdata.Cover.Counters[file]
+		for index := 0; index < len(counters); index++ {
+			if atomic.LoadUint32(&counters[index]) == 0 {
+				continue
+			}
+			// Non-zero coverage data found; consume it and report as a PC.
+			atomic.StoreUint32(&counters[index], 0)
+			pc := globalData.syntheticPCs[fileIndex][index]
+			usermem.ByteOrder.PutUint64(pcBuffer[:], pc)
+			n, err := w.Write(pcBuffer[:])
+			if err != nil {
+				if err == io.EOF {
+					// Simply stop writing if we encounter EOF; it's ok if we attempted to
+					// write more than we can hold.
+					return total + n
+				}
+				panic(fmt.Sprintf("Internal error writing PCs to kcov area: %v", err))
+			}
+			total += n
+		}
+	}
+
+	if total == 0 {
+		// An empty profile indicates that coverage is not enabled, in which case
+		// there shouldn't be any task work registered.
+		panic("kcov task work is registered, but no coverage data was found")
+	}
+	return total
+}
+
+// initCoverageData initializes globalData. It should only be called once,
+// before any kcov data is written.
+func initCoverageData() {
+	// First, order all files. Then calculate synthetic PCs for every block
+	// (using the well-defined ordering for files as well).
+	for file := range coverdata.Cover.Blocks {
+		globalData.files = append(globalData.files, file)
+	}
+	sort.Strings(globalData.files)
+
+	// nextSyntheticPC is the first PC that we generate for a block.
+	//
+	// This uses a standard-looking kernel range for simplicity.
+	//
+	// FIXME(b/160639712): This is only necessary because syzkaller requires
+	// addresses in the kernel range. If we can remove this constraint, then we
+	// should be able to use the actual addresses.
+	var nextSyntheticPC uint64 = 0xffffffff80000000
+	for _, file := range globalData.files {
+		blocks := coverdata.Cover.Blocks[file]
+		thisFile := make([]uint64, 0, len(blocks))
+		for range blocks {
+			thisFile = append(thisFile, nextSyntheticPC)
+			nextSyntheticPC++ // Advance.
+		}
+		globalData.syntheticPCs = append(globalData.syntheticPCs, thisFile)
+	}
+}
diff --git a/pkg/cpuid/cpuid_parse_x86_test.go b/pkg/cpuid/cpuid_parse_x86_test.go
index c9bd40e1b..e4ae0d689 100644
--- a/pkg/cpuid/cpuid_parse_x86_test.go
+++ b/pkg/cpuid/cpuid_parse_x86_test.go
@@ -32,27 +32,27 @@ func kernelVersion() (int, int, error) {
 		return 0, 0, err
 	}
 
-	var r string
+	var sb strings.Builder
 	for _, b := range u.Release {
 		if b == 0 {
 			break
 		}
-		r += string(b)
+		sb.WriteByte(byte(b))
 	}
 
-	s := strings.Split(r, ".")
+	s := strings.Split(sb.String(), ".")
 	if len(s) < 2 {
-		return 0, 0, fmt.Errorf("kernel release missing major and minor component: %s", r)
+		return 0, 0, fmt.Errorf("kernel release missing major and minor component: %s", sb.String())
 	}
 
 	major, err := strconv.Atoi(s[0])
 	if err != nil {
-		return 0, 0, fmt.Errorf("error parsing major version %q in %q: %v", s[0], r, err)
+		return 0, 0, fmt.Errorf("error parsing major version %q in %q: %w", s[0], sb.String(), err)
 	}
 
 	minor, err := strconv.Atoi(s[1])
 	if err != nil {
-		return 0, 0, fmt.Errorf("error parsing minor version %q in %q: %v", s[1], r, err)
+		return 0, 0, fmt.Errorf("error parsing minor version %q in %q: %w", s[1], sb.String(), err)
 	}
 
 	return major, minor, nil
diff --git a/pkg/eventchannel/BUILD b/pkg/eventchannel/BUILD
index bee28b68d..a493e3407 100644
--- a/pkg/eventchannel/BUILD
+++ b/pkg/eventchannel/BUILD
@@ -6,6 +6,7 @@ go_library(
     name = "eventchannel",
     srcs = [
         "event.go",
+        "event_any.go",
         "rate.go",
     ],
     visibility = ["//:sandbox"],
@@ -14,8 +15,9 @@ go_library(
         "//pkg/log",
         "//pkg/sync",
         "//pkg/unet",
-        "@com_github_golang_protobuf//proto:go_default_library",
-        "@com_github_golang_protobuf//ptypes:go_default_library_gen",
+        "@org_golang_google_protobuf//encoding/prototext:go_default_library",
+        "@org_golang_google_protobuf//proto:go_default_library",
+        "@org_golang_google_protobuf//types/known/anypb:go_default_library",
         "@org_golang_x_time//rate:go_default_library",
     ],
 )
@@ -32,6 +34,6 @@ go_test(
     library = ":eventchannel",
     deps = [
         "//pkg/sync",
-        "@com_github_golang_protobuf//proto:go_default_library",
+        "@org_golang_google_protobuf//proto:go_default_library",
     ],
 )
diff --git a/pkg/eventchannel/event.go b/pkg/eventchannel/event.go
index 9a29c58bd..7172ce75d 100644
--- a/pkg/eventchannel/event.go
+++ b/pkg/eventchannel/event.go
@@ -24,8 +24,8 @@ import (
 	"fmt"
 	"syscall"
 
-	"github.com/golang/protobuf/proto"
-	"github.com/golang/protobuf/ptypes"
+	"google.golang.org/protobuf/encoding/prototext"
+	"google.golang.org/protobuf/proto"
 	pb "gvisor.dev/gvisor/pkg/eventchannel/eventchannel_go_proto"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sync"
@@ -118,22 +118,6 @@ func (me *multiEmitter) Close() error {
 	return err
 }
 
-func marshal(msg proto.Message) ([]byte, error) {
-	anypb, err := ptypes.MarshalAny(msg)
-	if err != nil {
-		return nil, err
-	}
-
-	// Wire format is uvarint message length followed by binary proto.
-	bufMsg, err := proto.Marshal(anypb)
-	if err != nil {
-		return nil, err
-	}
-	p := make([]byte, binary.MaxVarintLen64)
-	n := binary.PutUvarint(p, uint64(len(bufMsg)))
-	return append(p[:n], bufMsg...), nil
-}
-
 // socketEmitter emits proto messages on a socket.
 type socketEmitter struct {
 	socket *unet.Socket
@@ -155,10 +139,19 @@ func SocketEmitter(fd int) (Emitter, error) {
 
 // Emit implements Emitter.Emit.
 func (s *socketEmitter) Emit(msg proto.Message) (bool, error) {
-	p, err := marshal(msg)
+	any, err := newAny(msg)
 	if err != nil {
 		return false, err
 	}
+	bufMsg, err := proto.Marshal(any)
+	if err != nil {
+		return false, err
+	}
+
+	// Wire format is uvarint message length followed by binary proto.
+	p := make([]byte, binary.MaxVarintLen64)
+	n := binary.PutUvarint(p, uint64(len(bufMsg)))
+	p = append(p[:n], bufMsg...)
 	for done := 0; done < len(p); {
 		n, err := s.socket.Write(p[done:])
 		if err != nil {
@@ -166,6 +159,7 @@ func (s *socketEmitter) Emit(msg proto.Message) (bool, error) {
 		}
 		done += n
 	}
+
 	return false, nil
 }
 
@@ -189,9 +183,13 @@ func DebugEmitterFrom(inner Emitter) Emitter {
 }
 
 func (d *debugEmitter) Emit(msg proto.Message) (bool, error) {
+	text, err := prototext.Marshal(msg)
+	if err != nil {
+		return false, err
+	}
 	ev := &pb.DebugEvent{
-		Name: proto.MessageName(msg),
-		Text: proto.MarshalTextString(msg),
+		Name: string(msg.ProtoReflect().Descriptor().FullName()),
+		Text: string(text),
 	}
 	return d.inner.Emit(ev)
 }
diff --git a/pkg/eventchannel/event.proto b/pkg/eventchannel/event.proto
index 34468f072..4b24ac47c 100644
--- a/pkg/eventchannel/event.proto
+++ b/pkg/eventchannel/event.proto
@@ -16,7 +16,7 @@ syntax = "proto3";
 
 package gvisor;
 
-// A debug event encapsulates any other event protobuf in text format. This is
+// DebugEvent encapsulates any other event protobuf in text format. This is
 // useful because clients reading events emitted this way do not need to link
 // the event protobufs to display them in a human-readable format.
 message DebugEvent {
diff --git a/pkg/eventchannel/event_any.go b/pkg/eventchannel/event_any.go
new file mode 100644
index 000000000..a5549f6cd
--- /dev/null
+++ b/pkg/eventchannel/event_any.go
@@ -0,0 +1,25 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package eventchannel
+
+import (
+	"google.golang.org/protobuf/types/known/anypb"
+
+	"google.golang.org/protobuf/proto"
+)
+
+func newAny(m proto.Message) (*anypb.Any, error) {
+	return anypb.New(m)
+}
diff --git a/pkg/eventchannel/event_test.go b/pkg/eventchannel/event_test.go
index 43750360b..0dd408f76 100644
--- a/pkg/eventchannel/event_test.go
+++ b/pkg/eventchannel/event_test.go
@@ -19,7 +19,7 @@ import (
 	"testing"
 	"time"
 
-	"github.com/golang/protobuf/proto"
+	"google.golang.org/protobuf/proto"
 	"gvisor.dev/gvisor/pkg/sync"
 )
 
diff --git a/pkg/eventchannel/rate.go b/pkg/eventchannel/rate.go
index 179226c92..74960e16a 100644
--- a/pkg/eventchannel/rate.go
+++ b/pkg/eventchannel/rate.go
@@ -15,8 +15,8 @@
 package eventchannel
 
 import (
-	"github.com/golang/protobuf/proto"
 	"golang.org/x/time/rate"
+	"google.golang.org/protobuf/proto"
 )
 
 // rateLimitedEmitter wraps an emitter and limits events to the given limits.
diff --git a/pkg/fd/fd.go b/pkg/fd/fd.go
index 83bcfe220..cc6b0cdf1 100644
--- a/pkg/fd/fd.go
+++ b/pkg/fd/fd.go
@@ -49,7 +49,7 @@ func fixCount(n int, err error) (int, error) {
 
 // Read implements io.Reader.
 func (r *ReadWriter) Read(b []byte) (int, error) {
-	c, err := fixCount(syscall.Read(int(atomic.LoadInt64(&r.fd)), b))
+	c, err := fixCount(syscall.Read(r.FD(), b))
 	if c == 0 && len(b) > 0 && err == nil {
 		return 0, io.EOF
 	}
@@ -62,7 +62,7 @@ func (r *ReadWriter) Read(b []byte) (int, error) {
 func (r *ReadWriter) ReadAt(b []byte, off int64) (c int, err error) {
 	for len(b) > 0 {
 		var m int
-		m, err = fixCount(syscall.Pread(int(atomic.LoadInt64(&r.fd)), b, off))
+		m, err = fixCount(syscall.Pread(r.FD(), b, off))
 		if m == 0 && err == nil {
 			return c, io.EOF
 		}
@@ -82,7 +82,7 @@ func (r *ReadWriter) Write(b []byte) (int, error) {
 	var n, remaining int
 	for remaining = len(b); remaining > 0; {
 		woff := len(b) - remaining
-		n, err = syscall.Write(int(atomic.LoadInt64(&r.fd)), b[woff:])
+		n, err = syscall.Write(r.FD(), b[woff:])
 
 		if n > 0 {
 			// syscall.Write wrote some bytes. This is the common case.
@@ -110,7 +110,7 @@ func (r *ReadWriter) Write(b []byte) (int, error) {
 func (r *ReadWriter) WriteAt(b []byte, off int64) (c int, err error) {
 	for len(b) > 0 {
 		var m int
-		m, err = fixCount(syscall.Pwrite(int(atomic.LoadInt64(&r.fd)), b, off))
+		m, err = fixCount(syscall.Pwrite(r.FD(), b, off))
 		if err != nil {
 			break
 		}
@@ -121,6 +121,16 @@ func (r *ReadWriter) WriteAt(b []byte, off int64) (c int, err error) {
 	return
 }
 
+// FD returns the owned file descriptor. Ownership remains unchanged.
+func (r *ReadWriter) FD() int {
+	return int(atomic.LoadInt64(&r.fd))
+}
+
+// String implements Stringer.String().
+func (r *ReadWriter) String() string {
+	return fmt.Sprintf("FD: %d", r.FD())
+}
+
 // FD owns a host file descriptor.
 //
 // It is similar to os.File, with a few important distinctions:
@@ -167,6 +177,23 @@ func NewFromFile(file *os.File) (*FD, error) {
 	return New(fd), nil
 }
 
+// NewFromFiles creates new FDs for each file in the slice.
+func NewFromFiles(files []*os.File) ([]*FD, error) {
+	rv := make([]*FD, 0, len(files))
+	for _, f := range files {
+		new, err := NewFromFile(f)
+		if err != nil {
+			// Cleanup on error.
+			for _, fd := range rv {
+				fd.Close()
+			}
+			return nil, err
+		}
+		rv = append(rv, new)
+	}
+	return rv, nil
+}
+
 // Open is equivalent to open(2).
 func Open(path string, openmode int, perm uint32) (*FD, error) {
 	f, err := syscall.Open(path, openmode|syscall.O_LARGEFILE, perm)
@@ -204,11 +231,6 @@ func (f *FD) Release() int {
 	return int(atomic.SwapInt64(&f.fd, -1))
 }
 
-// FD returns the file descriptor owned by FD. FD retains ownership.
-func (f *FD) FD() int {
-	return int(atomic.LoadInt64(&f.fd))
-}
-
 // File converts the FD to an os.File.
 //
 // FD does not transfer ownership of the file descriptor (it will be
@@ -219,7 +241,7 @@ func (f *FD) FD() int {
 // This operation is somewhat expensive, so care should be taken to minimize
 // its use.
 func (f *FD) File() (*os.File, error) {
-	fd, err := syscall.Dup(int(atomic.LoadInt64(&f.fd)))
+	fd, err := syscall.Dup(f.FD())
 	if err != nil {
 		return nil, err
 	}
diff --git a/pkg/fdnotifier/poll_unsafe.go b/pkg/fdnotifier/poll_unsafe.go
index 4225b04dd..ec2f997a2 100644
--- a/pkg/fdnotifier/poll_unsafe.go
+++ b/pkg/fdnotifier/poll_unsafe.go
@@ -65,8 +65,7 @@ func NonBlockingPoll(fd int32, mask waiter.EventMask) waiter.EventMask {
 
 // epollWait performs a blocking wait on epfd.
 //
-// Preconditions:
-//  * len(events) > 0
+// Preconditions: len(events) > 0
 func epollWait(epfd int, events []syscall.EpollEvent, msec int) (int, error) {
 	if len(events) == 0 {
 		panic("Empty events passed to EpollWait")
diff --git a/pkg/flipcall/flipcall.go b/pkg/flipcall/flipcall.go
index ec742c091..c4a3366ce 100644
--- a/pkg/flipcall/flipcall.go
+++ b/pkg/flipcall/flipcall.go
@@ -179,8 +179,10 @@ const (
 
 // Connect blocks until the peer Endpoint has called Endpoint.RecvFirst().
 //
-// Preconditions: ep is a client Endpoint. ep.Connect(), ep.RecvFirst(),
-// ep.SendRecv(), and ep.SendLast() have never been called.
+// Preconditions:
+// * ep is a client Endpoint.
+// * ep.Connect(), ep.RecvFirst(), ep.SendRecv(), and ep.SendLast() have never
+//   been called.
 func (ep *Endpoint) Connect() error {
 	err := ep.ctrlConnect()
 	if err == nil {
@@ -192,8 +194,9 @@ func (ep *Endpoint) Connect() error {
 // RecvFirst blocks until the peer Endpoint calls Endpoint.SendRecv(), then
 // returns the datagram length specified by that call.
 //
-// Preconditions: ep is a server Endpoint. ep.SendRecv(), ep.RecvFirst(), and
-// ep.SendLast() have never been called.
+// Preconditions:
+// * ep is a server Endpoint.
+// * ep.SendRecv(), ep.RecvFirst(), and ep.SendLast() have never been called.
 func (ep *Endpoint) RecvFirst() (uint32, error) {
 	if err := ep.ctrlWaitFirst(); err != nil {
 		return 0, err
@@ -211,10 +214,12 @@ func (ep *Endpoint) RecvFirst() (uint32, error) {
 // datagram length, then blocks until the peer Endpoint calls
 // Endpoint.SendRecv() or Endpoint.SendLast().
 //
-// Preconditions: dataLen <= ep.DataCap(). No previous call to ep.SendRecv() or
-// ep.RecvFirst() has returned an error. ep.SendLast() has never been called.
-// If ep is a client Endpoint, ep.Connect() has previously been called and
-// returned nil.
+// Preconditions:
+// * dataLen <= ep.DataCap().
+// * No previous call to ep.SendRecv() or ep.RecvFirst() has returned an error.
+// * ep.SendLast() has never been called.
+// * If ep is a client Endpoint, ep.Connect() has previously been called and
+//   returned nil.
 func (ep *Endpoint) SendRecv(dataLen uint32) (uint32, error) {
 	if dataLen > ep.dataCap {
 		panic(fmt.Sprintf("attempting to send packet with datagram length %d (maximum %d)", dataLen, ep.dataCap))
@@ -240,10 +245,12 @@ func (ep *Endpoint) SendRecv(dataLen uint32) (uint32, error) {
 // SendLast causes the peer Endpoint's call to Endpoint.SendRecv() or
 // Endpoint.RecvFirst() to return with the given datagram length.
 //
-// Preconditions: dataLen <= ep.DataCap(). No previous call to ep.SendRecv() or
-// ep.RecvFirst() has returned an error. ep.SendLast() has never been called.
-// If ep is a client Endpoint, ep.Connect() has previously been called and
-// returned nil.
+// Preconditions:
+// * dataLen <= ep.DataCap().
+// * No previous call to ep.SendRecv() or ep.RecvFirst() has returned an error.
+// * ep.SendLast() has never been called.
+// * If ep is a client Endpoint, ep.Connect() has previously been called and
+//   returned nil.
 func (ep *Endpoint) SendLast(dataLen uint32) error {
 	if dataLen > ep.dataCap {
 		panic(fmt.Sprintf("attempting to send packet with datagram length %d (maximum %d)", dataLen, ep.dataCap))
diff --git a/pkg/lisafs/README.md b/pkg/lisafs/README.md
new file mode 100644
index 000000000..51d0d40e5
--- /dev/null
+++ b/pkg/lisafs/README.md
@@ -0,0 +1,363 @@
+# Replacing 9P
+
+## Background
+
+The Linux filesystem model consists of the following key aspects (modulo mounts,
+which are outside the scope of this discussion):
+
+-   A `struct inode` represents a "filesystem object", such as a directory or a
+    regular file. "Filesystem object" is most precisely defined by the practical
+    properties of an inode, such as an immutable type (regular file, directory,
+    symbolic link, etc.) and its independence from the path originally used to
+    obtain it.
+
+-   A `struct dentry` represents a node in a filesystem tree. Semantically, each
+    dentry is immutably associated with an inode representing the filesystem
+    object at that position. (Linux implements optimizations involving reuse of
+    unreferenced dentries, which allows their associated inodes to change, but
+    this is outside the scope of this discussion.)
+
+-   A `struct file` represents an open file description (hereafter FD) and is
+    needed to perform I/O. Each FD is immutably associated with the dentry
+    through which it was opened.
+
+The current gVisor virtual filesystem implementation (hereafter VFS1) closely
+imitates the Linux design:
+
+-   `struct inode` => `fs.Inode`
+
+-   `struct dentry` => `fs.Dirent`
+
+-   `struct file` => `fs.File`
+
+gVisor accesses most external filesystems through a variant of the 9P2000.L
+protocol, including extensions for performance (`walkgetattr`) and for features
+not supported by vanilla 9P2000.L (`flushf`, `lconnect`). The 9P protocol family
+is inode-based; 9P fids represent a file (equivalently "file system object"),
+and the protocol is structured around alternatively obtaining fids to represent
+files (with `walk` and, in gVisor, `walkgetattr`) and performing operations on
+those fids.
+
+In the sections below, a **shared** filesystem is a filesystem that is *mutably*
+accessible by multiple concurrent clients, such that a **non-shared** filesystem
+is a filesystem that is either read-only or accessible by only a single client.
+
+## Problems
+
+### Serialization of Path Component RPCs
+
+Broadly speaking, VFS1 traverses each path component in a pathname, alternating
+between verifying that each traversed dentry represents an inode that represents
+a searchable directory and moving to the next dentry in the path.
+
+In the context of a remote filesystem, the structure of this traversal means
+that - modulo caching - a path involving N components requires at least N-1
+*sequential* RPCs to obtain metadata for intermediate directories, incurring
+significant latency. (In vanilla 9P2000.L, 2(N-1) RPCs are required: N-1 `walk`
+and N-1 `getattr`. We added the `walkgetattr` RPC to reduce this overhead.) On
+non-shared filesystems, this overhead is primarily significant during
+application startup; caching mitigates much of this overhead at steady state. On
+shared filesystems, where correct caching requires revalidation (requiring RPCs
+for each revalidated directory anyway), this overhead is consistently ruinous.
+
+### Inefficient RPCs
+
+9P is not exceptionally economical with RPCs in general. In addition to the
+issue described above:
+
+-   Opening an existing file in 9P involves at least 2 RPCs: `walk` to produce
+    an unopened fid representing the file, and `lopen` to open the fid.
+
+-   Creating a file also involves at least 2 RPCs: `walk` to produce an unopened
+    fid representing the parent directory, and `lcreate` to create the file and
+    convert the fid to an open fid representing the created file. In practice,
+    both the Linux and gVisor 9P clients expect to have an unopened fid for the
+    created file (necessitating an additional `walk`), as well as attributes for
+    the created file (necessitating an additional `getattr`), for a total of 4
+    RPCs. (In a shared filesystem, where whether a file already exists can
+    change between RPCs, a correct implementation of `open(O_CREAT)` would have
+    to alternate between these two paths (plus `clunk`ing the temporary fid
+    between alternations, since the nature of the `fid` differs between the two
+    paths). Neither Linux nor gVisor implement the required alternation, so
+    `open(O_CREAT)` without `O_EXCL` can spuriously fail with `EEXIST` on both.)
+
+-   Closing (`clunk`ing) a fid requires an RPC. VFS1 issues this RPC
+    asynchronously in an attempt to reduce critical path latency, but scheduling
+    overhead makes this not clearly advantageous in practice.
+
+-   `read` and `readdir` can return partial reads without a way to indicate EOF,
+    necessitating an additional final read to detect EOF.
+
+-   Operations that affect filesystem state do not consistently return updated
+    filesystem state. In gVisor, the client implementation attempts to handle
+    this by tracking what it thinks updated state "should" be; this is complex,
+    and especially brittle for timestamps (which are often not arbitrarily
+    settable). In Linux, the client implemtation invalidates cached metadata
+    whenever it performs such an operation, and reloads it when a dentry
+    corresponding to an inode with no valid cached metadata is revalidated; this
+    is simple, but necessitates an additional `getattr`.
+
+### Dentry/Inode Ambiguity
+
+As noted above, 9P's documentation tends to imply that unopened fids represent
+an inode. In practice, most filesystem APIs present very limited interfaces for
+working with inodes at best, such that the interpretation of unopened fids
+varies:
+
+-   Linux's 9P client associates unopened fids with (dentry, uid) pairs. When
+    caching is enabled, it also associates each inode with the first fid opened
+    writably that references that inode, in order to support page cache
+    writeback.
+
+-   gVisor's 9P client associates unopened fids with inodes, and also caches
+    opened fids in inodes in a manner similar to Linux.
+
+-   The runsc fsgofer associates unopened fids with both "dentries" (host
+    filesystem paths) and "inodes" (host file descriptors); which is used
+    depends on the operation invoked on the fid.
+
+For non-shared filesystems, this confusion has resulted in correctness issues
+that are (in gVisor) currently handled by a number of coarse-grained locks that
+serialize renames with all other filesystem operations. For shared filesystems,
+this means inconsistent behavior in the presence of concurrent mutation.
+
+## Design
+
+Almost all Linux filesystem syscalls describe filesystem resources in one of two
+ways:
+
+-   Path-based: A filesystem position is described by a combination of a
+    starting position and a sequence of path components relative to that
+    position, where the starting position is one of:
+
+    -   The VFS root (defined by mount namespace and chroot), for absolute paths
+
+    -   The VFS position of an existing FD, for relative paths passed to `*at`
+        syscalls (e.g. `statat`)
+
+    -   The current working directory, for relative paths passed to non-`*at`
+        syscalls and `*at` syscalls with `AT_FDCWD`
+
+-   File-description-based: A filesystem object is described by an existing FD,
+    passed to a `f*` syscall (e.g. `fstat`).
+
+Many of our issues with 9P arise from its (and VFS') interposition of a model
+based on inodes between the filesystem syscall API and filesystem
+implementations. We propose to replace 9P with a protocol that does not feature
+inodes at all, and instead closely follows the filesystem syscall API by
+featuring only path-based and FD-based operations, with minimal deviations as
+necessary to ameliorate deficiencies in the syscall interface (see below). This
+approach addresses the issues described above:
+
+-   Even on shared filesystems, most application filesystem syscalls are
+    translated to a single RPC (possibly excepting special cases described
+    below), which is a logical lower bound.
+
+-   The behavior of application syscalls on shared filesystems is
+    straightforwardly predictable: path-based syscalls are translated to
+    path-based RPCs, which will re-lookup the file at that path, and FD-based
+    syscalls are translated to FD-based RPCs, which use an existing open file
+    without performing another lookup. (This is at least true on gofers that
+    proxy the host local filesystem; other filesystems that lack support for
+    e.g. certain operations on FDs may have different behavior, but this
+    divergence is at least still predictable and inherent to the underlying
+    filesystem implementation.)
+
+Note that this approach is only feasible in gVisor's next-generation virtual
+filesystem (VFS2), which does not assume the existence of inodes and allows the
+remote filesystem client to translate whole path-based syscalls into RPCs. Thus
+one of the unavoidable tradeoffs associated with such a protocol vs. 9P is the
+inability to construct a Linux client that is performance-competitive with
+gVisor.
+
+### File Permissions
+
+Many filesystem operations are side-effectual, such that file permissions must
+be checked before such operations take effect. The simplest approach to file
+permission checking is for the sentry to obtain permissions from the remote
+filesystem, then apply permission checks in the sentry before performing the
+application-requested operation. However, this requires an additional RPC per
+application syscall (which can't be mitigated by caching on shared filesystems).
+Alternatively, we may delegate file permission checking to gofers. In general,
+file permission checks depend on the following properties of the accessor:
+
+-   Filesystem UID/GID
+
+-   Supplementary GIDs
+
+-   Effective capabilities in the accessor's user namespace (i.e. the accessor's
+    effective capability set)
+
+-   All UIDs and GIDs mapped in the accessor's user namespace (which determine
+    if the accessor's capabilities apply to accessed files)
+
+We may choose to delay implementation of file permission checking delegation,
+although this is potentially costly since it doubles the number of required RPCs
+for most operations on shared filesystems. We may also consider compromise
+options, such as only delegating file permission checks for accessors in the
+root user namespace.
+
+### Symbolic Links
+
+gVisor usually interprets symbolic link targets in its VFS rather than on the
+filesystem containing the symbolic link; thus e.g. a symlink to
+"/proc/self/maps" on a remote filesystem resolves to said file in the sentry's
+procfs rather than the host's. This implies that:
+
+-   Remote filesystem servers that proxy filesystems supporting symlinks must
+    check if each path component is a symlink during path traversal.
+
+-   Absolute symlinks require that the sentry restart the operation at its
+    contextual VFS root (which is task-specific and may not be on a remote
+    filesystem at all), so if a remote filesystem server encounters an absolute
+    symlink during path traversal on behalf of a path-based operation, it must
+    terminate path traversal and return the symlink target.
+
+-   Relative symlinks begin target resolution in the parent directory of the
+    symlink, so in theory most relative symlinks can be handled automatically
+    during the path traversal that encounters the symlink, provided that said
+    traversal is supplied with the number of remaining symlinks before `ELOOP`.
+    However, the new path traversed by the symlink target may cross VFS mount
+    boundaries, such that it's only safe for remote filesystem servers to
+    speculatively follow relative symlinks for side-effect-free operations such
+    as `stat` (where the sentry can simply ignore results that are inapplicable
+    due to crossing mount boundaries). We may choose to delay implementation of
+    this feature, at the cost of an additional RPC per relative symlink (note
+    that even if the symlink target crosses a mount boundary, the sentry will
+    need to `stat` the path to the mount boundary to confirm that each traversed
+    component is an accessible directory); until it is implemented, relative
+    symlinks may be handled like absolute symlinks, by terminating path
+    traversal and returning the symlink target.
+
+The possibility of symlinks (and the possibility of a compromised sentry) means
+that the sentry may issue RPCs with paths that, in the absence of symlinks,
+would traverse beyond the root of the remote filesystem. For example, the sentry
+may issue an RPC with a path like "/foo/../..", on the premise that if "/foo" is
+a symlink then the resulting path may be elsewhere on the remote filesystem. To
+handle this, path traversal must also track its current depth below the remote
+filesystem root, and terminate path traversal if it would ascend beyond this
+point.
+
+### Path Traversal
+
+Since path-based VFS operations will translate to path-based RPCs, filesystem
+servers will need to handle path traversal. From the perspective of a given
+filesystem implementation in the server, there are two basic approaches to path
+traversal:
+
+-   Inode-walk: For each path component, obtain a handle to the underlying
+    filesystem object (e.g. with `open(O_PATH)`), check if that object is a
+    symlink (as described above) and that that object is accessible by the
+    caller (e.g. with `fstat()`), then continue to the next path component (e.g.
+    with `openat()`). This ensures that the checked filesystem object is the one
+    used to obtain the next object in the traversal, which is intuitively
+    appealing. However, while this approach works for host local filesystems, it
+    requires features that are not widely supported by other filesystems.
+
+-   Path-walk: For each path component, use a path-based operation to determine
+    if the filesystem object currently referred to by that path component is a
+    symlink / is accessible. This is highly portable, but suffers from quadratic
+    behavior (at the level of the underlying filesystem implementation, the
+    first path component will be traversed a number of times equal to the number
+    of path components in the path).
+
+The implementation should support either option by delegating path traversal to
+filesystem implementations within the server (like VFS and the remote filesystem
+protocol itself), as inode-walking is still safe, efficient, amenable to FD
+caching, and implementable on non-shared host local filesystems (a sufficiently
+common case as to be worth considering in the design).
+
+Both approaches are susceptible to race conditions that may permit sandboxed
+filesystem escapes:
+
+-   Under inode-walk, a malicious application may cause a directory to be moved
+    (with `rename`) during path traversal, such that the filesystem
+    implementation incorrectly determines whether subsequent inodes are located
+    in paths that should be visible to sandboxed applications.
+
+-   Under path-walk, a malicious application may cause a non-symlink file to be
+    replaced with a symlink during path traversal, such that following path
+    operations will incorrectly follow the symlink.
+
+Both race conditions can, to some extent, be mitigated in filesystem server
+implementations by synchronizing path traversal with the hazardous operations in
+question. However, shared filesystems are frequently used to share data between
+sandboxed and unsandboxed applications in a controlled way, and in some cases a
+malicious sandboxed application may be able to take advantage of a hazardous
+filesystem operation performed by an unsandboxed application. In some cases,
+filesystem features may be available to ensure safety even in such cases (e.g.
+[the new openat2() syscall](https://man7.org/linux/man-pages/man2/openat2.2.html)),
+but it is not clear how to solve this problem in general. (Note that this issue
+is not specific to our design; rather, it is a fundamental limitation of
+filesystem sandboxing.)
+
+### Filesystem Multiplexing
+
+A given sentry may need to access multiple distinct remote filesystems (e.g.
+different volumes for a given container). In many cases, there is no advantage
+to serving these filesystems from distinct filesystem servers, or accessing them
+through distinct connections (factors such as maximum RPC concurrency should be
+based on available host resources). Therefore, the protocol should support
+multiplexing of distinct filesystem trees within a single session. 9P supports
+this by allowing multiple calls to the `attach` RPC to produce fids representing
+distinct filesystem trees, but this is somewhat clunky; we propose a much
+simpler mechanism wherein each message that conveys a path also conveys a
+numeric filesystem ID that identifies a filesystem tree.
+
+## Alternatives Considered
+
+### Additional Extensions to 9P
+
+There are at least three conceptual aspects to 9P:
+
+-   Wire format: messages with a 4-byte little-endian size prefix, strings with
+    a 2-byte little-endian size prefix, etc. Whether the wire format is worth
+    retaining is unclear; in particular, it's unclear that the 9P wire format
+    has a significant advantage over protobufs, which are substantially easier
+    to extend. Note that the official Go protobuf implementation is widely known
+    to suffer from a significant number of performance deficiencies, so if we
+    choose to switch to protobuf, we may need to use an alternative toolchain
+    such as `gogo/protobuf` (which is also widely used in the Go ecosystem, e.g.
+    by Kubernetes).
+
+-   Filesystem model: fids, qids, etc. Discarding this is one of the motivations
+    for this proposal.
+
+-   RPCs: Twalk, Tlopen, etc. In addition to previously-described
+    inefficiencies, most of these are dependent on the filesystem model and
+    therefore must be discarded.
+
+### FUSE
+
+The FUSE (Filesystem in Userspace) protocol is frequently used to provide
+arbitrary userspace filesystem implementations to a host Linux kernel.
+Unfortunately, FUSE is also inode-based, and therefore doesn't address any of
+the problems we have with 9P.
+
+### virtio-fs
+
+virtio-fs is an ongoing project aimed at improving Linux VM filesystem
+performance when accessing Linux host filesystems (vs. virtio-9p). In brief, it
+is based on:
+
+-   Using a FUSE client in the guest that communicates over virtio with a FUSE
+    server in the host.
+
+-   Using DAX to map the host page cache into the guest.
+
+-   Using a file metadata table in shared memory to avoid VM exits for metadata
+    updates.
+
+None of these improvements seem applicable to gVisor:
+
+-   As explained above, FUSE is still inode-based, so it is still susceptible to
+    most of the problems we have with 9P.
+
+-   Our use of host file descriptors already allows us to leverage the host page
+    cache for file contents.
+
+-   Our need for shared filesystem coherence is usually based on a user
+    requirement that an out-of-sandbox filesystem mutation is guaranteed to be
+    visible by all subsequent observations from within the sandbox, or vice
+    versa; it's not clear that this can be guaranteed without a synchronous
+    signaling mechanism like an RPC.
diff --git a/pkg/marshal/BUILD b/pkg/marshal/BUILD
new file mode 100644
index 000000000..aac0161fa
--- /dev/null
+++ b/pkg/marshal/BUILD
@@ -0,0 +1,15 @@
+load("//tools:defs.bzl", "go_library")
+
+licenses(["notice"])
+
+go_library(
+    name = "marshal",
+    srcs = [
+        "marshal.go",
+        "marshal_impl_util.go",
+    ],
+    visibility = [
+        "//:sandbox",
+    ],
+    deps = ["//pkg/usermem"],
+)
diff --git a/pkg/marshal/marshal.go b/pkg/marshal/marshal.go
new file mode 100644
index 000000000..d8cb44b40
--- /dev/null
+++ b/pkg/marshal/marshal.go
@@ -0,0 +1,184 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package marshal defines the Marshallable interface for
+// serialize/deserializing go data structures to/from memory, according to the
+// Linux ABI.
+//
+// Implementations of this interface are typically automatically generated by
+// tools/go_marshal. See the go_marshal README for details.
+package marshal
+
+import (
+	"io"
+
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// CopyContext defines the memory operations required to marshal to and from
+// user memory. Typically, kernel.Task is used to provide implementations for
+// these operations.
+type CopyContext interface {
+	// CopyScratchBuffer provides a task goroutine-local scratch buffer. See
+	// kernel.CopyScratchBuffer.
+	CopyScratchBuffer(size int) []byte
+
+	// CopyOutBytes writes the contents of b to the task's memory. See
+	// kernel.CopyOutBytes.
+	CopyOutBytes(addr usermem.Addr, b []byte) (int, error)
+
+	// CopyInBytes reads the contents of the task's memory to b. See
+	// kernel.CopyInBytes.
+	CopyInBytes(addr usermem.Addr, b []byte) (int, error)
+}
+
+// Marshallable represents operations on a type that can be marshalled to and
+// from memory.
+//
+// go-marshal automatically generates implementations for this interface for
+// types marked as '+marshal'.
+type Marshallable interface {
+	io.WriterTo
+
+	// SizeBytes is the size of the memory representation of a type in
+	// marshalled form.
+	//
+	// SizeBytes must handle a nil receiver. Practically, this means SizeBytes
+	// cannot deference any fields on the object implementing it (but will
+	// likely make use of the type of these fields).
+	SizeBytes() int
+
+	// MarshalBytes serializes a copy of a type to dst.
+	// Precondition: dst must be at least SizeBytes() in length.
+	MarshalBytes(dst []byte)
+
+	// UnmarshalBytes deserializes a type from src.
+	// Precondition: src must be at least SizeBytes() in length.
+	UnmarshalBytes(src []byte)
+
+	// Packed returns true if the marshalled size of the type is the same as the
+	// size it occupies in memory. This happens when the type has no fields
+	// starting at unaligned addresses (should always be true by default for ABI
+	// structs, verified by automatically generated tests when using
+	// go_marshal), and has no fields marked `marshal:"unaligned"`.
+	//
+	// Packed must return the same result for all possible values of the type
+	// implementing it. Violating this constraint implies the type doesn't have
+	// a static memory layout, and will lead to memory corruption.
+	// Go-marshal-generated code reuses the result of Packed for multiple values
+	// of the same type.
+	Packed() bool
+
+	// MarshalUnsafe serializes a type by bulk copying its in-memory
+	// representation to the dst buffer. This is only safe to do when the type
+	// has no implicit padding, see Marshallable.Packed. When Packed would
+	// return false, MarshalUnsafe should fall back to the safer but slower
+	// MarshalBytes.
+	// Precondition: dst must be at least SizeBytes() in length.
+	MarshalUnsafe(dst []byte)
+
+	// UnmarshalUnsafe deserializes a type by directly copying to the underlying
+	// memory allocated for the object by the runtime.
+	//
+	// This allows much faster unmarshalling of types which have no implicit
+	// padding, see Marshallable.Packed. When Packed would return false,
+	// UnmarshalUnsafe should fall back to the safer but slower unmarshal
+	// mechanism implemented in UnmarshalBytes.
+	// Precondition: src must be at least SizeBytes() in length.
+	UnmarshalUnsafe(src []byte)
+
+	// CopyIn deserializes a Marshallable type from a task's memory. This may
+	// only be called from a task goroutine. This is more efficient than calling
+	// UnmarshalUnsafe on Marshallable.Packed types, as the type being
+	// marshalled does not escape. The implementation should avoid creating
+	// extra copies in memory by directly deserializing to the object's
+	// underlying memory.
+	//
+	// If the copy-in from the task memory is only partially successful, CopyIn
+	// should still attempt to deserialize as much data as possible. See comment
+	// for UnmarshalBytes.
+	CopyIn(cc CopyContext, addr usermem.Addr) (int, error)
+
+	// CopyOut serializes a Marshallable type to a task's memory. This may only
+	// be called from a task goroutine. This is more efficient than calling
+	// MarshalUnsafe on Marshallable.Packed types, as the type being serialized
+	// does not escape. The implementation should avoid creating extra copies in
+	// memory by directly serializing from the object's underlying memory.
+	//
+	// The copy-out to the task memory may be partially successful, in which
+	// case CopyOut returns how much data was serialized. See comment for
+	// MarshalBytes for implications.
+	CopyOut(cc CopyContext, addr usermem.Addr) (int, error)
+
+	// CopyOutN is like CopyOut, but explicitly requests a partial
+	// copy-out. Note that this may yield unexpected results for non-packed
+	// types and the caller may only want to allow this for packed types. See
+	// comment on MarshalBytes.
+	//
+	// The limit must be less than or equal to SizeBytes().
+	CopyOutN(cc CopyContext, addr usermem.Addr, limit int) (int, error)
+}
+
+// go-marshal generates additional functions for a type based on additional
+// clauses to the +marshal directive. They are documented below.
+//
+// Slice API
+// =========
+//
+// Adding a "slice" clause to the +marshal directive for structs or newtypes on
+// primitives like this:
+//
+// // +marshal slice:FooSlice
+// type Foo struct { ... }
+//
+// Generates four additional functions for marshalling slices of Foos like this:
+//
+// // MarshalUnsafeFooSlice is like Foo.MarshalUnsafe, buf for a []Foo. It
+// // might be more efficient that repeatedly calling Foo.MarshalUnsafe
+// // over a []Foo in a loop if the type is Packed.
+// // Preconditions: dst must be at least len(src)*Foo.SizeBytes() in length.
+// func MarshalUnsafeFooSlice(src []Foo, dst []byte) (int, error) { ... }
+//
+// // UnmarshalUnsafeFooSlice is like Foo.UnmarshalUnsafe, buf for a []Foo. It
+// // might be more efficient that repeatedly calling Foo.UnmarshalUnsafe
+// // over a []Foo in a loop if the type is Packed.
+// // Preconditions: src must be at least len(dst)*Foo.SizeBytes() in length.
+// func UnmarshalUnsafeFooSlice(dst []Foo, src []byte) (int, error) { ... }
+//
+// // CopyFooSliceIn copies in a slice of Foo objects from the task's memory.
+// func CopyFooSliceIn(cc marshal.CopyContext, addr usermem.Addr, dst []Foo) (int, error) { ... }
+//
+// // CopyFooSliceIn copies out a slice of Foo objects to the task's memory.
+// func CopyFooSliceOut(cc marshal.CopyContext, addr usermem.Addr, src []Foo) (int, error) { ... }
+//
+// The name of the functions are of the format "Copy%sIn" and "Copy%sOut", where
+// %s is the first argument to the slice clause. This directive is not supported
+// for newtypes on arrays.
+//
+// The slice clause also takes an optional second argument, which must be the
+// value "inner":
+//
+// // +marshal slice:Int32Slice:inner
+// type Int32 int32
+//
+// This is only valid on newtypes on primitives, and causes the generated
+// functions to accept slices of the inner type instead:
+//
+// func CopyInt32SliceIn(cc marshal.CopyContext, addr usermem.Addr, dst []int32) (int, error) { ... }
+//
+// Without "inner", they would instead be:
+//
+// func CopyInt32SliceIn(cc marshal.CopyContext, addr usermem.Addr, dst []Int32) (int, error) { ... }
+//
+// This may help avoid a cast depending on how the generated functions are used.
diff --git a/pkg/marshal/marshal_impl_util.go b/pkg/marshal/marshal_impl_util.go
new file mode 100644
index 000000000..ea75e09f2
--- /dev/null
+++ b/pkg/marshal/marshal_impl_util.go
@@ -0,0 +1,78 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package marshal
+
+import (
+	"io"
+
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// StubMarshallable implements the Marshallable interface.
+// StubMarshallable is a convenient embeddable type for satisfying the
+// marshallable interface, but provides no actual implementation. It is
+// useful when the marshallable interface needs to be implemented manually,
+// but the caller doesn't require the full marshallable interface.
+type StubMarshallable struct{}
+
+// WriteTo implements Marshallable.WriteTo.
+func (StubMarshallable) WriteTo(w io.Writer) (n int64, err error) {
+	panic("Please implement your own WriteTo function")
+}
+
+// SizeBytes implements Marshallable.SizeBytes.
+func (StubMarshallable) SizeBytes() int {
+	panic("Please implement your own SizeBytes function")
+}
+
+// MarshalBytes implements Marshallable.MarshalBytes.
+func (StubMarshallable) MarshalBytes(dst []byte) {
+	panic("Please implement your own MarshalBytes function")
+}
+
+// UnmarshalBytes implements Marshallable.UnmarshalBytes.
+func (StubMarshallable) UnmarshalBytes(src []byte) {
+	panic("Please implement your own UnmarshalBytes function")
+}
+
+// Packed implements Marshallable.Packed.
+func (StubMarshallable) Packed() bool {
+	panic("Please implement your own Packed function")
+}
+
+// MarshalUnsafe implements Marshallable.MarshalUnsafe.
+func (StubMarshallable) MarshalUnsafe(dst []byte) {
+	panic("Please implement your own MarshalUnsafe function")
+}
+
+// UnmarshalUnsafe implements Marshallable.UnmarshalUnsafe.
+func (StubMarshallable) UnmarshalUnsafe(src []byte) {
+	panic("Please implement your own UnmarshalUnsafe function")
+}
+
+// CopyIn implements Marshallable.CopyIn.
+func (StubMarshallable) CopyIn(cc CopyContext, addr usermem.Addr) (int, error) {
+	panic("Please implement your own CopyIn function")
+}
+
+// CopyOut implements Marshallable.CopyOut.
+func (StubMarshallable) CopyOut(cc CopyContext, addr usermem.Addr) (int, error) {
+	panic("Please implement your own CopyOut function")
+}
+
+// CopyOutN implements Marshallable.CopyOutN.
+func (StubMarshallable) CopyOutN(cc CopyContext, addr usermem.Addr, limit int) (int, error) {
+	panic("Please implement your own CopyOutN function")
+}
diff --git a/pkg/marshal/primitive/BUILD b/pkg/marshal/primitive/BUILD
new file mode 100644
index 000000000..d77a11c79
--- /dev/null
+++ b/pkg/marshal/primitive/BUILD
@@ -0,0 +1,19 @@
+load("//tools:defs.bzl", "go_library")
+
+licenses(["notice"])
+
+go_library(
+    name = "primitive",
+    srcs = [
+        "primitive.go",
+    ],
+    marshal = True,
+    visibility = [
+        "//:sandbox",
+    ],
+    deps = [
+        "//pkg/context",
+        "//pkg/marshal",
+        "//pkg/usermem",
+    ],
+)
diff --git a/pkg/marshal/primitive/primitive.go b/pkg/marshal/primitive/primitive.go
new file mode 100644
index 000000000..4b342de6b
--- /dev/null
+++ b/pkg/marshal/primitive/primitive.go
@@ -0,0 +1,349 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package primitive defines marshal.Marshallable implementations for primitive
+// types.
+package primitive
+
+import (
+	"io"
+
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/marshal"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// Int8 is a marshal.Marshallable implementation for int8.
+//
+// +marshal slice:Int8Slice:inner
+type Int8 int8
+
+// Uint8 is a marshal.Marshallable implementation for uint8.
+//
+// +marshal slice:Uint8Slice:inner
+type Uint8 uint8
+
+// Int16 is a marshal.Marshallable implementation for int16.
+//
+// +marshal slice:Int16Slice:inner
+type Int16 int16
+
+// Uint16 is a marshal.Marshallable implementation for uint16.
+//
+// +marshal slice:Uint16Slice:inner
+type Uint16 uint16
+
+// Int32 is a marshal.Marshallable implementation for int32.
+//
+// +marshal slice:Int32Slice:inner
+type Int32 int32
+
+// Uint32 is a marshal.Marshallable implementation for uint32.
+//
+// +marshal slice:Uint32Slice:inner
+type Uint32 uint32
+
+// Int64 is a marshal.Marshallable implementation for int64.
+//
+// +marshal slice:Int64Slice:inner
+type Int64 int64
+
+// Uint64 is a marshal.Marshallable implementation for uint64.
+//
+// +marshal slice:Uint64Slice:inner
+type Uint64 uint64
+
+// ByteSlice is a marshal.Marshallable implementation for []byte.
+// This is a convenience wrapper around a dynamically sized type, and can't be
+// embedded in other marshallable types because it breaks assumptions made by
+// go-marshal internals. It violates the "no dynamically-sized types"
+// constraint of the go-marshal library.
+type ByteSlice []byte
+
+// SizeBytes implements marshal.Marshallable.SizeBytes.
+func (b *ByteSlice) SizeBytes() int {
+	return len(*b)
+}
+
+// MarshalBytes implements marshal.Marshallable.MarshalBytes.
+func (b *ByteSlice) MarshalBytes(dst []byte) {
+	copy(dst, *b)
+}
+
+// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes.
+func (b *ByteSlice) UnmarshalBytes(src []byte) {
+	copy(*b, src)
+}
+
+// Packed implements marshal.Marshallable.Packed.
+func (b *ByteSlice) Packed() bool {
+	return false
+}
+
+// MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe.
+func (b *ByteSlice) MarshalUnsafe(dst []byte) {
+	b.MarshalBytes(dst)
+}
+
+// UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe.
+func (b *ByteSlice) UnmarshalUnsafe(src []byte) {
+	b.UnmarshalBytes(src)
+}
+
+// CopyIn implements marshal.Marshallable.CopyIn.
+func (b *ByteSlice) CopyIn(cc marshal.CopyContext, addr usermem.Addr) (int, error) {
+	return cc.CopyInBytes(addr, *b)
+}
+
+// CopyOut implements marshal.Marshallable.CopyOut.
+func (b *ByteSlice) CopyOut(cc marshal.CopyContext, addr usermem.Addr) (int, error) {
+	return cc.CopyOutBytes(addr, *b)
+}
+
+// CopyOutN implements marshal.Marshallable.CopyOutN.
+func (b *ByteSlice) CopyOutN(cc marshal.CopyContext, addr usermem.Addr, limit int) (int, error) {
+	return cc.CopyOutBytes(addr, (*b)[:limit])
+}
+
+// WriteTo implements io.WriterTo.WriteTo.
+func (b *ByteSlice) WriteTo(w io.Writer) (int64, error) {
+	n, err := w.Write(*b)
+	return int64(n), err
+}
+
+var _ marshal.Marshallable = (*ByteSlice)(nil)
+
+// Below, we define some convenience functions for marshalling primitive types
+// using the newtypes above, without requiring superfluous casts.
+
+// 8-bit integers
+
+// CopyInt8In is a convenient wrapper for copying in an int8 from the task's
+// memory.
+func CopyInt8In(cc marshal.CopyContext, addr usermem.Addr, dst *int8) (int, error) {
+	var buf Int8
+	n, err := buf.CopyIn(cc, addr)
+	if err != nil {
+		return n, err
+	}
+	*dst = int8(buf)
+	return n, nil
+}
+
+// CopyInt8Out is a convenient wrapper for copying out an int8 to the task's
+// memory.
+func CopyInt8Out(cc marshal.CopyContext, addr usermem.Addr, src int8) (int, error) {
+	srcP := Int8(src)
+	return srcP.CopyOut(cc, addr)
+}
+
+// CopyUint8In is a convenient wrapper for copying in a uint8 from the task's
+// memory.
+func CopyUint8In(cc marshal.CopyContext, addr usermem.Addr, dst *uint8) (int, error) {
+	var buf Uint8
+	n, err := buf.CopyIn(cc, addr)
+	if err != nil {
+		return n, err
+	}
+	*dst = uint8(buf)
+	return n, nil
+}
+
+// CopyUint8Out is a convenient wrapper for copying out a uint8 to the task's
+// memory.
+func CopyUint8Out(cc marshal.CopyContext, addr usermem.Addr, src uint8) (int, error) {
+	srcP := Uint8(src)
+	return srcP.CopyOut(cc, addr)
+}
+
+// 16-bit integers
+
+// CopyInt16In is a convenient wrapper for copying in an int16 from the task's
+// memory.
+func CopyInt16In(cc marshal.CopyContext, addr usermem.Addr, dst *int16) (int, error) {
+	var buf Int16
+	n, err := buf.CopyIn(cc, addr)
+	if err != nil {
+		return n, err
+	}
+	*dst = int16(buf)
+	return n, nil
+}
+
+// CopyInt16Out is a convenient wrapper for copying out an int16 to the task's
+// memory.
+func CopyInt16Out(cc marshal.CopyContext, addr usermem.Addr, src int16) (int, error) {
+	srcP := Int16(src)
+	return srcP.CopyOut(cc, addr)
+}
+
+// CopyUint16In is a convenient wrapper for copying in a uint16 from the task's
+// memory.
+func CopyUint16In(cc marshal.CopyContext, addr usermem.Addr, dst *uint16) (int, error) {
+	var buf Uint16
+	n, err := buf.CopyIn(cc, addr)
+	if err != nil {
+		return n, err
+	}
+	*dst = uint16(buf)
+	return n, nil
+}
+
+// CopyUint16Out is a convenient wrapper for copying out a uint16 to the task's
+// memory.
+func CopyUint16Out(cc marshal.CopyContext, addr usermem.Addr, src uint16) (int, error) {
+	srcP := Uint16(src)
+	return srcP.CopyOut(cc, addr)
+}
+
+// 32-bit integers
+
+// CopyInt32In is a convenient wrapper for copying in an int32 from the task's
+// memory.
+func CopyInt32In(cc marshal.CopyContext, addr usermem.Addr, dst *int32) (int, error) {
+	var buf Int32
+	n, err := buf.CopyIn(cc, addr)
+	if err != nil {
+		return n, err
+	}
+	*dst = int32(buf)
+	return n, nil
+}
+
+// CopyInt32Out is a convenient wrapper for copying out an int32 to the task's
+// memory.
+func CopyInt32Out(cc marshal.CopyContext, addr usermem.Addr, src int32) (int, error) {
+	srcP := Int32(src)
+	return srcP.CopyOut(cc, addr)
+}
+
+// CopyUint32In is a convenient wrapper for copying in a uint32 from the task's
+// memory.
+func CopyUint32In(cc marshal.CopyContext, addr usermem.Addr, dst *uint32) (int, error) {
+	var buf Uint32
+	n, err := buf.CopyIn(cc, addr)
+	if err != nil {
+		return n, err
+	}
+	*dst = uint32(buf)
+	return n, nil
+}
+
+// CopyUint32Out is a convenient wrapper for copying out a uint32 to the task's
+// memory.
+func CopyUint32Out(cc marshal.CopyContext, addr usermem.Addr, src uint32) (int, error) {
+	srcP := Uint32(src)
+	return srcP.CopyOut(cc, addr)
+}
+
+// 64-bit integers
+
+// CopyInt64In is a convenient wrapper for copying in an int64 from the task's
+// memory.
+func CopyInt64In(cc marshal.CopyContext, addr usermem.Addr, dst *int64) (int, error) {
+	var buf Int64
+	n, err := buf.CopyIn(cc, addr)
+	if err != nil {
+		return n, err
+	}
+	*dst = int64(buf)
+	return n, nil
+}
+
+// CopyInt64Out is a convenient wrapper for copying out an int64 to the task's
+// memory.
+func CopyInt64Out(cc marshal.CopyContext, addr usermem.Addr, src int64) (int, error) {
+	srcP := Int64(src)
+	return srcP.CopyOut(cc, addr)
+}
+
+// CopyUint64In is a convenient wrapper for copying in a uint64 from the task's
+// memory.
+func CopyUint64In(cc marshal.CopyContext, addr usermem.Addr, dst *uint64) (int, error) {
+	var buf Uint64
+	n, err := buf.CopyIn(cc, addr)
+	if err != nil {
+		return n, err
+	}
+	*dst = uint64(buf)
+	return n, nil
+}
+
+// CopyUint64Out is a convenient wrapper for copying out a uint64 to the task's
+// memory.
+func CopyUint64Out(cc marshal.CopyContext, addr usermem.Addr, src uint64) (int, error) {
+	srcP := Uint64(src)
+	return srcP.CopyOut(cc, addr)
+}
+
+// CopyByteSliceIn is a convenient wrapper for copying in a []byte from the
+// task's memory.
+func CopyByteSliceIn(cc marshal.CopyContext, addr usermem.Addr, dst *[]byte) (int, error) {
+	var buf ByteSlice
+	n, err := buf.CopyIn(cc, addr)
+	if err != nil {
+		return n, err
+	}
+	*dst = []byte(buf)
+	return n, nil
+}
+
+// CopyByteSliceOut is a convenient wrapper for copying out a []byte to the
+// task's memory.
+func CopyByteSliceOut(cc marshal.CopyContext, addr usermem.Addr, src []byte) (int, error) {
+	srcP := ByteSlice(src)
+	return srcP.CopyOut(cc, addr)
+}
+
+// CopyStringIn is a convenient wrapper for copying in a string from the
+// task's memory.
+func CopyStringIn(cc marshal.CopyContext, addr usermem.Addr, dst *string) (int, error) {
+	var buf ByteSlice
+	n, err := buf.CopyIn(cc, addr)
+	if err != nil {
+		return n, err
+	}
+	*dst = string(buf)
+	return n, nil
+}
+
+// CopyStringOut is a convenient wrapper for copying out a string to the task's
+// memory.
+func CopyStringOut(cc marshal.CopyContext, addr usermem.Addr, src string) (int, error) {
+	srcP := ByteSlice(src)
+	return srcP.CopyOut(cc, addr)
+}
+
+// IOCopyContext wraps an object implementing usermem.IO to implement
+// marshal.CopyContext.
+type IOCopyContext struct {
+	Ctx  context.Context
+	IO   usermem.IO
+	Opts usermem.IOOpts
+}
+
+// CopyScratchBuffer implements marshal.CopyContext.CopyScratchBuffer.
+func (i *IOCopyContext) CopyScratchBuffer(size int) []byte {
+	return make([]byte, size)
+}
+
+// CopyOutBytes implements marshal.CopyContext.CopyOutBytes.
+func (i *IOCopyContext) CopyOutBytes(addr usermem.Addr, b []byte) (int, error) {
+	return i.IO.CopyOut(i.Ctx, addr, b, i.Opts)
+}
+
+// CopyInBytes implements marshal.CopyContext.CopyInBytes.
+func (i *IOCopyContext) CopyInBytes(addr usermem.Addr, b []byte) (int, error) {
+	return i.IO.CopyIn(i.Ctx, addr, b, i.Opts)
+}
diff --git a/pkg/merkletree/merkletree.go b/pkg/merkletree/merkletree.go
index 955c9c473..d8227b8bd 100644
--- a/pkg/merkletree/merkletree.go
+++ b/pkg/merkletree/merkletree.go
@@ -29,13 +29,19 @@ const (
 	sha256DigestSize = 32
 )
 
+// DigestSize returns the size (in bytes) of a digest.
+// TODO(b/156980949): Allow config other hash methods (SHA384/SHA512).
+func DigestSize() int {
+	return sha256DigestSize
+}
+
 // Layout defines the scale of a Merkle tree.
 type Layout struct {
 	// blockSize is the size of a data block to be hashed.
 	blockSize int64
 	// digestSize is the size of a generated hash.
 	digestSize int64
-	// levelOffset contains the offset of the begnning of each level in
+	// levelOffset contains the offset of the beginning of each level in
 	// bytes. The number of levels in the tree is the length of the slice.
 	// The leaf nodes (level 0) contain hashes of blocks of the input data.
 	// Each level N contains hashes of the blocks in level N-1. The highest
@@ -45,12 +51,25 @@ type Layout struct {
 
 // InitLayout initializes and returns a new Layout object describing the structure
 // of a tree. dataSize specifies the size of input data in bytes.
-func InitLayout(dataSize int64) Layout {
+func InitLayout(dataSize int64, dataAndTreeInSameFile bool) Layout {
 	layout := Layout{
 		blockSize: usermem.PageSize,
 		// TODO(b/156980949): Allow config other hash methods (SHA384/SHA512).
 		digestSize: sha256DigestSize,
 	}
+
+	// treeStart is the offset (in bytes) of the first level of the tree in
+	// the file. If data and tree are in different files, treeStart should
+	// be zero. If data is in the same file as the tree, treeStart points
+	// to the block after the last data block (which may be zero-padded).
+	var treeStart int64
+	if dataAndTreeInSameFile {
+		treeStart = dataSize
+		if dataSize%layout.blockSize != 0 {
+			treeStart += layout.blockSize - dataSize%layout.blockSize
+		}
+	}
+
 	numBlocks := (dataSize + layout.blockSize - 1) / layout.blockSize
 	level := 0
 	offset := int64(0)
@@ -60,14 +79,15 @@ func InitLayout(dataSize int64) Layout {
 	// contain the hashes of the data blocks, while level numLevels - 1 is
 	// the root.
 	for numBlocks > 1 {
-		layout.levelOffset = append(layout.levelOffset, offset*layout.blockSize)
+		layout.levelOffset = append(layout.levelOffset, treeStart+offset*layout.blockSize)
 		// Round numBlocks up to fill up a block.
 		numBlocks += (layout.hashesPerBlock() - numBlocks%layout.hashesPerBlock()) % layout.hashesPerBlock()
 		offset += numBlocks / layout.hashesPerBlock()
 		numBlocks = numBlocks / layout.hashesPerBlock()
 		level++
 	}
-	layout.levelOffset = append(layout.levelOffset, offset*layout.blockSize)
+	layout.levelOffset = append(layout.levelOffset, treeStart+offset*layout.blockSize)
+
 	return layout
 }
 
@@ -103,14 +123,72 @@ func (layout Layout) blockOffset(level int, index int64) int64 {
 	return layout.levelOffset[level] + index*layout.blockSize
 }
 
-// Generate constructs a Merkle tree for the contents of data. The output is
-// written to treeWriter. The treeReader should be able to read the tree after
-// it has been written. That is, treeWriter and treeReader should point to the
-// same underlying data but have separate cursors.
-func Generate(data io.Reader, dataSize int64, treeReader io.Reader, treeWriter io.Writer) ([]byte, error) {
-	layout := InitLayout(dataSize)
+// VerityDescriptor is a struct that is serialized and hashed to get a file's
+// root hash, which contains the root hash of the raw content and the file's
+// meatadata.
+type VerityDescriptor struct {
+	Name     string
+	Mode     uint32
+	UID      uint32
+	GID      uint32
+	RootHash []byte
+}
 
-	numBlocks := (dataSize + layout.blockSize - 1) / layout.blockSize
+func (d *VerityDescriptor) String() string {
+	return fmt.Sprintf("Name: %s, Mode: %d, UID: %d, GID: %d, RootHash: %v", d.Name, d.Mode, d.UID, d.GID, d.RootHash)
+}
+
+// verify generates a hash from d, and compares it with expected.
+func (d *VerityDescriptor) verify(expected []byte) error {
+	h := sha256.Sum256([]byte(d.String()))
+	if !bytes.Equal(h[:], expected) {
+		return fmt.Errorf("unexpected root hash")
+	}
+	return nil
+}
+
+// GenerateParams contains the parameters used to generate a Merkle tree.
+type GenerateParams struct {
+	// File is a reader of the file to be hashed.
+	File io.ReaderAt
+	// Size is the size of the file.
+	Size int64
+	// Name is the name of the target file.
+	Name string
+	// Mode is the mode of the target file.
+	Mode uint32
+	// UID is the user ID of the target file.
+	UID uint32
+	// GID is the group ID of the target file.
+	GID uint32
+	// TreeReader is a reader for the Merkle tree.
+	TreeReader io.ReaderAt
+	// TreeWriter is a writer for the Merkle tree.
+	TreeWriter io.Writer
+	// DataAndTreeInSameFile is true if data and Merkle tree are in the same
+	// file, or false if Merkle tree is a separate file from data.
+	DataAndTreeInSameFile bool
+}
+
+// Generate constructs a Merkle tree for the contents of params.File. The
+// output is written to params.TreeWriter.
+//
+// Generate returns a hash of a VerityDescriptor, which contains the file
+// metadata and the hash from file content.
+func Generate(params *GenerateParams) ([]byte, error) {
+	layout := InitLayout(params.Size, params.DataAndTreeInSameFile)
+
+	numBlocks := (params.Size + layout.blockSize - 1) / layout.blockSize
+
+	// If the data is in the same file as the tree, zero pad the last data
+	// block.
+	bytesInLastBlock := params.Size % layout.blockSize
+	if params.DataAndTreeInSameFile && bytesInLastBlock != 0 {
+		zeroBuf := make([]byte, layout.blockSize-bytesInLastBlock)
+		if _, err := params.TreeWriter.Write(zeroBuf); err != nil {
+			return nil, err
+		}
+	}
 
 	var root []byte
 	for level := 0; level < layout.numLevels(); level++ {
@@ -123,11 +201,11 @@ func Generate(data io.Reader, dataSize int64, treeReader io.Reader, treeWriter i
 			if level == 0 {
 				// Read data block from the target file since level 0 includes hashes
 				// of blocks in the input data.
-				n, err = data.Read(buf)
+				n, err = params.File.ReadAt(buf, i*layout.blockSize)
 			} else {
 				// Read data block from the tree file since levels higher than 0 are
 				// hashing the lower level hashes.
-				n, err = treeReader.Read(buf)
+				n, err = params.TreeReader.ReadAt(buf, layout.blockOffset(level-1, i))
 			}
 
 			// err is populated as long as the bytes read is smaller than the buffer
@@ -147,7 +225,7 @@ func Generate(data io.Reader, dataSize int64, treeReader io.Reader, treeWriter i
 			}
 
 			// Write the generated hash to the end of the tree file.
-			if _, err = treeWriter.Write(digest[:]); err != nil {
+			if _, err = params.TreeWriter.Write(digest[:]); err != nil {
 				return nil, err
 			}
 		}
@@ -155,61 +233,111 @@ func Generate(data io.Reader, dataSize int64, treeReader io.Reader, treeWriter i
 		// remaining of the last block. But no need to do so for root.
 		if level != layout.rootLevel() && numBlocks%layout.hashesPerBlock() != 0 {
 			zeroBuf := make([]byte, layout.blockSize-(numBlocks%layout.hashesPerBlock())*layout.digestSize)
-			if _, err := treeWriter.Write(zeroBuf[:]); err != nil {
+			if _, err := params.TreeWriter.Write(zeroBuf[:]); err != nil {
 				return nil, err
 			}
 		}
 		numBlocks = (numBlocks + layout.hashesPerBlock() - 1) / layout.hashesPerBlock()
 	}
-	return root, nil
+	descriptor := VerityDescriptor{
+		Name:     params.Name,
+		Mode:     params.Mode,
+		UID:      params.UID,
+		GID:      params.GID,
+		RootHash: root,
+	}
+	ret := sha256.Sum256([]byte(descriptor.String()))
+	return ret[:], nil
+}
+
+// VerifyParams contains the params used to verify a portion of a file against
+// a Merkle tree.
+type VerifyParams struct {
+	// Out will be filled with verified data.
+	Out io.Writer
+	// File is a handler on the file to be verified.
+	File io.ReaderAt
+	// tree is a handler on the Merkle tree used to verify file.
+	Tree io.ReaderAt
+	// Size is the size of the file.
+	Size int64
+	// Name is the name of the target file.
+	Name string
+	// Mode is the mode of the target file.
+	Mode uint32
+	// UID is the user ID of the target file.
+	UID uint32
+	// GID is the group ID of the target file.
+	GID uint32
+	// ReadOffset is the offset of the data range to be verified.
+	ReadOffset int64
+	// ReadSize is the size of the data range to be verified.
+	ReadSize int64
+	// Expected is a trusted hash for the file. It is compared with the
+	// calculated root hash to verify the content.
+	Expected []byte
+	// DataAndTreeInSameFile is true if data and Merkle tree are in the same
+	// file, or false if Merkle tree is a separate file from data.
+	DataAndTreeInSameFile bool
+}
+
+// verifyMetadata verifies the metadata by hashing a descriptor that contains
+// the metadata and compare the generated hash with expected.
+//
+// For verifyMetadata, params.data is not needed. It only accesses params.tree
+// for the raw root hash.
+func verifyMetadata(params *VerifyParams, layout *Layout) error {
+	root := make([]byte, layout.digestSize)
+	if _, err := params.Tree.ReadAt(root, layout.blockOffset(layout.rootLevel(), 0 /* index */)); err != nil {
+		return fmt.Errorf("failed to read root hash: %w", err)
+	}
+	descriptor := VerityDescriptor{
+		Name:     params.Name,
+		Mode:     params.Mode,
+		UID:      params.UID,
+		GID:      params.GID,
+		RootHash: root,
+	}
+	return descriptor.verify(params.Expected)
 }
 
 // Verify verifies the content read from data with offset. The content is
 // verified against tree. If content spans across multiple blocks, each block is
 // verified. Verification fails if the hash of the data does not match the tree
-// at any level, or if the final root hash does not match expectedRoot.
-// Once the data is verified, it will be written using w.
-// Verify will modify the cursor for data, but always restores it to its
-// original position upon exit. The cursor for tree is modified and not
-// restored.
-func Verify(w io.Writer, data, tree io.ReadSeeker, dataSize int64, readOffset int64, readSize int64, expectedRoot []byte) error {
-	if readSize <= 0 {
-		return fmt.Errorf("Unexpected read size: %d", readSize)
+// at any level, or if the final root hash does not match expected.
+// Once the data is verified, it will be written using params.Out.
+//
+// Verify checks for both target file content and metadata. If readSize is 0,
+// only metadata is checked.
+func Verify(params *VerifyParams) (int64, error) {
+	if params.ReadSize < 0 {
+		return 0, fmt.Errorf("unexpected read size: %d", params.ReadSize)
+	}
+	layout := InitLayout(int64(params.Size), params.DataAndTreeInSameFile)
+	if params.ReadSize == 0 {
+		return 0, verifyMetadata(params, &layout)
 	}
-	layout := InitLayout(int64(dataSize))
 
 	// Calculate the index of blocks that includes the target range in input
 	// data.
-	firstDataBlock := readOffset / layout.blockSize
-	lastDataBlock := (readOffset + readSize - 1) / layout.blockSize
-
-	// Store the current offset, so we can set it back once verification
-	// finishes.
-	origOffset, err := data.Seek(0, io.SeekCurrent)
-	if err != nil {
-		return fmt.Errorf("Find current data offset failed: %v", err)
-	}
-	defer data.Seek(origOffset, io.SeekStart)
-
-	// Move to the first block that contains target data.
-	if _, err := data.Seek(firstDataBlock*layout.blockSize, io.SeekStart); err != nil {
-		return fmt.Errorf("Seek to datablock start failed: %v", err)
-	}
+	firstDataBlock := params.ReadOffset / layout.blockSize
+	lastDataBlock := (params.ReadOffset + params.ReadSize - 1) / layout.blockSize
 
 	buf := make([]byte, layout.blockSize)
 	var readErr error
-	bytesRead := 0
+	total := int64(0)
 	for i := firstDataBlock; i <= lastDataBlock; i++ {
 		// Read a block that includes all or part of target range in
 		// input data.
-		bytesRead, readErr = data.Read(buf)
+		bytesRead, err := params.File.ReadAt(buf, i*layout.blockSize)
+		readErr = err
 		// If at the end of input data and all previous blocks are
 		// verified, return the verified input data and EOF.
 		if readErr == io.EOF && bytesRead == 0 {
 			break
 		}
 		if readErr != nil && readErr != io.EOF {
-			return fmt.Errorf("Read from data failed: %v", err)
+			return 0, fmt.Errorf("read from data failed: %w", err)
 		}
 		// If this is the end of file, zero the remaining bytes in buf,
 		// otherwise they are still from the previous block.
@@ -220,22 +348,29 @@ func Verify(w io.Writer, data, tree io.ReadSeeker, dataSize int64, readOffset in
 				buf[j] = 0
 			}
 		}
-		if err := verifyBlock(tree, layout, buf, i, expectedRoot); err != nil {
-			return err
+		descriptor := VerityDescriptor{
+			Name: params.Name,
+			Mode: params.Mode,
+			UID:  params.UID,
+			GID:  params.GID,
 		}
+		if err := verifyBlock(params.Tree, &descriptor, &layout, buf, i, params.Expected); err != nil {
+			return 0, err
+		}
+
 		// startOff is the beginning of the read range within the
 		// current data block. Note that for all blocks other than the
 		// first, startOff should be 0.
 		startOff := int64(0)
 		if i == firstDataBlock {
-			startOff = readOffset % layout.blockSize
+			startOff = params.ReadOffset % layout.blockSize
 		}
 		// endOff is the end of the read range within the current data
 		// block. Note that for all blocks other than the last,  endOff
 		// should be the block size.
 		endOff := layout.blockSize
 		if i == lastDataBlock {
-			endOff = (readOffset+readSize-1)%layout.blockSize + 1
+			endOff = (params.ReadOffset+params.ReadSize-1)%layout.blockSize + 1
 		}
 		// If the provided size exceeds the end of input data, we should
 		// only copy the parts in buf that's part of input data.
@@ -245,19 +380,22 @@ func Verify(w io.Writer, data, tree io.ReadSeeker, dataSize int64, readOffset in
 		if endOff > int64(bytesRead) {
 			endOff = int64(bytesRead)
 		}
-		w.Write(buf[startOff:endOff])
+		n, err := params.Out.Write(buf[startOff:endOff])
+		if err != nil {
+			return total, err
+		}
+		total += int64(n)
 
 	}
-	return readErr
+	return total, readErr
 }
 
 // verifyBlock verifies a block against tree. index is the number of block in
 // original data. The block is verified through each level of the tree. It
 // fails if the calculated hash from block is different from any level of
 // hashes stored in tree. And the final root hash is compared with
-// expectedRoot.  verifyBlock modifies the cursor for tree. Users needs to
-// maintain the cursor if intended.
-func verifyBlock(tree io.ReadSeeker, layout Layout, dataBlock []byte, blockIndex int64, expectedRoot []byte) error {
+// expected.
+func verifyBlock(tree io.ReaderAt, descriptor *VerityDescriptor, layout *Layout, dataBlock []byte, blockIndex int64, expected []byte) error {
 	if len(dataBlock) != int(layout.blockSize) {
 		return fmt.Errorf("incorrect block size")
 	}
@@ -274,41 +412,27 @@ func verifyBlock(tree io.ReadSeeker, layout Layout, dataBlock []byte, blockIndex
 			// Read a block in previous level that contains the
 			// hash we just generated, and generate a next level
 			// hash from it.
-			if _, err := tree.Seek(layout.blockOffset(level-1, blockIndex), io.SeekStart); err != nil {
-				return err
-			}
-			if _, err := tree.Read(treeBlock); err != nil {
+			if _, err := tree.ReadAt(treeBlock, layout.blockOffset(level-1, blockIndex)); err != nil {
 				return err
 			}
 			digestArray := sha256.Sum256(treeBlock)
 			digest = digestArray[:]
 		}
 
-		// Move to stored hash for the current block, read the digest
-		// and store in expectedDigest.
-		if _, err := tree.Seek(layout.digestOffset(level, blockIndex), io.SeekStart); err != nil {
-			return err
-		}
-		if _, err := tree.Read(expectedDigest); err != nil {
+		// Read the digest for the current block and store in
+		// expectedDigest.
+		if _, err := tree.ReadAt(expectedDigest, layout.digestOffset(level, blockIndex)); err != nil {
 			return err
 		}
 
 		if !bytes.Equal(digest, expectedDigest) {
-			return fmt.Errorf("Verification failed")
-		}
-
-		// If this is the root layer, no need to generate next level
-		// hash.
-		if level == layout.rootLevel() {
-			break
+			return fmt.Errorf("verification failed")
 		}
 		blockIndex = blockIndex / layout.hashesPerBlock()
 	}
 
-	// Verification for the tree succeeded. Now compare the root hash in the
-	// tree with expectedRoot.
-	if !bytes.Equal(digest[:], expectedRoot) {
-		return fmt.Errorf("Verification failed")
-	}
-	return nil
+	// Verification for the tree succeeded. Now hash the descriptor with
+	// the root hash and compare it with expected.
+	descriptor.RootHash = digest
+	return descriptor.verify(expected)
 }
diff --git a/pkg/merkletree/merkletree_test.go b/pkg/merkletree/merkletree_test.go
index 911f61df9..e1350ebda 100644
--- a/pkg/merkletree/merkletree_test.go
+++ b/pkg/merkletree/merkletree_test.go
@@ -27,130 +27,153 @@ import (
 
 func TestLayout(t *testing.T) {
 	testCases := []struct {
-		dataSize            int64
-		expectedLevelOffset []int64
+		dataSize              int64
+		dataAndTreeInSameFile bool
+		expectedLevelOffset   []int64
 	}{
 		{
-			dataSize:            100,
-			expectedLevelOffset: []int64{0},
+			dataSize:              100,
+			dataAndTreeInSameFile: false,
+			expectedLevelOffset:   []int64{0},
 		},
 		{
-			dataSize:            1000000,
-			expectedLevelOffset: []int64{0, 2 * usermem.PageSize, 3 * usermem.PageSize},
+			dataSize:              100,
+			dataAndTreeInSameFile: true,
+			expectedLevelOffset:   []int64{usermem.PageSize},
 		},
 		{
-			dataSize:            4096 * int64(usermem.PageSize),
-			expectedLevelOffset: []int64{0, 32 * usermem.PageSize, 33 * usermem.PageSize},
+			dataSize:              1000000,
+			dataAndTreeInSameFile: false,
+			expectedLevelOffset:   []int64{0, 2 * usermem.PageSize, 3 * usermem.PageSize},
+		},
+		{
+			dataSize:              1000000,
+			dataAndTreeInSameFile: true,
+			expectedLevelOffset:   []int64{245 * usermem.PageSize, 247 * usermem.PageSize, 248 * usermem.PageSize},
+		},
+		{
+			dataSize:              4096 * int64(usermem.PageSize),
+			dataAndTreeInSameFile: false,
+			expectedLevelOffset:   []int64{0, 32 * usermem.PageSize, 33 * usermem.PageSize},
+		},
+		{
+			dataSize:              4096 * int64(usermem.PageSize),
+			dataAndTreeInSameFile: true,
+			expectedLevelOffset:   []int64{4096 * usermem.PageSize, 4128 * usermem.PageSize, 4129 * usermem.PageSize},
 		},
 	}
 
 	for _, tc := range testCases {
 		t.Run(fmt.Sprintf("%d", tc.dataSize), func(t *testing.T) {
-			p := InitLayout(tc.dataSize)
-			if p.blockSize != int64(usermem.PageSize) {
-				t.Errorf("got blockSize %d, want %d", p.blockSize, usermem.PageSize)
+			l := InitLayout(tc.dataSize, tc.dataAndTreeInSameFile)
+			if l.blockSize != int64(usermem.PageSize) {
+				t.Errorf("Got blockSize %d, want %d", l.blockSize, usermem.PageSize)
 			}
-			if p.digestSize != sha256DigestSize {
-				t.Errorf("got digestSize %d, want %d", p.digestSize, sha256DigestSize)
+			if l.digestSize != sha256DigestSize {
+				t.Errorf("Got digestSize %d, want %d", l.digestSize, sha256DigestSize)
 			}
-			if p.numLevels() != len(tc.expectedLevelOffset) {
-				t.Errorf("got levels %d, want %d", p.numLevels(), len(tc.expectedLevelOffset))
+			if l.numLevels() != len(tc.expectedLevelOffset) {
+				t.Errorf("Got levels %d, want %d", l.numLevels(), len(tc.expectedLevelOffset))
 			}
-			for i := 0; i < p.numLevels() && i < len(tc.expectedLevelOffset); i++ {
-				if p.levelOffset[i] != tc.expectedLevelOffset[i] {
-					t.Errorf("got levelStart[%d] %d, want %d", i, p.levelOffset[i], tc.expectedLevelOffset[i])
+			for i := 0; i < l.numLevels() && i < len(tc.expectedLevelOffset); i++ {
+				if l.levelOffset[i] != tc.expectedLevelOffset[i] {
+					t.Errorf("Got levelStart[%d] %d, want %d", i, l.levelOffset[i], tc.expectedLevelOffset[i])
 				}
 			}
 		})
 	}
 }
 
+const (
+	defaultName = "merkle_test"
+	defaultMode = 0644
+	defaultUID  = 0
+	defaultGID  = 0
+)
+
+// bytesReadWriter is used to read from/write to/seek in a byte array. Unlike
+// bytes.Buffer, it keeps the whole buffer during read so that it can be reused.
+type bytesReadWriter struct {
+	// bytes contains the underlying byte array.
+	bytes []byte
+	// readPos is the currently location for Read. Write always appends to
+	// the end of the array.
+	readPos int
+}
+
+func (brw *bytesReadWriter) Write(p []byte) (int, error) {
+	brw.bytes = append(brw.bytes, p...)
+	return len(p), nil
+}
+
+func (brw *bytesReadWriter) ReadAt(p []byte, off int64) (int, error) {
+	bytesRead := copy(p, brw.bytes[off:])
+	if bytesRead == 0 {
+		return bytesRead, io.EOF
+	}
+	return bytesRead, nil
+}
+
 func TestGenerate(t *testing.T) {
 	// The input data has size dataSize. It starts with the data in startWith,
 	// and all other bytes are zeroes.
 	testCases := []struct {
 		data         []byte
-		expectedRoot []byte
+		expectedHash []byte
 	}{
 		{
 			data:         bytes.Repeat([]byte{0}, usermem.PageSize),
-			expectedRoot: []byte{173, 127, 172, 178, 88, 111, 198, 233, 102, 192, 4, 215, 209, 209, 107, 2, 79, 88, 5, 255, 124, 180, 124, 122, 133, 218, 189, 139, 72, 137, 44, 167},
+			expectedHash: []byte{64, 253, 58, 72, 192, 131, 82, 184, 193, 33, 108, 142, 43, 46, 179, 134, 244, 21, 29, 190, 14, 39, 66, 129, 6, 46, 200, 211, 30, 247, 191, 252},
 		},
 		{
 			data:         bytes.Repeat([]byte{0}, 128*usermem.PageSize+1),
-			expectedRoot: []byte{62, 93, 40, 92, 161, 241, 30, 223, 202, 99, 39, 2, 132, 113, 240, 139, 117, 99, 79, 243, 54, 18, 100, 184, 141, 121, 238, 46, 149, 202, 203, 132},
+			expectedHash: []byte{182, 223, 218, 62, 65, 185, 160, 219, 93, 119, 186, 88, 205, 32, 122, 231, 173, 72, 78, 76, 65, 57, 177, 146, 159, 39, 44, 123, 230, 156, 97, 26},
 		},
 		{
 			data:         []byte{'a'},
-			expectedRoot: []byte{52, 75, 204, 142, 172, 129, 37, 14, 145, 137, 103, 203, 11, 162, 209, 205, 30, 169, 213, 72, 20, 28, 243, 24, 242, 2, 92, 43, 169, 59, 110, 210},
+			expectedHash: []byte{28, 201, 8, 36, 150, 178, 111, 5, 193, 212, 129, 205, 206, 124, 211, 90, 224, 142, 81, 183, 72, 165, 243, 240, 242, 241, 76, 127, 101, 61, 63, 11},
 		},
 		{
 			data:         bytes.Repeat([]byte{'a'}, usermem.PageSize),
-			expectedRoot: []byte{201, 62, 238, 45, 13, 176, 47, 16, 172, 199, 70, 13, 149, 118, 225, 34, 220, 248, 205, 83, 196, 191, 141, 252, 174, 27, 62, 116, 235, 207, 255, 90},
+			expectedHash: []byte{106, 58, 160, 152, 41, 68, 38, 108, 245, 74, 177, 84, 64, 193, 19, 176, 249, 86, 27, 193, 85, 164, 99, 240, 79, 104, 148, 222, 76, 46, 191, 79},
 		},
 	}
 
 	for _, tc := range testCases {
 		t.Run(fmt.Sprintf("%d:%v", len(tc.data), tc.data[0]), func(t *testing.T) {
-			var tree bytes.Buffer
-
-			root, err := Generate(bytes.NewBuffer(tc.data), int64(len(tc.data)), &tree, &tree)
-			if err != nil {
-				t.Fatalf("Generate failed: %v", err)
-			}
+			for _, dataAndTreeInSameFile := range []bool{false, true} {
+				var tree bytesReadWriter
+				params := GenerateParams{
+					Size:                  int64(len(tc.data)),
+					Name:                  defaultName,
+					Mode:                  defaultMode,
+					UID:                   defaultUID,
+					GID:                   defaultGID,
+					TreeReader:            &tree,
+					TreeWriter:            &tree,
+					DataAndTreeInSameFile: dataAndTreeInSameFile,
+				}
+				if dataAndTreeInSameFile {
+					tree.Write(tc.data)
+					params.File = &tree
+				} else {
+					params.File = &bytesReadWriter{
+						bytes: tc.data,
+					}
+				}
+				hash, err := Generate(&params)
+				if err != nil {
+					t.Fatalf("Got err: %v, want nil", err)
+				}
 
-			if !bytes.Equal(root, tc.expectedRoot) {
-				t.Errorf("Unexpected root")
+				if !bytes.Equal(hash, tc.expectedHash) {
+					t.Errorf("Got hash: %v, want %v", hash, tc.expectedHash)
+				}
 			}
 		})
 	}
 }
 
-// bytesReadWriter is used to read from/write to/seek in a byte array. Unlike
-// bytes.Buffer, it keeps the whole buffer during read so that it can be reused.
-type bytesReadWriter struct {
-	// bytes contains the underlying byte array.
-	bytes []byte
-	// readPos is the currently location for Read. Write always appends to
-	// the end of the array.
-	readPos int
-}
-
-func (brw *bytesReadWriter) Write(p []byte) (int, error) {
-	brw.bytes = append(brw.bytes, p...)
-	return len(p), nil
-}
-
-func (brw *bytesReadWriter) Read(p []byte) (int, error) {
-	if brw.readPos >= len(brw.bytes) {
-		return 0, io.EOF
-	}
-	bytesRead := copy(p, brw.bytes[brw.readPos:])
-	brw.readPos += bytesRead
-	if bytesRead < len(p) {
-		return bytesRead, io.EOF
-	}
-	return bytesRead, nil
-}
-
-func (brw *bytesReadWriter) Seek(offset int64, whence int) (int64, error) {
-	off := offset
-	if whence == io.SeekCurrent {
-		off += int64(brw.readPos)
-	}
-	if whence == io.SeekEnd {
-		off += int64(len(brw.bytes))
-	}
-	if off < 0 {
-		panic("seek with negative offset")
-	}
-	if off >= int64(len(brw.bytes)) {
-		return 0, io.EOF
-	}
-	brw.readPos = int(off)
-	return off, nil
-}
-
 func TestVerify(t *testing.T) {
 	// The input data has size dataSize. The portion to be verified ranges from
 	// verifyStart with verifySize. A bit is flipped in outOfRangeByteIndex to
@@ -165,6 +188,10 @@ func TestVerify(t *testing.T) {
 		// modified byte falls in verification range, Verify should
 		// fail, otherwise Verify should still succeed.
 		modifyByte    int64
+		modifyName    bool
+		modifyMode    bool
+		modifyUID     bool
+		modifyGID     bool
 		shouldSucceed bool
 	}{
 		// Verify range start outside the data range should fail.
@@ -193,12 +220,48 @@ func TestVerify(t *testing.T) {
 			modifyByte:    0,
 			shouldSucceed: false,
 		},
-		// Invalid verify range (0 size) should fail.
+		// 0 verify size should only verify metadata.
+		{
+			dataSize:      usermem.PageSize,
+			verifyStart:   0,
+			verifySize:    0,
+			modifyByte:    0,
+			shouldSucceed: true,
+		},
+		// Modified name should fail verification.
+		{
+			dataSize:      usermem.PageSize,
+			verifyStart:   0,
+			verifySize:    0,
+			modifyByte:    0,
+			modifyName:    true,
+			shouldSucceed: false,
+		},
+		// Modified mode should fail verification.
+		{
+			dataSize:      usermem.PageSize,
+			verifyStart:   0,
+			verifySize:    0,
+			modifyByte:    0,
+			modifyMode:    true,
+			shouldSucceed: false,
+		},
+		// Modified UID should fail verification.
 		{
 			dataSize:      usermem.PageSize,
 			verifyStart:   0,
 			verifySize:    0,
 			modifyByte:    0,
+			modifyUID:     true,
+			shouldSucceed: false,
+		},
+		// Modified GID should fail verification.
+		{
+			dataSize:      usermem.PageSize,
+			verifyStart:   0,
+			verifySize:    0,
+			modifyByte:    0,
+			modifyGID:     true,
 			shouldSucceed: false,
 		},
 		// The test cases below use a block-aligned verify range.
@@ -284,26 +347,79 @@ func TestVerify(t *testing.T) {
 			data := make([]byte, tc.dataSize)
 			// Generate random bytes in data.
 			rand.Read(data)
-			var tree bytesReadWriter
 
-			root, err := Generate(bytes.NewBuffer(data), int64(tc.dataSize), &tree, &tree)
-			if err != nil {
-				t.Fatalf("Generate failed: %v", err)
-			}
+			for _, dataAndTreeInSameFile := range []bool{false, true} {
+				var tree bytesReadWriter
+				genParams := GenerateParams{
+					Size:                  int64(len(data)),
+					Name:                  defaultName,
+					Mode:                  defaultMode,
+					UID:                   defaultUID,
+					GID:                   defaultGID,
+					TreeReader:            &tree,
+					TreeWriter:            &tree,
+					DataAndTreeInSameFile: dataAndTreeInSameFile,
+				}
+				if dataAndTreeInSameFile {
+					tree.Write(data)
+					genParams.File = &tree
+				} else {
+					genParams.File = &bytesReadWriter{
+						bytes: data,
+					}
+				}
+				hash, err := Generate(&genParams)
+				if err != nil {
+					t.Fatalf("Generate failed: %v", err)
+				}
 
-			// Flip a bit in data and checks Verify results.
-			var buf bytes.Buffer
-			data[tc.modifyByte] ^= 1
-			if tc.shouldSucceed {
-				if err := Verify(&buf, bytes.NewReader(data), &tree, tc.dataSize, tc.verifyStart, tc.verifySize, root); err != nil && err != io.EOF {
-					t.Errorf("Verification failed when expected to succeed: %v", err)
+				// Flip a bit in data and checks Verify results.
+				var buf bytes.Buffer
+				data[tc.modifyByte] ^= 1
+				verifyParams := VerifyParams{
+					Out:                   &buf,
+					File:                  bytes.NewReader(data),
+					Tree:                  &tree,
+					Size:                  tc.dataSize,
+					Name:                  defaultName,
+					Mode:                  defaultMode,
+					UID:                   defaultUID,
+					GID:                   defaultGID,
+					ReadOffset:            tc.verifyStart,
+					ReadSize:              tc.verifySize,
+					Expected:              hash,
+					DataAndTreeInSameFile: dataAndTreeInSameFile,
+				}
+				if tc.modifyName {
+					verifyParams.Name = defaultName + "abc"
+				}
+				if tc.modifyMode {
+					verifyParams.Mode = defaultMode + 1
 				}
-				if int64(buf.Len()) != tc.verifySize || !bytes.Equal(data[tc.verifyStart:tc.verifyStart+tc.verifySize], buf.Bytes()) {
-					t.Errorf("Incorrect output from Verify")
+				if tc.modifyUID {
+					verifyParams.UID = defaultUID + 1
 				}
-			} else {
-				if err := Verify(&buf, bytes.NewReader(data), &tree, tc.dataSize, tc.verifyStart, tc.verifySize, root); err == nil {
-					t.Errorf("Verification succeeded when expected to fail")
+				if tc.modifyGID {
+					verifyParams.GID = defaultGID + 1
+				}
+				if tc.shouldSucceed {
+					n, err := Verify(&verifyParams)
+					if err != nil && err != io.EOF {
+						t.Errorf("Verification failed when expected to succeed: %v", err)
+					}
+					if n != tc.verifySize {
+						t.Errorf("Got Verify output size %d, want %d", n, tc.verifySize)
+					}
+					if int64(buf.Len()) != tc.verifySize {
+						t.Errorf("Got Verify output buf size %d, want %d,", buf.Len(), tc.verifySize)
+					}
+					if !bytes.Equal(data[tc.verifyStart:tc.verifyStart+tc.verifySize], buf.Bytes()) {
+						t.Errorf("Incorrect output buf from Verify")
+					}
+				} else {
+					if _, err := Verify(&verifyParams); err == nil {
+						t.Errorf("Verification succeeded when expected to fail")
+					}
 				}
 			}
 		})
@@ -318,36 +434,88 @@ func TestVerifyRandom(t *testing.T) {
 	data := make([]byte, dataSize)
 	// Generate random bytes in data.
 	rand.Read(data)
-	var tree bytesReadWriter
 
-	root, err := Generate(bytes.NewBuffer(data), int64(dataSize), &tree, &tree)
-	if err != nil {
-		t.Fatalf("Generate failed: %v", err)
-	}
+	for _, dataAndTreeInSameFile := range []bool{false, true} {
+		var tree bytesReadWriter
+		genParams := GenerateParams{
+			Size:                  int64(len(data)),
+			Name:                  defaultName,
+			Mode:                  defaultMode,
+			UID:                   defaultUID,
+			GID:                   defaultGID,
+			TreeReader:            &tree,
+			TreeWriter:            &tree,
+			DataAndTreeInSameFile: dataAndTreeInSameFile,
+		}
 
-	// Pick a random portion of data.
-	start := rand.Int63n(dataSize - 1)
-	size := rand.Int63n(dataSize) + 1
+		if dataAndTreeInSameFile {
+			tree.Write(data)
+			genParams.File = &tree
+		} else {
+			genParams.File = &bytesReadWriter{
+				bytes: data,
+			}
+		}
+		hash, err := Generate(&genParams)
+		if err != nil {
+			t.Fatalf("Generate failed: %v", err)
+		}
 
-	var buf bytes.Buffer
-	// Checks that the random portion of data from the original data is
-	// verified successfully.
-	if err := Verify(&buf, bytes.NewReader(data), &tree, dataSize, start, size, root); err != nil && err != io.EOF {
-		t.Errorf("Verification failed for correct data: %v", err)
-	}
-	if size > dataSize-start {
-		size = dataSize - start
-	}
-	if int64(buf.Len()) != size || !bytes.Equal(data[start:start+size], buf.Bytes()) {
-		t.Errorf("Incorrect output from Verify")
-	}
+		// Pick a random portion of data.
+		start := rand.Int63n(dataSize - 1)
+		size := rand.Int63n(dataSize) + 1
+
+		var buf bytes.Buffer
+		verifyParams := VerifyParams{
+			Out:                   &buf,
+			File:                  bytes.NewReader(data),
+			Tree:                  &tree,
+			Size:                  dataSize,
+			Name:                  defaultName,
+			Mode:                  defaultMode,
+			UID:                   defaultUID,
+			GID:                   defaultGID,
+			ReadOffset:            start,
+			ReadSize:              size,
+			Expected:              hash,
+			DataAndTreeInSameFile: dataAndTreeInSameFile,
+		}
+
+		// Checks that the random portion of data from the original data is
+		// verified successfully.
+		n, err := Verify(&verifyParams)
+		if err != nil && err != io.EOF {
+			t.Errorf("Verification failed for correct data: %v", err)
+		}
+		if size > dataSize-start {
+			size = dataSize - start
+		}
+		if n != size {
+			t.Errorf("Got Verify output size %d, want %d", n, size)
+		}
+		if int64(buf.Len()) != size {
+			t.Errorf("Got Verify output buf size %d, want %d", buf.Len(), size)
+		}
+		if !bytes.Equal(data[start:start+size], buf.Bytes()) {
+			t.Errorf("Incorrect output buf from Verify")
+		}
+
+		// Verify that modified metadata should fail verification.
+		buf.Reset()
+		verifyParams.Name = defaultName + "abc"
+		if _, err := Verify(&verifyParams); err == nil {
+			t.Error("Verify succeeded for modified metadata, expect failure")
+		}
 
-	buf.Reset()
-	// Flip a random bit in randPortion, and check that verification fails.
-	randBytePos := rand.Int63n(size)
-	data[start+randBytePos] ^= 1
+		// Flip a random bit in randPortion, and check that verification fails.
+		buf.Reset()
+		randBytePos := rand.Int63n(size)
+		data[start+randBytePos] ^= 1
+		verifyParams.File = bytes.NewReader(data)
+		verifyParams.Name = defaultName
 
-	if err := Verify(&buf, bytes.NewReader(data), &tree, dataSize, start, size, root); err == nil {
-		t.Errorf("Verification succeeded for modified data")
+		if _, err := Verify(&verifyParams); err == nil {
+			t.Error("Verification succeeded for modified data, expect failure")
+		}
 	}
 }
diff --git a/pkg/metric/BUILD b/pkg/metric/BUILD
index 58305009d..0a6a5d215 100644
--- a/pkg/metric/BUILD
+++ b/pkg/metric/BUILD
@@ -27,6 +27,6 @@ go_test(
     deps = [
         ":metric_go_proto",
         "//pkg/eventchannel",
-        "@com_github_golang_protobuf//proto:go_default_library",
+        "@org_golang_google_protobuf//proto:go_default_library",
     ],
 )
diff --git a/pkg/metric/metric.go b/pkg/metric/metric.go
index 64aa365ce..d012c5734 100644
--- a/pkg/metric/metric.go
+++ b/pkg/metric/metric.go
@@ -106,8 +106,8 @@ type customUint64Metric struct {
 // after Initialized.
 //
 // Preconditions:
-//  * name must be globally unique.
-//  * Initialize/Disable have not been called.
+// * name must be globally unique.
+// * Initialize/Disable have not been called.
 func RegisterCustomUint64Metric(name string, cumulative, sync bool, units pb.MetricMetadata_Units, description string, value func() uint64) error {
 	if initialized {
 		return ErrInitializationDone
@@ -221,7 +221,7 @@ var (
 // EmitMetricUpdate is thread-safe.
 //
 // Preconditions:
-//  * Initialize has been called.
+// * Initialize has been called.
 func EmitMetricUpdate() {
 	emitMu.Lock()
 	defer emitMu.Unlock()
diff --git a/pkg/metric/metric_test.go b/pkg/metric/metric_test.go
index c425ea532..aefd0ea5c 100644
--- a/pkg/metric/metric_test.go
+++ b/pkg/metric/metric_test.go
@@ -17,7 +17,7 @@ package metric
 import (
 	"testing"
 
-	"github.com/golang/protobuf/proto"
+	"google.golang.org/protobuf/proto"
 	"gvisor.dev/gvisor/pkg/eventchannel"
 	pb "gvisor.dev/gvisor/pkg/metric/metric_go_proto"
 )
diff --git a/pkg/p9/client_file.go b/pkg/p9/client_file.go
index 2ee07b664..28fe081d6 100644
--- a/pkg/p9/client_file.go
+++ b/pkg/p9/client_file.go
@@ -54,6 +54,8 @@ func (c *Client) newFile(fid FID) *clientFile {
 //
 // This proxies all of the interfaces found in file.go.
 type clientFile struct {
+	DisallowServerCalls
+
 	// client is the originating client.
 	client *Client
 
@@ -283,6 +285,39 @@ func (c *clientFile) Close() error {
 	return nil
 }
 
+// SetAttrClose implements File.SetAttrClose.
+func (c *clientFile) SetAttrClose(valid SetAttrMask, attr SetAttr) error {
+	if !versionSupportsTsetattrclunk(c.client.version) {
+		setAttrErr := c.SetAttr(valid, attr)
+
+		// Try to close file even in case of failure above. Since the state of the
+		// file is unknown to the caller, it will not attempt to close the file
+		// again.
+		if err := c.Close(); err != nil {
+			return err
+		}
+
+		return setAttrErr
+	}
+
+	// Avoid double close.
+	if !atomic.CompareAndSwapUint32(&c.closed, 0, 1) {
+		return syscall.EBADF
+	}
+
+	// Send the message.
+	if err := c.client.sendRecv(&Tsetattrclunk{FID: c.fid, Valid: valid, SetAttr: attr}, &Rsetattrclunk{}); err != nil {
+		// If an error occurred, we toss away the FID. This isn't ideal,
+		// but I'm not sure what else makes sense in this context.
+		log.Warningf("Tsetattrclunk failed, losing FID %v: %v", c.fid, err)
+		return err
+	}
+
+	// Return the FID to the pool.
+	c.client.fidPool.Put(uint64(c.fid))
+	return nil
+}
+
 // Open implements File.Open.
 func (c *clientFile) Open(flags OpenFlags) (*fd.FD, QID, uint32, error) {
 	if atomic.LoadUint32(&c.closed) != 0 {
@@ -681,6 +716,3 @@ func (c *clientFile) Flush() error {
 
 	return c.client.sendRecv(&Tflushf{FID: c.fid}, &Rflushf{})
 }
-
-// Renamed implements File.Renamed.
-func (c *clientFile) Renamed(newDir File, newName string) {}
diff --git a/pkg/p9/file.go b/pkg/p9/file.go
index cab35896f..c2e3a3f98 100644
--- a/pkg/p9/file.go
+++ b/pkg/p9/file.go
@@ -135,6 +135,14 @@ type File interface {
 	// On the server, Close has no concurrency guarantee.
 	Close() error
 
+	// SetAttrClose is the equivalent of calling SetAttr() followed by Close().
+	// This can be used to set file times before closing the file in a single
+	// operation.
+	//
+	// On the server, SetAttr has a write concurrency guarantee.
+	// On the server, Close has no concurrency guarantee.
+	SetAttrClose(valid SetAttrMask, attr SetAttr) error
+
 	// Open must be called prior to using Read, Write or Readdir. Once Open
 	// is called, some operations, such as Walk, will no longer work.
 	//
@@ -286,3 +294,19 @@ type DefaultWalkGetAttr struct{}
 func (DefaultWalkGetAttr) WalkGetAttr([]string) ([]QID, File, AttrMask, Attr, error) {
 	return nil, nil, AttrMask{}, Attr{}, syscall.ENOSYS
 }
+
+// DisallowClientCalls panics if a client-only function is called.
+type DisallowClientCalls struct{}
+
+// SetAttrClose implements File.SetAttrClose.
+func (DisallowClientCalls) SetAttrClose(SetAttrMask, SetAttr) error {
+	panic("SetAttrClose should not be called on the server")
+}
+
+// DisallowServerCalls panics if a server-only function is called.
+type DisallowServerCalls struct{}
+
+// Renamed implements File.Renamed.
+func (*clientFile) Renamed(File, string) {
+	panic("Renamed should not be called on the client")
+}
diff --git a/pkg/p9/handlers.go b/pkg/p9/handlers.go
index 1db5797dd..abd237f46 100644
--- a/pkg/p9/handlers.go
+++ b/pkg/p9/handlers.go
@@ -123,6 +123,37 @@ func (t *Tclunk) handle(cs *connState) message {
 	return &Rclunk{}
 }
 
+func (t *Tsetattrclunk) handle(cs *connState) message {
+	ref, ok := cs.LookupFID(t.FID)
+	if !ok {
+		return newErr(syscall.EBADF)
+	}
+	defer ref.DecRef()
+
+	setAttrErr := ref.safelyWrite(func() error {
+		// We don't allow setattr on files that have been deleted.
+		// This might be technically incorrect, as it's possible that
+		// there were multiple links and you can still change the
+		// corresponding inode information.
+		if ref.isDeleted() {
+			return syscall.EINVAL
+		}
+
+		// Set the attributes.
+		return ref.file.SetAttr(t.Valid, t.SetAttr)
+	})
+
+	// Try to delete FID even in case of failure above. Since the state of the
+	// file is unknown to the caller, it will not attempt to close the file again.
+	if !cs.DeleteFID(t.FID) {
+		return newErr(syscall.EBADF)
+	}
+	if setAttrErr != nil {
+		return newErr(setAttrErr)
+	}
+	return &Rsetattrclunk{}
+}
+
 // handle implements handler.handle.
 func (t *Tremove) handle(cs *connState) message {
 	ref, ok := cs.LookupFID(t.FID)
diff --git a/pkg/p9/messages.go b/pkg/p9/messages.go
index 2cb59f934..cf13cbb69 100644
--- a/pkg/p9/messages.go
+++ b/pkg/p9/messages.go
@@ -317,6 +317,64 @@ func (r *Rclunk) String() string {
 	return "Rclunk{}"
 }
 
+// Tsetattrclunk is a setattr+close request.
+type Tsetattrclunk struct {
+	// FID is the FID to change.
+	FID FID
+
+	// Valid is the set of bits which will be used.
+	Valid SetAttrMask
+
+	// SetAttr is the set request.
+	SetAttr SetAttr
+}
+
+// decode implements encoder.decode.
+func (t *Tsetattrclunk) decode(b *buffer) {
+	t.FID = b.ReadFID()
+	t.Valid.decode(b)
+	t.SetAttr.decode(b)
+}
+
+// encode implements encoder.encode.
+func (t *Tsetattrclunk) encode(b *buffer) {
+	b.WriteFID(t.FID)
+	t.Valid.encode(b)
+	t.SetAttr.encode(b)
+}
+
+// Type implements message.Type.
+func (*Tsetattrclunk) Type() MsgType {
+	return MsgTsetattrclunk
+}
+
+// String implements fmt.Stringer.
+func (t *Tsetattrclunk) String() string {
+	return fmt.Sprintf("Tsetattrclunk{FID: %d, Valid: %v, SetAttr: %s}", t.FID, t.Valid, t.SetAttr)
+}
+
+// Rsetattrclunk is a setattr+close response.
+type Rsetattrclunk struct {
+}
+
+// decode implements encoder.decode.
+func (*Rsetattrclunk) decode(*buffer) {
+}
+
+// encode implements encoder.encode.
+func (*Rsetattrclunk) encode(*buffer) {
+}
+
+// Type implements message.Type.
+func (*Rsetattrclunk) Type() MsgType {
+	return MsgRsetattrclunk
+}
+
+// String implements fmt.Stringer.
+func (r *Rsetattrclunk) String() string {
+	return "Rsetattrclunk{}"
+}
+
 // Tremove is a remove request.
 //
 // This will eventually be replaced by Tunlinkat.
@@ -2657,6 +2715,8 @@ func init() {
 	msgRegistry.register(MsgRlconnect, func() message { return &Rlconnect{} })
 	msgRegistry.register(MsgTallocate, func() message { return &Tallocate{} })
 	msgRegistry.register(MsgRallocate, func() message { return &Rallocate{} })
+	msgRegistry.register(MsgTsetattrclunk, func() message { return &Tsetattrclunk{} })
+	msgRegistry.register(MsgRsetattrclunk, func() message { return &Rsetattrclunk{} })
 	msgRegistry.register(MsgTchannel, func() message { return &Tchannel{} })
 	msgRegistry.register(MsgRchannel, func() message { return &Rchannel{} })
 }
diff --git a/pkg/p9/messages_test.go b/pkg/p9/messages_test.go
index 7facc9f5e..bfeb6c236 100644
--- a/pkg/p9/messages_test.go
+++ b/pkg/p9/messages_test.go
@@ -376,6 +376,30 @@ func TestEncodeDecode(t *testing.T) {
 		&Rumknod{
 			Rmknod{QID: QID{Type: 1}},
 		},
+		&Tsetattrclunk{
+			FID: 1,
+			Valid: SetAttrMask{
+				Permissions:        true,
+				UID:                true,
+				GID:                true,
+				Size:               true,
+				ATime:              true,
+				MTime:              true,
+				CTime:              true,
+				ATimeNotSystemTime: true,
+				MTimeNotSystemTime: true,
+			},
+			SetAttr: SetAttr{
+				Permissions:      1,
+				UID:              2,
+				GID:              3,
+				Size:             4,
+				ATimeSeconds:     5,
+				ATimeNanoSeconds: 6,
+				MTimeSeconds:     7,
+				MTimeNanoSeconds: 8,
+			},
+		},
 	}
 
 	for _, enc := range objs {
diff --git a/pkg/p9/p9.go b/pkg/p9/p9.go
index 122c457d2..2235f8968 100644
--- a/pkg/p9/p9.go
+++ b/pkg/p9/p9.go
@@ -315,86 +315,88 @@ type MsgType uint8
 
 // MsgType declarations.
 const (
-	MsgTlerror      MsgType = 6
-	MsgRlerror              = 7
-	MsgTstatfs              = 8
-	MsgRstatfs              = 9
-	MsgTlopen               = 12
-	MsgRlopen               = 13
-	MsgTlcreate             = 14
-	MsgRlcreate             = 15
-	MsgTsymlink             = 16
-	MsgRsymlink             = 17
-	MsgTmknod               = 18
-	MsgRmknod               = 19
-	MsgTrename              = 20
-	MsgRrename              = 21
-	MsgTreadlink            = 22
-	MsgRreadlink            = 23
-	MsgTgetattr             = 24
-	MsgRgetattr             = 25
-	MsgTsetattr             = 26
-	MsgRsetattr             = 27
-	MsgTlistxattr           = 28
-	MsgRlistxattr           = 29
-	MsgTxattrwalk           = 30
-	MsgRxattrwalk           = 31
-	MsgTxattrcreate         = 32
-	MsgRxattrcreate         = 33
-	MsgTgetxattr            = 34
-	MsgRgetxattr            = 35
-	MsgTsetxattr            = 36
-	MsgRsetxattr            = 37
-	MsgTremovexattr         = 38
-	MsgRremovexattr         = 39
-	MsgTreaddir             = 40
-	MsgRreaddir             = 41
-	MsgTfsync               = 50
-	MsgRfsync               = 51
-	MsgTlink                = 70
-	MsgRlink                = 71
-	MsgTmkdir               = 72
-	MsgRmkdir               = 73
-	MsgTrenameat            = 74
-	MsgRrenameat            = 75
-	MsgTunlinkat            = 76
-	MsgRunlinkat            = 77
-	MsgTversion             = 100
-	MsgRversion             = 101
-	MsgTauth                = 102
-	MsgRauth                = 103
-	MsgTattach              = 104
-	MsgRattach              = 105
-	MsgTflush               = 108
-	MsgRflush               = 109
-	MsgTwalk                = 110
-	MsgRwalk                = 111
-	MsgTread                = 116
-	MsgRread                = 117
-	MsgTwrite               = 118
-	MsgRwrite               = 119
-	MsgTclunk               = 120
-	MsgRclunk               = 121
-	MsgTremove              = 122
-	MsgRremove              = 123
-	MsgTflushf              = 124
-	MsgRflushf              = 125
-	MsgTwalkgetattr         = 126
-	MsgRwalkgetattr         = 127
-	MsgTucreate             = 128
-	MsgRucreate             = 129
-	MsgTumkdir              = 130
-	MsgRumkdir              = 131
-	MsgTumknod              = 132
-	MsgRumknod              = 133
-	MsgTusymlink            = 134
-	MsgRusymlink            = 135
-	MsgTlconnect            = 136
-	MsgRlconnect            = 137
-	MsgTallocate            = 138
-	MsgRallocate            = 139
-	MsgTchannel             = 250
-	MsgRchannel             = 251
+	MsgTlerror       MsgType = 6
+	MsgRlerror       MsgType = 7
+	MsgTstatfs       MsgType = 8
+	MsgRstatfs       MsgType = 9
+	MsgTlopen        MsgType = 12
+	MsgRlopen        MsgType = 13
+	MsgTlcreate      MsgType = 14
+	MsgRlcreate      MsgType = 15
+	MsgTsymlink      MsgType = 16
+	MsgRsymlink      MsgType = 17
+	MsgTmknod        MsgType = 18
+	MsgRmknod        MsgType = 19
+	MsgTrename       MsgType = 20
+	MsgRrename       MsgType = 21
+	MsgTreadlink     MsgType = 22
+	MsgRreadlink     MsgType = 23
+	MsgTgetattr      MsgType = 24
+	MsgRgetattr      MsgType = 25
+	MsgTsetattr      MsgType = 26
+	MsgRsetattr      MsgType = 27
+	MsgTlistxattr    MsgType = 28
+	MsgRlistxattr    MsgType = 29
+	MsgTxattrwalk    MsgType = 30
+	MsgRxattrwalk    MsgType = 31
+	MsgTxattrcreate  MsgType = 32
+	MsgRxattrcreate  MsgType = 33
+	MsgTgetxattr     MsgType = 34
+	MsgRgetxattr     MsgType = 35
+	MsgTsetxattr     MsgType = 36
+	MsgRsetxattr     MsgType = 37
+	MsgTremovexattr  MsgType = 38
+	MsgRremovexattr  MsgType = 39
+	MsgTreaddir      MsgType = 40
+	MsgRreaddir      MsgType = 41
+	MsgTfsync        MsgType = 50
+	MsgRfsync        MsgType = 51
+	MsgTlink         MsgType = 70
+	MsgRlink         MsgType = 71
+	MsgTmkdir        MsgType = 72
+	MsgRmkdir        MsgType = 73
+	MsgTrenameat     MsgType = 74
+	MsgRrenameat     MsgType = 75
+	MsgTunlinkat     MsgType = 76
+	MsgRunlinkat     MsgType = 77
+	MsgTversion      MsgType = 100
+	MsgRversion      MsgType = 101
+	MsgTauth         MsgType = 102
+	MsgRauth         MsgType = 103
+	MsgTattach       MsgType = 104
+	MsgRattach       MsgType = 105
+	MsgTflush        MsgType = 108
+	MsgRflush        MsgType = 109
+	MsgTwalk         MsgType = 110
+	MsgRwalk         MsgType = 111
+	MsgTread         MsgType = 116
+	MsgRread         MsgType = 117
+	MsgTwrite        MsgType = 118
+	MsgRwrite        MsgType = 119
+	MsgTclunk        MsgType = 120
+	MsgRclunk        MsgType = 121
+	MsgTremove       MsgType = 122
+	MsgRremove       MsgType = 123
+	MsgTflushf       MsgType = 124
+	MsgRflushf       MsgType = 125
+	MsgTwalkgetattr  MsgType = 126
+	MsgRwalkgetattr  MsgType = 127
+	MsgTucreate      MsgType = 128
+	MsgRucreate      MsgType = 129
+	MsgTumkdir       MsgType = 130
+	MsgRumkdir       MsgType = 131
+	MsgTumknod       MsgType = 132
+	MsgRumknod       MsgType = 133
+	MsgTusymlink     MsgType = 134
+	MsgRusymlink     MsgType = 135
+	MsgTlconnect     MsgType = 136
+	MsgRlconnect     MsgType = 137
+	MsgTallocate     MsgType = 138
+	MsgRallocate     MsgType = 139
+	MsgTsetattrclunk MsgType = 140
+	MsgRsetattrclunk MsgType = 141
+	MsgTchannel      MsgType = 250
+	MsgRchannel      MsgType = 251
 )
 
 // QIDType represents the file type for QIDs.
diff --git a/pkg/p9/p9test/client_test.go b/pkg/p9/p9test/client_test.go
index 6e7bb3db2..6e605b14c 100644
--- a/pkg/p9/p9test/client_test.go
+++ b/pkg/p9/p9test/client_test.go
@@ -1225,22 +1225,31 @@ func TestOpen(t *testing.T) {
 func TestClose(t *testing.T) {
 	type closeTest struct {
 		name    string
-		closeFn func(backend *Mock, f p9.File)
+		closeFn func(backend *Mock, f p9.File) error
 	}
 
 	cases := []closeTest{
 		{
 			name: "close",
-			closeFn: func(_ *Mock, f p9.File) {
-				f.Close()
+			closeFn: func(_ *Mock, f p9.File) error {
+				return f.Close()
 			},
 		},
 		{
 			name: "remove",
-			closeFn: func(backend *Mock, f p9.File) {
+			closeFn: func(backend *Mock, f p9.File) error {
 				// Allow the rename call in the parent, automatically translated.
 				backend.parent.EXPECT().UnlinkAt(gomock.Any(), gomock.Any()).Times(1)
-				f.(deprecatedRemover).Remove()
+				return f.(deprecatedRemover).Remove()
+			},
+		},
+		{
+			name: "setAttrClose",
+			closeFn: func(backend *Mock, f p9.File) error {
+				valid := p9.SetAttrMask{ATime: true}
+				attr := p9.SetAttr{ATimeSeconds: 1, ATimeNanoSeconds: 2}
+				backend.EXPECT().SetAttr(valid, attr).Times(1)
+				return f.SetAttrClose(valid, attr)
 			},
 		},
 	}
@@ -1258,7 +1267,9 @@ func TestClose(t *testing.T) {
 				_, backend, f := walkHelper(h, name, root)
 
 				// Close via the prescribed method.
-				tc.closeFn(backend, f)
+				if err := tc.closeFn(backend, f); err != nil {
+					t.Fatalf("closeFn failed: %v", err)
+				}
 
 				// Everything should fail with EBADF.
 				if _, _, err := f.Walk(nil); err != syscall.EBADF {
diff --git a/pkg/p9/version.go b/pkg/p9/version.go
index 09cde9f5a..8d7168ef5 100644
--- a/pkg/p9/version.go
+++ b/pkg/p9/version.go
@@ -26,7 +26,7 @@ const (
 	//
 	// Clients are expected to start requesting this version number and
 	// to continuously decrement it until a Tversion request succeeds.
-	highestSupportedVersion uint32 = 11
+	highestSupportedVersion uint32 = 12
 
 	// lowestSupportedVersion is the lowest supported version X in a
 	// version string of the format 9P2000.L.Google.X.
@@ -173,3 +173,9 @@ func versionSupportsGetSetXattr(v uint32) bool {
 func versionSupportsListRemoveXattr(v uint32) bool {
 	return v >= 11
 }
+
+// versionSupportsTsetattrclunk returns true if version v supports
+// the Tsetattrclunk message.
+func versionSupportsTsetattrclunk(v uint32) bool {
+	return v >= 12
+}
diff --git a/pkg/procid/procid_amd64.s b/pkg/procid/procid_amd64.s
index 7c622e5d7..a45920040 100644
--- a/pkg/procid/procid_amd64.s
+++ b/pkg/procid/procid_amd64.s
@@ -14,7 +14,7 @@
 
 // +build amd64
 // +build go1.8
-// +build !go1.16
+// +build !go1.17
 
 #include "textflag.h"
 
diff --git a/pkg/procid/procid_arm64.s b/pkg/procid/procid_arm64.s
index 48ebb5fd1..9d3b0666d 100644
--- a/pkg/procid/procid_arm64.s
+++ b/pkg/procid/procid_arm64.s
@@ -14,7 +14,7 @@
 
 // +build arm64
 // +build go1.8
-// +build !go1.16
+// +build !go1.17
 
 #include "textflag.h"
 
diff --git a/pkg/refs/refcounter.go b/pkg/refs/refcounter.go
index d9d5e6bcb..699ea8ac3 100644
--- a/pkg/refs/refcounter.go
+++ b/pkg/refs/refcounter.go
@@ -234,6 +234,41 @@ const (
 	LeaksLogTraces
 )
 
+// Set implements flag.Value.
+func (l *LeakMode) Set(v string) error {
+	switch v {
+	case "disabled":
+		*l = NoLeakChecking
+	case "log-names":
+		*l = LeaksLogWarning
+	case "log-traces":
+		*l = LeaksLogTraces
+	default:
+		return fmt.Errorf("invalid ref leak mode %q", v)
+	}
+	return nil
+}
+
+// Get implements flag.Value.
+func (l *LeakMode) Get() interface{} {
+	return *l
+}
+
+// String implements flag.Value.
+func (l *LeakMode) String() string {
+	switch *l {
+	case UninitializedLeakChecking:
+		return "uninitialized"
+	case NoLeakChecking:
+		return "disabled"
+	case LeaksLogWarning:
+		return "log-names"
+	case LeaksLogTraces:
+		return "log-traces"
+	}
+	panic(fmt.Sprintf("invalid ref leak mode %d", *l))
+}
+
 // leakMode stores the current mode for the reference leak checker.
 //
 // Values must be one of the LeakMode values.
diff --git a/pkg/refs_vfs2/BUILD b/pkg/refs_vfs2/BUILD
index 7b3e10683..577b827a5 100644
--- a/pkg/refs_vfs2/BUILD
+++ b/pkg/refs_vfs2/BUILD
@@ -11,7 +11,7 @@ go_template(
     types = [
         "T",
     ],
-    visibility = ["//pkg/sentry:internal"],
+    visibility = ["//:sandbox"],
     deps = [
         "//pkg/log",
         "//pkg/refs",
diff --git a/pkg/refs_vfs2/refs_template.go b/pkg/refs_vfs2/refs_template.go
index 99c43c065..d9b552896 100644
--- a/pkg/refs_vfs2/refs_template.go
+++ b/pkg/refs_vfs2/refs_template.go
@@ -12,11 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// Package refs_template defines a template that can be used by reference counted
-// objects.
+// Package refs_template defines a template that can be used by reference
+// counted objects. The "owner" template parameter is used in log messages to
+// indicate the type of reference-counted object that exhibited a reference
+// leak. As a result, structs that are embedded in other structs should not use
+// this template, since it will make tracking down leaks more difficult.
 package refs_template
 
 import (
+	"fmt"
 	"runtime"
 	"sync/atomic"
 
@@ -38,6 +42,11 @@ var ownerType *T
 // Note that the number of references is actually refCount + 1 so that a default
 // zero-value Refs object contains one reference.
 //
+// TODO(gvisor.dev/issue/1486): Store stack traces when leak check is enabled in
+// a map with 16-bit hashes, and store the hash in the top 16 bits of refCount.
+// This will allow us to add stack trace information to the leak messages
+// without growing the size of Refs.
+//
 // +stateify savable
 type Refs struct {
 	// refCount is composed of two fields:
@@ -82,7 +91,7 @@ func (r *Refs) ReadRefs() int64 {
 //go:nosplit
 func (r *Refs) IncRef() {
 	if v := atomic.AddInt64(&r.refCount, 1); v <= 0 {
-		panic("Incrementing non-positive ref count")
+		panic(fmt.Sprintf("Incrementing non-positive ref count %p owned by %T", r, ownerType))
 	}
 }
 
@@ -122,7 +131,7 @@ func (r *Refs) TryIncRef() bool {
 func (r *Refs) DecRef(destroy func()) {
 	switch v := atomic.AddInt64(&r.refCount, -1); {
 	case v < -1:
-		panic("Decrementing non-positive ref count")
+		panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %T", r, ownerType))
 
 	case v == -1:
 		// Call the destructor.
diff --git a/pkg/safemem/BUILD b/pkg/safemem/BUILD
index ce30382ab..68ed074f8 100644
--- a/pkg/safemem/BUILD
+++ b/pkg/safemem/BUILD
@@ -11,9 +11,7 @@ go_library(
         "seq_unsafe.go",
     ],
     visibility = ["//:sandbox"],
-    deps = [
-        "//pkg/safecopy",
-    ],
+    deps = ["//pkg/safecopy"],
 )
 
 go_test(
diff --git a/pkg/safemem/seq_unsafe.go b/pkg/safemem/seq_unsafe.go
index f5f0574f8..fc4049eeb 100644
--- a/pkg/safemem/seq_unsafe.go
+++ b/pkg/safemem/seq_unsafe.go
@@ -91,9 +91,10 @@ func BlockSeqFromSlice(slice []Block) BlockSeq {
 	return blockSeqFromSliceLimited(slice, limit)
 }
 
-// Preconditions: The combined length of all Blocks in slice <= limit. If
-// len(slice) != 0, the first Block in slice has non-zero length, and limit >
-// 0.
+// Preconditions:
+// * The combined length of all Blocks in slice <= limit.
+// * If len(slice) != 0, the first Block in slice has non-zero length and
+//   limit > 0.
 func blockSeqFromSliceLimited(slice []Block, limit uint64) BlockSeq {
 	switch len(slice) {
 	case 0:
diff --git a/pkg/seccomp/BUILD b/pkg/seccomp/BUILD
index 29aeaab8c..e828894b0 100644
--- a/pkg/seccomp/BUILD
+++ b/pkg/seccomp/BUILD
@@ -10,6 +10,7 @@ go_binary(
         "seccomp_test_victim_amd64.go",
         "seccomp_test_victim_arm64.go",
     ],
+    nogo = False,
     deps = [":seccomp"],
 )
 
@@ -48,7 +49,7 @@ go_test(
     library = ":seccomp",
     deps = [
         "//pkg/abi/linux",
-        "//pkg/binary",
         "//pkg/bpf",
+        "//pkg/usermem",
     ],
 )
diff --git a/pkg/seccomp/seccomp.go b/pkg/seccomp/seccomp.go
index 55fd6967e..752e2dc32 100644
--- a/pkg/seccomp/seccomp.go
+++ b/pkg/seccomp/seccomp.go
@@ -12,7 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// Package seccomp provides basic seccomp filters for x86_64 (little endian).
+// Package seccomp provides generation of basic seccomp filters. Currently,
+// only little endian systems are supported.
 package seccomp
 
 import (
@@ -64,9 +65,9 @@ func Install(rules SyscallRules) error {
 			Rules:  rules,
 			Action: linux.SECCOMP_RET_ALLOW,
 		},
-	}, defaultAction)
+	}, defaultAction, defaultAction)
 	if log.IsLogging(log.Debug) {
-		programStr, errDecode := bpf.DecodeProgram(instrs)
+		programStr, errDecode := bpf.DecodeInstructions(instrs)
 		if errDecode != nil {
 			programStr = fmt.Sprintf("Error: %v\n%s", errDecode, programStr)
 		}
@@ -117,7 +118,7 @@ var SyscallName = func(sysno uintptr) string {
 
 // BuildProgram builds a BPF program from the given map of actions to matching
 // SyscallRules. The single generated program covers all provided RuleSets.
-func BuildProgram(rules []RuleSet, defaultAction linux.BPFAction) ([]linux.BPFInstruction, error) {
+func BuildProgram(rules []RuleSet, defaultAction, badArchAction linux.BPFAction) ([]linux.BPFInstruction, error) {
 	program := bpf.NewProgramBuilder()
 
 	// Be paranoid and check that syscall is done in the expected architecture.
@@ -128,7 +129,7 @@ func BuildProgram(rules []RuleSet, defaultAction linux.BPFAction) ([]linux.BPFIn
 	// defaultLabel is at the bottom of the program. The size of program
 	// may exceeds 255 lines, which is the limit of a condition jump.
 	program.AddJump(bpf.Jmp|bpf.Jeq|bpf.K, LINUX_AUDIT_ARCH, skipOneInst, 0)
-	program.AddDirectJumpLabel(defaultLabel)
+	program.AddStmt(bpf.Ret|bpf.K, uint32(badArchAction))
 	if err := buildIndex(rules, program); err != nil {
 		return nil, err
 	}
@@ -144,6 +145,11 @@ func BuildProgram(rules []RuleSet, defaultAction linux.BPFAction) ([]linux.BPFIn
 
 // buildIndex builds a BST to quickly search through all syscalls.
 func buildIndex(rules []RuleSet, program *bpf.ProgramBuilder) error {
+	// Do nothing if rules is empty.
+	if len(rules) == 0 {
+		return nil
+	}
+
 	// Build a list of all application system calls, across all given rule
 	// sets. We have a simple BST, but may dispatch individual matchers
 	// with different actions. The matchers are evaluated linearly.
@@ -216,42 +222,163 @@ func addSyscallArgsCheck(p *bpf.ProgramBuilder, rules []Rule, action linux.BPFAc
 		labelled := false
 		for i, arg := range rule {
 			if arg != nil {
+				// Break out early if using MatchAny since no further
+				// instructions are required.
+				if _, ok := arg.(MatchAny); ok {
+					continue
+				}
+
+				// Determine the data offset for low and high bits of input.
+				dataOffsetLow := seccompDataOffsetArgLow(i)
+				dataOffsetHigh := seccompDataOffsetArgHigh(i)
+				if i == RuleIP {
+					dataOffsetLow = seccompDataOffsetIPLow
+					dataOffsetHigh = seccompDataOffsetIPHigh
+				}
+
+				// Add the conditional operation. Input values to the BPF
+				// program are 64bit values.  However, comparisons in BPF can
+				// only be done on 32bit values. This means that we need to do
+				// multiple BPF comparisons in order to do one logical 64bit
+				// comparison.
 				switch a := arg.(type) {
-				case AllowAny:
-				case AllowValue:
-					dataOffsetLow := seccompDataOffsetArgLow(i)
-					dataOffsetHigh := seccompDataOffsetArgHigh(i)
-					if i == RuleIP {
-						dataOffsetLow = seccompDataOffsetIPLow
-						dataOffsetHigh = seccompDataOffsetIPHigh
-					}
+				case EqualTo:
+					// EqualTo checks that both the higher and lower 32bits are equal.
 					high, low := uint32(a>>32), uint32(a)
-					// assert arg_low == low
+
+					// Assert that the lower 32bits are equal.
+					// arg_low == low ? continue : violation
 					p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, dataOffsetLow)
 					p.AddJumpFalseLabel(bpf.Jmp|bpf.Jeq|bpf.K, low, 0, ruleViolationLabel(ruleSetIdx, sysno, ruleidx))
-					// assert arg_high == high
+
+					// Assert that the lower 32bits are also equal.
+					// arg_high == high ? continue/success : violation
 					p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, dataOffsetHigh)
 					p.AddJumpFalseLabel(bpf.Jmp|bpf.Jeq|bpf.K, high, 0, ruleViolationLabel(ruleSetIdx, sysno, ruleidx))
 					labelled = true
+				case NotEqual:
+					// NotEqual checks that either the higher or lower 32bits
+					// are *not* equal.
+					high, low := uint32(a>>32), uint32(a)
+					labelGood := fmt.Sprintf("ne%v", i)
+
+					// Check if the higher 32bits are (not) equal.
+					// arg_low == low ? continue : success
+					p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, dataOffsetLow)
+					p.AddJumpFalseLabel(bpf.Jmp|bpf.Jeq|bpf.K, low, 0, ruleLabel(ruleSetIdx, sysno, ruleidx, labelGood))
+
+					// Assert that the lower 32bits are not equal (assuming
+					// higher bits are equal).
+					// arg_high == high ? violation : continue/success
+					p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, dataOffsetHigh)
+					p.AddJumpTrueLabel(bpf.Jmp|bpf.Jeq|bpf.K, high, ruleViolationLabel(ruleSetIdx, sysno, ruleidx), 0)
+					p.AddLabel(ruleLabel(ruleSetIdx, sysno, ruleidx, labelGood))
+					labelled = true
 				case GreaterThan:
-					dataOffsetLow := seccompDataOffsetArgLow(i)
-					dataOffsetHigh := seccompDataOffsetArgHigh(i)
-					if i == RuleIP {
-						dataOffsetLow = seccompDataOffsetIPLow
-						dataOffsetHigh = seccompDataOffsetIPHigh
-					}
-					labelGood := fmt.Sprintf("gt%v", i)
+					// GreaterThan checks that the higher 32bits is greater
+					// *or* that the higher 32bits are equal and the lower
+					// 32bits are greater.
 					high, low := uint32(a>>32), uint32(a)
-					// assert arg_high < high
+					labelGood := fmt.Sprintf("gt%v", i)
+
+					// Assert the higher 32bits are greater than or equal.
+					// arg_high >= high ? continue : violation (arg_high < high)
 					p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, dataOffsetHigh)
 					p.AddJumpFalseLabel(bpf.Jmp|bpf.Jge|bpf.K, high, 0, ruleViolationLabel(ruleSetIdx, sysno, ruleidx))
-					// arg_high > high
+
+					// Assert that the lower 32bits are greater.
+					// arg_high == high ? continue : success (arg_high > high)
 					p.AddJumpFalseLabel(bpf.Jmp|bpf.Jeq|bpf.K, high, 0, ruleLabel(ruleSetIdx, sysno, ruleidx, labelGood))
-					// arg_low < low
+					// arg_low > low ? continue/success : violation (arg_high == high and arg_low <= low)
 					p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, dataOffsetLow)
 					p.AddJumpFalseLabel(bpf.Jmp|bpf.Jgt|bpf.K, low, 0, ruleViolationLabel(ruleSetIdx, sysno, ruleidx))
 					p.AddLabel(ruleLabel(ruleSetIdx, sysno, ruleidx, labelGood))
 					labelled = true
+				case GreaterThanOrEqual:
+					// GreaterThanOrEqual checks that the higher 32bits is
+					// greater *or* that the higher 32bits are equal and the
+					// lower 32bits are greater than or equal.
+					high, low := uint32(a>>32), uint32(a)
+					labelGood := fmt.Sprintf("ge%v", i)
+
+					// Assert the higher 32bits are greater than or equal.
+					// arg_high >= high ? continue : violation (arg_high < high)
+					p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, dataOffsetHigh)
+					p.AddJumpFalseLabel(bpf.Jmp|bpf.Jge|bpf.K, high, 0, ruleViolationLabel(ruleSetIdx, sysno, ruleidx))
+					// arg_high == high ? continue : success (arg_high > high)
+					p.AddJumpFalseLabel(bpf.Jmp|bpf.Jeq|bpf.K, high, 0, ruleLabel(ruleSetIdx, sysno, ruleidx, labelGood))
+
+					// Assert that the lower 32bits are greater (assuming the
+					// higher bits are equal).
+					// arg_low >= low ? continue/success : violation (arg_high == high and arg_low < low)
+					p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, dataOffsetLow)
+					p.AddJumpFalseLabel(bpf.Jmp|bpf.Jge|bpf.K, low, 0, ruleViolationLabel(ruleSetIdx, sysno, ruleidx))
+					p.AddLabel(ruleLabel(ruleSetIdx, sysno, ruleidx, labelGood))
+					labelled = true
+				case LessThan:
+					// LessThan checks that the higher 32bits is less *or* that
+					// the higher 32bits are equal and the lower 32bits are
+					// less.
+					high, low := uint32(a>>32), uint32(a)
+					labelGood := fmt.Sprintf("lt%v", i)
+
+					// Assert the higher 32bits are less than or equal.
+					// arg_high > high ? violation : continue
+					p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, dataOffsetHigh)
+					p.AddJumpTrueLabel(bpf.Jmp|bpf.Jgt|bpf.K, high, ruleViolationLabel(ruleSetIdx, sysno, ruleidx), 0)
+					// arg_high == high ? continue : success (arg_high < high)
+					p.AddJumpFalseLabel(bpf.Jmp|bpf.Jeq|bpf.K, high, 0, ruleLabel(ruleSetIdx, sysno, ruleidx, labelGood))
+
+					// Assert that the lower 32bits are less (assuming the
+					// higher bits are equal).
+					// arg_low >= low ? violation : continue
+					p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, dataOffsetLow)
+					p.AddJumpTrueLabel(bpf.Jmp|bpf.Jge|bpf.K, low, ruleViolationLabel(ruleSetIdx, sysno, ruleidx), 0)
+					p.AddLabel(ruleLabel(ruleSetIdx, sysno, ruleidx, labelGood))
+					labelled = true
+				case LessThanOrEqual:
+					// LessThan checks that the higher 32bits is less *or* that
+					// the higher 32bits are equal and the lower 32bits are
+					// less than or equal.
+					high, low := uint32(a>>32), uint32(a)
+					labelGood := fmt.Sprintf("le%v", i)
+
+					// Assert the higher 32bits are less than or equal.
+					// assert arg_high > high ? violation : continue
+					p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, dataOffsetHigh)
+					p.AddJumpTrueLabel(bpf.Jmp|bpf.Jgt|bpf.K, high, ruleViolationLabel(ruleSetIdx, sysno, ruleidx), 0)
+					// arg_high == high ? continue : success
+					p.AddJumpFalseLabel(bpf.Jmp|bpf.Jeq|bpf.K, high, 0, ruleLabel(ruleSetIdx, sysno, ruleidx, labelGood))
+
+					// Assert the lower bits are less than or equal (assuming
+					// the higher bits are equal).
+					// arg_low > low ? violation : success
+					p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, dataOffsetLow)
+					p.AddJumpTrueLabel(bpf.Jmp|bpf.Jgt|bpf.K, low, ruleViolationLabel(ruleSetIdx, sysno, ruleidx), 0)
+					p.AddLabel(ruleLabel(ruleSetIdx, sysno, ruleidx, labelGood))
+					labelled = true
+				case maskedEqual:
+					// MaskedEqual checks that the bitwise AND of the value and
+					// mask are equal for both the higher and lower 32bits.
+					high, low := uint32(a.value>>32), uint32(a.value)
+					maskHigh, maskLow := uint32(a.mask>>32), uint32(a.mask)
+
+					// Assert that the lower 32bits are equal when masked.
+					// A <- arg_low.
+					p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, dataOffsetLow)
+					// A <- arg_low & maskLow
+					p.AddStmt(bpf.Alu|bpf.And|bpf.K, maskLow)
+					// Assert that arg_low & maskLow == low.
+					p.AddJumpFalseLabel(bpf.Jmp|bpf.Jeq|bpf.K, low, 0, ruleViolationLabel(ruleSetIdx, sysno, ruleidx))
+
+					// Assert that the higher 32bits are equal when masked.
+					// A <- arg_high
+					p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, dataOffsetHigh)
+					// A <- arg_high & maskHigh
+					p.AddStmt(bpf.Alu|bpf.And|bpf.K, maskHigh)
+					// Assert that arg_high & maskHigh == high.
+					p.AddJumpFalseLabel(bpf.Jmp|bpf.Jeq|bpf.K, high, 0, ruleViolationLabel(ruleSetIdx, sysno, ruleidx))
+					labelled = true
 				default:
 					return fmt.Errorf("unknown syscall rule type: %v", reflect.TypeOf(a))
 				}
diff --git a/pkg/seccomp/seccomp_rules.go b/pkg/seccomp/seccomp_rules.go
index a52dc1b4e..daf165bbf 100644
--- a/pkg/seccomp/seccomp_rules.go
+++ b/pkg/seccomp/seccomp_rules.go
@@ -39,28 +39,79 @@ func seccompDataOffsetArgHigh(i int) uint32 {
 	return seccompDataOffsetArgLow(i) + 4
 }
 
-// AllowAny is marker to indicate any value will be accepted.
-type AllowAny struct{}
+// MatchAny is marker to indicate any value will be accepted.
+type MatchAny struct{}
 
-func (a AllowAny) String() (s string) {
+func (a MatchAny) String() (s string) {
 	return "*"
 }
 
-// AllowValue specifies a value that needs to be strictly matched.
-type AllowValue uintptr
+// EqualTo specifies a value that needs to be strictly matched.
+type EqualTo uintptr
+
+func (a EqualTo) String() (s string) {
+	return fmt.Sprintf("== %#x", uintptr(a))
+}
+
+// NotEqual specifies a value that is strictly not equal.
+type NotEqual uintptr
+
+func (a NotEqual) String() (s string) {
+	return fmt.Sprintf("!= %#x", uintptr(a))
+}
 
 // GreaterThan specifies a value that needs to be strictly smaller.
 type GreaterThan uintptr
 
-func (a AllowValue) String() (s string) {
-	return fmt.Sprintf("%#x ", uintptr(a))
+func (a GreaterThan) String() (s string) {
+	return fmt.Sprintf("> %#x", uintptr(a))
+}
+
+// GreaterThanOrEqual specifies a value that needs to be smaller or equal.
+type GreaterThanOrEqual uintptr
+
+func (a GreaterThanOrEqual) String() (s string) {
+	return fmt.Sprintf(">= %#x", uintptr(a))
+}
+
+// LessThan specifies a value that needs to be strictly greater.
+type LessThan uintptr
+
+func (a LessThan) String() (s string) {
+	return fmt.Sprintf("< %#x", uintptr(a))
+}
+
+// LessThanOrEqual specifies a value that needs to be greater or equal.
+type LessThanOrEqual uintptr
+
+func (a LessThanOrEqual) String() (s string) {
+	return fmt.Sprintf("<= %#x", uintptr(a))
+}
+
+type maskedEqual struct {
+	mask  uintptr
+	value uintptr
+}
+
+func (a maskedEqual) String() (s string) {
+	return fmt.Sprintf("& %#x == %#x", a.mask, a.value)
+}
+
+// MaskedEqual specifies a value that matches the input after the input is
+// masked (bitwise &) against the given mask. Can be used to verify that input
+// only includes certain approved flags.
+func MaskedEqual(mask, value uintptr) interface{} {
+	return maskedEqual{
+		mask:  mask,
+		value: value,
+	}
 }
 
 // Rule stores the allowed syscall arguments.
 //
 // For example:
 // rule := Rule {
-//       AllowValue(linux.ARCH_GET_FS | linux.ARCH_SET_FS), // arg0
+//       EqualTo(linux.ARCH_GET_FS | linux.ARCH_SET_FS), // arg0
 // }
 type Rule [7]interface{} // 6 arguments + RIP
 
@@ -89,12 +140,12 @@ func (r Rule) String() (s string) {
 //  rules := SyscallRules{
 //         syscall.SYS_FUTEX: []Rule{
 //                 {
-//                         AllowAny{},
-//                         AllowValue(linux.FUTEX_WAIT | linux.FUTEX_PRIVATE_FLAG),
+//                         MatchAny{},
+//                         EqualTo(linux.FUTEX_WAIT | linux.FUTEX_PRIVATE_FLAG),
 //                 }, // OR
 //                 {
-//                         AllowAny{},
-//                         AllowValue(linux.FUTEX_WAKE | linux.FUTEX_PRIVATE_FLAG),
+//                         MatchAny{},
+//                         EqualTo(linux.FUTEX_WAKE | linux.FUTEX_PRIVATE_FLAG),
 //                 },
 //         },
 //         syscall.SYS_GETPID: []Rule{},
diff --git a/pkg/seccomp/seccomp_test.go b/pkg/seccomp/seccomp_test.go
index 5238df8bd..e1444d18b 100644
--- a/pkg/seccomp/seccomp_test.go
+++ b/pkg/seccomp/seccomp_test.go
@@ -28,17 +28,10 @@ import (
 	"time"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/binary"
 	"gvisor.dev/gvisor/pkg/bpf"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
-type seccompData struct {
-	nr                 uint32
-	arch               uint32
-	instructionPointer uint64
-	args               [6]uint64
-}
-
 // newVictim makes a victim binary.
 func newVictim() (string, error) {
 	f, err := ioutil.TempFile("", "victim")
@@ -58,9 +51,14 @@ func newVictim() (string, error) {
 	return path, nil
 }
 
-// asInput converts a seccompData to a bpf.Input.
-func (d *seccompData) asInput() bpf.Input {
-	return bpf.InputBytes{binary.Marshal(nil, binary.LittleEndian, d), binary.LittleEndian}
+// dataAsInput converts a linux.SeccompData to a bpf.Input.
+func dataAsInput(d *linux.SeccompData) bpf.Input {
+	buf := make([]byte, d.SizeBytes())
+	d.MarshalUnsafe(buf)
+	return bpf.InputBytes{
+		Data:  buf,
+		Order: usermem.ByteOrder,
+	}
 }
 
 func TestBasic(t *testing.T) {
@@ -69,18 +67,21 @@ func TestBasic(t *testing.T) {
 		desc string
 
 		// data is the input data.
-		data seccompData
+		data linux.SeccompData
 
 		// want is the expected return value of the BPF program.
 		want linux.BPFAction
 	}
 
 	for _, test := range []struct {
+		name          string
 		ruleSets      []RuleSet
 		defaultAction linux.BPFAction
+		badArchAction linux.BPFAction
 		specs         []spec
 	}{
 		{
+			name: "Single syscall",
 			ruleSets: []RuleSet{
 				{
 					Rules:  SyscallRules{1: {}},
@@ -88,26 +89,28 @@ func TestBasic(t *testing.T) {
 				},
 			},
 			defaultAction: linux.SECCOMP_RET_TRAP,
+			badArchAction: linux.SECCOMP_RET_KILL_THREAD,
 			specs: []spec{
 				{
-					desc: "Single syscall allowed",
-					data: seccompData{nr: 1, arch: LINUX_AUDIT_ARCH},
+					desc: "syscall allowed",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH},
 					want: linux.SECCOMP_RET_ALLOW,
 				},
 				{
-					desc: "Single syscall disallowed",
-					data: seccompData{nr: 2, arch: LINUX_AUDIT_ARCH},
+					desc: "syscall disallowed",
+					data: linux.SeccompData{Nr: 2, Arch: LINUX_AUDIT_ARCH},
 					want: linux.SECCOMP_RET_TRAP,
 				},
 			},
 		},
 		{
+			name: "Multiple rulesets",
 			ruleSets: []RuleSet{
 				{
 					Rules: SyscallRules{
 						1: []Rule{
 							{
-								AllowValue(0x1),
+								EqualTo(0x1),
 							},
 						},
 					},
@@ -122,30 +125,32 @@ func TestBasic(t *testing.T) {
 				},
 			},
 			defaultAction: linux.SECCOMP_RET_KILL_THREAD,
+			badArchAction: linux.SECCOMP_RET_KILL_THREAD,
 			specs: []spec{
 				{
-					desc: "Multiple rulesets allowed (1a)",
-					data: seccompData{nr: 1, arch: LINUX_AUDIT_ARCH, args: [6]uint64{0x1}},
+					desc: "allowed (1a)",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0x1}},
 					want: linux.SECCOMP_RET_ALLOW,
 				},
 				{
-					desc: "Multiple rulesets allowed (1b)",
-					data: seccompData{nr: 1, arch: LINUX_AUDIT_ARCH},
+					desc: "allowed (1b)",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH},
 					want: linux.SECCOMP_RET_TRAP,
 				},
 				{
-					desc: "Multiple rulesets allowed (2)",
-					data: seccompData{nr: 1, arch: LINUX_AUDIT_ARCH},
+					desc: "syscall 1 matched 2nd rule",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH},
 					want: linux.SECCOMP_RET_TRAP,
 				},
 				{
-					desc: "Multiple rulesets allowed (2)",
-					data: seccompData{nr: 0, arch: LINUX_AUDIT_ARCH},
+					desc: "no match",
+					data: linux.SeccompData{Nr: 0, Arch: LINUX_AUDIT_ARCH},
 					want: linux.SECCOMP_RET_KILL_THREAD,
 				},
 			},
 		},
 		{
+			name: "Multiple syscalls",
 			ruleSets: []RuleSet{
 				{
 					Rules: SyscallRules{
@@ -157,50 +162,52 @@ func TestBasic(t *testing.T) {
 				},
 			},
 			defaultAction: linux.SECCOMP_RET_TRAP,
+			badArchAction: linux.SECCOMP_RET_KILL_THREAD,
 			specs: []spec{
 				{
-					desc: "Multiple syscalls allowed (1)",
-					data: seccompData{nr: 1, arch: LINUX_AUDIT_ARCH},
+					desc: "allowed (1)",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH},
 					want: linux.SECCOMP_RET_ALLOW,
 				},
 				{
-					desc: "Multiple syscalls allowed (3)",
-					data: seccompData{nr: 3, arch: LINUX_AUDIT_ARCH},
+					desc: "allowed (3)",
+					data: linux.SeccompData{Nr: 3, Arch: LINUX_AUDIT_ARCH},
 					want: linux.SECCOMP_RET_ALLOW,
 				},
 				{
-					desc: "Multiple syscalls allowed (5)",
-					data: seccompData{nr: 5, arch: LINUX_AUDIT_ARCH},
+					desc: "allowed (5)",
+					data: linux.SeccompData{Nr: 5, Arch: LINUX_AUDIT_ARCH},
 					want: linux.SECCOMP_RET_ALLOW,
 				},
 				{
-					desc: "Multiple syscalls disallowed (0)",
-					data: seccompData{nr: 0, arch: LINUX_AUDIT_ARCH},
+					desc: "disallowed (0)",
+					data: linux.SeccompData{Nr: 0, Arch: LINUX_AUDIT_ARCH},
 					want: linux.SECCOMP_RET_TRAP,
 				},
 				{
-					desc: "Multiple syscalls disallowed (2)",
-					data: seccompData{nr: 2, arch: LINUX_AUDIT_ARCH},
+					desc: "disallowed (2)",
+					data: linux.SeccompData{Nr: 2, Arch: LINUX_AUDIT_ARCH},
 					want: linux.SECCOMP_RET_TRAP,
 				},
 				{
-					desc: "Multiple syscalls disallowed (4)",
-					data: seccompData{nr: 4, arch: LINUX_AUDIT_ARCH},
+					desc: "disallowed (4)",
+					data: linux.SeccompData{Nr: 4, Arch: LINUX_AUDIT_ARCH},
 					want: linux.SECCOMP_RET_TRAP,
 				},
 				{
-					desc: "Multiple syscalls disallowed (6)",
-					data: seccompData{nr: 6, arch: LINUX_AUDIT_ARCH},
+					desc: "disallowed (6)",
+					data: linux.SeccompData{Nr: 6, Arch: LINUX_AUDIT_ARCH},
 					want: linux.SECCOMP_RET_TRAP,
 				},
 				{
-					desc: "Multiple syscalls disallowed (100)",
-					data: seccompData{nr: 100, arch: LINUX_AUDIT_ARCH},
+					desc: "disallowed (100)",
+					data: linux.SeccompData{Nr: 100, Arch: LINUX_AUDIT_ARCH},
 					want: linux.SECCOMP_RET_TRAP,
 				},
 			},
 		},
 		{
+			name: "Wrong architecture",
 			ruleSets: []RuleSet{
 				{
 					Rules: SyscallRules{
@@ -210,15 +217,17 @@ func TestBasic(t *testing.T) {
 				},
 			},
 			defaultAction: linux.SECCOMP_RET_TRAP,
+			badArchAction: linux.SECCOMP_RET_KILL_THREAD,
 			specs: []spec{
 				{
-					desc: "Wrong architecture",
-					data: seccompData{nr: 1, arch: 123},
-					want: linux.SECCOMP_RET_TRAP,
+					desc: "arch (123)",
+					data: linux.SeccompData{Nr: 1, Arch: 123},
+					want: linux.SECCOMP_RET_KILL_THREAD,
 				},
 			},
 		},
 		{
+			name: "Syscall disallowed",
 			ruleSets: []RuleSet{
 				{
 					Rules: SyscallRules{
@@ -228,22 +237,24 @@ func TestBasic(t *testing.T) {
 				},
 			},
 			defaultAction: linux.SECCOMP_RET_TRAP,
+			badArchAction: linux.SECCOMP_RET_KILL_THREAD,
 			specs: []spec{
 				{
-					desc: "Syscall disallowed, action trap",
-					data: seccompData{nr: 2, arch: LINUX_AUDIT_ARCH},
+					desc: "action trap",
+					data: linux.SeccompData{Nr: 2, Arch: LINUX_AUDIT_ARCH},
 					want: linux.SECCOMP_RET_TRAP,
 				},
 			},
 		},
 		{
+			name: "Syscall arguments",
 			ruleSets: []RuleSet{
 				{
 					Rules: SyscallRules{
 						1: []Rule{
 							{
-								AllowAny{},
-								AllowValue(0xf),
+								MatchAny{},
+								EqualTo(0xf),
 							},
 						},
 					},
@@ -251,29 +262,31 @@ func TestBasic(t *testing.T) {
 				},
 			},
 			defaultAction: linux.SECCOMP_RET_TRAP,
+			badArchAction: linux.SECCOMP_RET_KILL_THREAD,
 			specs: []spec{
 				{
-					desc: "Syscall argument allowed",
-					data: seccompData{nr: 1, arch: LINUX_AUDIT_ARCH, args: [6]uint64{0xf, 0xf}},
+					desc: "allowed",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0xf, 0xf}},
 					want: linux.SECCOMP_RET_ALLOW,
 				},
 				{
-					desc: "Syscall argument disallowed",
-					data: seccompData{nr: 1, arch: LINUX_AUDIT_ARCH, args: [6]uint64{0xf, 0xe}},
+					desc: "disallowed",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0xf, 0xe}},
 					want: linux.SECCOMP_RET_TRAP,
 				},
 			},
 		},
 		{
+			name: "Multiple arguments",
 			ruleSets: []RuleSet{
 				{
 					Rules: SyscallRules{
 						1: []Rule{
 							{
-								AllowValue(0xf),
+								EqualTo(0xf),
 							},
 							{
-								AllowValue(0xe),
+								EqualTo(0xe),
 							},
 						},
 					},
@@ -281,28 +294,30 @@ func TestBasic(t *testing.T) {
 				},
 			},
 			defaultAction: linux.SECCOMP_RET_TRAP,
+			badArchAction: linux.SECCOMP_RET_KILL_THREAD,
 			specs: []spec{
 				{
-					desc: "Syscall argument allowed, two rules",
-					data: seccompData{nr: 1, arch: LINUX_AUDIT_ARCH, args: [6]uint64{0xf}},
+					desc: "match first rule",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0xf}},
 					want: linux.SECCOMP_RET_ALLOW,
 				},
 				{
-					desc: "Syscall argument allowed, two rules",
-					data: seccompData{nr: 1, arch: LINUX_AUDIT_ARCH, args: [6]uint64{0xe}},
+					desc: "match 2nd rule",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0xe}},
 					want: linux.SECCOMP_RET_ALLOW,
 				},
 			},
 		},
 		{
+			name: "EqualTo",
 			ruleSets: []RuleSet{
 				{
 					Rules: SyscallRules{
 						1: []Rule{
 							{
-								AllowValue(0),
-								AllowValue(math.MaxUint64 - 1),
-								AllowValue(math.MaxUint32),
+								EqualTo(0),
+								EqualTo(math.MaxUint64 - 1),
+								EqualTo(math.MaxUint32),
 							},
 						},
 					},
@@ -310,37 +325,135 @@ func TestBasic(t *testing.T) {
 				},
 			},
 			defaultAction: linux.SECCOMP_RET_TRAP,
+			badArchAction: linux.SECCOMP_RET_KILL_THREAD,
 			specs: []spec{
 				{
-					desc: "64bit syscall argument allowed",
-					data: seccompData{
-						nr:   1,
-						arch: LINUX_AUDIT_ARCH,
-						args: [6]uint64{0, math.MaxUint64 - 1, math.MaxUint32},
+					desc: "argument allowed (all match)",
+					data: linux.SeccompData{
+						Nr:   1,
+						Arch: LINUX_AUDIT_ARCH,
+						Args: [6]uint64{0, math.MaxUint64 - 1, math.MaxUint32},
 					},
 					want: linux.SECCOMP_RET_ALLOW,
 				},
 				{
-					desc: "64bit syscall argument disallowed",
-					data: seccompData{
-						nr:   1,
-						arch: LINUX_AUDIT_ARCH,
-						args: [6]uint64{0, math.MaxUint64, math.MaxUint32},
+					desc: "argument disallowed (one mismatch)",
+					data: linux.SeccompData{
+						Nr:   1,
+						Arch: LINUX_AUDIT_ARCH,
+						Args: [6]uint64{0, math.MaxUint64, math.MaxUint32},
 					},
 					want: linux.SECCOMP_RET_TRAP,
 				},
 				{
-					desc: "64bit syscall argument disallowed",
-					data: seccompData{
-						nr:   1,
-						arch: LINUX_AUDIT_ARCH,
-						args: [6]uint64{0, math.MaxUint64, math.MaxUint32 - 1},
+					desc: "argument disallowed (multiple mismatch)",
+					data: linux.SeccompData{
+						Nr:   1,
+						Arch: LINUX_AUDIT_ARCH,
+						Args: [6]uint64{0, math.MaxUint64, math.MaxUint32 - 1},
 					},
 					want: linux.SECCOMP_RET_TRAP,
 				},
 			},
 		},
 		{
+			name: "NotEqual",
+			ruleSets: []RuleSet{
+				{
+					Rules: SyscallRules{
+						1: []Rule{
+							{
+								NotEqual(0x7aabbccdd),
+								NotEqual(math.MaxUint64 - 1),
+								NotEqual(math.MaxUint32),
+							},
+						},
+					},
+					Action: linux.SECCOMP_RET_ALLOW,
+				},
+			},
+			defaultAction: linux.SECCOMP_RET_TRAP,
+			badArchAction: linux.SECCOMP_RET_KILL_THREAD,
+			specs: []spec{
+				{
+					desc: "arg allowed",
+					data: linux.SeccompData{
+						Nr:   1,
+						Arch: LINUX_AUDIT_ARCH,
+						Args: [6]uint64{0, math.MaxUint64, math.MaxUint32 - 1},
+					},
+					want: linux.SECCOMP_RET_ALLOW,
+				},
+				{
+					desc: "arg disallowed (one equal)",
+					data: linux.SeccompData{
+						Nr:   1,
+						Arch: LINUX_AUDIT_ARCH,
+						Args: [6]uint64{0x7aabbccdd, math.MaxUint64, math.MaxUint32 - 1},
+					},
+					want: linux.SECCOMP_RET_TRAP,
+				},
+				{
+					desc: "arg disallowed (all equal)",
+					data: linux.SeccompData{
+						Nr:   1,
+						Arch: LINUX_AUDIT_ARCH,
+						Args: [6]uint64{0x7aabbccdd, math.MaxUint64 - 1, math.MaxUint32},
+					},
+					want: linux.SECCOMP_RET_TRAP,
+				},
+			},
+		},
+		{
+			name: "GreaterThan",
+			ruleSets: []RuleSet{
+				{
+					Rules: SyscallRules{
+						1: []Rule{
+							{
+								// 4294967298
+								// Both upper 32 bits and lower 32 bits are non-zero.
+								// 00000000000000000000000000000010
+								// 00000000000000000000000000000010
+								GreaterThan(0x00000002_00000002),
+							},
+						},
+					},
+					Action: linux.SECCOMP_RET_ALLOW,
+				},
+			},
+			defaultAction: linux.SECCOMP_RET_TRAP,
+			badArchAction: linux.SECCOMP_RET_KILL_THREAD,
+			specs: []spec{
+				{
+					desc: "high 32bits greater",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0x00000003_00000002}},
+					want: linux.SECCOMP_RET_ALLOW,
+				},
+				{
+					desc: "high 32bits equal, low 32bits greater",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0x00000002_00000003}},
+					want: linux.SECCOMP_RET_ALLOW,
+				},
+				{
+					desc: "high 32bits equal, low 32bits equal",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0x00000002_00000002}},
+					want: linux.SECCOMP_RET_TRAP,
+				},
+				{
+					desc: "high 32bits equal, low 32bits less",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0x00000002_00000001}},
+					want: linux.SECCOMP_RET_TRAP,
+				},
+				{
+					desc: "high 32bits less",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0x00000001_00000003}},
+					want: linux.SECCOMP_RET_TRAP,
+				},
+			},
+		},
+		{
+			name: "GreaterThan (multi)",
 			ruleSets: []RuleSet{
 				{
 					Rules: SyscallRules{
@@ -355,46 +468,410 @@ func TestBasic(t *testing.T) {
 				},
 			},
 			defaultAction: linux.SECCOMP_RET_TRAP,
+			badArchAction: linux.SECCOMP_RET_KILL_THREAD,
+			specs: []spec{
+				{
+					desc: "arg allowed",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0x10, 0xffffffff}},
+					want: linux.SECCOMP_RET_ALLOW,
+				},
+				{
+					desc: "arg disallowed (first arg equal)",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0xf, 0xffffffff}},
+					want: linux.SECCOMP_RET_TRAP,
+				},
+				{
+					desc: "arg disallowed (first arg smaller)",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0x0, 0xffffffff}},
+					want: linux.SECCOMP_RET_TRAP,
+				},
+				{
+					desc: "arg disallowed (second arg equal)",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0x10, 0xabcd000d}},
+					want: linux.SECCOMP_RET_TRAP,
+				},
+				{
+					desc: "arg disallowed (second arg smaller)",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0x10, 0xa000ffff}},
+					want: linux.SECCOMP_RET_TRAP,
+				},
+			},
+		},
+		{
+			name: "GreaterThanOrEqual",
+			ruleSets: []RuleSet{
+				{
+					Rules: SyscallRules{
+						1: []Rule{
+							{
+								// 4294967298
+								// Both upper 32 bits and lower 32 bits are non-zero.
+								// 00000000000000000000000000000010
+								// 00000000000000000000000000000010
+								GreaterThanOrEqual(0x00000002_00000002),
+							},
+						},
+					},
+					Action: linux.SECCOMP_RET_ALLOW,
+				},
+			},
+			defaultAction: linux.SECCOMP_RET_TRAP,
+			badArchAction: linux.SECCOMP_RET_KILL_THREAD,
+			specs: []spec{
+				{
+					desc: "high 32bits greater",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0x00000003_00000002}},
+					want: linux.SECCOMP_RET_ALLOW,
+				},
+				{
+					desc: "high 32bits equal, low 32bits greater",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0x00000002_00000003}},
+					want: linux.SECCOMP_RET_ALLOW,
+				},
+				{
+					desc: "high 32bits equal, low 32bits equal",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0x00000002_00000002}},
+					want: linux.SECCOMP_RET_ALLOW,
+				},
+				{
+					desc: "high 32bits equal, low 32bits less",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0x00000002_00000001}},
+					want: linux.SECCOMP_RET_TRAP,
+				},
+				{
+					desc: "high 32bits less",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0x00000001_00000002}},
+					want: linux.SECCOMP_RET_TRAP,
+				},
+			},
+		},
+		{
+			name: "GreaterThanOrEqual (multi)",
+			ruleSets: []RuleSet{
+				{
+					Rules: SyscallRules{
+						1: []Rule{
+							{
+								GreaterThanOrEqual(0xf),
+								GreaterThanOrEqual(0xabcd000d),
+							},
+						},
+					},
+					Action: linux.SECCOMP_RET_ALLOW,
+				},
+			},
+			defaultAction: linux.SECCOMP_RET_TRAP,
+			badArchAction: linux.SECCOMP_RET_KILL_THREAD,
+			specs: []spec{
+				{
+					desc: "arg allowed (both greater)",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0x10, 0xffffffff}},
+					want: linux.SECCOMP_RET_ALLOW,
+				},
+				{
+					desc: "arg allowed (first arg equal)",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0xf, 0xffffffff}},
+					want: linux.SECCOMP_RET_ALLOW,
+				},
+				{
+					desc: "arg disallowed (first arg smaller)",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0x0, 0xffffffff}},
+					want: linux.SECCOMP_RET_TRAP,
+				},
+				{
+					desc: "arg allowed (second arg equal)",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0x10, 0xabcd000d}},
+					want: linux.SECCOMP_RET_ALLOW,
+				},
+				{
+					desc: "arg disallowed (second arg smaller)",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0x10, 0xa000ffff}},
+					want: linux.SECCOMP_RET_TRAP,
+				},
+				{
+					desc: "arg disallowed (both arg smaller)",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0x0, 0xa000ffff}},
+					want: linux.SECCOMP_RET_TRAP,
+				},
+			},
+		},
+		{
+			name: "LessThan",
+			ruleSets: []RuleSet{
+				{
+					Rules: SyscallRules{
+						1: []Rule{
+							{
+								// 4294967298
+								// Both upper 32 bits and lower 32 bits are non-zero.
+								// 00000000000000000000000000000010
+								// 00000000000000000000000000000010
+								LessThan(0x00000002_00000002),
+							},
+						},
+					},
+					Action: linux.SECCOMP_RET_ALLOW,
+				},
+			},
+			defaultAction: linux.SECCOMP_RET_TRAP,
+			badArchAction: linux.SECCOMP_RET_KILL_THREAD,
 			specs: []spec{
 				{
-					desc: "GreaterThan: Syscall argument allowed",
-					data: seccompData{nr: 1, arch: LINUX_AUDIT_ARCH, args: [6]uint64{0x10, 0xffffffff}},
+					desc: "high 32bits greater",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0x00000003_00000002}},
+					want: linux.SECCOMP_RET_TRAP,
+				},
+				{
+					desc: "high 32bits equal, low 32bits greater",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0x00000002_00000003}},
+					want: linux.SECCOMP_RET_TRAP,
+				},
+				{
+					desc: "high 32bits equal, low 32bits equal",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0x00000002_00000002}},
+					want: linux.SECCOMP_RET_TRAP,
+				},
+				{
+					desc: "high 32bits equal, low 32bits less",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0x00000002_00000001}},
 					want: linux.SECCOMP_RET_ALLOW,
 				},
 				{
-					desc: "GreaterThan: Syscall argument disallowed (equal)",
-					data: seccompData{nr: 1, arch: LINUX_AUDIT_ARCH, args: [6]uint64{0xf, 0xffffffff}},
+					desc: "high 32bits less",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0x00000001_00000002}},
+					want: linux.SECCOMP_RET_ALLOW,
+				},
+			},
+		},
+		{
+			name: "LessThan (multi)",
+			ruleSets: []RuleSet{
+				{
+					Rules: SyscallRules{
+						1: []Rule{
+							{
+								LessThan(0x1),
+								LessThan(0xabcd000d),
+							},
+						},
+					},
+					Action: linux.SECCOMP_RET_ALLOW,
+				},
+			},
+			defaultAction: linux.SECCOMP_RET_TRAP,
+			badArchAction: linux.SECCOMP_RET_KILL_THREAD,
+			specs: []spec{
+				{
+					desc: "arg allowed",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0x0, 0x0}},
+					want: linux.SECCOMP_RET_ALLOW,
+				},
+				{
+					desc: "arg disallowed (first arg equal)",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0x1, 0x0}},
 					want: linux.SECCOMP_RET_TRAP,
 				},
 				{
-					desc: "Syscall argument disallowed (smaller)",
-					data: seccompData{nr: 1, arch: LINUX_AUDIT_ARCH, args: [6]uint64{0x0, 0xffffffff}},
+					desc: "arg disallowed (first arg greater)",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0x2, 0x0}},
 					want: linux.SECCOMP_RET_TRAP,
 				},
 				{
-					desc: "GreaterThan2: Syscall argument allowed",
-					data: seccompData{nr: 1, arch: LINUX_AUDIT_ARCH, args: [6]uint64{0x10, 0xfbcd000d}},
+					desc: "arg disallowed (second arg equal)",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0x0, 0xabcd000d}},
+					want: linux.SECCOMP_RET_TRAP,
+				},
+				{
+					desc: "arg disallowed (second arg greater)",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0x0, 0xffffffff}},
+					want: linux.SECCOMP_RET_TRAP,
+				},
+				{
+					desc: "arg disallowed (both arg greater)",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0x2, 0xffffffff}},
+					want: linux.SECCOMP_RET_TRAP,
+				},
+			},
+		},
+		{
+			name: "LessThanOrEqual",
+			ruleSets: []RuleSet{
+				{
+					Rules: SyscallRules{
+						1: []Rule{
+							{
+								// 4294967298
+								// Both upper 32 bits and lower 32 bits are non-zero.
+								// 00000000000000000000000000000010
+								// 00000000000000000000000000000010
+								LessThanOrEqual(0x00000002_00000002),
+							},
+						},
+					},
+					Action: linux.SECCOMP_RET_ALLOW,
+				},
+			},
+			defaultAction: linux.SECCOMP_RET_TRAP,
+			badArchAction: linux.SECCOMP_RET_KILL_THREAD,
+			specs: []spec{
+				{
+					desc: "high 32bits greater",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0x00000003_00000002}},
+					want: linux.SECCOMP_RET_TRAP,
+				},
+				{
+					desc: "high 32bits equal, low 32bits greater",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0x00000002_00000003}},
+					want: linux.SECCOMP_RET_TRAP,
+				},
+				{
+					desc: "high 32bits equal, low 32bits equal",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0x00000002_00000002}},
 					want: linux.SECCOMP_RET_ALLOW,
 				},
 				{
-					desc: "GreaterThan2: Syscall argument disallowed (equal)",
-					data: seccompData{nr: 1, arch: LINUX_AUDIT_ARCH, args: [6]uint64{0x10, 0xabcd000d}},
+					desc: "high 32bits equal, low 32bits less",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0x00000002_00000001}},
+					want: linux.SECCOMP_RET_ALLOW,
+				},
+				{
+					desc: "high 32bits less",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0x00000001_00000002}},
+					want: linux.SECCOMP_RET_ALLOW,
+				},
+			},
+		},
+
+		{
+			name: "LessThanOrEqual (multi)",
+			ruleSets: []RuleSet{
+				{
+					Rules: SyscallRules{
+						1: []Rule{
+							{
+								LessThanOrEqual(0x1),
+								LessThanOrEqual(0xabcd000d),
+							},
+						},
+					},
+					Action: linux.SECCOMP_RET_ALLOW,
+				},
+			},
+			defaultAction: linux.SECCOMP_RET_TRAP,
+			badArchAction: linux.SECCOMP_RET_KILL_THREAD,
+			specs: []spec{
+				{
+					desc: "arg allowed",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0x0, 0x0}},
+					want: linux.SECCOMP_RET_ALLOW,
+				},
+				{
+					desc: "arg allowed (first arg equal)",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0x1, 0x0}},
+					want: linux.SECCOMP_RET_ALLOW,
+				},
+				{
+					desc: "arg disallowed (first arg greater)",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0x2, 0x0}},
+					want: linux.SECCOMP_RET_TRAP,
+				},
+				{
+					desc: "arg allowed (second arg equal)",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0x0, 0xabcd000d}},
+					want: linux.SECCOMP_RET_ALLOW,
+				},
+				{
+					desc: "arg disallowed (second arg greater)",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0x0, 0xffffffff}},
+					want: linux.SECCOMP_RET_TRAP,
+				},
+				{
+					desc: "arg disallowed (both arg greater)",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0x2, 0xffffffff}},
+					want: linux.SECCOMP_RET_TRAP,
+				},
+			},
+		},
+		{
+			name: "MaskedEqual",
+			ruleSets: []RuleSet{
+				{
+					Rules: SyscallRules{
+						1: []Rule{
+							{
+								// x & 00000001 00000011 (0x103) == 00000000 00000001 (0x1)
+								// Input x must have lowest order bit set and
+								// must *not* have 8th or second lowest order bit set.
+								MaskedEqual(0x103, 0x1),
+							},
+						},
+					},
+					Action: linux.SECCOMP_RET_ALLOW,
+				},
+			},
+			defaultAction: linux.SECCOMP_RET_TRAP,
+			badArchAction: linux.SECCOMP_RET_KILL_THREAD,
+			specs: []spec{
+				{
+					desc: "arg allowed (low order mandatory bit)",
+					data: linux.SeccompData{
+						Nr:   1,
+						Arch: LINUX_AUDIT_ARCH,
+						// 00000000 00000000 00000000 00000001
+						Args: [6]uint64{0x1},
+					},
+					want: linux.SECCOMP_RET_ALLOW,
+				},
+				{
+					desc: "arg allowed (low order optional bit)",
+					data: linux.SeccompData{
+						Nr:   1,
+						Arch: LINUX_AUDIT_ARCH,
+						// 00000000 00000000 00000000 00000101
+						Args: [6]uint64{0x5},
+					},
+					want: linux.SECCOMP_RET_ALLOW,
+				},
+				{
+					desc: "arg disallowed (lowest order bit not set)",
+					data: linux.SeccompData{
+						Nr:   1,
+						Arch: LINUX_AUDIT_ARCH,
+						// 00000000 00000000 00000000 00000010
+						Args: [6]uint64{0x2},
+					},
+					want: linux.SECCOMP_RET_TRAP,
+				},
+				{
+					desc: "arg disallowed (second lowest order bit set)",
+					data: linux.SeccompData{
+						Nr:   1,
+						Arch: LINUX_AUDIT_ARCH,
+						// 00000000 00000000 00000000 00000011
+						Args: [6]uint64{0x3},
+					},
 					want: linux.SECCOMP_RET_TRAP,
 				},
 				{
-					desc: "GreaterThan2: Syscall argument disallowed (smaller)",
-					data: seccompData{nr: 1, arch: LINUX_AUDIT_ARCH, args: [6]uint64{0x10, 0xa000ffff}},
+					desc: "arg disallowed (8th bit set)",
+					data: linux.SeccompData{
+						Nr:   1,
+						Arch: LINUX_AUDIT_ARCH,
+						// 00000000 00000000 00000001 00000000
+						Args: [6]uint64{0x100},
+					},
 					want: linux.SECCOMP_RET_TRAP,
 				},
 			},
 		},
 		{
+			name: "Instruction Pointer",
 			ruleSets: []RuleSet{
 				{
 					Rules: SyscallRules{
 						1: []Rule{
 							{
-								RuleIP: AllowValue(0x7aabbccdd),
+								RuleIP: EqualTo(0x7aabbccdd),
 							},
 						},
 					},
@@ -402,40 +879,42 @@ func TestBasic(t *testing.T) {
 				},
 			},
 			defaultAction: linux.SECCOMP_RET_TRAP,
+			badArchAction: linux.SECCOMP_RET_KILL_THREAD,
 			specs: []spec{
 				{
-					desc: "IP: Syscall instruction pointer allowed",
-					data: seccompData{nr: 1, arch: LINUX_AUDIT_ARCH, args: [6]uint64{}, instructionPointer: 0x7aabbccdd},
+					desc: "allowed",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{}, InstructionPointer: 0x7aabbccdd},
 					want: linux.SECCOMP_RET_ALLOW,
 				},
 				{
-					desc: "IP: Syscall instruction pointer disallowed",
-					data: seccompData{nr: 1, arch: LINUX_AUDIT_ARCH, args: [6]uint64{}, instructionPointer: 0x711223344},
+					desc: "disallowed",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{}, InstructionPointer: 0x711223344},
 					want: linux.SECCOMP_RET_TRAP,
 				},
 			},
 		},
 	} {
-		instrs, err := BuildProgram(test.ruleSets, test.defaultAction)
-		if err != nil {
-			t.Errorf("%s: buildProgram() got error: %v", test.specs[0].desc, err)
-			continue
-		}
-		p, err := bpf.Compile(instrs)
-		if err != nil {
-			t.Errorf("%s: bpf.Compile() got error: %v", test.specs[0].desc, err)
-			continue
-		}
-		for _, spec := range test.specs {
-			got, err := bpf.Exec(p, spec.data.asInput())
+		t.Run(test.name, func(t *testing.T) {
+			instrs, err := BuildProgram(test.ruleSets, test.defaultAction, test.badArchAction)
 			if err != nil {
-				t.Errorf("%s: bpf.Exec() got error: %v", spec.desc, err)
-				continue
+				t.Fatalf("BuildProgram() got error: %v", err)
 			}
-			if got != uint32(spec.want) {
-				t.Errorf("%s: bpd.Exec() = %d, want: %d", spec.desc, got, spec.want)
+			p, err := bpf.Compile(instrs)
+			if err != nil {
+				t.Fatalf("bpf.Compile() got error: %v", err)
 			}
-		}
+			for _, spec := range test.specs {
+				got, err := bpf.Exec(p, dataAsInput(&spec.data))
+				if err != nil {
+					t.Fatalf("%s: bpf.Exec() got error: %v", spec.desc, err)
+				}
+				if got != uint32(spec.want) {
+					// Include a decoded version of the program in output for debugging purposes.
+					decoded, _ := bpf.DecodeInstructions(instrs)
+					t.Fatalf("%s: got: %d, want: %d\nBPF Program\n%s", spec.desc, got, spec.want, decoded)
+				}
+			}
+		})
 	}
 }
 
@@ -457,7 +936,7 @@ func TestRandom(t *testing.T) {
 			Rules:  syscallRules,
 			Action: linux.SECCOMP_RET_ALLOW,
 		},
-	}, linux.SECCOMP_RET_TRAP)
+	}, linux.SECCOMP_RET_TRAP, linux.SECCOMP_RET_KILL_THREAD)
 	if err != nil {
 		t.Fatalf("buildProgram() got error: %v", err)
 	}
@@ -466,8 +945,8 @@ func TestRandom(t *testing.T) {
 		t.Fatalf("bpf.Compile() got error: %v", err)
 	}
 	for i := uint32(0); i < 200; i++ {
-		data := seccompData{nr: i, arch: LINUX_AUDIT_ARCH}
-		got, err := bpf.Exec(p, data.asInput())
+		data := linux.SeccompData{Nr: int32(i), Arch: LINUX_AUDIT_ARCH}
+		got, err := bpf.Exec(p, dataAsInput(&data))
 		if err != nil {
 			t.Errorf("bpf.Exec() got error: %v, for syscall %d", err, i)
 			continue
diff --git a/pkg/seccomp/seccomp_test_victim.go b/pkg/seccomp/seccomp_test_victim.go
index fe157f539..7f33e0d9e 100644
--- a/pkg/seccomp/seccomp_test_victim.go
+++ b/pkg/seccomp/seccomp_test_victim.go
@@ -100,7 +100,7 @@ func main() {
 	if !die {
 		syscalls[syscall.SYS_OPENAT] = []seccomp.Rule{
 			{
-				seccomp.AllowValue(10),
+				seccomp.EqualTo(10),
 			},
 		}
 	}
diff --git a/pkg/segment/set.go b/pkg/segment/set.go
index 1a17ad9cb..fbb31dbea 100644
--- a/pkg/segment/set.go
+++ b/pkg/segment/set.go
@@ -407,7 +407,9 @@ func (s *Set) InsertWithoutMerging(gap GapIterator, r Range, val Value) Iterator
 // and returns an iterator to the inserted segment. All existing iterators
 // (including gap, but not including the returned iterator) are invalidated.
 //
-// Preconditions: r.Start >= gap.Start(); r.End <= gap.End().
+// Preconditions:
+// * r.Start >= gap.Start().
+// * r.End <= gap.End().
 func (s *Set) InsertWithoutMergingUnchecked(gap GapIterator, r Range, val Value) Iterator {
 	gap = gap.node.rebalanceBeforeInsert(gap)
 	splitMaxGap := trackGaps != 0 && (gap.node.nrSegments == 0 || gap.Range().Length() == gap.node.maxGap.Get())
@@ -1211,12 +1213,10 @@ func (seg Iterator) End() Key {
 // does not invalidate any iterators.
 //
 // Preconditions:
-//
-// - r.Length() > 0.
-//
-// - The new range must not overlap an existing one: If seg.NextSegment().Ok(),
-// then r.end <= seg.NextSegment().Start(); if seg.PrevSegment().Ok(), then
-// r.start >= seg.PrevSegment().End().
+// * r.Length() > 0.
+// * The new range must not overlap an existing one:
+//   * If seg.NextSegment().Ok(), then r.end <= seg.NextSegment().Start().
+//   * If seg.PrevSegment().Ok(), then r.start >= seg.PrevSegment().End().
 func (seg Iterator) SetRangeUnchecked(r Range) {
 	seg.node.keys[seg.index] = r
 }
@@ -1241,8 +1241,9 @@ func (seg Iterator) SetRange(r Range) {
 // SetStartUnchecked mutates the iterated segment's start. This operation does
 // not invalidate any iterators.
 //
-// Preconditions: The new start must be valid: start < seg.End(); if
-// seg.PrevSegment().Ok(), then start >= seg.PrevSegment().End().
+// Preconditions: The new start must be valid:
+// * start < seg.End()
+// * If seg.PrevSegment().Ok(), then start >= seg.PrevSegment().End().
 func (seg Iterator) SetStartUnchecked(start Key) {
 	seg.node.keys[seg.index].Start = start
 }
@@ -1264,8 +1265,9 @@ func (seg Iterator) SetStart(start Key) {
 // SetEndUnchecked mutates the iterated segment's end. This operation does not
 // invalidate any iterators.
 //
-// Preconditions: The new end must be valid: end > seg.Start(); if
-// seg.NextSegment().Ok(), then end <= seg.NextSegment().Start().
+// Preconditions: The new end must be valid:
+// * end > seg.Start().
+// * If seg.NextSegment().Ok(), then end <= seg.NextSegment().Start().
 func (seg Iterator) SetEndUnchecked(end Key) {
 	seg.node.keys[seg.index].End = end
 }
@@ -1695,9 +1697,11 @@ func (s *Set) ExportSortedSlices() *SegmentDataSlices {
 
 // ImportSortedSlice initializes the given set from the given slice.
 //
-// Preconditions: s must be empty. sds must represent a valid set (the segments
-// in sds must have valid lengths that do not overlap). The segments in sds
-// must be sorted in ascending key order.
+// Preconditions:
+// * s must be empty.
+// * sds must represent a valid set (the segments in sds must have valid
+//   lengths that do not overlap).
+// * The segments in sds must be sorted in ascending key order.
 func (s *Set) ImportSortedSlices(sds *SegmentDataSlices) error {
 	if !s.IsEmpty() {
 		return fmt.Errorf("cannot import into non-empty set %v", s)
diff --git a/pkg/sentry/arch/BUILD b/pkg/sentry/arch/BUILD
index 901e0f320..4af4d6e84 100644
--- a/pkg/sentry/arch/BUILD
+++ b/pkg/sentry/arch/BUILD
@@ -22,6 +22,7 @@ go_library(
         "signal_info.go",
         "signal_stack.go",
         "stack.go",
+        "stack_unsafe.go",
         "syscalls_amd64.go",
         "syscalls_arm64.go",
     ],
@@ -33,11 +34,12 @@ go_library(
         "//pkg/context",
         "//pkg/cpuid",
         "//pkg/log",
+        "//pkg/marshal",
+        "//pkg/marshal/primitive",
         "//pkg/sentry/limits",
         "//pkg/sync",
         "//pkg/syserror",
         "//pkg/usermem",
-        "//tools/go_marshal/marshal",
     ],
 )
 
diff --git a/pkg/sentry/arch/arch.go b/pkg/sentry/arch/arch.go
index a903d031c..d75d665ae 100644
--- a/pkg/sentry/arch/arch.go
+++ b/pkg/sentry/arch/arch.go
@@ -23,6 +23,7 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/cpuid"
 	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/marshal"
 	"gvisor.dev/gvisor/pkg/sentry/limits"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
@@ -72,12 +73,12 @@ type Context interface {
 	// with return values of varying sizes (for example ARCH_GETFS). This
 	// is a simple utility function to convert to the native size in these
 	// cases, and then we can CopyOut.
-	Native(val uintptr) interface{}
+	Native(val uintptr) marshal.Marshallable
 
 	// Value converts a native type back to a generic value.
 	// Once a value has been converted to native via the above call -- it
 	// can be converted back here.
-	Value(val interface{}) uintptr
+	Value(val marshal.Marshallable) uintptr
 
 	// Width returns the number of bytes for a native value.
 	Width() uint
@@ -205,7 +206,7 @@ type Context interface {
 	// equivalent of arch_ptrace():
 
 	// PtracePeekUser implements ptrace(PTRACE_PEEKUSR).
-	PtracePeekUser(addr uintptr) (interface{}, error)
+	PtracePeekUser(addr uintptr) (marshal.Marshallable, error)
 
 	// PtracePokeUser implements ptrace(PTRACE_POKEUSR).
 	PtracePokeUser(addr, data uintptr) error
diff --git a/pkg/sentry/arch/arch_aarch64.go b/pkg/sentry/arch/arch_aarch64.go
index 0f433ee79..fd73751e7 100644
--- a/pkg/sentry/arch/arch_aarch64.go
+++ b/pkg/sentry/arch/arch_aarch64.go
@@ -154,6 +154,7 @@ func (s State) Proto() *rpb.Registers {
 		Sp:     s.Regs.Sp,
 		Pc:     s.Regs.Pc,
 		Pstate: s.Regs.Pstate,
+		Tls:    s.Regs.TPIDR_EL0,
 	}
 	return &rpb.Registers{Arch: &rpb.Registers_Arm64{Arm64: regs}}
 }
@@ -232,6 +233,7 @@ func (s *State) RegisterMap() (map[string]uintptr, error) {
 		"Sp":     uintptr(s.Regs.Sp),
 		"Pc":     uintptr(s.Regs.Pc),
 		"Pstate": uintptr(s.Regs.Pstate),
+		"Tls":    uintptr(s.Regs.TPIDR_EL0),
 	}, nil
 }
 
diff --git a/pkg/sentry/arch/arch_amd64.go b/pkg/sentry/arch/arch_amd64.go
index 1c3e3c14c..c7d3a206d 100644
--- a/pkg/sentry/arch/arch_amd64.go
+++ b/pkg/sentry/arch/arch_amd64.go
@@ -23,6 +23,8 @@ import (
 	"syscall"
 
 	"gvisor.dev/gvisor/pkg/cpuid"
+	"gvisor.dev/gvisor/pkg/marshal"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
 	"gvisor.dev/gvisor/pkg/sentry/limits"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
@@ -179,14 +181,14 @@ func (c *context64) SetOldRSeqInterruptedIP(value uintptr) {
 }
 
 // Native returns the native type for the given val.
-func (c *context64) Native(val uintptr) interface{} {
-	v := uint64(val)
+func (c *context64) Native(val uintptr) marshal.Marshallable {
+	v := primitive.Uint64(val)
 	return &v
 }
 
 // Value returns the generic val for the given native type.
-func (c *context64) Value(val interface{}) uintptr {
-	return uintptr(*val.(*uint64))
+func (c *context64) Value(val marshal.Marshallable) uintptr {
+	return uintptr(*val.(*primitive.Uint64))
 }
 
 // Width returns the byte width of this architecture.
@@ -293,7 +295,7 @@ func (c *context64) PIELoadAddress(l MmapLayout) usermem.Addr {
 const userStructSize = 928
 
 // PtracePeekUser implements Context.PtracePeekUser.
-func (c *context64) PtracePeekUser(addr uintptr) (interface{}, error) {
+func (c *context64) PtracePeekUser(addr uintptr) (marshal.Marshallable, error) {
 	if addr&7 != 0 || addr >= userStructSize {
 		return nil, syscall.EIO
 	}
diff --git a/pkg/sentry/arch/arch_arm64.go b/pkg/sentry/arch/arch_arm64.go
index 550741d8c..680d23a9f 100644
--- a/pkg/sentry/arch/arch_arm64.go
+++ b/pkg/sentry/arch/arch_arm64.go
@@ -22,6 +22,8 @@ import (
 	"syscall"
 
 	"gvisor.dev/gvisor/pkg/cpuid"
+	"gvisor.dev/gvisor/pkg/marshal"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
 	"gvisor.dev/gvisor/pkg/sentry/limits"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
@@ -163,14 +165,14 @@ func (c *context64) SetOldRSeqInterruptedIP(value uintptr) {
 }
 
 // Native returns the native type for the given val.
-func (c *context64) Native(val uintptr) interface{} {
-	v := uint64(val)
+func (c *context64) Native(val uintptr) marshal.Marshallable {
+	v := primitive.Uint64(val)
 	return &v
 }
 
 // Value returns the generic val for the given native type.
-func (c *context64) Value(val interface{}) uintptr {
-	return uintptr(*val.(*uint64))
+func (c *context64) Value(val marshal.Marshallable) uintptr {
+	return uintptr(*val.(*primitive.Uint64))
 }
 
 // Width returns the byte width of this architecture.
@@ -274,7 +276,7 @@ func (c *context64) PIELoadAddress(l MmapLayout) usermem.Addr {
 }
 
 // PtracePeekUser implements Context.PtracePeekUser.
-func (c *context64) PtracePeekUser(addr uintptr) (interface{}, error) {
+func (c *context64) PtracePeekUser(addr uintptr) (marshal.Marshallable, error) {
 	// TODO(gvisor.dev/issue/1239): Full ptrace supporting for Arm64.
 	return c.Native(0), nil
 }
diff --git a/pkg/sentry/arch/registers.proto b/pkg/sentry/arch/registers.proto
index 60c027aab..2727ba08a 100644
--- a/pkg/sentry/arch/registers.proto
+++ b/pkg/sentry/arch/registers.proto
@@ -83,6 +83,7 @@ message ARM64Registers {
   uint64 sp = 32;
   uint64 pc = 33;
   uint64 pstate = 34;
+  uint64 tls = 35;
 }
 message Registers {
   oneof arch {
diff --git a/pkg/sentry/arch/signal_act.go b/pkg/sentry/arch/signal_act.go
index 32173aa20..d3e2324a8 100644
--- a/pkg/sentry/arch/signal_act.go
+++ b/pkg/sentry/arch/signal_act.go
@@ -14,7 +14,7 @@
 
 package arch
 
-import "gvisor.dev/gvisor/tools/go_marshal/marshal"
+import "gvisor.dev/gvisor/pkg/marshal"
 
 // Special values for SignalAct.Handler.
 const (
diff --git a/pkg/sentry/arch/signal_amd64.go b/pkg/sentry/arch/signal_amd64.go
index 6fb756f0e..72e07a988 100644
--- a/pkg/sentry/arch/signal_amd64.go
+++ b/pkg/sentry/arch/signal_amd64.go
@@ -17,17 +17,19 @@
 package arch
 
 import (
-	"encoding/binary"
 	"math"
 	"syscall"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // SignalContext64 is equivalent to struct sigcontext, the type passed as the
 // second argument to signal handlers set by signal(2).
+//
+// +marshal
 type SignalContext64 struct {
 	R8      uint64
 	R9      uint64
@@ -68,6 +70,8 @@ const (
 )
 
 // UContext64 is equivalent to ucontext_t on 64-bit x86.
+//
+// +marshal
 type UContext64 struct {
 	Flags    uint64
 	Link     uint64
@@ -172,12 +176,7 @@ func (c *context64) SignalSetup(st *Stack, act *SignalAct, info *SignalInfo, alt
 
 	// "... the value (%rsp+8) is always a multiple of 16 (...) when
 	// control is transferred to the function entry point." - AMD64 ABI
-	ucSize := binary.Size(uc)
-	if ucSize < 0 {
-		// This can only happen if we've screwed up the definition of
-		// UContext64.
-		panic("can't get size of UContext64")
-	}
+	ucSize := uc.SizeBytes()
 	// st.Arch.Width() is for the restorer address. sizeof(siginfo) == 128.
 	frameSize := int(st.Arch.Width()) + ucSize + 128
 	frameBottom := (sp-usermem.Addr(frameSize)) & ^usermem.Addr(15) - 8
@@ -195,18 +194,18 @@ func (c *context64) SignalSetup(st *Stack, act *SignalAct, info *SignalInfo, alt
 	info.FixSignalCodeForUser()
 
 	// Set up the stack frame.
-	infoAddr, err := st.Push(info)
-	if err != nil {
+	if _, err := info.CopyOut(st, StackBottomMagic); err != nil {
 		return err
 	}
-	ucAddr, err := st.Push(uc)
-	if err != nil {
+	infoAddr := st.Bottom
+	if _, err := uc.CopyOut(st, StackBottomMagic); err != nil {
 		return err
 	}
+	ucAddr := st.Bottom
 	if act.HasRestorer() {
 		// Push the restorer return address.
 		// Note that this doesn't need to be popped.
-		if _, err := st.Push(usermem.Addr(act.Restorer)); err != nil {
+		if _, err := primitive.CopyUint64Out(st, StackBottomMagic, act.Restorer); err != nil {
 			return err
 		}
 	} else {
@@ -240,11 +239,11 @@ func (c *context64) SignalSetup(st *Stack, act *SignalAct, info *SignalInfo, alt
 func (c *context64) SignalRestore(st *Stack, rt bool) (linux.SignalSet, SignalStack, error) {
 	// Copy out the stack frame.
 	var uc UContext64
-	if _, err := st.Pop(&uc); err != nil {
+	if _, err := uc.CopyIn(st, StackBottomMagic); err != nil {
 		return 0, SignalStack{}, err
 	}
 	var info SignalInfo
-	if _, err := st.Pop(&info); err != nil {
+	if _, err := info.CopyIn(st, StackBottomMagic); err != nil {
 		return 0, SignalStack{}, err
 	}
 
diff --git a/pkg/sentry/arch/signal_arm64.go b/pkg/sentry/arch/signal_arm64.go
index 642c79dda..7fde5d34e 100644
--- a/pkg/sentry/arch/signal_arm64.go
+++ b/pkg/sentry/arch/signal_arm64.go
@@ -12,10 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+// +build arm64
+
 package arch
 
 import (
-	"encoding/binary"
 	"syscall"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
@@ -25,6 +26,8 @@ import (
 
 // SignalContext64 is equivalent to struct sigcontext, the type passed as the
 // second argument to signal handlers set by signal(2).
+//
+// +marshal
 type SignalContext64 struct {
 	FaultAddr uint64
 	Regs      [31]uint64
@@ -36,6 +39,7 @@ type SignalContext64 struct {
 	Reserved  [3568]uint8
 }
 
+// +marshal
 type aarch64Ctx struct {
 	Magic uint32
 	Size  uint32
@@ -43,6 +47,8 @@ type aarch64Ctx struct {
 
 // FpsimdContext is equivalent to struct fpsimd_context on arm64
 // (arch/arm64/include/uapi/asm/sigcontext.h).
+//
+// +marshal
 type FpsimdContext struct {
 	Head  aarch64Ctx
 	Fpsr  uint32
@@ -51,13 +57,15 @@ type FpsimdContext struct {
 }
 
 // UContext64 is equivalent to ucontext on arm64(arch/arm64/include/uapi/asm/ucontext.h).
+//
+// +marshal
 type UContext64 struct {
 	Flags  uint64
 	Link   uint64
 	Stack  SignalStack
 	Sigset linux.SignalSet
 	// glibc uses a 1024-bit sigset_t
-	_pad [(1024 - 64) / 8]byte
+	_pad [120]byte // (1024 - 64) / 8 = 120
 	// sigcontext must be aligned to 16-byte
 	_pad2 [8]byte
 	// last for future expansion
@@ -94,11 +102,7 @@ func (c *context64) SignalSetup(st *Stack, act *SignalAct, info *SignalInfo, alt
 		},
 		Sigset: sigset,
 	}
-
-	ucSize := binary.Size(uc)
-	if ucSize < 0 {
-		panic("can't get size of UContext64")
-	}
+	ucSize := uc.SizeBytes()
 
 	// frameSize = ucSize + sizeof(siginfo).
 	// sizeof(siginfo) == 128.
@@ -119,14 +123,14 @@ func (c *context64) SignalSetup(st *Stack, act *SignalAct, info *SignalInfo, alt
 	info.FixSignalCodeForUser()
 
 	// Set up the stack frame.
-	infoAddr, err := st.Push(info)
-	if err != nil {
+	if _, err := info.CopyOut(st, StackBottomMagic); err != nil {
 		return err
 	}
-	ucAddr, err := st.Push(uc)
-	if err != nil {
+	infoAddr := st.Bottom
+	if _, err := uc.CopyOut(st, StackBottomMagic); err != nil {
 		return err
 	}
+	ucAddr := st.Bottom
 
 	// Set up registers.
 	c.Regs.Sp = uint64(st.Bottom)
@@ -147,11 +151,11 @@ func (c *context64) SignalSetup(st *Stack, act *SignalAct, info *SignalInfo, alt
 func (c *context64) SignalRestore(st *Stack, rt bool) (linux.SignalSet, SignalStack, error) {
 	// Copy out the stack frame.
 	var uc UContext64
-	if _, err := st.Pop(&uc); err != nil {
+	if _, err := uc.CopyIn(st, StackBottomMagic); err != nil {
 		return 0, SignalStack{}, err
 	}
 	var info SignalInfo
-	if _, err := st.Pop(&info); err != nil {
+	if _, err := info.CopyIn(st, StackBottomMagic); err != nil {
 		return 0, SignalStack{}, err
 	}
 
diff --git a/pkg/sentry/arch/signal_stack.go b/pkg/sentry/arch/signal_stack.go
index 0fa738a1d..a1eae98f9 100644
--- a/pkg/sentry/arch/signal_stack.go
+++ b/pkg/sentry/arch/signal_stack.go
@@ -17,8 +17,8 @@
 package arch
 
 import (
+	"gvisor.dev/gvisor/pkg/marshal"
 	"gvisor.dev/gvisor/pkg/usermem"
-	"gvisor.dev/gvisor/tools/go_marshal/marshal"
 )
 
 const (
diff --git a/pkg/sentry/arch/stack.go b/pkg/sentry/arch/stack.go
index 1108fa0bd..5f06c751d 100644
--- a/pkg/sentry/arch/stack.go
+++ b/pkg/sentry/arch/stack.go
@@ -15,14 +15,16 @@
 package arch
 
 import (
-	"encoding/binary"
-	"fmt"
-
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
-// Stack is a simple wrapper around a usermem.IO and an address.
+// Stack is a simple wrapper around a usermem.IO and an address. Stack
+// implements marshal.CopyContext, and marshallable values can be pushed or
+// popped from the stack through the marshal.Marshallable interface.
+//
+// Stack is not thread-safe.
 type Stack struct {
 	// Our arch info.
 	// We use this for automatic Native conversion of usermem.Addrs during
@@ -34,105 +36,60 @@ type Stack struct {
 
 	// Our current stack bottom.
 	Bottom usermem.Addr
-}
 
-// Push pushes the given values on to the stack.
-//
-// (This method supports Addrs and treats them as native types.)
-func (s *Stack) Push(vals ...interface{}) (usermem.Addr, error) {
-	for _, v := range vals {
-
-		// We convert some types to well-known serializable quanities.
-		var norm interface{}
-
-		// For array types, we will automatically add an appropriate
-		// terminal value. This is done simply to make the interface
-		// easier to use.
-		var term interface{}
-
-		switch v.(type) {
-		case string:
-			norm = []byte(v.(string))
-			term = byte(0)
-		case []int8, []uint8:
-			norm = v
-			term = byte(0)
-		case []int16, []uint16:
-			norm = v
-			term = uint16(0)
-		case []int32, []uint32:
-			norm = v
-			term = uint32(0)
-		case []int64, []uint64:
-			norm = v
-			term = uint64(0)
-		case []usermem.Addr:
-			// Special case: simply push recursively.
-			_, err := s.Push(s.Arch.Native(uintptr(0)))
-			if err != nil {
-				return 0, err
-			}
-			varr := v.([]usermem.Addr)
-			for i := len(varr) - 1; i >= 0; i-- {
-				_, err := s.Push(varr[i])
-				if err != nil {
-					return 0, err
-				}
-			}
-			continue
-		case usermem.Addr:
-			norm = s.Arch.Native(uintptr(v.(usermem.Addr)))
-		default:
-			norm = v
-		}
+	// Scratch buffer used for marshalling to avoid having to repeatedly
+	// allocate scratch memory.
+	scratchBuf []byte
+}
 
-		if term != nil {
-			_, err := s.Push(term)
-			if err != nil {
-				return 0, err
-			}
-		}
+// scratchBufLen is the default length of Stack.scratchBuf. The
+// largest structs the stack regularly serializes are arch.SignalInfo
+// and arch.UContext64. We'll set the default size as the larger of
+// the two, arch.UContext64.
+var scratchBufLen = (*UContext64)(nil).SizeBytes()
 
-		c := binary.Size(norm)
-		if c < 0 {
-			return 0, fmt.Errorf("bad binary.Size for %T", v)
-		}
-		n, err := usermem.CopyObjectOut(context.Background(), s.IO, s.Bottom-usermem.Addr(c), norm, usermem.IOOpts{})
-		if err != nil || c != n {
-			return 0, err
-		}
+// CopyScratchBuffer implements marshal.CopyContext.CopyScratchBuffer.
+func (s *Stack) CopyScratchBuffer(size int) []byte {
+	if len(s.scratchBuf) < size {
+		s.scratchBuf = make([]byte, size)
+	}
+	return s.scratchBuf[:size]
+}
 
+// StackBottomMagic is the special address callers must past to all stack
+// marshalling operations to cause the src/dst address to be computed based on
+// the current end of the stack.
+const StackBottomMagic = ^usermem.Addr(0) // usermem.Addr(-1)
+
+// CopyOutBytes implements marshal.CopyContext.CopyOutBytes. CopyOutBytes
+// computes an appropriate address based on the current end of the
+// stack. Callers use the sentinel address StackBottomMagic to marshal methods
+// to indicate this.
+func (s *Stack) CopyOutBytes(sentinel usermem.Addr, b []byte) (int, error) {
+	if sentinel != StackBottomMagic {
+		panic("Attempted to copy out to stack with absolute address")
+	}
+	c := len(b)
+	n, err := s.IO.CopyOut(context.Background(), s.Bottom-usermem.Addr(c), b, usermem.IOOpts{})
+	if err == nil && n == c {
 		s.Bottom -= usermem.Addr(n)
 	}
-
-	return s.Bottom, nil
+	return n, err
 }
 
-// Pop pops the given values off the stack.
-//
-// (This method supports Addrs and treats them as native types.)
-func (s *Stack) Pop(vals ...interface{}) (usermem.Addr, error) {
-	for _, v := range vals {
-
-		vaddr, isVaddr := v.(*usermem.Addr)
-
-		var n int
-		var err error
-		if isVaddr {
-			value := s.Arch.Native(uintptr(0))
-			n, err = usermem.CopyObjectIn(context.Background(), s.IO, s.Bottom, value, usermem.IOOpts{})
-			*vaddr = usermem.Addr(s.Arch.Value(value))
-		} else {
-			n, err = usermem.CopyObjectIn(context.Background(), s.IO, s.Bottom, v, usermem.IOOpts{})
-		}
-		if err != nil {
-			return 0, err
-		}
-
+// CopyInBytes implements marshal.CopyContext.CopyInBytes. CopyInBytes computes
+// an appropriate address based on the current end of the stack. Callers must
+// use the sentinel address StackBottomMagic to marshal methods to indicate
+// this.
+func (s *Stack) CopyInBytes(sentinel usermem.Addr, b []byte) (int, error) {
+	if sentinel != StackBottomMagic {
+		panic("Attempted to copy in from stack with absolute address")
+	}
+	n, err := s.IO.CopyIn(context.Background(), s.Bottom, b, usermem.IOOpts{})
+	if err == nil {
 		s.Bottom += usermem.Addr(n)
 	}
-
-	return s.Bottom, nil
+	return n, err
 }
 
 // Align aligns the stack to the given offset.
@@ -142,6 +99,22 @@ func (s *Stack) Align(offset int) {
 	}
 }
 
+// PushNullTerminatedByteSlice writes bs to the stack, followed by an extra null
+// byte at the end. On error, the contents of the stack and the bottom cursor
+// are undefined.
+func (s *Stack) PushNullTerminatedByteSlice(bs []byte) (int, error) {
+	// Note: Stack grows up, so write the terminal null byte first.
+	nNull, err := primitive.CopyUint8Out(s, StackBottomMagic, 0)
+	if err != nil {
+		return 0, err
+	}
+	n, err := primitive.CopyByteSliceOut(s, StackBottomMagic, bs)
+	if err != nil {
+		return 0, err
+	}
+	return n + nNull, nil
+}
+
 // StackLayout describes the location of the arguments and environment on the
 // stack.
 type StackLayout struct {
@@ -177,11 +150,10 @@ func (s *Stack) Load(args []string, env []string, aux Auxv) (StackLayout, error)
 	l.EnvvEnd = s.Bottom
 	envAddrs := make([]usermem.Addr, len(env))
 	for i := len(env) - 1; i >= 0; i-- {
-		addr, err := s.Push(env[i])
-		if err != nil {
+		if _, err := s.PushNullTerminatedByteSlice([]byte(env[i])); err != nil {
 			return StackLayout{}, err
 		}
-		envAddrs[i] = addr
+		envAddrs[i] = s.Bottom
 	}
 	l.EnvvStart = s.Bottom
 
@@ -189,11 +161,10 @@ func (s *Stack) Load(args []string, env []string, aux Auxv) (StackLayout, error)
 	l.ArgvEnd = s.Bottom
 	argAddrs := make([]usermem.Addr, len(args))
 	for i := len(args) - 1; i >= 0; i-- {
-		addr, err := s.Push(args[i])
-		if err != nil {
+		if _, err := s.PushNullTerminatedByteSlice([]byte(args[i])); err != nil {
 			return StackLayout{}, err
 		}
-		argAddrs[i] = addr
+		argAddrs[i] = s.Bottom
 	}
 	l.ArgvStart = s.Bottom
 
@@ -222,26 +193,26 @@ func (s *Stack) Load(args []string, env []string, aux Auxv) (StackLayout, error)
 		auxv = append(auxv, usermem.Addr(a.Key), a.Value)
 	}
 	auxv = append(auxv, usermem.Addr(0))
-	_, err := s.Push(auxv)
+	_, err := s.pushAddrSliceAndTerminator(auxv)
 	if err != nil {
 		return StackLayout{}, err
 	}
 
 	// Push environment.
-	_, err = s.Push(envAddrs)
+	_, err = s.pushAddrSliceAndTerminator(envAddrs)
 	if err != nil {
 		return StackLayout{}, err
 	}
 
 	// Push args.
-	_, err = s.Push(argAddrs)
+	_, err = s.pushAddrSliceAndTerminator(argAddrs)
 	if err != nil {
 		return StackLayout{}, err
 	}
 
 	// Push arg count.
-	_, err = s.Push(usermem.Addr(len(args)))
-	if err != nil {
+	lenP := s.Arch.Native(uintptr(len(args)))
+	if _, err = lenP.CopyOut(s, StackBottomMagic); err != nil {
 		return StackLayout{}, err
 	}
 
diff --git a/pkg/sentry/arch/stack_unsafe.go b/pkg/sentry/arch/stack_unsafe.go
new file mode 100644
index 000000000..a90d297ee
--- /dev/null
+++ b/pkg/sentry/arch/stack_unsafe.go
@@ -0,0 +1,69 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package arch
+
+import (
+	"reflect"
+	"runtime"
+	"unsafe"
+
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// pushAddrSliceAndTerminator copies a slices of addresses to the stack, and
+// also pushes an extra null address element at the end of the slice.
+//
+// Internally, we unsafely transmute the slice type from the arch-dependent
+// []usermem.Addr type, to a slice of fixed-sized ints so that we can pass it to
+// go-marshal.
+//
+// On error, the contents of the stack and the bottom cursor are undefined.
+func (s *Stack) pushAddrSliceAndTerminator(src []usermem.Addr) (int, error) {
+	// Note: Stack grows upwards, so push the terminator first.
+	srcHdr := (*reflect.SliceHeader)(unsafe.Pointer(&src))
+	switch s.Arch.Width() {
+	case 8:
+		nNull, err := primitive.CopyUint64Out(s, StackBottomMagic, 0)
+		if err != nil {
+			return 0, err
+		}
+		var dst []uint64
+		dstHdr := (*reflect.SliceHeader)(unsafe.Pointer(&dst))
+		dstHdr.Data = srcHdr.Data
+		dstHdr.Len = srcHdr.Len
+		dstHdr.Cap = srcHdr.Cap
+		n, err := primitive.CopyUint64SliceOut(s, StackBottomMagic, dst)
+		// Ensures src doesn't get GCed until we're done using it through dst.
+		runtime.KeepAlive(src)
+		return n + nNull, err
+	case 4:
+		nNull, err := primitive.CopyUint32Out(s, StackBottomMagic, 0)
+		if err != nil {
+			return 0, err
+		}
+		var dst []uint32
+		dstHdr := (*reflect.SliceHeader)(unsafe.Pointer(&dst))
+		dstHdr.Data = srcHdr.Data
+		dstHdr.Len = srcHdr.Len
+		dstHdr.Cap = srcHdr.Cap
+		n, err := primitive.CopyUint32SliceOut(s, StackBottomMagic, dst)
+		// Ensure src doesn't get GCed until we're done using it through dst.
+		runtime.KeepAlive(src)
+		return n + nNull, err
+	default:
+		panic("Unsupported arch width")
+	}
+}
diff --git a/pkg/sentry/contexttest/contexttest.go b/pkg/sentry/contexttest/contexttest.go
index 8e5658c7a..dfd195a23 100644
--- a/pkg/sentry/contexttest/contexttest.go
+++ b/pkg/sentry/contexttest/contexttest.go
@@ -144,27 +144,7 @@ func (t *TestContext) MemoryFile() *pgalloc.MemoryFile {
 // RootContext returns a Context that may be used in tests that need root
 // credentials. Uses ptrace as the platform.Platform.
 func RootContext(tb testing.TB) context.Context {
-	return WithCreds(Context(tb), auth.NewRootCredentials(auth.NewRootUserNamespace()))
-}
-
-// WithCreds returns a copy of ctx carrying creds.
-func WithCreds(ctx context.Context, creds *auth.Credentials) context.Context {
-	return &authContext{ctx, creds}
-}
-
-type authContext struct {
-	context.Context
-	creds *auth.Credentials
-}
-
-// Value implements context.Context.
-func (ac *authContext) Value(key interface{}) interface{} {
-	switch key {
-	case auth.CtxCredentials:
-		return ac.creds
-	default:
-		return ac.Context.Value(key)
-	}
+	return auth.ContextWithCredentials(Context(tb), auth.NewRootCredentials(auth.NewRootUserNamespace()))
 }
 
 // WithLimitSet returns a copy of ctx carrying l.
diff --git a/pkg/sentry/control/BUILD b/pkg/sentry/control/BUILD
index 2c5d14be5..deaf5fa23 100644
--- a/pkg/sentry/control/BUILD
+++ b/pkg/sentry/control/BUILD
@@ -35,7 +35,6 @@ go_library(
         "//pkg/sync",
         "//pkg/tcpip/link/sniffer",
         "//pkg/urpc",
-        "@org_golang_x_sys//unix:go_default_library",
     ],
 )
 
diff --git a/pkg/sentry/control/proc.go b/pkg/sentry/control/proc.go
index dfa936563..1d88db12f 100644
--- a/pkg/sentry/control/proc.go
+++ b/pkg/sentry/control/proc.go
@@ -23,8 +23,8 @@ import (
 	"text/tabwriter"
 	"time"
 
-	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/fd"
 	"gvisor.dev/gvisor/pkg/sentry/fdimport"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/host"
@@ -183,9 +183,9 @@ func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadI
 		if initArgs.MountNamespaceVFS2 == nil {
 			// Set initArgs so that 'ctx' returns the namespace.
 			//
-			// MountNamespaceVFS2 adds a reference to the namespace, which is
-			// transferred to the new process.
+			// Add a reference to the namespace, which is transferred to the new process.
 			initArgs.MountNamespaceVFS2 = proc.Kernel.GlobalInit().Leader().MountNamespaceVFS2()
+			initArgs.MountNamespaceVFS2.IncRef()
 		}
 	} else {
 		if initArgs.MountNamespace == nil {
@@ -203,27 +203,17 @@ func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadI
 	}
 	initArgs.Filename = resolved
 
-	fds := make([]int, len(args.FilePayload.Files))
-	for i, file := range args.FilePayload.Files {
-		if kernel.VFS2Enabled {
-			// Need to dup to remove ownership from os.File.
-			dup, err := unix.Dup(int(file.Fd()))
-			if err != nil {
-				return nil, 0, nil, nil, fmt.Errorf("duplicating payload files: %w", err)
-			}
-			fds[i] = dup
-		} else {
-			// VFS1 dups the file on import.
-			fds[i] = int(file.Fd())
-		}
+	fds, err := fd.NewFromFiles(args.Files)
+	if err != nil {
+		return nil, 0, nil, nil, fmt.Errorf("duplicating payload files: %w", err)
 	}
+	defer func() {
+		for _, fd := range fds {
+			_ = fd.Close()
+		}
+	}()
 	ttyFile, ttyFileVFS2, err := fdimport.Import(ctx, fdTable, args.StdioIsPty, fds)
 	if err != nil {
-		if kernel.VFS2Enabled {
-			for _, fd := range fds {
-				unix.Close(fd)
-			}
-		}
 		return nil, 0, nil, nil, err
 	}
 
diff --git a/pkg/sentry/devices/memdev/BUILD b/pkg/sentry/devices/memdev/BUILD
index abe58f818..4c8604d58 100644
--- a/pkg/sentry/devices/memdev/BUILD
+++ b/pkg/sentry/devices/memdev/BUILD
@@ -18,9 +18,10 @@ go_library(
         "//pkg/rand",
         "//pkg/safemem",
         "//pkg/sentry/fsimpl/devtmpfs",
+        "//pkg/sentry/fsimpl/tmpfs",
+        "//pkg/sentry/kernel",
+        "//pkg/sentry/kernel/auth",
         "//pkg/sentry/memmap",
-        "//pkg/sentry/mm",
-        "//pkg/sentry/pgalloc",
         "//pkg/sentry/vfs",
         "//pkg/syserror",
         "//pkg/usermem",
diff --git a/pkg/sentry/devices/memdev/full.go b/pkg/sentry/devices/memdev/full.go
index 511179e31..fece3e762 100644
--- a/pkg/sentry/devices/memdev/full.go
+++ b/pkg/sentry/devices/memdev/full.go
@@ -24,6 +24,8 @@ import (
 const fullDevMinor = 7
 
 // fullDevice implements vfs.Device for /dev/full.
+//
+// +stateify savable
 type fullDevice struct{}
 
 // Open implements vfs.Device.Open.
@@ -38,6 +40,8 @@ func (fullDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, op
 }
 
 // fullFD implements vfs.FileDescriptionImpl for /dev/full.
+//
+// +stateify savable
 type fullFD struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
diff --git a/pkg/sentry/devices/memdev/null.go b/pkg/sentry/devices/memdev/null.go
index 4918dbeeb..ff5837747 100644
--- a/pkg/sentry/devices/memdev/null.go
+++ b/pkg/sentry/devices/memdev/null.go
@@ -25,6 +25,8 @@ import (
 const nullDevMinor = 3
 
 // nullDevice implements vfs.Device for /dev/null.
+//
+// +stateify savable
 type nullDevice struct{}
 
 // Open implements vfs.Device.Open.
@@ -39,6 +41,8 @@ func (nullDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, op
 }
 
 // nullFD implements vfs.FileDescriptionImpl for /dev/null.
+//
+// +stateify savable
 type nullFD struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
diff --git a/pkg/sentry/devices/memdev/random.go b/pkg/sentry/devices/memdev/random.go
index 5e7fe0280..ac943e3ba 100644
--- a/pkg/sentry/devices/memdev/random.go
+++ b/pkg/sentry/devices/memdev/random.go
@@ -30,6 +30,8 @@ const (
 )
 
 // randomDevice implements vfs.Device for /dev/random and /dev/urandom.
+//
+// +stateify savable
 type randomDevice struct{}
 
 // Open implements vfs.Device.Open.
@@ -44,6 +46,8 @@ func (randomDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry,
 }
 
 // randomFD implements vfs.FileDescriptionImpl for /dev/random.
+//
+// +stateify savable
 type randomFD struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
diff --git a/pkg/sentry/devices/memdev/zero.go b/pkg/sentry/devices/memdev/zero.go
index 2e631a252..1929e41cd 100644
--- a/pkg/sentry/devices/memdev/zero.go
+++ b/pkg/sentry/devices/memdev/zero.go
@@ -16,9 +16,10 @@ package memdev
 
 import (
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
-	"gvisor.dev/gvisor/pkg/sentry/mm"
-	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
@@ -26,6 +27,8 @@ import (
 const zeroDevMinor = 5
 
 // zeroDevice implements vfs.Device for /dev/zero.
+//
+// +stateify savable
 type zeroDevice struct{}
 
 // Open implements vfs.Device.Open.
@@ -40,6 +43,8 @@ func (zeroDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, op
 }
 
 // zeroFD implements vfs.FileDescriptionImpl for /dev/zero.
+//
+// +stateify savable
 type zeroFD struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
@@ -79,11 +84,22 @@ func (fd *zeroFD) Seek(ctx context.Context, offset int64, whence int32) (int64,
 
 // ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap.
 func (fd *zeroFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
-	m, err := mm.NewSharedAnonMappable(opts.Length, pgalloc.MemoryFileProviderFromContext(ctx))
+	if opts.Private || !opts.MaxPerms.Write {
+		// This mapping will never permit writing to the "underlying file" (in
+		// Linux terms, it isn't VM_SHARED), so implement it as an anonymous
+		// mapping, but back it with fd; this is what Linux does, and is
+		// actually application-visible because the resulting VMA will show up
+		// in /proc/[pid]/maps with fd.vfsfd.VirtualDentry()'s path rather than
+		// "/dev/zero (deleted)".
+		opts.Offset = 0
+		opts.MappingIdentity = &fd.vfsfd
+		opts.MappingIdentity.IncRef()
+		return nil
+	}
+	tmpfsFD, err := tmpfs.NewZeroFile(ctx, auth.CredentialsFromContext(ctx), kernel.KernelFromContext(ctx).ShmMount(), opts.Length)
 	if err != nil {
 		return err
 	}
-	opts.MappingIdentity = m
-	opts.Mappable = m
-	return nil
+	defer tmpfsFD.DecRef(ctx)
+	return tmpfsFD.ConfigureMMap(ctx, opts)
 }
diff --git a/pkg/sentry/devices/ttydev/ttydev.go b/pkg/sentry/devices/ttydev/ttydev.go
index 664e54498..a287c65ca 100644
--- a/pkg/sentry/devices/ttydev/ttydev.go
+++ b/pkg/sentry/devices/ttydev/ttydev.go
@@ -30,6 +30,8 @@ const (
 )
 
 // ttyDevice implements vfs.Device for /dev/tty.
+//
+// +stateify savable
 type ttyDevice struct{}
 
 // Open implements vfs.Device.Open.
diff --git a/pkg/sentry/devices/tundev/BUILD b/pkg/sentry/devices/tundev/BUILD
index 71c59287c..14a8bf9cd 100644
--- a/pkg/sentry/devices/tundev/BUILD
+++ b/pkg/sentry/devices/tundev/BUILD
@@ -17,6 +17,7 @@ go_library(
         "//pkg/sentry/vfs",
         "//pkg/syserror",
         "//pkg/tcpip/link/tun",
+        "//pkg/tcpip/network/arp",
         "//pkg/usermem",
         "//pkg/waiter",
     ],
diff --git a/pkg/sentry/devices/tundev/tundev.go b/pkg/sentry/devices/tundev/tundev.go
index a40625e19..655ea549b 100644
--- a/pkg/sentry/devices/tundev/tundev.go
+++ b/pkg/sentry/devices/tundev/tundev.go
@@ -16,6 +16,8 @@
 package tundev
 
 import (
+	"fmt"
+
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
@@ -26,6 +28,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/tcpip/link/tun"
+	"gvisor.dev/gvisor/pkg/tcpip/network/arp"
 	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
@@ -64,12 +67,13 @@ func (fd *tunFD) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArg
 	request := args[1].Uint()
 	data := args[2].Pointer()
 
+	t := kernel.TaskFromContext(ctx)
+	if t == nil {
+		panic("Ioctl should be called from a task context")
+	}
+
 	switch request {
 	case linux.TUNSETIFF:
-		t := kernel.TaskFromContext(ctx)
-		if t == nil {
-			panic("Ioctl should be called from a task context")
-		}
 		if !t.HasCapability(linux.CAP_NET_ADMIN) {
 			return 0, syserror.EPERM
 		}
@@ -79,13 +83,20 @@ func (fd *tunFD) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArg
 		}
 
 		var req linux.IFReq
-		if _, err := usermem.CopyObjectIn(ctx, uio, data, &req, usermem.IOOpts{
-			AddressSpaceActive: true,
-		}); err != nil {
+		if _, err := req.CopyIn(t, data); err != nil {
 			return 0, err
 		}
 		flags := usermem.ByteOrder.Uint16(req.Data[:])
-		return 0, fd.device.SetIff(stack.Stack, req.Name(), flags)
+		created, err := fd.device.SetIff(stack.Stack, req.Name(), flags)
+		if err == nil && created {
+			// Always start with an ARP address for interfaces so they can handle ARP
+			// packets.
+			nicID := fd.device.NICID()
+			if err := stack.Stack.AddAddress(nicID, arp.ProtocolNumber, arp.ProtocolAddress); err != nil {
+				panic(fmt.Sprintf("failed to add ARP address after creating new TUN/TAP interface with ID = %d", nicID))
+			}
+		}
+		return 0, err
 
 	case linux.TUNGETIFF:
 		var req linux.IFReq
@@ -97,9 +108,7 @@ func (fd *tunFD) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArg
 		flags := fd.device.Flags() | linux.IFF_NOFILTER
 		usermem.ByteOrder.PutUint16(req.Data[:], flags)
 
-		_, err := usermem.CopyObjectOut(ctx, uio, data, &req, usermem.IOOpts{
-			AddressSpaceActive: true,
-		})
+		_, err := req.CopyOut(t, data)
 		return 0, err
 
 	default:
diff --git a/pkg/sentry/fdimport/BUILD b/pkg/sentry/fdimport/BUILD
index 5e41ceb4e..6b4f8b0ed 100644
--- a/pkg/sentry/fdimport/BUILD
+++ b/pkg/sentry/fdimport/BUILD
@@ -10,6 +10,7 @@ go_library(
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/context",
+        "//pkg/fd",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/host",
         "//pkg/sentry/fsimpl/host",
diff --git a/pkg/sentry/fdimport/fdimport.go b/pkg/sentry/fdimport/fdimport.go
index 1b7cb94c0..314661475 100644
--- a/pkg/sentry/fdimport/fdimport.go
+++ b/pkg/sentry/fdimport/fdimport.go
@@ -18,6 +18,7 @@ import (
 	"fmt"
 
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fd"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/host"
 	hostvfs2 "gvisor.dev/gvisor/pkg/sentry/fsimpl/host"
@@ -27,8 +28,9 @@ import (
 
 // Import imports a slice of FDs into the given FDTable. If console is true,
 // sets up TTY for the first 3 FDs in the slice representing stdin, stdout,
-// stderr. Upon success, Import takes ownership of all FDs.
-func Import(ctx context.Context, fdTable *kernel.FDTable, console bool, fds []int) (*host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) {
+// stderr. Used FDs are either closed or released. It's safe for the caller to
+// close any remaining files upon return.
+func Import(ctx context.Context, fdTable *kernel.FDTable, console bool, fds []*fd.FD) (*host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) {
 	if kernel.VFS2Enabled {
 		ttyFile, err := importVFS2(ctx, fdTable, console, fds)
 		return nil, ttyFile, err
@@ -37,7 +39,7 @@ func Import(ctx context.Context, fdTable *kernel.FDTable, console bool, fds []in
 	return ttyFile, nil, err
 }
 
-func importFS(ctx context.Context, fdTable *kernel.FDTable, console bool, fds []int) (*host.TTYFileOperations, error) {
+func importFS(ctx context.Context, fdTable *kernel.FDTable, console bool, fds []*fd.FD) (*host.TTYFileOperations, error) {
 	var ttyFile *fs.File
 	for appFD, hostFD := range fds {
 		var appFile *fs.File
@@ -46,11 +48,12 @@ func importFS(ctx context.Context, fdTable *kernel.FDTable, console bool, fds []
 			// Import the file as a host TTY file.
 			if ttyFile == nil {
 				var err error
-				appFile, err = host.ImportFile(ctx, hostFD, true /* isTTY */)
+				appFile, err = host.ImportFile(ctx, hostFD.FD(), true /* isTTY */)
 				if err != nil {
 					return nil, err
 				}
 				defer appFile.DecRef(ctx)
+				_ = hostFD.Close() // FD is dup'd i ImportFile.
 
 				// Remember this in the TTY file, as we will
 				// use it for the other stdio FDs.
@@ -65,11 +68,12 @@ func importFS(ctx context.Context, fdTable *kernel.FDTable, console bool, fds []
 		} else {
 			// Import the file as a regular host file.
 			var err error
-			appFile, err = host.ImportFile(ctx, hostFD, false /* isTTY */)
+			appFile, err = host.ImportFile(ctx, hostFD.FD(), false /* isTTY */)
 			if err != nil {
 				return nil, err
 			}
 			defer appFile.DecRef(ctx)
+			_ = hostFD.Close() // FD is dup'd i ImportFile.
 		}
 
 		// Add the file to the FD map.
@@ -84,7 +88,7 @@ func importFS(ctx context.Context, fdTable *kernel.FDTable, console bool, fds []
 	return ttyFile.FileOperations.(*host.TTYFileOperations), nil
 }
 
-func importVFS2(ctx context.Context, fdTable *kernel.FDTable, console bool, stdioFDs []int) (*hostvfs2.TTYFileDescription, error) {
+func importVFS2(ctx context.Context, fdTable *kernel.FDTable, console bool, stdioFDs []*fd.FD) (*hostvfs2.TTYFileDescription, error) {
 	k := kernel.KernelFromContext(ctx)
 	if k == nil {
 		return nil, fmt.Errorf("cannot find kernel from context")
@@ -98,11 +102,12 @@ func importVFS2(ctx context.Context, fdTable *kernel.FDTable, console bool, stdi
 			// Import the file as a host TTY file.
 			if ttyFile == nil {
 				var err error
-				appFile, err = hostvfs2.ImportFD(ctx, k.HostMount(), hostFD, true /* isTTY */)
+				appFile, err = hostvfs2.ImportFD(ctx, k.HostMount(), hostFD.FD(), true /* isTTY */)
 				if err != nil {
 					return nil, err
 				}
 				defer appFile.DecRef(ctx)
+				hostFD.Release() // FD is transfered to host FD.
 
 				// Remember this in the TTY file, as we will use it for the other stdio
 				// FDs.
@@ -115,11 +120,12 @@ func importVFS2(ctx context.Context, fdTable *kernel.FDTable, console bool, stdi
 			}
 		} else {
 			var err error
-			appFile, err = hostvfs2.ImportFD(ctx, k.HostMount(), hostFD, false /* isTTY */)
+			appFile, err = hostvfs2.ImportFD(ctx, k.HostMount(), hostFD.FD(), false /* isTTY */)
 			if err != nil {
 				return nil, err
 			}
 			defer appFile.DecRef(ctx)
+			hostFD.Release() // FD is transfered to host FD.
 		}
 
 		if err := fdTable.NewFDAtVFS2(ctx, int32(appFD), appFile, kernel.FDFlags{}); err != nil {
diff --git a/pkg/sentry/fs/copy_up.go b/pkg/sentry/fs/copy_up.go
index 735452b07..ff2fe6712 100644
--- a/pkg/sentry/fs/copy_up.go
+++ b/pkg/sentry/fs/copy_up.go
@@ -107,8 +107,7 @@ func copyUp(ctx context.Context, d *Dirent) error {
 // leave the upper filesystem filled with any number of parent directories
 // but the upper filesystem will never be in an inconsistent state.
 //
-// Preconditions:
-// - d.Inode.overlay is non-nil.
+// Preconditions: d.Inode.overlay is non-nil.
 func copyUpLockedForRename(ctx context.Context, d *Dirent) error {
 	for {
 		// Did we race with another copy up or does there
@@ -183,12 +182,12 @@ func doCopyUp(ctx context.Context, d *Dirent) error {
 // Returns a generic error on failure.
 //
 // Preconditions:
-// - parent.Inode.overlay.upper must be non-nil.
-// - next.Inode.overlay.copyMu must be locked writable.
-// - next.Inode.overlay.lower must be non-nil.
-// - next.Inode.overlay.lower.StableAttr.Type must be RegularFile, Directory,
+// * parent.Inode.overlay.upper must be non-nil.
+// * next.Inode.overlay.copyMu must be locked writable.
+// * next.Inode.overlay.lower must be non-nil.
+// * next.Inode.overlay.lower.StableAttr.Type must be RegularFile, Directory,
 //   or Symlink.
-// - upper filesystem must support setting file ownership and timestamps.
+// * upper filesystem must support setting file ownership and timestamps.
 func copyUpLocked(ctx context.Context, parent *Dirent, next *Dirent) error {
 	// Extract the attributes of the file we wish to copy.
 	attrs, err := next.Inode.overlay.lower.UnstableAttr(ctx)
diff --git a/pkg/sentry/fs/dev/BUILD b/pkg/sentry/fs/dev/BUILD
index 9379a4d7b..6b7b451b8 100644
--- a/pkg/sentry/fs/dev/BUILD
+++ b/pkg/sentry/fs/dev/BUILD
@@ -34,6 +34,7 @@ go_library(
         "//pkg/sentry/socket/netstack",
         "//pkg/syserror",
         "//pkg/tcpip/link/tun",
+        "//pkg/tcpip/network/arp",
         "//pkg/usermem",
         "//pkg/waiter",
     ],
diff --git a/pkg/sentry/fs/dev/net_tun.go b/pkg/sentry/fs/dev/net_tun.go
index ec474e554..19ffdec47 100644
--- a/pkg/sentry/fs/dev/net_tun.go
+++ b/pkg/sentry/fs/dev/net_tun.go
@@ -15,6 +15,8 @@
 package dev
 
 import (
+	"fmt"
+
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
@@ -25,6 +27,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/socket/netstack"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/tcpip/link/tun"
+	"gvisor.dev/gvisor/pkg/tcpip/network/arp"
 	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
@@ -60,7 +63,7 @@ func newNetTunDevice(ctx context.Context, owner fs.FileOwner, mode linux.FileMod
 }
 
 // GetFile implements fs.InodeOperations.GetFile.
-func (iops *netTunInodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+func (*netTunInodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
 	return fs.NewFile(ctx, d, flags, &netTunFileOperations{}), nil
 }
 
@@ -80,21 +83,22 @@ type netTunFileOperations struct {
 var _ fs.FileOperations = (*netTunFileOperations)(nil)
 
 // Release implements fs.FileOperations.Release.
-func (fops *netTunFileOperations) Release(ctx context.Context) {
-	fops.device.Release(ctx)
+func (n *netTunFileOperations) Release(ctx context.Context) {
+	n.device.Release(ctx)
 }
 
 // Ioctl implements fs.FileOperations.Ioctl.
-func (fops *netTunFileOperations) Ioctl(ctx context.Context, file *fs.File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+func (n *netTunFileOperations) Ioctl(ctx context.Context, file *fs.File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
 	request := args[1].Uint()
 	data := args[2].Pointer()
 
+	t := kernel.TaskFromContext(ctx)
+	if t == nil {
+		panic("Ioctl should be called from a task context")
+	}
+
 	switch request {
 	case linux.TUNSETIFF:
-		t := kernel.TaskFromContext(ctx)
-		if t == nil {
-			panic("Ioctl should be called from a task context")
-		}
 		if !t.HasCapability(linux.CAP_NET_ADMIN) {
 			return 0, syserror.EPERM
 		}
@@ -104,27 +108,32 @@ func (fops *netTunFileOperations) Ioctl(ctx context.Context, file *fs.File, io u
 		}
 
 		var req linux.IFReq
-		if _, err := usermem.CopyObjectIn(ctx, io, data, &req, usermem.IOOpts{
-			AddressSpaceActive: true,
-		}); err != nil {
+		if _, err := req.CopyIn(t, data); err != nil {
 			return 0, err
 		}
 		flags := usermem.ByteOrder.Uint16(req.Data[:])
-		return 0, fops.device.SetIff(stack.Stack, req.Name(), flags)
+		created, err := n.device.SetIff(stack.Stack, req.Name(), flags)
+		if err == nil && created {
+			// Always start with an ARP address for interfaces so they can handle ARP
+			// packets.
+			nicID := n.device.NICID()
+			if err := stack.Stack.AddAddress(nicID, arp.ProtocolNumber, arp.ProtocolAddress); err != nil {
+				panic(fmt.Sprintf("failed to add ARP address after creating new TUN/TAP interface with ID = %d", nicID))
+			}
+		}
+		return 0, err
 
 	case linux.TUNGETIFF:
 		var req linux.IFReq
 
-		copy(req.IFName[:], fops.device.Name())
+		copy(req.IFName[:], n.device.Name())
 
 		// Linux adds IFF_NOFILTER (the same value as IFF_NO_PI unfortunately) when
 		// there is no sk_filter. See __tun_chr_ioctl() in net/drivers/tun.c.
-		flags := fops.device.Flags() | linux.IFF_NOFILTER
+		flags := n.device.Flags() | linux.IFF_NOFILTER
 		usermem.ByteOrder.PutUint16(req.Data[:], flags)
 
-		_, err := usermem.CopyObjectOut(ctx, io, data, &req, usermem.IOOpts{
-			AddressSpaceActive: true,
-		})
+		_, err := req.CopyOut(t, data)
 		return 0, err
 
 	default:
@@ -133,41 +142,41 @@ func (fops *netTunFileOperations) Ioctl(ctx context.Context, file *fs.File, io u
 }
 
 // Write implements fs.FileOperations.Write.
-func (fops *netTunFileOperations) Write(ctx context.Context, file *fs.File, src usermem.IOSequence, offset int64) (int64, error) {
+func (n *netTunFileOperations) Write(ctx context.Context, file *fs.File, src usermem.IOSequence, offset int64) (int64, error) {
 	data := make([]byte, src.NumBytes())
 	if _, err := src.CopyIn(ctx, data); err != nil {
 		return 0, err
 	}
-	return fops.device.Write(data)
+	return n.device.Write(data)
 }
 
 // Read implements fs.FileOperations.Read.
-func (fops *netTunFileOperations) Read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
-	data, err := fops.device.Read()
+func (n *netTunFileOperations) Read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
+	data, err := n.device.Read()
 	if err != nil {
 		return 0, err
 	}
-	n, err := dst.CopyOut(ctx, data)
-	if n > 0 && n < len(data) {
+	bytesCopied, err := dst.CopyOut(ctx, data)
+	if bytesCopied > 0 && bytesCopied < len(data) {
 		// Not an error for partial copying. Packet truncated.
 		err = nil
 	}
-	return int64(n), err
+	return int64(bytesCopied), err
 }
 
 // Readiness implements watier.Waitable.Readiness.
-func (fops *netTunFileOperations) Readiness(mask waiter.EventMask) waiter.EventMask {
-	return fops.device.Readiness(mask)
+func (n *netTunFileOperations) Readiness(mask waiter.EventMask) waiter.EventMask {
+	return n.device.Readiness(mask)
 }
 
 // EventRegister implements watier.Waitable.EventRegister.
-func (fops *netTunFileOperations) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
-	fops.device.EventRegister(e, mask)
+func (n *netTunFileOperations) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+	n.device.EventRegister(e, mask)
 }
 
 // EventUnregister implements watier.Waitable.EventUnregister.
-func (fops *netTunFileOperations) EventUnregister(e *waiter.Entry) {
-	fops.device.EventUnregister(e)
+func (n *netTunFileOperations) EventUnregister(e *waiter.Entry) {
+	n.device.EventUnregister(e)
 }
 
 // isNetTunSupported returns whether /dev/net/tun device is supported for s.
diff --git a/pkg/sentry/fs/dirent.go b/pkg/sentry/fs/dirent.go
index a2f751068..00c526b03 100644
--- a/pkg/sentry/fs/dirent.go
+++ b/pkg/sentry/fs/dirent.go
@@ -413,9 +413,9 @@ func (d *Dirent) descendantOf(p *Dirent) bool {
 // Inode.Lookup, otherwise walk will keep d.mu locked.
 //
 // Preconditions:
-// - renameMu must be held for reading.
-// - d.mu must be held.
-// - name must must not contain "/"s.
+// * renameMu must be held for reading.
+// * d.mu must be held.
+// * name must must not contain "/"s.
 func (d *Dirent) walk(ctx context.Context, root *Dirent, name string, walkMayUnlock bool) (*Dirent, error) {
 	if !IsDir(d.Inode.StableAttr) {
 		return nil, syscall.ENOTDIR
@@ -577,9 +577,9 @@ func (d *Dirent) Walk(ctx context.Context, root *Dirent, name string) (*Dirent,
 // exists returns true if name exists in relation to d.
 //
 // Preconditions:
-// - renameMu must be held for reading.
-// - d.mu must be held.
-// - name must must not contain "/"s.
+// * renameMu must be held for reading.
+// * d.mu must be held.
+// * name must must not contain "/"s.
 func (d *Dirent) exists(ctx context.Context, root *Dirent, name string) bool {
 	child, err := d.walk(ctx, root, name, false /* may unlock */)
 	if err != nil {
diff --git a/pkg/sentry/fs/file_operations.go b/pkg/sentry/fs/file_operations.go
index 305c0f840..6ec721022 100644
--- a/pkg/sentry/fs/file_operations.go
+++ b/pkg/sentry/fs/file_operations.go
@@ -159,8 +159,9 @@ type FileOperations interface {
 	// io provides access to the virtual memory space to which pointers in args
 	// refer.
 	//
-	// Preconditions: The AddressSpace (if any) that io refers to is activated.
-	// Must only be called from a task goroutine.
+	// Preconditions:
+	// * The AddressSpace (if any) that io refers to is activated.
+	// * Must only be called from a task goroutine.
 	Ioctl(ctx context.Context, file *File, io usermem.IO, args arch.SyscallArguments) (uintptr, error)
 }
 
diff --git a/pkg/sentry/fs/fsutil/file_range_set.go b/pkg/sentry/fs/fsutil/file_range_set.go
index bbafebf03..1dc409d38 100644
--- a/pkg/sentry/fs/fsutil/file_range_set.go
+++ b/pkg/sentry/fs/fsutil/file_range_set.go
@@ -70,7 +70,9 @@ func (seg FileRangeIterator) FileRange() memmap.FileRange {
 
 // FileRangeOf returns the FileRange mapped by mr.
 //
-// Preconditions: seg.Range().IsSupersetOf(mr). mr.Length() != 0.
+// Preconditions:
+// * seg.Range().IsSupersetOf(mr).
+// * mr.Length() != 0.
 func (seg FileRangeIterator) FileRangeOf(mr memmap.MappableRange) memmap.FileRange {
 	frstart := seg.Value() + (mr.Start - seg.Start())
 	return memmap.FileRange{frstart, frstart + mr.Length()}
@@ -82,15 +84,18 @@ func (seg FileRangeIterator) FileRangeOf(mr memmap.MappableRange) memmap.FileRan
 // returns a successful partial read, Fill will call it repeatedly until all
 // bytes have been read.) EOF is handled consistently with the requirements of
 // mmap(2): bytes after EOF on the same page are zeroed; pages after EOF are
-// invalid.
+// invalid. fileSize is an upper bound on the file's size; bytes after fileSize
+// will be zeroed without calling readAt.
 //
 // Fill may read offsets outside of required, but will never read offsets
 // outside of optional. It returns a non-nil error if any error occurs, even
 // if the error only affects offsets in optional, but not in required.
 //
-// Preconditions: required.Length() > 0. optional.IsSupersetOf(required).
-// required and optional must be page-aligned.
-func (frs *FileRangeSet) Fill(ctx context.Context, required, optional memmap.MappableRange, mf *pgalloc.MemoryFile, kind usage.MemoryKind, readAt func(ctx context.Context, dsts safemem.BlockSeq, offset uint64) (uint64, error)) error {
+// Preconditions:
+// * required.Length() > 0.
+// * optional.IsSupersetOf(required).
+// * required and optional must be page-aligned.
+func (frs *FileRangeSet) Fill(ctx context.Context, required, optional memmap.MappableRange, fileSize uint64, mf *pgalloc.MemoryFile, kind usage.MemoryKind, readAt func(ctx context.Context, dsts safemem.BlockSeq, offset uint64) (uint64, error)) error {
 	gap := frs.LowerBoundGap(required.Start)
 	for gap.Ok() && gap.Start() < required.End {
 		if gap.Range().Length() == 0 {
@@ -103,7 +108,21 @@ func (frs *FileRangeSet) Fill(ctx context.Context, required, optional memmap.Map
 		fr, err := mf.AllocateAndFill(gr.Length(), kind, safemem.ReaderFunc(func(dsts safemem.BlockSeq) (uint64, error) {
 			var done uint64
 			for !dsts.IsEmpty() {
-				n, err := readAt(ctx, dsts, gr.Start+done)
+				n, err := func() (uint64, error) {
+					off := gr.Start + done
+					if off >= fileSize {
+						return 0, io.EOF
+					}
+					if off+dsts.NumBytes() > fileSize {
+						rd := fileSize - off
+						n, err := readAt(ctx, dsts.TakeFirst64(rd), off)
+						if n == rd && err == nil {
+							return n, io.EOF
+						}
+						return n, err
+					}
+					return readAt(ctx, dsts, off)
+				}()
 				done += n
 				dsts = dsts.DropFirst64(n)
 				if err != nil {
diff --git a/pkg/sentry/fs/fsutil/host_file_mapper.go b/pkg/sentry/fs/fsutil/host_file_mapper.go
index ef0113b52..1390a9a7f 100644
--- a/pkg/sentry/fs/fsutil/host_file_mapper.go
+++ b/pkg/sentry/fs/fsutil/host_file_mapper.go
@@ -80,7 +80,9 @@ func NewHostFileMapper() *HostFileMapper {
 
 // IncRefOn increments the reference count on all offsets in mr.
 //
-// Preconditions: mr.Length() != 0. mr.Start and mr.End must be page-aligned.
+// Preconditions:
+// * mr.Length() != 0.
+// * mr.Start and mr.End must be page-aligned.
 func (f *HostFileMapper) IncRefOn(mr memmap.MappableRange) {
 	f.refsMu.Lock()
 	defer f.refsMu.Unlock()
@@ -97,7 +99,9 @@ func (f *HostFileMapper) IncRefOn(mr memmap.MappableRange) {
 
 // DecRefOn decrements the reference count on all offsets in mr.
 //
-// Preconditions: mr.Length() != 0. mr.Start and mr.End must be page-aligned.
+// Preconditions:
+// * mr.Length() != 0.
+// * mr.Start and mr.End must be page-aligned.
 func (f *HostFileMapper) DecRefOn(mr memmap.MappableRange) {
 	f.refsMu.Lock()
 	defer f.refsMu.Unlock()
@@ -204,7 +208,9 @@ func (f *HostFileMapper) UnmapAll() {
 	}
 }
 
-// Preconditions: f.mapsMu must be locked. f.mappings[chunkStart] == m.
+// Preconditions:
+// * f.mapsMu must be locked.
+// * f.mappings[chunkStart] == m.
 func (f *HostFileMapper) unmapAndRemoveLocked(chunkStart uint64, m mapping) {
 	if _, _, errno := syscall.Syscall(syscall.SYS_MUNMAP, m.addr, chunkSize, 0); errno != 0 {
 		// This leaks address space and is unexpected, but is otherwise
diff --git a/pkg/sentry/fs/fsutil/inode_cached.go b/pkg/sentry/fs/fsutil/inode_cached.go
index fe8b0b6ac..82eda3e43 100644
--- a/pkg/sentry/fs/fsutil/inode_cached.go
+++ b/pkg/sentry/fs/fsutil/inode_cached.go
@@ -22,7 +22,6 @@ import (
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
-	"gvisor.dev/gvisor/pkg/sentry/kernel/time"
 	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
@@ -444,7 +443,7 @@ func (c *CachingInodeOperations) TouchAccessTime(ctx context.Context, inode *fs.
 // time.
 //
 // Preconditions: c.attrMu is locked for writing.
-func (c *CachingInodeOperations) touchAccessTimeLocked(now time.Time) {
+func (c *CachingInodeOperations) touchAccessTimeLocked(now ktime.Time) {
 	c.attr.AccessTime = now
 	c.dirtyAttr.AccessTime = true
 }
@@ -461,7 +460,7 @@ func (c *CachingInodeOperations) TouchModificationAndStatusChangeTime(ctx contex
 // and status change times in-place to the current time.
 //
 // Preconditions: c.attrMu is locked for writing.
-func (c *CachingInodeOperations) touchModificationAndStatusChangeTimeLocked(now time.Time) {
+func (c *CachingInodeOperations) touchModificationAndStatusChangeTimeLocked(now ktime.Time) {
 	c.attr.ModificationTime = now
 	c.dirtyAttr.ModificationTime = true
 	c.attr.StatusChangeTime = now
@@ -480,7 +479,7 @@ func (c *CachingInodeOperations) TouchStatusChangeTime(ctx context.Context) {
 // in-place to the current time.
 //
 // Preconditions: c.attrMu is locked for writing.
-func (c *CachingInodeOperations) touchStatusChangeTimeLocked(now time.Time) {
+func (c *CachingInodeOperations) touchStatusChangeTimeLocked(now ktime.Time) {
 	c.attr.StatusChangeTime = now
 	c.dirtyAttr.StatusChangeTime = true
 }
@@ -645,7 +644,7 @@ func (rw *inodeReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
 					End:   fs.OffsetPageEnd(int64(gapMR.End)),
 				}
 				optMR := gap.Range()
-				err := rw.c.cache.Fill(rw.ctx, reqMR, maxFillRange(reqMR, optMR), mem, usage.PageCache, rw.c.backingFile.ReadToBlocksAt)
+				err := rw.c.cache.Fill(rw.ctx, reqMR, maxFillRange(reqMR, optMR), uint64(rw.c.attr.Size), mem, usage.PageCache, rw.c.backingFile.ReadToBlocksAt)
 				mem.MarkEvictable(rw.c, pgalloc.EvictableRange{optMR.Start, optMR.End})
 				seg, gap = rw.c.cache.Find(uint64(rw.offset))
 				if !seg.Ok() {
@@ -672,9 +671,6 @@ func (rw *inodeReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
 				// Continue.
 				seg, gap = gap.NextSegment(), FileRangeGapIterator{}
 			}
-
-		default:
-			break
 		}
 	}
 	unlock()
@@ -684,7 +680,9 @@ func (rw *inodeReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
 // maybeGrowFile grows the file's size if data has been written past the old
 // size.
 //
-// Preconditions: rw.c.attrMu and rw.c.dataMu bust be locked.
+// Preconditions:
+// * rw.c.attrMu must be locked.
+// * rw.c.dataMu must be locked.
 func (rw *inodeReadWriter) maybeGrowFile() {
 	// If the write ends beyond the file's previous size, it causes the
 	// file to grow.
@@ -766,9 +764,6 @@ func (rw *inodeReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error
 
 			// Continue.
 			seg, gap = gap.NextSegment(), FileRangeGapIterator{}
-
-		default:
-			break
 		}
 	}
 	rw.maybeGrowFile()
@@ -875,7 +870,7 @@ func (c *CachingInodeOperations) Translate(ctx context.Context, required, option
 	}
 
 	mf := c.mfp.MemoryFile()
-	cerr := c.cache.Fill(ctx, required, maxFillRange(required, optional), mf, usage.PageCache, c.backingFile.ReadToBlocksAt)
+	cerr := c.cache.Fill(ctx, required, maxFillRange(required, optional), uint64(c.attr.Size), mf, usage.PageCache, c.backingFile.ReadToBlocksAt)
 
 	var ts []memmap.Translation
 	var translatedEnd uint64
diff --git a/pkg/sentry/fs/g3doc/fuse.md b/pkg/sentry/fs/g3doc/fuse.md
index 2ca84dd74..05e043583 100644
--- a/pkg/sentry/fs/g3doc/fuse.md
+++ b/pkg/sentry/fs/g3doc/fuse.md
@@ -79,7 +79,7 @@ ops can be implemented in parallel.
 -   Implement `/dev/fuse` - a character device used to establish an FD for
     communication between the sentry and the server daemon.
 
--   Implement basic FUSE ops like `FUSE_INIT`, `FUSE_DESTROY`.
+-   Implement basic FUSE ops like `FUSE_INIT`.
 
 #### Read-only mount with basic file operations
 
@@ -95,6 +95,103 @@ ops can be implemented in parallel.
 -   Implement the remaining FUSE ops and decide if we can omit rarely used
     operations like ioctl.
 
+### Design Details
+
+#### Lifecycle for a FUSE Request
+
+-   User invokes a syscall
+-   Sentry prepares corresponding request
+    -   If FUSE device is available
+        -   Write the request in binary
+    -   If FUSE device is full
+        -   Kernel task blocked until available
+-   Sentry notifies the readers of fuse device that it's ready for read
+-   FUSE daemon reads the request and processes it
+-   Sentry waits until a reply is written to the FUSE device
+    -   but returns directly for async requests
+-   FUSE daemon writes to the fuse device
+-   Sentry processes the reply
+    -   For sync requests, unblock blocked kernel task
+    -   For async requests, execute pre-specified callback if any
+-   Sentry returns the syscall to the user
+
+#### Channels and Queues for Requests in Different Stages
+
+`connection.initializedChan`
+
+-   a channel that the requests issued before connection initialization blocks
+    on.
+
+`fd.queue`
+
+-   a queue of requests that haven’t been read by the FUSE daemon yet.
+
+`fd.completions`
+
+-   a map of the requests that have been prepared but not yet received a
+    response, including the ones on the `fd.queue`.
+
+`fd.waitQueue`
+
+-   a queue of waiters that is waiting for the fuse device fd to be available,
+    such as the FUSE daemon.
+
+`fd.fullQueueCh`
+
+-   a channel that the kernel task will be blocked on when the fd is not
+    available.
+
+#### Basic I/O Implementation
+
+Currently we have implemented basic functionalities of read and write for our
+FUSE. We describe the design and ways to improve it here:
+
+##### Basic FUSE Read
+
+The vfs2 expects implementations of `vfs.FileDescriptionImpl.Read()` and
+`vfs.FileDescriptionImpl.PRead()`. When a syscall is made, it will eventually
+reach our implementation of those interface functions located at
+`pkg/sentry/fsimpl/fuse/regular_file.go` for regular files.
+
+After validation checks of the input, sentry sends `FUSE_READ` requests to the
+FUSE daemon. The FUSE daemon returns data after the `fuse_out_header` as the
+responses. For the first version, we create a copy in kernel memory of those
+data. They are represented as a byte slice in the marshalled struct. This
+happens as a common process for all the FUSE responses at this moment at
+`pkg/sentry/fsimpl/fuse/dev.go:writeLocked()`. We then directly copy from this
+intermediate buffer to the input buffer provided by the read syscall.
+
+There is an extra requirement for FUSE: When mounting the FUSE fs, the mounter
+or the FUSE daemon can specify a `max_read` or a `max_pages` parameter. They are
+the upperbound of the bytes to read in each `FUSE_READ` request. We implemented
+the code to handle the fragmented reads.
+
+To improve the performance: ideally we should have buffer cache to copy those
+data from the responses of FUSE daemon into, as is also the design of several
+other existing file system implementations for sentry, instead of a single-use
+temporary buffer. Directly mapping the memory of one process to another could
+also boost the performance, but to keep them isolated, we did not choose to do
+so.
+
+##### Basic FUSE Write
+
+The vfs2 invokes implementations of `vfs.FileDescriptionImpl.Write()` and
+`vfs.FileDescriptionImpl.PWrite()` on the regular file descriptor of FUSE when a
+user makes write(2) and pwrite(2) syscall.
+
+For valid writes, sentry sends the bytes to write after a `FUSE_WRITE` header
+(can be regarded as a request with 2 payloads) to the FUSE daemon. For the first
+version, we allocate a buffer inside kernel memory to store the bytes from the
+user, and copy directly from that buffer to the memory of FUSE daemon. This
+happens at `pkg/sentry/fsimpl/fuse/dev.go:readLocked()`
+
+The parameters `max_write` and `max_pages` restrict the number of bytes in one
+`FUSE_WRITE`. There are code handling fragmented writes in current
+implementation.
+
+To have better performance: the extra copy created to store the bytes to write
+can be replaced by the buffer cache as well.
+
 # Appendix
 
 ## FUSE Protocol
diff --git a/pkg/sentry/fs/host/BUILD b/pkg/sentry/fs/host/BUILD
index d41d23a43..1368014c4 100644
--- a/pkg/sentry/fs/host/BUILD
+++ b/pkg/sentry/fs/host/BUILD
@@ -32,6 +32,7 @@ go_library(
         "//pkg/fdnotifier",
         "//pkg/iovec",
         "//pkg/log",
+        "//pkg/marshal/primitive",
         "//pkg/refs",
         "//pkg/safemem",
         "//pkg/secio",
diff --git a/pkg/sentry/fs/host/socket_unsafe.go b/pkg/sentry/fs/host/socket_unsafe.go
index 5d4f312cf..c8231e0aa 100644
--- a/pkg/sentry/fs/host/socket_unsafe.go
+++ b/pkg/sentry/fs/host/socket_unsafe.go
@@ -65,10 +65,10 @@ func fdReadVec(fd int, bufs [][]byte, control []byte, peek bool, maxlen int64) (
 	controlTrunc = msg.Flags&syscall.MSG_CTRUNC == syscall.MSG_CTRUNC
 
 	if n > length {
-		return length, n, msg.Controllen, controlTrunc, err
+		return length, n, msg.Controllen, controlTrunc, nil
 	}
 
-	return n, n, msg.Controllen, controlTrunc, err
+	return n, n, msg.Controllen, controlTrunc, nil
 }
 
 // fdWriteVec sends from bufs to fd.
diff --git a/pkg/sentry/fs/host/tty.go b/pkg/sentry/fs/host/tty.go
index e29ae00f2..1183727ab 100644
--- a/pkg/sentry/fs/host/tty.go
+++ b/pkg/sentry/fs/host/tty.go
@@ -17,6 +17,7 @@ package host
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
@@ -53,7 +54,7 @@ type TTYFileOperations struct {
 func newTTYFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags, iops *inodeOperations) *fs.File {
 	return fs.NewFile(ctx, dirent, flags, &TTYFileOperations{
 		fileOperations: fileOperations{iops: iops},
-		termios:        linux.DefaultSlaveTermios,
+		termios:        linux.DefaultReplicaTermios,
 	})
 }
 
@@ -123,6 +124,11 @@ func (t *TTYFileOperations) Release(ctx context.Context) {
 
 // Ioctl implements fs.FileOperations.Ioctl.
 func (t *TTYFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	task := kernel.TaskFromContext(ctx)
+	if task == nil {
+		return 0, syserror.ENOTTY
+	}
+
 	// Ignore arg[0].  This is the real FD:
 	fd := t.fileOperations.iops.fileState.FD()
 	ioctl := args[1].Uint64()
@@ -132,9 +138,7 @@ func (t *TTYFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO
 		if err != nil {
 			return 0, err
 		}
-		_, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), termios, usermem.IOOpts{
-			AddressSpaceActive: true,
-		})
+		_, err = termios.CopyOut(task, args[2].Pointer())
 		return 0, err
 
 	case linux.TCSETS, linux.TCSETSW, linux.TCSETSF:
@@ -146,9 +150,7 @@ func (t *TTYFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO
 		}
 
 		var termios linux.Termios
-		if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &termios, usermem.IOOpts{
-			AddressSpaceActive: true,
-		}); err != nil {
+		if _, err := termios.CopyIn(task, args[2].Pointer()); err != nil {
 			return 0, err
 		}
 		err := ioctlSetTermios(fd, ioctl, &termios)
@@ -173,10 +175,8 @@ func (t *TTYFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO
 
 		// Map the ProcessGroup into a ProcessGroupID in the task's PID
 		// namespace.
-		pgID := pidns.IDOfProcessGroup(t.fgProcessGroup)
-		_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), &pgID, usermem.IOOpts{
-			AddressSpaceActive: true,
-		})
+		pgID := primitive.Int32(pidns.IDOfProcessGroup(t.fgProcessGroup))
+		_, err := pgID.CopyOut(task, args[2].Pointer())
 		return 0, err
 
 	case linux.TIOCSPGRP:
@@ -184,11 +184,6 @@ func (t *TTYFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO
 		// Equivalent to tcsetpgrp(fd, *argp).
 		// Set the foreground process group ID of this terminal.
 
-		task := kernel.TaskFromContext(ctx)
-		if task == nil {
-			return 0, syserror.ENOTTY
-		}
-
 		t.mu.Lock()
 		defer t.mu.Unlock()
 
@@ -208,12 +203,11 @@ func (t *TTYFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO
 			return 0, syserror.ENOTTY
 		}
 
-		var pgID kernel.ProcessGroupID
-		if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &pgID, usermem.IOOpts{
-			AddressSpaceActive: true,
-		}); err != nil {
+		var pgIDP primitive.Int32
+		if _, err := pgIDP.CopyIn(task, args[2].Pointer()); err != nil {
 			return 0, err
 		}
+		pgID := kernel.ProcessGroupID(pgIDP)
 
 		// pgID must be non-negative.
 		if pgID < 0 {
@@ -242,9 +236,7 @@ func (t *TTYFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO
 		if err != nil {
 			return 0, err
 		}
-		_, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), winsize, usermem.IOOpts{
-			AddressSpaceActive: true,
-		})
+		_, err = winsize.CopyOut(task, args[2].Pointer())
 		return 0, err
 
 	case linux.TIOCSWINSZ:
@@ -255,9 +247,7 @@ func (t *TTYFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO
 		// background ones) can set the winsize.
 
 		var winsize linux.Winsize
-		if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &winsize, usermem.IOOpts{
-			AddressSpaceActive: true,
-		}); err != nil {
+		if _, err := winsize.CopyIn(task, args[2].Pointer()); err != nil {
 			return 0, err
 		}
 		err := ioctlSetWinsize(fd, &winsize)
diff --git a/pkg/sentry/fs/inode.go b/pkg/sentry/fs/inode.go
index b79cd9877..004910453 100644
--- a/pkg/sentry/fs/inode.go
+++ b/pkg/sentry/fs/inode.go
@@ -270,7 +270,7 @@ func (i *Inode) GetXattr(ctx context.Context, name string, size uint64) (string,
 // SetXattr calls i.InodeOperations.SetXattr with i as the Inode.
 func (i *Inode) SetXattr(ctx context.Context, d *Dirent, name, value string, flags uint32) error {
 	if i.overlay != nil {
-		return overlaySetxattr(ctx, i.overlay, d, name, value, flags)
+		return overlaySetXattr(ctx, i.overlay, d, name, value, flags)
 	}
 	return i.InodeOperations.SetXattr(ctx, i, name, value, flags)
 }
diff --git a/pkg/sentry/fs/inode_overlay.go b/pkg/sentry/fs/inode_overlay.go
index dc2e353d9..b16ab08ba 100644
--- a/pkg/sentry/fs/inode_overlay.go
+++ b/pkg/sentry/fs/inode_overlay.go
@@ -16,7 +16,6 @@ package fs
 
 import (
 	"fmt"
-	"strings"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
@@ -539,7 +538,7 @@ func overlayGetXattr(ctx context.Context, o *overlayEntry, name string, size uin
 
 	// Don't forward the value of the extended attribute if it would
 	// unexpectedly change the behavior of a wrapping overlay layer.
-	if strings.HasPrefix(XattrOverlayPrefix, name) {
+	if isXattrOverlay(name) {
 		return "", syserror.ENODATA
 	}
 
@@ -553,9 +552,9 @@ func overlayGetXattr(ctx context.Context, o *overlayEntry, name string, size uin
 	return s, err
 }
 
-func overlaySetxattr(ctx context.Context, o *overlayEntry, d *Dirent, name, value string, flags uint32) error {
+func overlaySetXattr(ctx context.Context, o *overlayEntry, d *Dirent, name, value string, flags uint32) error {
 	// Don't allow changes to overlay xattrs through a setxattr syscall.
-	if strings.HasPrefix(XattrOverlayPrefix, name) {
+	if isXattrOverlay(name) {
 		return syserror.EPERM
 	}
 
@@ -578,7 +577,7 @@ func overlayListXattr(ctx context.Context, o *overlayEntry, size uint64) (map[st
 	for name := range names {
 		// Same as overlayGetXattr, we shouldn't forward along
 		// overlay attributes.
-		if strings.HasPrefix(XattrOverlayPrefix, name) {
+		if isXattrOverlay(name) {
 			delete(names, name)
 		}
 	}
@@ -587,7 +586,7 @@ func overlayListXattr(ctx context.Context, o *overlayEntry, size uint64) (map[st
 
 func overlayRemoveXattr(ctx context.Context, o *overlayEntry, d *Dirent, name string) error {
 	// Don't allow changes to overlay xattrs through a removexattr syscall.
-	if strings.HasPrefix(XattrOverlayPrefix, name) {
+	if isXattrOverlay(name) {
 		return syserror.EPERM
 	}
 
diff --git a/pkg/sentry/fs/overlay.go b/pkg/sentry/fs/overlay.go
index 35013a21b..01a1235b8 100644
--- a/pkg/sentry/fs/overlay.go
+++ b/pkg/sentry/fs/overlay.go
@@ -86,13 +86,12 @@ func isXattrOverlay(name string) bool {
 // NewOverlayRoot produces the root of an overlay.
 //
 // Preconditions:
-//
-// - upper and lower must be non-nil.
-// - upper must not be an overlay.
-// - lower should not expose character devices, pipes, or sockets, because
+// * upper and lower must be non-nil.
+// * upper must not be an overlay.
+// * lower should not expose character devices, pipes, or sockets, because
 //   copying up these types of files is not supported.
-// - lower must not require that file objects be revalidated.
-// - lower must not have dynamic file/directory content.
+// * lower must not require that file objects be revalidated.
+// * lower must not have dynamic file/directory content.
 func NewOverlayRoot(ctx context.Context, upper *Inode, lower *Inode, flags MountSourceFlags) (*Inode, error) {
 	if !IsDir(upper.StableAttr) {
 		return nil, fmt.Errorf("upper Inode is a %v, not a directory", upper.StableAttr.Type)
@@ -117,12 +116,11 @@ func NewOverlayRoot(ctx context.Context, upper *Inode, lower *Inode, flags Mount
 // NewOverlayRootFile produces the root of an overlay that points to a file.
 //
 // Preconditions:
-//
-// - lower must be non-nil.
-// - lower should not expose character devices, pipes, or sockets, because
+// * lower must be non-nil.
+// * lower should not expose character devices, pipes, or sockets, because
 //   copying up these types of files is not supported. Neither it can be a dir.
-// - lower must not require that file objects be revalidated.
-// - lower must not have dynamic file/directory content.
+// * lower must not require that file objects be revalidated.
+// * lower must not have dynamic file/directory content.
 func NewOverlayRootFile(ctx context.Context, upperMS *MountSource, lower *Inode, flags MountSourceFlags) (*Inode, error) {
 	if !IsRegular(lower.StableAttr) {
 		return nil, fmt.Errorf("lower Inode is not a regular file")
diff --git a/pkg/sentry/fs/proc/BUILD b/pkg/sentry/fs/proc/BUILD
index 77c2c5c0e..b8b2281a8 100644
--- a/pkg/sentry/fs/proc/BUILD
+++ b/pkg/sentry/fs/proc/BUILD
@@ -50,6 +50,7 @@ go_library(
         "//pkg/sync",
         "//pkg/syserror",
         "//pkg/tcpip/header",
+        "//pkg/tcpip/network/ipv4",
         "//pkg/usermem",
         "//pkg/waiter",
     ],
diff --git a/pkg/sentry/fs/proc/sys_net.go b/pkg/sentry/fs/proc/sys_net.go
index 8615b60f0..e555672ad 100644
--- a/pkg/sentry/fs/proc/sys_net.go
+++ b/pkg/sentry/fs/proc/sys_net.go
@@ -26,6 +26,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/fs/ramfs"
 	"gvisor.dev/gvisor/pkg/sentry/inet"
 	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
 	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
@@ -54,7 +55,7 @@ type tcpMemInode struct {
 
 	// size stores the tcp buffer size during save, and sets the buffer
 	// size in netstack in restore. We must save/restore this here, since
-	// netstack itself is stateless.
+	// a netstack instance is created on restore.
 	size inet.TCPBufferSize
 
 	// mu protects against concurrent reads/writes to files based on this
@@ -258,6 +259,9 @@ func (f *tcpSackFile) Write(ctx context.Context, _ *fs.File, src usermem.IOSeque
 	if src.NumBytes() == 0 {
 		return 0, nil
 	}
+
+	// Only consider size of one memory page for input for performance reasons.
+	// We are only reading if it's zero or not anyway.
 	src = src.TakeFirst(usermem.PageSize - 1)
 
 	var v int32
@@ -383,11 +387,125 @@ func (p *proc) newSysNetCore(ctx context.Context, msrc *fs.MountSource, s inet.S
 	return newProcInode(ctx, d, msrc, fs.SpecialDirectory, nil)
 }
 
+// ipForwarding implements fs.InodeOperations.
+//
+// ipForwarding is used to enable/disable packet forwarding of netstack.
+//
+// +stateify savable
+type ipForwarding struct {
+	fsutil.SimpleFileInode
+
+	stack inet.Stack `state:"wait"`
+
+	// enabled stores the IPv4 forwarding state on save.
+	// We must save/restore this here, since a netstack instance
+	// is created on restore.
+	enabled *bool
+}
+
+func newIPForwardingInode(ctx context.Context, msrc *fs.MountSource, s inet.Stack) *fs.Inode {
+	ipf := &ipForwarding{
+		SimpleFileInode: *fsutil.NewSimpleFileInode(ctx, fs.RootOwner, fs.FilePermsFromMode(0444), linux.PROC_SUPER_MAGIC),
+		stack:           s,
+	}
+	sattr := fs.StableAttr{
+		DeviceID:  device.ProcDevice.DeviceID(),
+		InodeID:   device.ProcDevice.NextIno(),
+		BlockSize: usermem.PageSize,
+		Type:      fs.SpecialFile,
+	}
+	return fs.NewInode(ctx, ipf, msrc, sattr)
+}
+
+// Truncate implements fs.InodeOperations.Truncate. Truncate is called when
+// O_TRUNC is specified for any kind of existing Dirent but is not called via
+// (f)truncate for proc files.
+func (*ipForwarding) Truncate(context.Context, *fs.Inode, int64) error {
+	return nil
+}
+
+// +stateify savable
+type ipForwardingFile struct {
+	fsutil.FileGenericSeek          `state:"nosave"`
+	fsutil.FileNoIoctl              `state:"nosave"`
+	fsutil.FileNoMMap               `state:"nosave"`
+	fsutil.FileNoSplice             `state:"nosave"`
+	fsutil.FileNoopFlush            `state:"nosave"`
+	fsutil.FileNoopFsync            `state:"nosave"`
+	fsutil.FileNoopRelease          `state:"nosave"`
+	fsutil.FileNotDirReaddir        `state:"nosave"`
+	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
+	waiter.AlwaysReady              `state:"nosave"`
+
+	ipf *ipForwarding
+
+	stack inet.Stack `state:"wait"`
+}
+
+// GetFile implements fs.InodeOperations.GetFile.
+func (ipf *ipForwarding) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+	flags.Pread = true
+	flags.Pwrite = true
+	return fs.NewFile(ctx, dirent, flags, &ipForwardingFile{
+		stack: ipf.stack,
+		ipf:   ipf,
+	}), nil
+}
+
+// Read implements fs.FileOperations.Read.
+func (f *ipForwardingFile) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
+	if offset != 0 {
+		return 0, io.EOF
+	}
+
+	if f.ipf.enabled == nil {
+		enabled := f.stack.Forwarding(ipv4.ProtocolNumber)
+		f.ipf.enabled = &enabled
+	}
+
+	val := "0\n"
+	if *f.ipf.enabled {
+		// Technically, this is not quite compatible with Linux. Linux
+		// stores these as an integer, so if you write "2" into
+		// ip_forward, you should get 2 back.
+		val = "1\n"
+	}
+	n, err := dst.CopyOut(ctx, []byte(val))
+	return int64(n), err
+}
+
+// Write implements fs.FileOperations.Write.
+//
+// Offset is ignored, multiple writes are not supported.
+func (f *ipForwardingFile) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, offset int64) (int64, error) {
+	if src.NumBytes() == 0 {
+		return 0, nil
+	}
+
+	// Only consider size of one memory page for input for performance reasons.
+	// We are only reading if it's zero or not anyway.
+	src = src.TakeFirst(usermem.PageSize - 1)
+
+	var v int32
+	n, err := usermem.CopyInt32StringInVec(ctx, src.IO, src.Addrs, &v, src.Opts)
+	if err != nil {
+		return n, err
+	}
+	if f.ipf.enabled == nil {
+		f.ipf.enabled = new(bool)
+	}
+	*f.ipf.enabled = v != 0
+	return n, f.stack.SetForwarding(ipv4.ProtocolNumber, *f.ipf.enabled)
+}
+
 func (p *proc) newSysNetIPv4Dir(ctx context.Context, msrc *fs.MountSource, s inet.Stack) *fs.Inode {
 	contents := map[string]*fs.Inode{
 		// Add tcp_sack.
 		"tcp_sack": newTCPSackInode(ctx, msrc, s),
 
+		// Add ip_forward.
+		"ip_forward": newIPForwardingInode(ctx, msrc, s),
+
 		// The following files are simple stubs until they are
 		// implemented in netstack, most of these files are
 		// configuration related. We use the value closest to the
diff --git a/pkg/sentry/fs/proc/sys_net_state.go b/pkg/sentry/fs/proc/sys_net_state.go
index 6eba709c6..4cb4741af 100644
--- a/pkg/sentry/fs/proc/sys_net_state.go
+++ b/pkg/sentry/fs/proc/sys_net_state.go
@@ -14,7 +14,11 @@
 
 package proc
 
-import "fmt"
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
+)
 
 // beforeSave is invoked by stateify.
 func (t *tcpMemInode) beforeSave() {
@@ -40,3 +44,12 @@ func (s *tcpSack) afterLoad() {
 		}
 	}
 }
+
+// afterLoad is invoked by stateify.
+func (ipf *ipForwarding) afterLoad() {
+	if ipf.enabled != nil {
+		if err := ipf.stack.SetForwarding(ipv4.ProtocolNumber, *ipf.enabled); err != nil {
+			panic(fmt.Sprintf("failed to set IPv4 forwarding [%v]: %v", *ipf.enabled, err))
+		}
+	}
+}
diff --git a/pkg/sentry/fs/proc/sys_net_test.go b/pkg/sentry/fs/proc/sys_net_test.go
index 355e83d47..6ef5738e7 100644
--- a/pkg/sentry/fs/proc/sys_net_test.go
+++ b/pkg/sentry/fs/proc/sys_net_test.go
@@ -123,3 +123,76 @@ func TestConfigureRecvBufferSize(t *testing.T) {
 		}
 	}
 }
+
+// TestIPForwarding tests the implementation of
+// /proc/sys/net/ipv4/ip_forwarding
+func TestIPForwarding(t *testing.T) {
+	ctx := context.Background()
+	s := inet.NewTestStack()
+
+	var cases = []struct {
+		comment string
+		initial bool
+		str     string
+		final   bool
+	}{
+		{
+			comment: `Forwarding is disabled; write 1 and enable forwarding`,
+			initial: false,
+			str:     "1",
+			final:   true,
+		},
+		{
+			comment: `Forwarding is disabled; write 0 and disable forwarding`,
+			initial: false,
+			str:     "0",
+			final:   false,
+		},
+		{
+			comment: `Forwarding is enabled; write 1 and enable forwarding`,
+			initial: true,
+			str:     "1",
+			final:   true,
+		},
+		{
+			comment: `Forwarding is enabled; write 0 and disable forwarding`,
+			initial: true,
+			str:     "0",
+			final:   false,
+		},
+		{
+			comment: `Forwarding is disabled; write 2404 and enable forwarding`,
+			initial: false,
+			str:     "2404",
+			final:   true,
+		},
+		{
+			comment: `Forwarding is enabled; write 2404 and enable forwarding`,
+			initial: true,
+			str:     "2404",
+			final:   true,
+		},
+	}
+	for _, c := range cases {
+		t.Run(c.comment, func(t *testing.T) {
+			s.IPForwarding = c.initial
+			ipf := &ipForwarding{stack: s}
+			file := &ipForwardingFile{
+				stack: s,
+				ipf:   ipf,
+			}
+
+			// Write the values.
+			src := usermem.BytesIOSequence([]byte(c.str))
+			if n, err := file.Write(ctx, nil, src, 0); n != int64(len(c.str)) || err != nil {
+				t.Errorf("file.Write(ctx, nil, %q, 0) = (%d, %v); want (%d, nil)", c.str, n, err, len(c.str))
+			}
+
+			// Read the values from the stack and check them.
+			if got, want := s.IPForwarding, c.final; got != want {
+				t.Errorf("s.IPForwarding incorrect; got: %v, want: %v", got, want)
+			}
+
+		})
+	}
+}
diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go
index 9cf7f2a62..22d658acf 100644
--- a/pkg/sentry/fs/proc/task.go
+++ b/pkg/sentry/fs/proc/task.go
@@ -84,6 +84,7 @@ func (p *proc) newTaskDir(t *kernel.Task, msrc *fs.MountSource, isThreadGroup bo
 		"auxv":          newAuxvec(t, msrc),
 		"cmdline":       newExecArgInode(t, msrc, cmdlineExecArg),
 		"comm":          newComm(t, msrc),
+		"cwd":           newCwd(t, msrc),
 		"environ":       newExecArgInode(t, msrc, environExecArg),
 		"exe":           newExe(t, msrc),
 		"fd":            newFdDir(t, msrc),
@@ -300,6 +301,49 @@ func (e *exe) Readlink(ctx context.Context, inode *fs.Inode) (string, error) {
 	return exec.PathnameWithDeleted(ctx), nil
 }
 
+// cwd is an fs.InodeOperations symlink for the /proc/PID/cwd file.
+//
+// +stateify savable
+type cwd struct {
+	ramfs.Symlink
+
+	t *kernel.Task
+}
+
+func newCwd(t *kernel.Task, msrc *fs.MountSource) *fs.Inode {
+	cwdSymlink := &cwd{
+		Symlink: *ramfs.NewSymlink(t, fs.RootOwner, ""),
+		t:       t,
+	}
+	return newProcInode(t, cwdSymlink, msrc, fs.Symlink, t)
+}
+
+// Readlink implements fs.InodeOperations.
+func (e *cwd) Readlink(ctx context.Context, inode *fs.Inode) (string, error) {
+	if !kernel.ContextCanTrace(ctx, e.t, false) {
+		return "", syserror.EACCES
+	}
+	if err := checkTaskState(e.t); err != nil {
+		return "", err
+	}
+	cwd := e.t.FSContext().WorkingDirectory()
+	if cwd == nil {
+		// It could have raced with process deletion.
+		return "", syserror.ESRCH
+	}
+	defer cwd.DecRef(ctx)
+
+	root := fs.RootFromContext(ctx)
+	if root == nil {
+		// It could have raced with process deletion.
+		return "", syserror.ESRCH
+	}
+	defer root.DecRef(ctx)
+
+	name, _ := cwd.FullName(root)
+	return name, nil
+}
+
 // namespaceSymlink represents a symlink in the namespacefs, such as the files
 // in /proc/<pid>/ns.
 //
@@ -604,7 +648,7 @@ func (s *statusData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) (
 	var vss, rss, data uint64
 	s.t.WithMuLocked(func(t *kernel.Task) {
 		if fdTable := t.FDTable(); fdTable != nil {
-			fds = fdTable.Size()
+			fds = fdTable.CurrentMaxFDs()
 		}
 		if mm := t.MemoryManager(); mm != nil {
 			vss = mm.VirtualMemorySize()
diff --git a/pkg/sentry/fs/tmpfs/inode_file.go b/pkg/sentry/fs/tmpfs/inode_file.go
index 1dc75291d..fc0498f17 100644
--- a/pkg/sentry/fs/tmpfs/inode_file.go
+++ b/pkg/sentry/fs/tmpfs/inode_file.go
@@ -613,7 +613,7 @@ func (f *fileInodeOperations) Translate(ctx context.Context, required, optional
 	}
 
 	mf := f.kernel.MemoryFile()
-	cerr := f.data.Fill(ctx, required, optional, mf, f.memUsage, func(_ context.Context, dsts safemem.BlockSeq, _ uint64) (uint64, error) {
+	cerr := f.data.Fill(ctx, required, optional, uint64(f.attr.Size), mf, f.memUsage, func(_ context.Context, dsts safemem.BlockSeq, _ uint64) (uint64, error) {
 		// Newly-allocated pages are zeroed, so we don't need to do anything.
 		return dsts.NumBytes(), nil
 	})
diff --git a/pkg/sentry/fs/tmpfs/tmpfs.go b/pkg/sentry/fs/tmpfs/tmpfs.go
index b095312fe..998b697ca 100644
--- a/pkg/sentry/fs/tmpfs/tmpfs.go
+++ b/pkg/sentry/fs/tmpfs/tmpfs.go
@@ -16,6 +16,8 @@
 package tmpfs
 
 import (
+	"math"
+
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
@@ -32,9 +34,15 @@ import (
 var fsInfo = fs.Info{
 	Type: linux.TMPFS_MAGIC,
 
+	// tmpfs currently does not support configurable size limits. In Linux,
+	// such a tmpfs mount will return f_blocks == f_bfree == f_bavail == 0 from
+	// statfs(2). However, many applications treat this as having a size limit
+	// of 0. To work around this, claim to have a very large but non-zero size,
+	// chosen to ensure that BlockSize * Blocks does not overflow int64 (which
+	// applications may also handle incorrectly).
 	// TODO(b/29637826): allow configuring a tmpfs size and enforce it.
-	TotalBlocks: 0,
-	FreeBlocks:  0,
+	TotalBlocks: math.MaxInt64 / usermem.PageSize,
+	FreeBlocks:  math.MaxInt64 / usermem.PageSize,
 }
 
 // rename implements fs.InodeOperations.Rename for tmpfs nodes.
diff --git a/pkg/sentry/fs/tty/BUILD b/pkg/sentry/fs/tty/BUILD
index 5cb0e0417..e6d0eb359 100644
--- a/pkg/sentry/fs/tty/BUILD
+++ b/pkg/sentry/fs/tty/BUILD
@@ -10,13 +10,14 @@ go_library(
         "line_discipline.go",
         "master.go",
         "queue.go",
-        "slave.go",
+        "replica.go",
         "terminal.go",
     ],
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
         "//pkg/context",
+        "//pkg/marshal/primitive",
         "//pkg/refs",
         "//pkg/safemem",
         "//pkg/sentry/arch",
diff --git a/pkg/sentry/fs/tty/dir.go b/pkg/sentry/fs/tty/dir.go
index 463f6189e..c2da80bc2 100644
--- a/pkg/sentry/fs/tty/dir.go
+++ b/pkg/sentry/fs/tty/dir.go
@@ -37,14 +37,14 @@ import (
 // This indirectly manages all terminals within the mount.
 //
 // New Terminals are created by masterInodeOperations.GetFile, which registers
-// the slave Inode in the this directory for discovery via Lookup/Readdir. The
-// slave inode is unregistered when the master file is Released, as the slave
+// the replica Inode in the this directory for discovery via Lookup/Readdir. The
+// replica inode is unregistered when the master file is Released, as the replica
 // is no longer discoverable at that point.
 //
 // References on the underlying Terminal are held by masterFileOperations and
-// slaveInodeOperations.
+// replicaInodeOperations.
 //
-// masterInodeOperations and slaveInodeOperations hold a pointer to
+// masterInodeOperations and replicaInodeOperations hold a pointer to
 // dirInodeOperations, which is reference counted by the refcount their
 // corresponding Dirents hold on their parent (this directory).
 //
@@ -76,16 +76,16 @@ type dirInodeOperations struct {
 	// master is the master PTY inode.
 	master *fs.Inode
 
-	// slaves contains the slave inodes reachable from the directory.
+	// replicas contains the replica inodes reachable from the directory.
 	//
-	// A new slave is added by allocateTerminal and is removed by
+	// A new replica is added by allocateTerminal and is removed by
 	// masterFileOperations.Release.
 	//
-	// A reference is held on every slave in the map.
-	slaves map[uint32]*fs.Inode
+	// A reference is held on every replica in the map.
+	replicas map[uint32]*fs.Inode
 
 	// dentryMap is a SortedDentryMap used to implement Readdir containing
-	// the master and all entries in slaves.
+	// the master and all entries in replicas.
 	dentryMap *fs.SortedDentryMap
 
 	// next is the next pty index to use.
@@ -101,7 +101,7 @@ func newDir(ctx context.Context, m *fs.MountSource) *fs.Inode {
 	d := &dirInodeOperations{
 		InodeSimpleAttributes: fsutil.NewInodeSimpleAttributes(ctx, fs.RootOwner, fs.FilePermsFromMode(0555), linux.DEVPTS_SUPER_MAGIC),
 		msrc:                  m,
-		slaves:                make(map[uint32]*fs.Inode),
+		replicas:              make(map[uint32]*fs.Inode),
 		dentryMap:             fs.NewSortedDentryMap(nil),
 	}
 	// Linux devpts uses a default mode of 0000 for ptmx which can be
@@ -133,7 +133,7 @@ func (d *dirInodeOperations) Release(ctx context.Context) {
 	defer d.mu.Unlock()
 
 	d.master.DecRef(ctx)
-	if len(d.slaves) != 0 {
+	if len(d.replicas) != 0 {
 		panic(fmt.Sprintf("devpts directory still contains active terminals: %+v", d))
 	}
 }
@@ -149,14 +149,14 @@ func (d *dirInodeOperations) Lookup(ctx context.Context, dir *fs.Inode, name str
 		return fs.NewDirent(ctx, d.master, name), nil
 	}
 
-	// Slave number?
+	// Replica number?
 	n, err := strconv.ParseUint(name, 10, 32)
 	if err != nil {
 		// Not found.
 		return nil, syserror.ENOENT
 	}
 
-	s, ok := d.slaves[uint32(n)]
+	s, ok := d.replicas[uint32(n)]
 	if !ok {
 		return nil, syserror.ENOENT
 	}
@@ -236,7 +236,7 @@ func (d *dirInodeOperations) allocateTerminal(ctx context.Context) (*Terminal, e
 		return nil, syserror.ENOMEM
 	}
 
-	if _, ok := d.slaves[n]; ok {
+	if _, ok := d.replicas[n]; ok {
 		panic(fmt.Sprintf("pty index collision; index %d already exists", n))
 	}
 
@@ -244,19 +244,19 @@ func (d *dirInodeOperations) allocateTerminal(ctx context.Context) (*Terminal, e
 	d.next++
 
 	// The reference returned by newTerminal is returned to the caller.
-	// Take another for the slave inode.
+	// Take another for the replica inode.
 	t.IncRef()
 
 	// Create a pts node. The owner is based on the context that opens
 	// ptmx.
 	creds := auth.CredentialsFromContext(ctx)
 	uid, gid := creds.EffectiveKUID, creds.EffectiveKGID
-	slave := newSlaveInode(ctx, d, t, fs.FileOwner{uid, gid}, fs.FilePermsFromMode(0666))
+	replica := newReplicaInode(ctx, d, t, fs.FileOwner{uid, gid}, fs.FilePermsFromMode(0666))
 
-	d.slaves[n] = slave
+	d.replicas[n] = replica
 	d.dentryMap.Add(strconv.FormatUint(uint64(n), 10), fs.DentAttr{
-		Type:    slave.StableAttr.Type,
-		InodeID: slave.StableAttr.InodeID,
+		Type:    replica.StableAttr.Type,
+		InodeID: replica.StableAttr.InodeID,
 	})
 
 	return t, nil
@@ -267,18 +267,18 @@ func (d *dirInodeOperations) masterClose(ctx context.Context, t *Terminal) {
 	d.mu.Lock()
 	defer d.mu.Unlock()
 
-	// The slave end disappears from the directory when the master end is
-	// closed, even if the slave end is open elsewhere.
+	// The replica end disappears from the directory when the master end is
+	// closed, even if the replica end is open elsewhere.
 	//
 	// N.B. since we're using a backdoor method to remove a directory entry
 	// we won't properly fire inotify events like Linux would.
-	s, ok := d.slaves[t.n]
+	s, ok := d.replicas[t.n]
 	if !ok {
 		panic(fmt.Sprintf("Terminal %+v doesn't exist in %+v?", t, d))
 	}
 
 	s.DecRef(ctx)
-	delete(d.slaves, t.n)
+	delete(d.replicas, t.n)
 	d.dentryMap.Remove(strconv.FormatUint(uint64(t.n), 10))
 }
 
diff --git a/pkg/sentry/fs/tty/fs.go b/pkg/sentry/fs/tty/fs.go
index 2d4d44bf3..13f4901db 100644
--- a/pkg/sentry/fs/tty/fs.go
+++ b/pkg/sentry/fs/tty/fs.go
@@ -79,8 +79,8 @@ type superOperations struct{}
 //
 // It always returns true, forcing a Lookup for all entries.
 //
-// Slave entries are dropped from dir when their master is closed, so an
-// existing slave Dirent in the tree is not sufficient to guarantee that it
+// Replica entries are dropped from dir when their master is closed, so an
+// existing replica Dirent in the tree is not sufficient to guarantee that it
 // still exists on the filesystem.
 func (superOperations) Revalidate(context.Context, string, *fs.Inode, *fs.Inode) bool {
 	return true
diff --git a/pkg/sentry/fs/tty/line_discipline.go b/pkg/sentry/fs/tty/line_discipline.go
index 2e9dd2d55..b34f4a0eb 100644
--- a/pkg/sentry/fs/tty/line_discipline.go
+++ b/pkg/sentry/fs/tty/line_discipline.go
@@ -21,6 +21,7 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
@@ -43,7 +44,7 @@ const (
 )
 
 // lineDiscipline dictates how input and output are handled between the
-// pseudoterminal (pty) master and slave. It can be configured to alter I/O,
+// pseudoterminal (pty) master and replica. It can be configured to alter I/O,
 // modify control characters (e.g. Ctrl-C for SIGINT), etc. The following man
 // pages are good resources for how to affect the line discipline:
 //
@@ -54,8 +55,8 @@ const (
 //
 // lineDiscipline has a simple structure but supports a multitude of options
 // (see the above man pages). It consists of two queues of bytes: one from the
-// terminal master to slave (the input queue) and one from slave to master (the
-// output queue). When bytes are written to one end of the pty, the line
+// terminal master to replica (the input queue) and one from replica to master
+// (the output queue). When bytes are written to one end of the pty, the line
 // discipline reads the bytes, modifies them or takes special action if
 // required, and enqueues them to be read by the other end of the pty:
 //
@@ -64,7 +65,7 @@ const (
 //    |   (inputQueueWrite)     +-------------+     (inputQueueRead)      |
 //    |                                                                   |
 //    |                                                                   v
-// masterFD                                                            slaveFD
+// masterFD                                                           replicaFD
 //    ^                                                                   |
 //    |                                                                   |
 //    |   output to terminal   +--------------+    output from process    |
@@ -103,8 +104,8 @@ type lineDiscipline struct {
 	// masterWaiter is used to wait on the master end of the TTY.
 	masterWaiter waiter.Queue `state:"zerovalue"`
 
-	// slaveWaiter is used to wait on the slave end of the TTY.
-	slaveWaiter waiter.Queue `state:"zerovalue"`
+	// replicaWaiter is used to wait on the replica end of the TTY.
+	replicaWaiter waiter.Queue `state:"zerovalue"`
 }
 
 func newLineDiscipline(termios linux.KernelTermios) *lineDiscipline {
@@ -115,27 +116,23 @@ func newLineDiscipline(termios linux.KernelTermios) *lineDiscipline {
 }
 
 // getTermios gets the linux.Termios for the tty.
-func (l *lineDiscipline) getTermios(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+func (l *lineDiscipline) getTermios(task *kernel.Task, args arch.SyscallArguments) (uintptr, error) {
 	l.termiosMu.RLock()
 	defer l.termiosMu.RUnlock()
 	// We must copy a Termios struct, not KernelTermios.
 	t := l.termios.ToTermios()
-	_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), t, usermem.IOOpts{
-		AddressSpaceActive: true,
-	})
+	_, err := t.CopyOut(task, args[2].Pointer())
 	return 0, err
 }
 
 // setTermios sets a linux.Termios for the tty.
-func (l *lineDiscipline) setTermios(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+func (l *lineDiscipline) setTermios(task *kernel.Task, args arch.SyscallArguments) (uintptr, error) {
 	l.termiosMu.Lock()
 	defer l.termiosMu.Unlock()
 	oldCanonEnabled := l.termios.LEnabled(linux.ICANON)
 	// We must copy a Termios struct, not KernelTermios.
 	var t linux.Termios
-	_, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &t, usermem.IOOpts{
-		AddressSpaceActive: true,
-	})
+	_, err := t.CopyIn(task, args[2].Pointer())
 	l.termios.FromTermios(t)
 
 	// If canonical mode is turned off, move bytes from inQueue's wait
@@ -146,27 +143,23 @@ func (l *lineDiscipline) setTermios(ctx context.Context, io usermem.IO, args arc
 		l.inQueue.pushWaitBufLocked(l)
 		l.inQueue.readable = true
 		l.inQueue.mu.Unlock()
-		l.slaveWaiter.Notify(waiter.EventIn)
+		l.replicaWaiter.Notify(waiter.EventIn)
 	}
 
 	return 0, err
 }
 
-func (l *lineDiscipline) windowSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error {
+func (l *lineDiscipline) windowSize(t *kernel.Task, args arch.SyscallArguments) error {
 	l.sizeMu.Lock()
 	defer l.sizeMu.Unlock()
-	_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), l.size, usermem.IOOpts{
-		AddressSpaceActive: true,
-	})
+	_, err := l.size.CopyOut(t, args[2].Pointer())
 	return err
 }
 
-func (l *lineDiscipline) setWindowSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error {
+func (l *lineDiscipline) setWindowSize(t *kernel.Task, args arch.SyscallArguments) error {
 	l.sizeMu.Lock()
 	defer l.sizeMu.Unlock()
-	_, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &l.size, usermem.IOOpts{
-		AddressSpaceActive: true,
-	})
+	_, err := l.size.CopyIn(t, args[2].Pointer())
 	return err
 }
 
@@ -176,14 +169,14 @@ func (l *lineDiscipline) masterReadiness() waiter.EventMask {
 	return l.inQueue.writeReadiness(&linux.MasterTermios) | l.outQueue.readReadiness(&linux.MasterTermios)
 }
 
-func (l *lineDiscipline) slaveReadiness() waiter.EventMask {
+func (l *lineDiscipline) replicaReadiness() waiter.EventMask {
 	l.termiosMu.RLock()
 	defer l.termiosMu.RUnlock()
 	return l.outQueue.writeReadiness(&l.termios) | l.inQueue.readReadiness(&l.termios)
 }
 
-func (l *lineDiscipline) inputQueueReadSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error {
-	return l.inQueue.readableSize(ctx, io, args)
+func (l *lineDiscipline) inputQueueReadSize(t *kernel.Task, args arch.SyscallArguments) error {
+	return l.inQueue.readableSize(t, args)
 }
 
 func (l *lineDiscipline) inputQueueRead(ctx context.Context, dst usermem.IOSequence) (int64, error) {
@@ -196,7 +189,7 @@ func (l *lineDiscipline) inputQueueRead(ctx context.Context, dst usermem.IOSeque
 	if n > 0 {
 		l.masterWaiter.Notify(waiter.EventOut)
 		if pushed {
-			l.slaveWaiter.Notify(waiter.EventIn)
+			l.replicaWaiter.Notify(waiter.EventIn)
 		}
 		return n, nil
 	}
@@ -211,14 +204,14 @@ func (l *lineDiscipline) inputQueueWrite(ctx context.Context, src usermem.IOSequ
 		return 0, err
 	}
 	if n > 0 {
-		l.slaveWaiter.Notify(waiter.EventIn)
+		l.replicaWaiter.Notify(waiter.EventIn)
 		return n, nil
 	}
 	return 0, syserror.ErrWouldBlock
 }
 
-func (l *lineDiscipline) outputQueueReadSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error {
-	return l.outQueue.readableSize(ctx, io, args)
+func (l *lineDiscipline) outputQueueReadSize(t *kernel.Task, args arch.SyscallArguments) error {
+	return l.outQueue.readableSize(t, args)
 }
 
 func (l *lineDiscipline) outputQueueRead(ctx context.Context, dst usermem.IOSequence) (int64, error) {
@@ -229,7 +222,7 @@ func (l *lineDiscipline) outputQueueRead(ctx context.Context, dst usermem.IOSequ
 		return 0, err
 	}
 	if n > 0 {
-		l.slaveWaiter.Notify(waiter.EventOut)
+		l.replicaWaiter.Notify(waiter.EventOut)
 		if pushed {
 			l.masterWaiter.Notify(waiter.EventIn)
 		}
diff --git a/pkg/sentry/fs/tty/master.go b/pkg/sentry/fs/tty/master.go
index e00746017..b91184b1b 100644
--- a/pkg/sentry/fs/tty/master.go
+++ b/pkg/sentry/fs/tty/master.go
@@ -17,9 +17,11 @@ package tty
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/unimpl"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
@@ -152,46 +154,51 @@ func (mf *masterFileOperations) Write(ctx context.Context, _ *fs.File, src userm
 
 // Ioctl implements fs.FileOperations.Ioctl.
 func (mf *masterFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	t := kernel.TaskFromContext(ctx)
+	if t == nil {
+		// ioctl(2) may only be called from a task goroutine.
+		return 0, syserror.ENOTTY
+	}
+
 	switch cmd := args[1].Uint(); cmd {
 	case linux.FIONREAD: // linux.FIONREAD == linux.TIOCINQ
 		// Get the number of bytes in the output queue read buffer.
-		return 0, mf.t.ld.outputQueueReadSize(ctx, io, args)
+		return 0, mf.t.ld.outputQueueReadSize(t, args)
 	case linux.TCGETS:
 		// N.B. TCGETS on the master actually returns the configuration
-		// of the slave end.
-		return mf.t.ld.getTermios(ctx, io, args)
+		// of the replica end.
+		return mf.t.ld.getTermios(t, args)
 	case linux.TCSETS:
 		// N.B. TCSETS on the master actually affects the configuration
-		// of the slave end.
-		return mf.t.ld.setTermios(ctx, io, args)
+		// of the replica end.
+		return mf.t.ld.setTermios(t, args)
 	case linux.TCSETSW:
 		// TODO(b/29356795): This should drain the output queue first.
-		return mf.t.ld.setTermios(ctx, io, args)
+		return mf.t.ld.setTermios(t, args)
 	case linux.TIOCGPTN:
-		_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), uint32(mf.t.n), usermem.IOOpts{
-			AddressSpaceActive: true,
-		})
+		nP := primitive.Uint32(mf.t.n)
+		_, err := nP.CopyOut(t, args[2].Pointer())
 		return 0, err
 	case linux.TIOCSPTLCK:
 		// TODO(b/29356795): Implement pty locking. For now just pretend we do.
 		return 0, nil
 	case linux.TIOCGWINSZ:
-		return 0, mf.t.ld.windowSize(ctx, io, args)
+		return 0, mf.t.ld.windowSize(t, args)
 	case linux.TIOCSWINSZ:
-		return 0, mf.t.ld.setWindowSize(ctx, io, args)
+		return 0, mf.t.ld.setWindowSize(t, args)
 	case linux.TIOCSCTTY:
 		// Make the given terminal the controlling terminal of the
 		// calling process.
-		return 0, mf.t.setControllingTTY(ctx, io, args, true /* isMaster */)
+		return 0, mf.t.setControllingTTY(ctx, args, true /* isMaster */)
 	case linux.TIOCNOTTY:
 		// Release this process's controlling terminal.
-		return 0, mf.t.releaseControllingTTY(ctx, io, args, true /* isMaster */)
+		return 0, mf.t.releaseControllingTTY(ctx, args, true /* isMaster */)
 	case linux.TIOCGPGRP:
 		// Get the foreground process group.
-		return mf.t.foregroundProcessGroup(ctx, io, args, true /* isMaster */)
+		return mf.t.foregroundProcessGroup(ctx, args, true /* isMaster */)
 	case linux.TIOCSPGRP:
 		// Set the foreground process group.
-		return mf.t.setForegroundProcessGroup(ctx, io, args, true /* isMaster */)
+		return mf.t.setForegroundProcessGroup(ctx, args, true /* isMaster */)
 	default:
 		maybeEmitUnimplementedEvent(ctx, cmd)
 		return 0, syserror.ENOTTY
diff --git a/pkg/sentry/fs/tty/queue.go b/pkg/sentry/fs/tty/queue.go
index ceabb9b1e..79975d812 100644
--- a/pkg/sentry/fs/tty/queue.go
+++ b/pkg/sentry/fs/tty/queue.go
@@ -17,8 +17,10 @@ package tty
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
 	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
@@ -32,7 +34,7 @@ import (
 const waitBufMaxBytes = 131072
 
 // queue represents one of the input or output queues between a pty master and
-// slave. Bytes written to a queue are added to the read buffer until it is
+// replica. Bytes written to a queue are added to the read buffer until it is
 // full, at which point they are written to the wait buffer. Bytes are
 // processed (i.e. undergo termios transformations) as they are added to the
 // read buffer. The read buffer is readable when its length is nonzero and
@@ -85,17 +87,15 @@ func (q *queue) writeReadiness(t *linux.KernelTermios) waiter.EventMask {
 }
 
 // readableSize writes the number of readable bytes to userspace.
-func (q *queue) readableSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error {
+func (q *queue) readableSize(t *kernel.Task, args arch.SyscallArguments) error {
 	q.mu.Lock()
 	defer q.mu.Unlock()
-	var size int32
+	size := primitive.Int32(0)
 	if q.readable {
-		size = int32(len(q.readBuf))
+		size = primitive.Int32(len(q.readBuf))
 	}
 
-	_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), size, usermem.IOOpts{
-		AddressSpaceActive: true,
-	})
+	_, err := size.CopyOut(t, args[2].Pointer())
 	return err
 
 }
@@ -104,8 +104,7 @@ func (q *queue) readableSize(ctx context.Context, io usermem.IO, args arch.Sysca
 // as whether the read caused more readable data to become available (whether
 // data was pushed from the wait buffer to the read buffer).
 //
-// Preconditions:
-// * l.termiosMu must be held for reading.
+// Preconditions: l.termiosMu must be held for reading.
 func (q *queue) read(ctx context.Context, dst usermem.IOSequence, l *lineDiscipline) (int64, bool, error) {
 	q.mu.Lock()
 	defer q.mu.Unlock()
@@ -145,8 +144,7 @@ func (q *queue) read(ctx context.Context, dst usermem.IOSequence, l *lineDiscipl
 
 // write writes to q from userspace.
 //
-// Preconditions:
-// * l.termiosMu must be held for reading.
+// Preconditions: l.termiosMu must be held for reading.
 func (q *queue) write(ctx context.Context, src usermem.IOSequence, l *lineDiscipline) (int64, error) {
 	q.mu.Lock()
 	defer q.mu.Unlock()
@@ -188,8 +186,7 @@ func (q *queue) write(ctx context.Context, src usermem.IOSequence, l *lineDiscip
 
 // writeBytes writes to q from b.
 //
-// Preconditions:
-// * l.termiosMu must be held for reading.
+// Preconditions: l.termiosMu must be held for reading.
 func (q *queue) writeBytes(b []byte, l *lineDiscipline) {
 	q.mu.Lock()
 	defer q.mu.Unlock()
diff --git a/pkg/sentry/fs/tty/slave.go b/pkg/sentry/fs/tty/replica.go
index 7c7292687..385d230fb 100644
--- a/pkg/sentry/fs/tty/slave.go
+++ b/pkg/sentry/fs/tty/replica.go
@@ -17,9 +17,11 @@ package tty
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
@@ -27,11 +29,11 @@ import (
 
 // LINT.IfChange
 
-// slaveInodeOperations are the fs.InodeOperations for the slave end of the
+// replicaInodeOperations are the fs.InodeOperations for the replica end of the
 // Terminal (pts file).
 //
 // +stateify savable
-type slaveInodeOperations struct {
+type replicaInodeOperations struct {
 	fsutil.SimpleFileInode
 
 	// d is the containing dir.
@@ -41,13 +43,13 @@ type slaveInodeOperations struct {
 	t *Terminal
 }
 
-var _ fs.InodeOperations = (*slaveInodeOperations)(nil)
+var _ fs.InodeOperations = (*replicaInodeOperations)(nil)
 
-// newSlaveInode creates an fs.Inode for the slave end of a terminal.
+// newReplicaInode creates an fs.Inode for the replica end of a terminal.
 //
-// newSlaveInode takes ownership of t.
-func newSlaveInode(ctx context.Context, d *dirInodeOperations, t *Terminal, owner fs.FileOwner, p fs.FilePermissions) *fs.Inode {
-	iops := &slaveInodeOperations{
+// newReplicaInode takes ownership of t.
+func newReplicaInode(ctx context.Context, d *dirInodeOperations, t *Terminal, owner fs.FileOwner, p fs.FilePermissions) *fs.Inode {
+	iops := &replicaInodeOperations{
 		SimpleFileInode: *fsutil.NewSimpleFileInode(ctx, owner, p, linux.DEVPTS_SUPER_MAGIC),
 		d:               d,
 		t:               t,
@@ -64,18 +66,18 @@ func newSlaveInode(ctx context.Context, d *dirInodeOperations, t *Terminal, owne
 		Type:    fs.CharacterDevice,
 		// See fs/devpts/inode.c:devpts_fill_super.
 		BlockSize:       1024,
-		DeviceFileMajor: linux.UNIX98_PTY_SLAVE_MAJOR,
+		DeviceFileMajor: linux.UNIX98_PTY_REPLICA_MAJOR,
 		DeviceFileMinor: t.n,
 	})
 }
 
 // Release implements fs.InodeOperations.Release.
-func (si *slaveInodeOperations) Release(ctx context.Context) {
+func (si *replicaInodeOperations) Release(ctx context.Context) {
 	si.t.DecRef(ctx)
 }
 
 // Truncate implements fs.InodeOperations.Truncate.
-func (*slaveInodeOperations) Truncate(context.Context, *fs.Inode, int64) error {
+func (*replicaInodeOperations) Truncate(context.Context, *fs.Inode, int64) error {
 	return nil
 }
 
@@ -83,14 +85,15 @@ func (*slaveInodeOperations) Truncate(context.Context, *fs.Inode, int64) error {
 //
 // This may race with destruction of the terminal. If the terminal is gone, it
 // returns ENOENT.
-func (si *slaveInodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
-	return fs.NewFile(ctx, d, flags, &slaveFileOperations{si: si}), nil
+func (si *replicaInodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+	return fs.NewFile(ctx, d, flags, &replicaFileOperations{si: si}), nil
 }
 
-// slaveFileOperations are the fs.FileOperations for the slave end of a terminal.
+// replicaFileOperations are the fs.FileOperations for the replica end of a
+// terminal.
 //
 // +stateify savable
-type slaveFileOperations struct {
+type replicaFileOperations struct {
 	fsutil.FilePipeSeek             `state:"nosave"`
 	fsutil.FileNotDirReaddir        `state:"nosave"`
 	fsutil.FileNoFsync              `state:"nosave"`
@@ -100,79 +103,84 @@ type slaveFileOperations struct {
 	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
 
 	// si is the inode operations.
-	si *slaveInodeOperations
+	si *replicaInodeOperations
 }
 
-var _ fs.FileOperations = (*slaveFileOperations)(nil)
+var _ fs.FileOperations = (*replicaFileOperations)(nil)
 
 // Release implements fs.FileOperations.Release.
-func (sf *slaveFileOperations) Release(context.Context) {
+func (sf *replicaFileOperations) Release(context.Context) {
 }
 
 // EventRegister implements waiter.Waitable.EventRegister.
-func (sf *slaveFileOperations) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
-	sf.si.t.ld.slaveWaiter.EventRegister(e, mask)
+func (sf *replicaFileOperations) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+	sf.si.t.ld.replicaWaiter.EventRegister(e, mask)
 }
 
 // EventUnregister implements waiter.Waitable.EventUnregister.
-func (sf *slaveFileOperations) EventUnregister(e *waiter.Entry) {
-	sf.si.t.ld.slaveWaiter.EventUnregister(e)
+func (sf *replicaFileOperations) EventUnregister(e *waiter.Entry) {
+	sf.si.t.ld.replicaWaiter.EventUnregister(e)
 }
 
 // Readiness implements waiter.Waitable.Readiness.
-func (sf *slaveFileOperations) Readiness(mask waiter.EventMask) waiter.EventMask {
-	return sf.si.t.ld.slaveReadiness()
+func (sf *replicaFileOperations) Readiness(mask waiter.EventMask) waiter.EventMask {
+	return sf.si.t.ld.replicaReadiness()
 }
 
 // Read implements fs.FileOperations.Read.
-func (sf *slaveFileOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, _ int64) (int64, error) {
+func (sf *replicaFileOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, _ int64) (int64, error) {
 	return sf.si.t.ld.inputQueueRead(ctx, dst)
 }
 
 // Write implements fs.FileOperations.Write.
-func (sf *slaveFileOperations) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, _ int64) (int64, error) {
+func (sf *replicaFileOperations) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, _ int64) (int64, error) {
 	return sf.si.t.ld.outputQueueWrite(ctx, src)
 }
 
 // Ioctl implements fs.FileOperations.Ioctl.
-func (sf *slaveFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+func (sf *replicaFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	t := kernel.TaskFromContext(ctx)
+	if t == nil {
+		// ioctl(2) may only be called from a task goroutine.
+		return 0, syserror.ENOTTY
+	}
+
 	switch cmd := args[1].Uint(); cmd {
 	case linux.FIONREAD: // linux.FIONREAD == linux.TIOCINQ
 		// Get the number of bytes in the input queue read buffer.
-		return 0, sf.si.t.ld.inputQueueReadSize(ctx, io, args)
+		return 0, sf.si.t.ld.inputQueueReadSize(t, args)
 	case linux.TCGETS:
-		return sf.si.t.ld.getTermios(ctx, io, args)
+		return sf.si.t.ld.getTermios(t, args)
 	case linux.TCSETS:
-		return sf.si.t.ld.setTermios(ctx, io, args)
+		return sf.si.t.ld.setTermios(t, args)
 	case linux.TCSETSW:
 		// TODO(b/29356795): This should drain the output queue first.
-		return sf.si.t.ld.setTermios(ctx, io, args)
+		return sf.si.t.ld.setTermios(t, args)
 	case linux.TIOCGPTN:
-		_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), uint32(sf.si.t.n), usermem.IOOpts{
-			AddressSpaceActive: true,
-		})
+		nP := primitive.Uint32(sf.si.t.n)
+		_, err := nP.CopyOut(t, args[2].Pointer())
 		return 0, err
 	case linux.TIOCGWINSZ:
-		return 0, sf.si.t.ld.windowSize(ctx, io, args)
+		return 0, sf.si.t.ld.windowSize(t, args)
 	case linux.TIOCSWINSZ:
-		return 0, sf.si.t.ld.setWindowSize(ctx, io, args)
+		return 0, sf.si.t.ld.setWindowSize(t, args)
 	case linux.TIOCSCTTY:
 		// Make the given terminal the controlling terminal of the
 		// calling process.
-		return 0, sf.si.t.setControllingTTY(ctx, io, args, false /* isMaster */)
+		return 0, sf.si.t.setControllingTTY(ctx, args, false /* isMaster */)
 	case linux.TIOCNOTTY:
 		// Release this process's controlling terminal.
-		return 0, sf.si.t.releaseControllingTTY(ctx, io, args, false /* isMaster */)
+		return 0, sf.si.t.releaseControllingTTY(ctx, args, false /* isMaster */)
 	case linux.TIOCGPGRP:
 		// Get the foreground process group.
-		return sf.si.t.foregroundProcessGroup(ctx, io, args, false /* isMaster */)
+		return sf.si.t.foregroundProcessGroup(ctx, args, false /* isMaster */)
 	case linux.TIOCSPGRP:
 		// Set the foreground process group.
-		return sf.si.t.setForegroundProcessGroup(ctx, io, args, false /* isMaster */)
+		return sf.si.t.setForegroundProcessGroup(ctx, args, false /* isMaster */)
 	default:
 		maybeEmitUnimplementedEvent(ctx, cmd)
 		return 0, syserror.ENOTTY
 	}
 }
 
-// LINT.ThenChange(../../fsimpl/devpts/slave.go)
+// LINT.ThenChange(../../fsimpl/devpts/replica.go)
diff --git a/pkg/sentry/fs/tty/terminal.go b/pkg/sentry/fs/tty/terminal.go
index ddcccf4da..4f431d74d 100644
--- a/pkg/sentry/fs/tty/terminal.go
+++ b/pkg/sentry/fs/tty/terminal.go
@@ -17,10 +17,10 @@ package tty
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
 	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
-	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // LINT.IfChange
@@ -44,19 +44,19 @@ type Terminal struct {
 	// this terminal. This field is immutable.
 	masterKTTY *kernel.TTY
 
-	// slaveKTTY contains the controlling process of the slave end of this
+	// replicaKTTY contains the controlling process of the replica end of this
 	// terminal. This field is immutable.
-	slaveKTTY *kernel.TTY
+	replicaKTTY *kernel.TTY
 }
 
 func newTerminal(ctx context.Context, d *dirInodeOperations, n uint32) *Terminal {
-	termios := linux.DefaultSlaveTermios
+	termios := linux.DefaultReplicaTermios
 	t := Terminal{
-		d:          d,
-		n:          n,
-		ld:         newLineDiscipline(termios),
-		masterKTTY: &kernel.TTY{Index: n},
-		slaveKTTY:  &kernel.TTY{Index: n},
+		d:           d,
+		n:           n,
+		ld:          newLineDiscipline(termios),
+		masterKTTY:  &kernel.TTY{Index: n},
+		replicaKTTY: &kernel.TTY{Index: n},
 	}
 	t.EnableLeakCheck("tty.Terminal")
 	return &t
@@ -64,7 +64,7 @@ func newTerminal(ctx context.Context, d *dirInodeOperations, n uint32) *Terminal
 
 // setControllingTTY makes tm the controlling terminal of the calling thread
 // group.
-func (tm *Terminal) setControllingTTY(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) error {
+func (tm *Terminal) setControllingTTY(ctx context.Context, args arch.SyscallArguments, isMaster bool) error {
 	task := kernel.TaskFromContext(ctx)
 	if task == nil {
 		panic("setControllingTTY must be called from a task context")
@@ -75,7 +75,7 @@ func (tm *Terminal) setControllingTTY(ctx context.Context, io usermem.IO, args a
 
 // releaseControllingTTY removes tm as the controlling terminal of the calling
 // thread group.
-func (tm *Terminal) releaseControllingTTY(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) error {
+func (tm *Terminal) releaseControllingTTY(ctx context.Context, args arch.SyscallArguments, isMaster bool) error {
 	task := kernel.TaskFromContext(ctx)
 	if task == nil {
 		panic("releaseControllingTTY must be called from a task context")
@@ -85,7 +85,7 @@ func (tm *Terminal) releaseControllingTTY(ctx context.Context, io usermem.IO, ar
 }
 
 // foregroundProcessGroup gets the process group ID of tm's foreground process.
-func (tm *Terminal) foregroundProcessGroup(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) (uintptr, error) {
+func (tm *Terminal) foregroundProcessGroup(ctx context.Context, args arch.SyscallArguments, isMaster bool) (uintptr, error) {
 	task := kernel.TaskFromContext(ctx)
 	if task == nil {
 		panic("foregroundProcessGroup must be called from a task context")
@@ -97,24 +97,21 @@ func (tm *Terminal) foregroundProcessGroup(ctx context.Context, io usermem.IO, a
 	}
 
 	// Write it out to *arg.
-	_, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(ret), usermem.IOOpts{
-		AddressSpaceActive: true,
-	})
+	retP := primitive.Int32(ret)
+	_, err = retP.CopyOut(task, args[2].Pointer())
 	return 0, err
 }
 
 // foregroundProcessGroup sets tm's foreground process.
-func (tm *Terminal) setForegroundProcessGroup(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) (uintptr, error) {
+func (tm *Terminal) setForegroundProcessGroup(ctx context.Context, args arch.SyscallArguments, isMaster bool) (uintptr, error) {
 	task := kernel.TaskFromContext(ctx)
 	if task == nil {
 		panic("setForegroundProcessGroup must be called from a task context")
 	}
 
 	// Read in the process group ID.
-	var pgid int32
-	if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &pgid, usermem.IOOpts{
-		AddressSpaceActive: true,
-	}); err != nil {
+	var pgid primitive.Int32
+	if _, err := pgid.CopyIn(task, args[2].Pointer()); err != nil {
 		return 0, err
 	}
 
@@ -126,7 +123,7 @@ func (tm *Terminal) tty(isMaster bool) *kernel.TTY {
 	if isMaster {
 		return tm.masterKTTY
 	}
-	return tm.slaveKTTY
+	return tm.replicaKTTY
 }
 
 // LINT.ThenChange(../../fsimpl/devpts/terminal.go)
diff --git a/pkg/sentry/fs/tty/tty_test.go b/pkg/sentry/fs/tty/tty_test.go
index 2cbc05678..49edee83d 100644
--- a/pkg/sentry/fs/tty/tty_test.go
+++ b/pkg/sentry/fs/tty/tty_test.go
@@ -22,8 +22,8 @@ import (
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
-func TestSimpleMasterToSlave(t *testing.T) {
-	ld := newLineDiscipline(linux.DefaultSlaveTermios)
+func TestSimpleMasterToReplica(t *testing.T) {
+	ld := newLineDiscipline(linux.DefaultReplicaTermios)
 	ctx := contexttest.Context(t)
 	inBytes := []byte("hello, tty\n")
 	src := usermem.BytesIOSequence(inBytes)
diff --git a/pkg/sentry/fs/user/path.go b/pkg/sentry/fs/user/path.go
index 2f5a43b84..124bc95ed 100644
--- a/pkg/sentry/fs/user/path.go
+++ b/pkg/sentry/fs/user/path.go
@@ -121,6 +121,7 @@ func resolve(ctx context.Context, mns *fs.MountNamespace, paths []string, name s
 
 func resolveVFS2(ctx context.Context, creds *auth.Credentials, mns *vfs.MountNamespace, paths []string, name string) (string, error) {
 	root := mns.Root()
+	root.IncRef()
 	defer root.DecRef(ctx)
 	for _, p := range paths {
 		if !path.IsAbs(p) {
diff --git a/pkg/sentry/fs/user/user.go b/pkg/sentry/fs/user/user.go
index 936fd3932..1f8684dc6 100644
--- a/pkg/sentry/fs/user/user.go
+++ b/pkg/sentry/fs/user/user.go
@@ -105,6 +105,7 @@ func getExecUserHomeVFS2(ctx context.Context, mns *vfs.MountNamespace, uid auth.
 	const defaultHome = "/"
 
 	root := mns.Root()
+	root.IncRef()
 	defer root.DecRef(ctx)
 
 	creds := auth.CredentialsFromContext(ctx)
diff --git a/pkg/sentry/fsbridge/vfs.go b/pkg/sentry/fsbridge/vfs.go
index 323506d33..be0900030 100644
--- a/pkg/sentry/fsbridge/vfs.go
+++ b/pkg/sentry/fsbridge/vfs.go
@@ -122,7 +122,7 @@ func NewVFSLookup(mntns *vfs.MountNamespace, root, workingDir vfs.VirtualDentry)
 // remainingTraversals is not configurable in VFS2, all callers are using the
 // default anyways.
 func (l *vfsLookup) OpenPath(ctx context.Context, pathname string, opts vfs.OpenOptions, _ *uint, resolveFinal bool) (File, error) {
-	vfsObj := l.mntns.Root().Mount().Filesystem().VirtualFilesystem()
+	vfsObj := l.root.Mount().Filesystem().VirtualFilesystem()
 	creds := auth.CredentialsFromContext(ctx)
 	path := fspath.Parse(pathname)
 	pop := &vfs.PathOperation{
diff --git a/pkg/sentry/fsimpl/devpts/BUILD b/pkg/sentry/fsimpl/devpts/BUILD
index 93512c9b6..84baaac66 100644
--- a/pkg/sentry/fsimpl/devpts/BUILD
+++ b/pkg/sentry/fsimpl/devpts/BUILD
@@ -1,7 +1,19 @@
 load("//tools:defs.bzl", "go_library", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
 
 licenses(["notice"])
 
+go_template_instance(
+    name = "root_inode_refs",
+    out = "root_inode_refs.go",
+    package = "devpts",
+    prefix = "rootInode",
+    template = "//pkg/refs_vfs2:refs_template",
+    types = {
+        "T": "rootInode",
+    },
+)
+
 go_library(
     name = "devpts",
     srcs = [
@@ -9,15 +21,21 @@ go_library(
         "line_discipline.go",
         "master.go",
         "queue.go",
-        "slave.go",
+        "replica.go",
+        "root_inode_refs.go",
         "terminal.go",
     ],
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
         "//pkg/context",
+        "//pkg/log",
+        "//pkg/marshal",
+        "//pkg/marshal/primitive",
+        "//pkg/refs",
         "//pkg/safemem",
         "//pkg/sentry/arch",
+        "//pkg/sentry/fs",
         "//pkg/sentry/fs/lock",
         "//pkg/sentry/fsimpl/kernfs",
         "//pkg/sentry/kernel",
diff --git a/pkg/sentry/fsimpl/devpts/devpts.go b/pkg/sentry/fsimpl/devpts/devpts.go
index 7169e91af..d5c5aaa8c 100644
--- a/pkg/sentry/fsimpl/devpts/devpts.go
+++ b/pkg/sentry/fsimpl/devpts/devpts.go
@@ -35,29 +35,56 @@ import (
 const Name = "devpts"
 
 // FilesystemType implements vfs.FilesystemType.
-type FilesystemType struct{}
+//
+// +stateify savable
+type FilesystemType struct {
+	initOnce sync.Once `state:"nosave"` // FIXME(gvisor.dev/issue/1663): not yet supported.
+	initErr  error
+
+	// fs backs all mounts of this FilesystemType. root is fs' root. fs and root
+	// are immutable.
+	fs   *vfs.Filesystem
+	root *vfs.Dentry
+}
 
 // Name implements vfs.FilesystemType.Name.
-func (FilesystemType) Name() string {
+func (*FilesystemType) Name() string {
 	return Name
 }
 
-var _ vfs.FilesystemType = (*FilesystemType)(nil)
-
 // GetFilesystem implements vfs.FilesystemType.GetFilesystem.
-func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
+func (fstype *FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
 	// No data allowed.
 	if opts.Data != "" {
 		return nil, nil, syserror.EINVAL
 	}
 
-	fs, root, err := fstype.newFilesystem(vfsObj, creds)
-	if err != nil {
-		return nil, nil, err
+	fstype.initOnce.Do(func() {
+		fs, root, err := fstype.newFilesystem(vfsObj, creds)
+		if err != nil {
+			fstype.initErr = err
+			return
+		}
+		fstype.fs = fs.VFSFilesystem()
+		fstype.root = root.VFSDentry()
+	})
+	if fstype.initErr != nil {
+		return nil, nil, fstype.initErr
+	}
+	fstype.fs.IncRef()
+	fstype.root.IncRef()
+	return fstype.fs, fstype.root, nil
+}
+
+// Release implements vfs.FilesystemType.Release.
+func (fstype *FilesystemType) Release(ctx context.Context) {
+	if fstype.fs != nil {
+		fstype.root.DecRef(ctx)
+		fstype.fs.DecRef(ctx)
 	}
-	return fs.Filesystem.VFSFilesystem(), root.VFSDentry(), nil
 }
 
+// +stateify savable
 type filesystem struct {
 	kernfs.Filesystem
 
@@ -66,7 +93,7 @@ type filesystem struct {
 
 // newFilesystem creates a new devpts filesystem with root directory and ptmx
 // master inode. It returns the filesystem and root Dentry.
-func (fstype FilesystemType) newFilesystem(vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials) (*filesystem, *kernfs.Dentry, error) {
+func (fstype *FilesystemType) newFilesystem(vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials) (*filesystem, *kernfs.Dentry, error) {
 	devMinor, err := vfsObj.GetAnonBlockDevMinor()
 	if err != nil {
 		return nil, nil, err
@@ -79,11 +106,14 @@ func (fstype FilesystemType) newFilesystem(vfsObj *vfs.VirtualFilesystem, creds
 
 	// Construct the root directory. This is always inode id 1.
 	root := &rootInode{
-		slaves: make(map[uint32]*slaveInode),
+		replicas: make(map[uint32]*replicaInode),
 	}
 	root.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, devMinor, 1, linux.ModeDirectory|0555)
 	root.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
-	root.dentry.Init(root)
+	root.EnableLeakCheck()
+
+	var rootD kernfs.Dentry
+	rootD.Init(&fs.Filesystem, root)
 
 	// Construct the pts master inode and dentry. Linux always uses inode
 	// id 2 for ptmx. See fs/devpts/inode.c:mknod_ptmx.
@@ -91,15 +121,14 @@ func (fstype FilesystemType) newFilesystem(vfsObj *vfs.VirtualFilesystem, creds
 		root: root,
 	}
 	master.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, devMinor, 2, linux.ModeCharacterDevice|0666)
-	master.dentry.Init(master)
 
 	// Add the master as a child of the root.
-	links := root.OrderedChildren.Populate(&root.dentry, map[string]*kernfs.Dentry{
-		"ptmx": &master.dentry,
+	links := root.OrderedChildren.Populate(map[string]kernfs.Inode{
+		"ptmx": master,
 	})
 	root.IncLinks(links)
 
-	return fs, &root.dentry, nil
+	return fs, &rootD, nil
 }
 
 // Release implements vfs.FilesystemImpl.Release.
@@ -109,29 +138,28 @@ func (fs *filesystem) Release(ctx context.Context) {
 }
 
 // rootInode is the root directory inode for the devpts mounts.
+//
+// +stateify savable
 type rootInode struct {
-	kernfs.AlwaysValid
+	implStatFS
+	kernfs.InodeAlwaysValid
 	kernfs.InodeAttrs
 	kernfs.InodeDirectoryNoNewChildren
 	kernfs.InodeNotSymlink
+	kernfs.InodeTemporary // This holds no meaning as this inode can't be Looked up and is always valid.
 	kernfs.OrderedChildren
+	rootInodeRefs
 
 	locks vfs.FileLocks
 
-	// Keep a reference to this inode's dentry.
-	dentry kernfs.Dentry
-
 	// master is the master pty inode. Immutable.
 	master *masterInode
 
-	// root is the root directory inode for this filesystem. Immutable.
-	root *rootInode
-
 	// mu protects the fields below.
-	mu sync.Mutex
+	mu sync.Mutex `state:"nosave"`
 
-	// slaves maps pty ids to slave inodes.
-	slaves map[uint32]*slaveInode
+	// replicas maps pty ids to replica inodes.
+	replicas map[uint32]*replicaInode
 
 	// nextIdx is the next pty index to use. Must be accessed atomically.
 	//
@@ -151,41 +179,46 @@ func (i *rootInode) allocateTerminal(creds *auth.Credentials) (*Terminal, error)
 	idx := i.nextIdx
 	i.nextIdx++
 
-	// Sanity check that slave with idx does not exist.
-	if _, ok := i.slaves[idx]; ok {
+	// Sanity check that replica with idx does not exist.
+	if _, ok := i.replicas[idx]; ok {
 		panic(fmt.Sprintf("pty index collision; index %d already exists", idx))
 	}
 
-	// Create the new terminal and slave.
+	// Create the new terminal and replica.
 	t := newTerminal(idx)
-	slave := &slaveInode{
+	replica := &replicaInode{
 		root: i,
 		t:    t,
 	}
 	// Linux always uses pty index + 3 as the inode id. See
 	// fs/devpts/inode.c:devpts_pty_new().
-	slave.InodeAttrs.Init(creds, i.InodeAttrs.DevMajor(), i.InodeAttrs.DevMinor(), uint64(idx+3), linux.ModeCharacterDevice|0600)
-	slave.dentry.Init(slave)
-	i.slaves[idx] = slave
+	replica.InodeAttrs.Init(creds, i.InodeAttrs.DevMajor(), i.InodeAttrs.DevMinor(), uint64(idx+3), linux.ModeCharacterDevice|0600)
+	i.replicas[idx] = replica
 
 	return t, nil
 }
 
 // masterClose is called when the master end of t is closed.
-func (i *rootInode) masterClose(t *Terminal) {
+func (i *rootInode) masterClose(ctx context.Context, t *Terminal) {
 	i.mu.Lock()
 	defer i.mu.Unlock()
 
-	// Sanity check that slave with idx exists.
-	if _, ok := i.slaves[t.n]; !ok {
+	// Sanity check that replica with idx exists.
+	ri, ok := i.replicas[t.n]
+	if !ok {
 		panic(fmt.Sprintf("pty with index %d does not exist", t.n))
 	}
-	delete(i.slaves, t.n)
+
+	// Drop the ref on replica inode taken during rootInode.allocateTerminal.
+	ri.DecRef(ctx)
+	delete(i.replicas, t.n)
 }
 
 // Open implements kernfs.Inode.Open.
-func (i *rootInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts)
+func (i *rootInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), d, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{
+		SeekEnd: kernfs.SeekEndStaticEntries,
+	})
 	if err != nil {
 		return nil, err
 	}
@@ -193,16 +226,22 @@ func (i *rootInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.D
 }
 
 // Lookup implements kernfs.Inode.Lookup.
-func (i *rootInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) {
+func (i *rootInode) Lookup(ctx context.Context, name string) (kernfs.Inode, error) {
+	// Check if a static entry was looked up.
+	if d, err := i.OrderedChildren.Lookup(ctx, name); err == nil {
+		return d, nil
+	}
+
+	// Not a static entry.
 	idx, err := strconv.ParseUint(name, 10, 32)
 	if err != nil {
 		return nil, syserror.ENOENT
 	}
 	i.mu.Lock()
 	defer i.mu.Unlock()
-	if si, ok := i.slaves[uint32(idx)]; ok {
-		si.dentry.IncRef()
-		return si.dentry.VFSDentry(), nil
+	if ri, ok := i.replicas[uint32(idx)]; ok {
+		ri.IncRef() // This ref is passed to the dentry upon creation via Init.
+		return ri, nil
 
 	}
 	return nil, syserror.ENOENT
@@ -212,8 +251,8 @@ func (i *rootInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error
 func (i *rootInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) {
 	i.mu.Lock()
 	defer i.mu.Unlock()
-	ids := make([]int, 0, len(i.slaves))
-	for id := range i.slaves {
+	ids := make([]int, 0, len(i.replicas))
+	for id := range i.replicas {
 		ids = append(ids, int(id))
 	}
 	sort.Ints(ids)
@@ -221,7 +260,7 @@ func (i *rootInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback,
 		dirent := vfs.Dirent{
 			Name:    strconv.FormatUint(uint64(id), 10),
 			Type:    linux.DT_CHR,
-			Ino:     i.slaves[uint32(id)].InodeAttrs.Ino(),
+			Ino:     i.replicas[uint32(id)].InodeAttrs.Ino(),
 			NextOff: offset + 1,
 		}
 		if err := cb.Handle(dirent); err != nil {
@@ -231,3 +270,16 @@ func (i *rootInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback,
 	}
 	return offset, nil
 }
+
+// DecRef implements kernfs.Inode.DecRef.
+func (i *rootInode) DecRef(ctx context.Context) {
+	i.rootInodeRefs.DecRef(func() { i.Destroy(ctx) })
+}
+
+// +stateify savable
+type implStatFS struct{}
+
+// StatFS implements kernfs.Inode.StatFS.
+func (*implStatFS) StatFS(context.Context, *vfs.Filesystem) (linux.Statfs, error) {
+	return vfs.GenericStatFS(linux.DEVPTS_SUPER_MAGIC), nil
+}
diff --git a/pkg/sentry/fsimpl/devpts/devpts_test.go b/pkg/sentry/fsimpl/devpts/devpts_test.go
index b7c149047..448390cfe 100644
--- a/pkg/sentry/fsimpl/devpts/devpts_test.go
+++ b/pkg/sentry/fsimpl/devpts/devpts_test.go
@@ -22,8 +22,8 @@ import (
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
-func TestSimpleMasterToSlave(t *testing.T) {
-	ld := newLineDiscipline(linux.DefaultSlaveTermios)
+func TestSimpleMasterToReplica(t *testing.T) {
+	ld := newLineDiscipline(linux.DefaultReplicaTermios)
 	ctx := contexttest.Context(t)
 	inBytes := []byte("hello, tty\n")
 	src := usermem.BytesIOSequence(inBytes)
diff --git a/pkg/sentry/fsimpl/devpts/line_discipline.go b/pkg/sentry/fsimpl/devpts/line_discipline.go
index f7bc325d1..e6b0e81cf 100644
--- a/pkg/sentry/fsimpl/devpts/line_discipline.go
+++ b/pkg/sentry/fsimpl/devpts/line_discipline.go
@@ -21,6 +21,7 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
@@ -41,7 +42,7 @@ const (
 )
 
 // lineDiscipline dictates how input and output are handled between the
-// pseudoterminal (pty) master and slave. It can be configured to alter I/O,
+// pseudoterminal (pty) master and replica. It can be configured to alter I/O,
 // modify control characters (e.g. Ctrl-C for SIGINT), etc. The following man
 // pages are good resources for how to affect the line discipline:
 //
@@ -52,8 +53,8 @@ const (
 //
 // lineDiscipline has a simple structure but supports a multitude of options
 // (see the above man pages). It consists of two queues of bytes: one from the
-// terminal master to slave (the input queue) and one from slave to master (the
-// output queue). When bytes are written to one end of the pty, the line
+// terminal master to replica (the input queue) and one from replica to master
+// (the output queue). When bytes are written to one end of the pty, the line
 // discipline reads the bytes, modifies them or takes special action if
 // required, and enqueues them to be read by the other end of the pty:
 //
@@ -62,7 +63,7 @@ const (
 //    |   (inputQueueWrite)     +-------------+     (inputQueueRead)      |
 //    |                                                                   |
 //    |                                                                   v
-// masterFD                                                            slaveFD
+// masterFD                                                           replicaFD
 //    ^                                                                   |
 //    |                                                                   |
 //    |   output to terminal   +--------------+    output from process    |
@@ -101,8 +102,8 @@ type lineDiscipline struct {
 	// masterWaiter is used to wait on the master end of the TTY.
 	masterWaiter waiter.Queue `state:"zerovalue"`
 
-	// slaveWaiter is used to wait on the slave end of the TTY.
-	slaveWaiter waiter.Queue `state:"zerovalue"`
+	// replicaWaiter is used to wait on the replica end of the TTY.
+	replicaWaiter waiter.Queue `state:"zerovalue"`
 }
 
 func newLineDiscipline(termios linux.KernelTermios) *lineDiscipline {
@@ -113,27 +114,23 @@ func newLineDiscipline(termios linux.KernelTermios) *lineDiscipline {
 }
 
 // getTermios gets the linux.Termios for the tty.
-func (l *lineDiscipline) getTermios(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+func (l *lineDiscipline) getTermios(task *kernel.Task, args arch.SyscallArguments) (uintptr, error) {
 	l.termiosMu.RLock()
 	defer l.termiosMu.RUnlock()
 	// We must copy a Termios struct, not KernelTermios.
 	t := l.termios.ToTermios()
-	_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), t, usermem.IOOpts{
-		AddressSpaceActive: true,
-	})
+	_, err := t.CopyOut(task, args[2].Pointer())
 	return 0, err
 }
 
 // setTermios sets a linux.Termios for the tty.
-func (l *lineDiscipline) setTermios(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+func (l *lineDiscipline) setTermios(task *kernel.Task, args arch.SyscallArguments) (uintptr, error) {
 	l.termiosMu.Lock()
 	defer l.termiosMu.Unlock()
 	oldCanonEnabled := l.termios.LEnabled(linux.ICANON)
 	// We must copy a Termios struct, not KernelTermios.
 	var t linux.Termios
-	_, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &t, usermem.IOOpts{
-		AddressSpaceActive: true,
-	})
+	_, err := t.CopyIn(task, args[2].Pointer())
 	l.termios.FromTermios(t)
 
 	// If canonical mode is turned off, move bytes from inQueue's wait
@@ -144,27 +141,23 @@ func (l *lineDiscipline) setTermios(ctx context.Context, io usermem.IO, args arc
 		l.inQueue.pushWaitBufLocked(l)
 		l.inQueue.readable = true
 		l.inQueue.mu.Unlock()
-		l.slaveWaiter.Notify(waiter.EventIn)
+		l.replicaWaiter.Notify(waiter.EventIn)
 	}
 
 	return 0, err
 }
 
-func (l *lineDiscipline) windowSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error {
+func (l *lineDiscipline) windowSize(t *kernel.Task, args arch.SyscallArguments) error {
 	l.sizeMu.Lock()
 	defer l.sizeMu.Unlock()
-	_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), l.size, usermem.IOOpts{
-		AddressSpaceActive: true,
-	})
+	_, err := l.size.CopyOut(t, args[2].Pointer())
 	return err
 }
 
-func (l *lineDiscipline) setWindowSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error {
+func (l *lineDiscipline) setWindowSize(t *kernel.Task, args arch.SyscallArguments) error {
 	l.sizeMu.Lock()
 	defer l.sizeMu.Unlock()
-	_, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &l.size, usermem.IOOpts{
-		AddressSpaceActive: true,
-	})
+	_, err := l.size.CopyIn(t, args[2].Pointer())
 	return err
 }
 
@@ -174,14 +167,14 @@ func (l *lineDiscipline) masterReadiness() waiter.EventMask {
 	return l.inQueue.writeReadiness(&linux.MasterTermios) | l.outQueue.readReadiness(&linux.MasterTermios)
 }
 
-func (l *lineDiscipline) slaveReadiness() waiter.EventMask {
+func (l *lineDiscipline) replicaReadiness() waiter.EventMask {
 	l.termiosMu.RLock()
 	defer l.termiosMu.RUnlock()
 	return l.outQueue.writeReadiness(&l.termios) | l.inQueue.readReadiness(&l.termios)
 }
 
-func (l *lineDiscipline) inputQueueReadSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error {
-	return l.inQueue.readableSize(ctx, io, args)
+func (l *lineDiscipline) inputQueueReadSize(t *kernel.Task, io usermem.IO, args arch.SyscallArguments) error {
+	return l.inQueue.readableSize(t, io, args)
 }
 
 func (l *lineDiscipline) inputQueueRead(ctx context.Context, dst usermem.IOSequence) (int64, error) {
@@ -194,7 +187,7 @@ func (l *lineDiscipline) inputQueueRead(ctx context.Context, dst usermem.IOSeque
 	if n > 0 {
 		l.masterWaiter.Notify(waiter.EventOut)
 		if pushed {
-			l.slaveWaiter.Notify(waiter.EventIn)
+			l.replicaWaiter.Notify(waiter.EventIn)
 		}
 		return n, nil
 	}
@@ -209,14 +202,14 @@ func (l *lineDiscipline) inputQueueWrite(ctx context.Context, src usermem.IOSequ
 		return 0, err
 	}
 	if n > 0 {
-		l.slaveWaiter.Notify(waiter.EventIn)
+		l.replicaWaiter.Notify(waiter.EventIn)
 		return n, nil
 	}
 	return 0, syserror.ErrWouldBlock
 }
 
-func (l *lineDiscipline) outputQueueReadSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error {
-	return l.outQueue.readableSize(ctx, io, args)
+func (l *lineDiscipline) outputQueueReadSize(t *kernel.Task, io usermem.IO, args arch.SyscallArguments) error {
+	return l.outQueue.readableSize(t, io, args)
 }
 
 func (l *lineDiscipline) outputQueueRead(ctx context.Context, dst usermem.IOSequence) (int64, error) {
@@ -227,7 +220,7 @@ func (l *lineDiscipline) outputQueueRead(ctx context.Context, dst usermem.IOSequ
 		return 0, err
 	}
 	if n > 0 {
-		l.slaveWaiter.Notify(waiter.EventOut)
+		l.replicaWaiter.Notify(waiter.EventOut)
 		if pushed {
 			l.masterWaiter.Notify(waiter.EventIn)
 		}
diff --git a/pkg/sentry/fsimpl/devpts/master.go b/pkg/sentry/fsimpl/devpts/master.go
index 3bb397f71..fda30fb93 100644
--- a/pkg/sentry/fsimpl/devpts/master.go
+++ b/pkg/sentry/fsimpl/devpts/master.go
@@ -17,9 +17,11 @@ package devpts
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/unimpl"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
@@ -29,7 +31,10 @@ import (
 )
 
 // masterInode is the inode for the master end of the Terminal.
+//
+// +stateify savable
 type masterInode struct {
+	implStatFS
 	kernfs.InodeAttrs
 	kernfs.InodeNoopRefCount
 	kernfs.InodeNotDirectory
@@ -37,9 +42,6 @@ type masterInode struct {
 
 	locks vfs.FileLocks
 
-	// Keep a reference to this inode's dentry.
-	dentry kernfs.Dentry
-
 	// root is the devpts root inode.
 	root *rootInode
 }
@@ -47,20 +49,18 @@ type masterInode struct {
 var _ kernfs.Inode = (*masterInode)(nil)
 
 // Open implements kernfs.Inode.Open.
-func (mi *masterInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+func (mi *masterInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
 	t, err := mi.root.allocateTerminal(rp.Credentials())
 	if err != nil {
 		return nil, err
 	}
 
-	mi.IncRef()
 	fd := &masterFileDescription{
 		inode: mi,
 		t:     t,
 	}
 	fd.LockFD.Init(&mi.locks)
-	if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}); err != nil {
-		mi.DecRef(ctx)
+	if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil {
 		return nil, err
 	}
 	return &fd.vfsfd, nil
@@ -86,6 +86,7 @@ func (mi *masterInode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds
 	return mi.InodeAttrs.SetStat(ctx, vfsfs, creds, opts)
 }
 
+// +stateify savable
 type masterFileDescription struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
@@ -99,8 +100,7 @@ var _ vfs.FileDescriptionImpl = (*masterFileDescription)(nil)
 
 // Release implements vfs.FileDescriptionImpl.Release.
 func (mfd *masterFileDescription) Release(ctx context.Context) {
-	mfd.inode.root.masterClose(mfd.t)
-	mfd.inode.DecRef(ctx)
+	mfd.inode.root.masterClose(ctx, mfd.t)
 }
 
 // EventRegister implements waiter.Waitable.EventRegister.
@@ -130,46 +130,51 @@ func (mfd *masterFileDescription) Write(ctx context.Context, src usermem.IOSeque
 
 // Ioctl implements vfs.FileDescriptionImpl.Ioctl.
 func (mfd *masterFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	t := kernel.TaskFromContext(ctx)
+	if t == nil {
+		// ioctl(2) may only be called from a task goroutine.
+		return 0, syserror.ENOTTY
+	}
+
 	switch cmd := args[1].Uint(); cmd {
 	case linux.FIONREAD: // linux.FIONREAD == linux.TIOCINQ
 		// Get the number of bytes in the output queue read buffer.
-		return 0, mfd.t.ld.outputQueueReadSize(ctx, io, args)
+		return 0, mfd.t.ld.outputQueueReadSize(t, io, args)
 	case linux.TCGETS:
 		// N.B. TCGETS on the master actually returns the configuration
-		// of the slave end.
-		return mfd.t.ld.getTermios(ctx, io, args)
+		// of the replica end.
+		return mfd.t.ld.getTermios(t, args)
 	case linux.TCSETS:
 		// N.B. TCSETS on the master actually affects the configuration
-		// of the slave end.
-		return mfd.t.ld.setTermios(ctx, io, args)
+		// of the replica end.
+		return mfd.t.ld.setTermios(t, args)
 	case linux.TCSETSW:
 		// TODO(b/29356795): This should drain the output queue first.
-		return mfd.t.ld.setTermios(ctx, io, args)
+		return mfd.t.ld.setTermios(t, args)
 	case linux.TIOCGPTN:
-		_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), uint32(mfd.t.n), usermem.IOOpts{
-			AddressSpaceActive: true,
-		})
+		nP := primitive.Uint32(mfd.t.n)
+		_, err := nP.CopyOut(t, args[2].Pointer())
 		return 0, err
 	case linux.TIOCSPTLCK:
 		// TODO(b/29356795): Implement pty locking. For now just pretend we do.
 		return 0, nil
 	case linux.TIOCGWINSZ:
-		return 0, mfd.t.ld.windowSize(ctx, io, args)
+		return 0, mfd.t.ld.windowSize(t, args)
 	case linux.TIOCSWINSZ:
-		return 0, mfd.t.ld.setWindowSize(ctx, io, args)
+		return 0, mfd.t.ld.setWindowSize(t, args)
 	case linux.TIOCSCTTY:
 		// Make the given terminal the controlling terminal of the
 		// calling process.
-		return 0, mfd.t.setControllingTTY(ctx, io, args, true /* isMaster */)
+		return 0, mfd.t.setControllingTTY(ctx, args, true /* isMaster */)
 	case linux.TIOCNOTTY:
 		// Release this process's controlling terminal.
-		return 0, mfd.t.releaseControllingTTY(ctx, io, args, true /* isMaster */)
+		return 0, mfd.t.releaseControllingTTY(ctx, args, true /* isMaster */)
 	case linux.TIOCGPGRP:
 		// Get the foreground process group.
-		return mfd.t.foregroundProcessGroup(ctx, io, args, true /* isMaster */)
+		return mfd.t.foregroundProcessGroup(ctx, args, true /* isMaster */)
 	case linux.TIOCSPGRP:
 		// Set the foreground process group.
-		return mfd.t.setForegroundProcessGroup(ctx, io, args, true /* isMaster */)
+		return mfd.t.setForegroundProcessGroup(ctx, args, true /* isMaster */)
 	default:
 		maybeEmitUnimplementedEvent(ctx, cmd)
 		return 0, syserror.ENOTTY
diff --git a/pkg/sentry/fsimpl/devpts/queue.go b/pkg/sentry/fsimpl/devpts/queue.go
index dffb4232c..55bff3e60 100644
--- a/pkg/sentry/fsimpl/devpts/queue.go
+++ b/pkg/sentry/fsimpl/devpts/queue.go
@@ -17,8 +17,10 @@ package devpts
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
 	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
@@ -30,7 +32,7 @@ import (
 const waitBufMaxBytes = 131072
 
 // queue represents one of the input or output queues between a pty master and
-// slave. Bytes written to a queue are added to the read buffer until it is
+// replica. Bytes written to a queue are added to the read buffer until it is
 // full, at which point they are written to the wait buffer. Bytes are
 // processed (i.e. undergo termios transformations) as they are added to the
 // read buffer. The read buffer is readable when its length is nonzero and
@@ -83,17 +85,15 @@ func (q *queue) writeReadiness(t *linux.KernelTermios) waiter.EventMask {
 }
 
 // readableSize writes the number of readable bytes to userspace.
-func (q *queue) readableSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error {
+func (q *queue) readableSize(t *kernel.Task, io usermem.IO, args arch.SyscallArguments) error {
 	q.mu.Lock()
 	defer q.mu.Unlock()
-	var size int32
+	size := primitive.Int32(0)
 	if q.readable {
-		size = int32(len(q.readBuf))
+		size = primitive.Int32(len(q.readBuf))
 	}
 
-	_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), size, usermem.IOOpts{
-		AddressSpaceActive: true,
-	})
+	_, err := size.CopyOut(t, args[2].Pointer())
 	return err
 
 }
@@ -102,8 +102,7 @@ func (q *queue) readableSize(ctx context.Context, io usermem.IO, args arch.Sysca
 // as whether the read caused more readable data to become available (whether
 // data was pushed from the wait buffer to the read buffer).
 //
-// Preconditions:
-// * l.termiosMu must be held for reading.
+// Preconditions: l.termiosMu must be held for reading.
 func (q *queue) read(ctx context.Context, dst usermem.IOSequence, l *lineDiscipline) (int64, bool, error) {
 	q.mu.Lock()
 	defer q.mu.Unlock()
@@ -143,8 +142,7 @@ func (q *queue) read(ctx context.Context, dst usermem.IOSequence, l *lineDiscipl
 
 // write writes to q from userspace.
 //
-// Preconditions:
-// * l.termiosMu must be held for reading.
+// Preconditions: l.termiosMu must be held for reading.
 func (q *queue) write(ctx context.Context, src usermem.IOSequence, l *lineDiscipline) (int64, error) {
 	q.mu.Lock()
 	defer q.mu.Unlock()
@@ -186,8 +184,7 @@ func (q *queue) write(ctx context.Context, src usermem.IOSequence, l *lineDiscip
 
 // writeBytes writes to q from b.
 //
-// Preconditions:
-// * l.termiosMu must be held for reading.
+// Preconditions: l.termiosMu must be held for reading.
 func (q *queue) writeBytes(b []byte, l *lineDiscipline) {
 	q.mu.Lock()
 	defer q.mu.Unlock()
diff --git a/pkg/sentry/fsimpl/devpts/replica.go b/pkg/sentry/fsimpl/devpts/replica.go
new file mode 100644
index 000000000..70c68cf0a
--- /dev/null
+++ b/pkg/sentry/fsimpl/devpts/replica.go
@@ -0,0 +1,201 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package devpts
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// replicaInode is the inode for the replica end of the Terminal.
+//
+// +stateify savable
+type replicaInode struct {
+	implStatFS
+	kernfs.InodeAttrs
+	kernfs.InodeNoopRefCount
+	kernfs.InodeNotDirectory
+	kernfs.InodeNotSymlink
+
+	locks vfs.FileLocks
+
+	// root is the devpts root inode.
+	root *rootInode
+
+	// t is the connected Terminal.
+	t *Terminal
+}
+
+var _ kernfs.Inode = (*replicaInode)(nil)
+
+// Open implements kernfs.Inode.Open.
+func (ri *replicaInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	fd := &replicaFileDescription{
+		inode: ri,
+	}
+	fd.LockFD.Init(&ri.locks)
+	if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil {
+		return nil, err
+	}
+	return &fd.vfsfd, nil
+
+}
+
+// Valid implements kernfs.Inode.Valid.
+func (ri *replicaInode) Valid(context.Context) bool {
+	// Return valid if the replica still exists.
+	ri.root.mu.Lock()
+	defer ri.root.mu.Unlock()
+	_, ok := ri.root.replicas[ri.t.n]
+	return ok
+}
+
+// Stat implements kernfs.Inode.Stat.
+func (ri *replicaInode) Stat(ctx context.Context, vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
+	statx, err := ri.InodeAttrs.Stat(ctx, vfsfs, opts)
+	if err != nil {
+		return linux.Statx{}, err
+	}
+	statx.Blksize = 1024
+	statx.RdevMajor = linux.UNIX98_PTY_REPLICA_MAJOR
+	statx.RdevMinor = ri.t.n
+	return statx, nil
+}
+
+// SetStat implements kernfs.Inode.SetStat
+func (ri *replicaInode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
+	if opts.Stat.Mask&linux.STATX_SIZE != 0 {
+		return syserror.EINVAL
+	}
+	return ri.InodeAttrs.SetStat(ctx, vfsfs, creds, opts)
+}
+
+// +stateify savable
+type replicaFileDescription struct {
+	vfsfd vfs.FileDescription
+	vfs.FileDescriptionDefaultImpl
+	vfs.LockFD
+
+	inode *replicaInode
+}
+
+var _ vfs.FileDescriptionImpl = (*replicaFileDescription)(nil)
+
+// Release implements fs.FileOperations.Release.
+func (rfd *replicaFileDescription) Release(ctx context.Context) {}
+
+// EventRegister implements waiter.Waitable.EventRegister.
+func (rfd *replicaFileDescription) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+	rfd.inode.t.ld.replicaWaiter.EventRegister(e, mask)
+}
+
+// EventUnregister implements waiter.Waitable.EventUnregister.
+func (rfd *replicaFileDescription) EventUnregister(e *waiter.Entry) {
+	rfd.inode.t.ld.replicaWaiter.EventUnregister(e)
+}
+
+// Readiness implements waiter.Waitable.Readiness.
+func (rfd *replicaFileDescription) Readiness(mask waiter.EventMask) waiter.EventMask {
+	return rfd.inode.t.ld.replicaReadiness()
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (rfd *replicaFileDescription) Read(ctx context.Context, dst usermem.IOSequence, _ vfs.ReadOptions) (int64, error) {
+	return rfd.inode.t.ld.inputQueueRead(ctx, dst)
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (rfd *replicaFileDescription) Write(ctx context.Context, src usermem.IOSequence, _ vfs.WriteOptions) (int64, error) {
+	return rfd.inode.t.ld.outputQueueWrite(ctx, src)
+}
+
+// Ioctl implements vfs.FileDescriptionImpl.Ioctl.
+func (rfd *replicaFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	t := kernel.TaskFromContext(ctx)
+	if t == nil {
+		// ioctl(2) may only be called from a task goroutine.
+		return 0, syserror.ENOTTY
+	}
+
+	switch cmd := args[1].Uint(); cmd {
+	case linux.FIONREAD: // linux.FIONREAD == linux.TIOCINQ
+		// Get the number of bytes in the input queue read buffer.
+		return 0, rfd.inode.t.ld.inputQueueReadSize(t, io, args)
+	case linux.TCGETS:
+		return rfd.inode.t.ld.getTermios(t, args)
+	case linux.TCSETS:
+		return rfd.inode.t.ld.setTermios(t, args)
+	case linux.TCSETSW:
+		// TODO(b/29356795): This should drain the output queue first.
+		return rfd.inode.t.ld.setTermios(t, args)
+	case linux.TIOCGPTN:
+		nP := primitive.Uint32(rfd.inode.t.n)
+		_, err := nP.CopyOut(t, args[2].Pointer())
+		return 0, err
+	case linux.TIOCGWINSZ:
+		return 0, rfd.inode.t.ld.windowSize(t, args)
+	case linux.TIOCSWINSZ:
+		return 0, rfd.inode.t.ld.setWindowSize(t, args)
+	case linux.TIOCSCTTY:
+		// Make the given terminal the controlling terminal of the
+		// calling process.
+		return 0, rfd.inode.t.setControllingTTY(ctx, args, false /* isMaster */)
+	case linux.TIOCNOTTY:
+		// Release this process's controlling terminal.
+		return 0, rfd.inode.t.releaseControllingTTY(ctx, args, false /* isMaster */)
+	case linux.TIOCGPGRP:
+		// Get the foreground process group.
+		return rfd.inode.t.foregroundProcessGroup(ctx, args, false /* isMaster */)
+	case linux.TIOCSPGRP:
+		// Set the foreground process group.
+		return rfd.inode.t.setForegroundProcessGroup(ctx, args, false /* isMaster */)
+	default:
+		maybeEmitUnimplementedEvent(ctx, cmd)
+		return 0, syserror.ENOTTY
+	}
+}
+
+// SetStat implements vfs.FileDescriptionImpl.SetStat.
+func (rfd *replicaFileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
+	creds := auth.CredentialsFromContext(ctx)
+	fs := rfd.vfsfd.VirtualDentry().Mount().Filesystem()
+	return rfd.inode.SetStat(ctx, fs, creds, opts)
+}
+
+// Stat implements vfs.FileDescriptionImpl.Stat.
+func (rfd *replicaFileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
+	fs := rfd.vfsfd.VirtualDentry().Mount().Filesystem()
+	return rfd.inode.Stat(ctx, fs, opts)
+}
+
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (rfd *replicaFileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+	return rfd.Locks().LockPOSIX(ctx, &rfd.vfsfd, uid, t, start, length, whence, block)
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (rfd *replicaFileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+	return rfd.Locks().UnlockPOSIX(ctx, &rfd.vfsfd, uid, start, length, whence)
+}
diff --git a/pkg/sentry/fsimpl/devpts/slave.go b/pkg/sentry/fsimpl/devpts/slave.go
deleted file mode 100644
index 32e4e1908..000000000
--- a/pkg/sentry/fsimpl/devpts/slave.go
+++ /dev/null
@@ -1,197 +0,0 @@
-// Copyright 2020 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package devpts
-
-import (
-	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/context"
-	"gvisor.dev/gvisor/pkg/sentry/arch"
-	fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
-	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
-	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
-	"gvisor.dev/gvisor/pkg/sentry/vfs"
-	"gvisor.dev/gvisor/pkg/syserror"
-	"gvisor.dev/gvisor/pkg/usermem"
-	"gvisor.dev/gvisor/pkg/waiter"
-)
-
-// slaveInode is the inode for the slave end of the Terminal.
-type slaveInode struct {
-	kernfs.InodeAttrs
-	kernfs.InodeNoopRefCount
-	kernfs.InodeNotDirectory
-	kernfs.InodeNotSymlink
-
-	locks vfs.FileLocks
-
-	// Keep a reference to this inode's dentry.
-	dentry kernfs.Dentry
-
-	// root is the devpts root inode.
-	root *rootInode
-
-	// t is the connected Terminal.
-	t *Terminal
-}
-
-var _ kernfs.Inode = (*slaveInode)(nil)
-
-// Open implements kernfs.Inode.Open.
-func (si *slaveInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	si.IncRef()
-	fd := &slaveFileDescription{
-		inode: si,
-	}
-	fd.LockFD.Init(&si.locks)
-	if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}); err != nil {
-		si.DecRef(ctx)
-		return nil, err
-	}
-	return &fd.vfsfd, nil
-
-}
-
-// Valid implements kernfs.Inode.Valid.
-func (si *slaveInode) Valid(context.Context) bool {
-	// Return valid if the slave still exists.
-	si.root.mu.Lock()
-	defer si.root.mu.Unlock()
-	_, ok := si.root.slaves[si.t.n]
-	return ok
-}
-
-// Stat implements kernfs.Inode.Stat.
-func (si *slaveInode) Stat(ctx context.Context, vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
-	statx, err := si.InodeAttrs.Stat(ctx, vfsfs, opts)
-	if err != nil {
-		return linux.Statx{}, err
-	}
-	statx.Blksize = 1024
-	statx.RdevMajor = linux.UNIX98_PTY_SLAVE_MAJOR
-	statx.RdevMinor = si.t.n
-	return statx, nil
-}
-
-// SetStat implements kernfs.Inode.SetStat
-func (si *slaveInode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
-	if opts.Stat.Mask&linux.STATX_SIZE != 0 {
-		return syserror.EINVAL
-	}
-	return si.InodeAttrs.SetStat(ctx, vfsfs, creds, opts)
-}
-
-type slaveFileDescription struct {
-	vfsfd vfs.FileDescription
-	vfs.FileDescriptionDefaultImpl
-	vfs.LockFD
-
-	inode *slaveInode
-}
-
-var _ vfs.FileDescriptionImpl = (*slaveFileDescription)(nil)
-
-// Release implements fs.FileOperations.Release.
-func (sfd *slaveFileDescription) Release(ctx context.Context) {
-	sfd.inode.DecRef(ctx)
-}
-
-// EventRegister implements waiter.Waitable.EventRegister.
-func (sfd *slaveFileDescription) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
-	sfd.inode.t.ld.slaveWaiter.EventRegister(e, mask)
-}
-
-// EventUnregister implements waiter.Waitable.EventUnregister.
-func (sfd *slaveFileDescription) EventUnregister(e *waiter.Entry) {
-	sfd.inode.t.ld.slaveWaiter.EventUnregister(e)
-}
-
-// Readiness implements waiter.Waitable.Readiness.
-func (sfd *slaveFileDescription) Readiness(mask waiter.EventMask) waiter.EventMask {
-	return sfd.inode.t.ld.slaveReadiness()
-}
-
-// Read implements vfs.FileDescriptionImpl.Read.
-func (sfd *slaveFileDescription) Read(ctx context.Context, dst usermem.IOSequence, _ vfs.ReadOptions) (int64, error) {
-	return sfd.inode.t.ld.inputQueueRead(ctx, dst)
-}
-
-// Write implements vfs.FileDescriptionImpl.Write.
-func (sfd *slaveFileDescription) Write(ctx context.Context, src usermem.IOSequence, _ vfs.WriteOptions) (int64, error) {
-	return sfd.inode.t.ld.outputQueueWrite(ctx, src)
-}
-
-// Ioctl implements vfs.FileDescriptionImpl.Ioctl.
-func (sfd *slaveFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
-	switch cmd := args[1].Uint(); cmd {
-	case linux.FIONREAD: // linux.FIONREAD == linux.TIOCINQ
-		// Get the number of bytes in the input queue read buffer.
-		return 0, sfd.inode.t.ld.inputQueueReadSize(ctx, io, args)
-	case linux.TCGETS:
-		return sfd.inode.t.ld.getTermios(ctx, io, args)
-	case linux.TCSETS:
-		return sfd.inode.t.ld.setTermios(ctx, io, args)
-	case linux.TCSETSW:
-		// TODO(b/29356795): This should drain the output queue first.
-		return sfd.inode.t.ld.setTermios(ctx, io, args)
-	case linux.TIOCGPTN:
-		_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), uint32(sfd.inode.t.n), usermem.IOOpts{
-			AddressSpaceActive: true,
-		})
-		return 0, err
-	case linux.TIOCGWINSZ:
-		return 0, sfd.inode.t.ld.windowSize(ctx, io, args)
-	case linux.TIOCSWINSZ:
-		return 0, sfd.inode.t.ld.setWindowSize(ctx, io, args)
-	case linux.TIOCSCTTY:
-		// Make the given terminal the controlling terminal of the
-		// calling process.
-		return 0, sfd.inode.t.setControllingTTY(ctx, io, args, false /* isMaster */)
-	case linux.TIOCNOTTY:
-		// Release this process's controlling terminal.
-		return 0, sfd.inode.t.releaseControllingTTY(ctx, io, args, false /* isMaster */)
-	case linux.TIOCGPGRP:
-		// Get the foreground process group.
-		return sfd.inode.t.foregroundProcessGroup(ctx, io, args, false /* isMaster */)
-	case linux.TIOCSPGRP:
-		// Set the foreground process group.
-		return sfd.inode.t.setForegroundProcessGroup(ctx, io, args, false /* isMaster */)
-	default:
-		maybeEmitUnimplementedEvent(ctx, cmd)
-		return 0, syserror.ENOTTY
-	}
-}
-
-// SetStat implements vfs.FileDescriptionImpl.SetStat.
-func (sfd *slaveFileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
-	creds := auth.CredentialsFromContext(ctx)
-	fs := sfd.vfsfd.VirtualDentry().Mount().Filesystem()
-	return sfd.inode.SetStat(ctx, fs, creds, opts)
-}
-
-// Stat implements vfs.FileDescriptionImpl.Stat.
-func (sfd *slaveFileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
-	fs := sfd.vfsfd.VirtualDentry().Mount().Filesystem()
-	return sfd.inode.Stat(ctx, fs, opts)
-}
-
-// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
-func (sfd *slaveFileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
-	return sfd.Locks().LockPOSIX(ctx, &sfd.vfsfd, uid, t, start, length, whence, block)
-}
-
-// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
-func (sfd *slaveFileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
-	return sfd.Locks().UnlockPOSIX(ctx, &sfd.vfsfd, uid, start, length, whence)
-}
diff --git a/pkg/sentry/fsimpl/devpts/terminal.go b/pkg/sentry/fsimpl/devpts/terminal.go
index 7d2781c54..510bd6d89 100644
--- a/pkg/sentry/fsimpl/devpts/terminal.go
+++ b/pkg/sentry/fsimpl/devpts/terminal.go
@@ -17,9 +17,9 @@ package devpts
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
-	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // Terminal is a pseudoterminal.
@@ -36,25 +36,25 @@ type Terminal struct {
 	// this terminal. This field is immutable.
 	masterKTTY *kernel.TTY
 
-	// slaveKTTY contains the controlling process of the slave end of this
+	// replicaKTTY contains the controlling process of the replica end of this
 	// terminal. This field is immutable.
-	slaveKTTY *kernel.TTY
+	replicaKTTY *kernel.TTY
 }
 
 func newTerminal(n uint32) *Terminal {
-	termios := linux.DefaultSlaveTermios
+	termios := linux.DefaultReplicaTermios
 	t := Terminal{
-		n:          n,
-		ld:         newLineDiscipline(termios),
-		masterKTTY: &kernel.TTY{Index: n},
-		slaveKTTY:  &kernel.TTY{Index: n},
+		n:           n,
+		ld:          newLineDiscipline(termios),
+		masterKTTY:  &kernel.TTY{Index: n},
+		replicaKTTY: &kernel.TTY{Index: n},
 	}
 	return &t
 }
 
 // setControllingTTY makes tm the controlling terminal of the calling thread
 // group.
-func (tm *Terminal) setControllingTTY(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) error {
+func (tm *Terminal) setControllingTTY(ctx context.Context, args arch.SyscallArguments, isMaster bool) error {
 	task := kernel.TaskFromContext(ctx)
 	if task == nil {
 		panic("setControllingTTY must be called from a task context")
@@ -65,7 +65,7 @@ func (tm *Terminal) setControllingTTY(ctx context.Context, io usermem.IO, args a
 
 // releaseControllingTTY removes tm as the controlling terminal of the calling
 // thread group.
-func (tm *Terminal) releaseControllingTTY(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) error {
+func (tm *Terminal) releaseControllingTTY(ctx context.Context, args arch.SyscallArguments, isMaster bool) error {
 	task := kernel.TaskFromContext(ctx)
 	if task == nil {
 		panic("releaseControllingTTY must be called from a task context")
@@ -75,7 +75,7 @@ func (tm *Terminal) releaseControllingTTY(ctx context.Context, io usermem.IO, ar
 }
 
 // foregroundProcessGroup gets the process group ID of tm's foreground process.
-func (tm *Terminal) foregroundProcessGroup(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) (uintptr, error) {
+func (tm *Terminal) foregroundProcessGroup(ctx context.Context, args arch.SyscallArguments, isMaster bool) (uintptr, error) {
 	task := kernel.TaskFromContext(ctx)
 	if task == nil {
 		panic("foregroundProcessGroup must be called from a task context")
@@ -87,24 +87,21 @@ func (tm *Terminal) foregroundProcessGroup(ctx context.Context, io usermem.IO, a
 	}
 
 	// Write it out to *arg.
-	_, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(ret), usermem.IOOpts{
-		AddressSpaceActive: true,
-	})
+	retP := primitive.Int32(ret)
+	_, err = retP.CopyOut(task, args[2].Pointer())
 	return 0, err
 }
 
 // foregroundProcessGroup sets tm's foreground process.
-func (tm *Terminal) setForegroundProcessGroup(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) (uintptr, error) {
+func (tm *Terminal) setForegroundProcessGroup(ctx context.Context, args arch.SyscallArguments, isMaster bool) (uintptr, error) {
 	task := kernel.TaskFromContext(ctx)
 	if task == nil {
 		panic("setForegroundProcessGroup must be called from a task context")
 	}
 
 	// Read in the process group ID.
-	var pgid int32
-	if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &pgid, usermem.IOOpts{
-		AddressSpaceActive: true,
-	}); err != nil {
+	var pgid primitive.Int32
+	if _, err := pgid.CopyIn(task, args[2].Pointer()); err != nil {
 		return 0, err
 	}
 
@@ -116,5 +113,5 @@ func (tm *Terminal) tty(isMaster bool) *kernel.TTY {
 	if isMaster {
 		return tm.masterKTTY
 	}
-	return tm.slaveKTTY
+	return tm.replicaKTTY
 }
diff --git a/pkg/sentry/fsimpl/devtmpfs/BUILD b/pkg/sentry/fsimpl/devtmpfs/BUILD
index aa0c2ad8c..01bbee5ad 100644
--- a/pkg/sentry/fsimpl/devtmpfs/BUILD
+++ b/pkg/sentry/fsimpl/devtmpfs/BUILD
@@ -24,6 +24,7 @@ go_test(
     library = ":devtmpfs",
     deps = [
         "//pkg/abi/linux",
+        "//pkg/context",
         "//pkg/fspath",
         "//pkg/sentry/contexttest",
         "//pkg/sentry/fsimpl/tmpfs",
diff --git a/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go b/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go
index 2ed5fa8a9..e6fe0fc0d 100644
--- a/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go
+++ b/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go
@@ -18,6 +18,7 @@ package devtmpfs
 
 import (
 	"fmt"
+	"path"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
@@ -32,8 +33,10 @@ import (
 const Name = "devtmpfs"
 
 // FilesystemType implements vfs.FilesystemType.
+//
+// +stateify savable
 type FilesystemType struct {
-	initOnce sync.Once
+	initOnce sync.Once `state:"nosave"` // FIXME(gvisor.dev/issue/1664): not yet supported.
 	initErr  error
 
 	// fs is the tmpfs filesystem that backs all mounts of this FilesystemType.
@@ -68,6 +71,15 @@ func (fst *FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virtua
 	return fst.fs, fst.root, nil
 }
 
+// Release implements vfs.FilesystemType.Release.
+func (fst *FilesystemType) Release(ctx context.Context) {
+	if fst.fs != nil {
+		// Release the original reference obtained when creating the filesystem.
+		fst.root.DecRef(ctx)
+		fst.fs.DecRef(ctx)
+	}
+}
+
 // Accessor allows devices to create device special files in devtmpfs.
 type Accessor struct {
 	vfsObj *vfs.VirtualFilesystem
@@ -79,14 +91,17 @@ type Accessor struct {
 // NewAccessor returns an Accessor that supports creation of device special
 // files in the devtmpfs instance registered with name fsTypeName in vfsObj.
 func NewAccessor(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, fsTypeName string) (*Accessor, error) {
-	mntns, err := vfsObj.NewMountNamespace(ctx, creds, "devtmpfs" /* source */, fsTypeName, &vfs.GetFilesystemOptions{})
+	mntns, err := vfsObj.NewMountNamespace(ctx, creds, "devtmpfs" /* source */, fsTypeName, &vfs.MountOptions{})
 	if err != nil {
 		return nil, err
 	}
+	// Pass a reference on root to the Accessor.
+	root := mntns.Root()
+	root.IncRef()
 	return &Accessor{
 		vfsObj: vfsObj,
 		mntns:  mntns,
-		root:   mntns.Root(),
+		root:   root,
 		creds:  creds,
 	}, nil
 }
@@ -150,13 +165,11 @@ func (a *Accessor) CreateDeviceFile(ctx context.Context, pathname string, kind v
 
 	// Create any parent directories. See
 	// devtmpfs.c:handle_create()=>path_create().
-	for it := fspath.Parse(pathname).Begin; it.NextOk(); it = it.Next() {
-		pop := a.pathOperationAt(it.String())
-		if err := a.vfsObj.MkdirAt(actx, a.creds, pop, &vfs.MkdirOptions{
-			Mode: 0755,
-		}); err != nil {
-			return fmt.Errorf("failed to create directory %q: %v", it.String(), err)
-		}
+	parent := path.Dir(pathname)
+	if err := a.vfsObj.MkdirAllAt(ctx, parent, a.root, a.creds, &vfs.MkdirOptions{
+		Mode: 0755,
+	}); err != nil {
+		return fmt.Errorf("failed to create device parent directory %q: %v", parent, err)
 	}
 
 	// NOTE: Linux's devtmpfs refuses to automatically delete files it didn't
diff --git a/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go b/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go
index 747867cca..e058eda7a 100644
--- a/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go
+++ b/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go
@@ -15,9 +15,11 @@
 package devtmpfs
 
 import (
+	"path"
 	"testing"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/sentry/contexttest"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs"
@@ -25,10 +27,13 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 )
 
-func TestDevtmpfs(t *testing.T) {
+const devPath = "/dev"
+
+func setupDevtmpfs(t *testing.T) (context.Context, *auth.Credentials, *vfs.VirtualFilesystem, vfs.VirtualDentry, func()) {
+	t.Helper()
+
 	ctx := contexttest.Context(t)
 	creds := auth.CredentialsFromContext(ctx)
-
 	vfsObj := &vfs.VirtualFilesystem{}
 	if err := vfsObj.Init(ctx); err != nil {
 		t.Fatalf("VFS init: %v", err)
@@ -43,14 +48,12 @@ func TestDevtmpfs(t *testing.T) {
 	})
 
 	// Create a test mount namespace with devtmpfs mounted at "/dev".
-	const devPath = "/dev"
-	mntns, err := vfsObj.NewMountNamespace(ctx, creds, "tmpfs" /* source */, "tmpfs" /* fsTypeName */, &vfs.GetFilesystemOptions{})
+	mntns, err := vfsObj.NewMountNamespace(ctx, creds, "tmpfs" /* source */, "tmpfs" /* fsTypeName */, &vfs.MountOptions{})
 	if err != nil {
 		t.Fatalf("failed to create tmpfs root mount: %v", err)
 	}
-	defer mntns.DecRef(ctx)
 	root := mntns.Root()
-	defer root.DecRef(ctx)
+	root.IncRef()
 	devpop := vfs.PathOperation{
 		Root:  root,
 		Start: root,
@@ -61,10 +64,20 @@ func TestDevtmpfs(t *testing.T) {
 	}); err != nil {
 		t.Fatalf("failed to create mount point: %v", err)
 	}
-	if err := vfsObj.MountAt(ctx, creds, "devtmpfs" /* source */, &devpop, "devtmpfs" /* fsTypeName */, &vfs.MountOptions{}); err != nil {
+	if _, err := vfsObj.MountAt(ctx, creds, "devtmpfs" /* source */, &devpop, "devtmpfs" /* fsTypeName */, &vfs.MountOptions{}); err != nil {
 		t.Fatalf("failed to mount devtmpfs: %v", err)
 	}
 
+	return ctx, creds, vfsObj, root, func() {
+		root.DecRef(ctx)
+		mntns.DecRef(ctx)
+	}
+}
+
+func TestUserspaceInit(t *testing.T) {
+	ctx, creds, vfsObj, root, cleanup := setupDevtmpfs(t)
+	defer cleanup()
+
 	a, err := NewAccessor(ctx, vfsObj, creds, "devtmpfs")
 	if err != nil {
 		t.Fatalf("failed to create devtmpfs.Accessor: %v", err)
@@ -75,48 +88,143 @@ func TestDevtmpfs(t *testing.T) {
 	if err := a.UserspaceInit(ctx); err != nil {
 		t.Fatalf("failed to userspace-initialize devtmpfs: %v", err)
 	}
+
 	// Created files should be visible in the test mount namespace.
-	abspath := devPath + "/fd"
-	target, err := vfsObj.ReadlinkAt(ctx, creds, &vfs.PathOperation{
-		Root:  root,
-		Start: root,
-		Path:  fspath.Parse(abspath),
-	})
-	if want := "/proc/self/fd"; err != nil || target != want {
-		t.Fatalf("readlink(%q): got (%q, %v), wanted (%q, nil)", abspath, target, err, want)
+	links := []struct {
+		source string
+		target string
+	}{
+		{
+			source: "fd",
+			target: "/proc/self/fd",
+		},
+		{
+			source: "stdin",
+			target: "/proc/self/fd/0",
+		},
+		{
+			source: "stdout",
+			target: "/proc/self/fd/1",
+		},
+		{
+			source: "stderr",
+			target: "/proc/self/fd/2",
+		},
+		{
+			source: "ptmx",
+			target: "pts/ptmx",
+		},
 	}
 
-	// Create a dummy device special file using a devtmpfs.Accessor.
-	const (
-		pathInDev = "dummy"
-		kind      = vfs.CharDevice
-		major     = 12
-		minor     = 34
-		perms     = 0600
-		wantMode  = linux.S_IFCHR | perms
-	)
-	if err := a.CreateDeviceFile(ctx, pathInDev, kind, major, minor, perms); err != nil {
-		t.Fatalf("failed to create device file: %v", err)
+	for _, link := range links {
+		abspath := path.Join(devPath, link.source)
+		if gotTarget, err := vfsObj.ReadlinkAt(ctx, creds, &vfs.PathOperation{
+			Root:  root,
+			Start: root,
+			Path:  fspath.Parse(abspath),
+		}); err != nil || gotTarget != link.target {
+			t.Errorf("readlink(%q): got (%q, %v), wanted (%q, nil)", abspath, gotTarget, err, link.target)
+		}
 	}
-	// The device special file should be visible in the test mount namespace.
-	abspath = devPath + "/" + pathInDev
-	stat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{
-		Root:  root,
-		Start: root,
-		Path:  fspath.Parse(abspath),
-	}, &vfs.StatOptions{
-		Mask: linux.STATX_TYPE | linux.STATX_MODE,
-	})
-	if err != nil {
-		t.Fatalf("failed to stat device file at %q: %v", abspath, err)
+
+	dirs := []string{"shm", "pts"}
+	for _, dir := range dirs {
+		abspath := path.Join(devPath, dir)
+		statx, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{
+			Root:  root,
+			Start: root,
+			Path:  fspath.Parse(abspath),
+		}, &vfs.StatOptions{
+			Mask: linux.STATX_MODE,
+		})
+		if err != nil {
+			t.Errorf("stat(%q): got error %v ", abspath, err)
+			continue
+		}
+		if want := uint16(0755) | linux.S_IFDIR; statx.Mode != want {
+			t.Errorf("stat(%q): got mode %x, want %x", abspath, statx.Mode, want)
+		}
 	}
-	if stat.Mode != wantMode {
-		t.Errorf("device file mode: got %v, wanted %v", stat.Mode, wantMode)
+}
+
+func TestCreateDeviceFile(t *testing.T) {
+	ctx, creds, vfsObj, root, cleanup := setupDevtmpfs(t)
+	defer cleanup()
+
+	a, err := NewAccessor(ctx, vfsObj, creds, "devtmpfs")
+	if err != nil {
+		t.Fatalf("failed to create devtmpfs.Accessor: %v", err)
 	}
-	if stat.RdevMajor != major {
-		t.Errorf("major device number: got %v, wanted %v", stat.RdevMajor, major)
+	defer a.Release(ctx)
+
+	devFiles := []struct {
+		path  string
+		kind  vfs.DeviceKind
+		major uint32
+		minor uint32
+		perms uint16
+	}{
+		{
+			path:  "dummy",
+			kind:  vfs.CharDevice,
+			major: 12,
+			minor: 34,
+			perms: 0600,
+		},
+		{
+			path:  "foo/bar",
+			kind:  vfs.BlockDevice,
+			major: 13,
+			minor: 35,
+			perms: 0660,
+		},
+		{
+			path:  "foo/baz",
+			kind:  vfs.CharDevice,
+			major: 12,
+			minor: 40,
+			perms: 0666,
+		},
+		{
+			path:  "a/b/c/d/e",
+			kind:  vfs.BlockDevice,
+			major: 12,
+			minor: 34,
+			perms: 0600,
+		},
 	}
-	if stat.RdevMinor != minor {
-		t.Errorf("minor device number: got %v, wanted %v", stat.RdevMinor, minor)
+
+	for _, f := range devFiles {
+		if err := a.CreateDeviceFile(ctx, f.path, f.kind, f.major, f.minor, f.perms); err != nil {
+			t.Fatalf("failed to create device file: %v", err)
+		}
+		// The device special file should be visible in the test mount namespace.
+		abspath := path.Join(devPath, f.path)
+		stat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{
+			Root:  root,
+			Start: root,
+			Path:  fspath.Parse(abspath),
+		}, &vfs.StatOptions{
+			Mask: linux.STATX_TYPE | linux.STATX_MODE,
+		})
+		if err != nil {
+			t.Fatalf("failed to stat device file at %q: %v", abspath, err)
+		}
+		if stat.RdevMajor != f.major {
+			t.Errorf("major device number: got %v, wanted %v", stat.RdevMajor, f.major)
+		}
+		if stat.RdevMinor != f.minor {
+			t.Errorf("minor device number: got %v, wanted %v", stat.RdevMinor, f.minor)
+		}
+		wantMode := f.perms
+		switch f.kind {
+		case vfs.CharDevice:
+			wantMode |= linux.S_IFCHR
+		case vfs.BlockDevice:
+			wantMode |= linux.S_IFBLK
+		}
+		if stat.Mode != wantMode {
+			t.Errorf("device file mode: got %v, wanted %v", stat.Mode, wantMode)
+		}
 	}
 }
diff --git a/pkg/sentry/fsimpl/eventfd/eventfd.go b/pkg/sentry/fsimpl/eventfd/eventfd.go
index 812171fa3..1c27ad700 100644
--- a/pkg/sentry/fsimpl/eventfd/eventfd.go
+++ b/pkg/sentry/fsimpl/eventfd/eventfd.go
@@ -30,9 +30,11 @@ import (
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
-// EventFileDescription implements FileDescriptionImpl for file-based event
+// EventFileDescription implements vfs.FileDescriptionImpl for file-based event
 // notification (eventfd). Eventfds are usually internal to the Sentry but in
 // certain situations they may be converted into a host-backed eventfd.
+//
+// +stateify savable
 type EventFileDescription struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
@@ -106,7 +108,7 @@ func (efd *EventFileDescription) HostFD() (int, error) {
 	return efd.hostfd, nil
 }
 
-// Release implements FileDescriptionImpl.Release()
+// Release implements vfs.FileDescriptionImpl.Release.
 func (efd *EventFileDescription) Release(context.Context) {
 	efd.mu.Lock()
 	defer efd.mu.Unlock()
@@ -119,7 +121,7 @@ func (efd *EventFileDescription) Release(context.Context) {
 	}
 }
 
-// Read implements FileDescriptionImpl.Read.
+// Read implements vfs.FileDescriptionImpl.Read.
 func (efd *EventFileDescription) Read(ctx context.Context, dst usermem.IOSequence, _ vfs.ReadOptions) (int64, error) {
 	if dst.NumBytes() < 8 {
 		return 0, syscall.EINVAL
@@ -130,7 +132,7 @@ func (efd *EventFileDescription) Read(ctx context.Context, dst usermem.IOSequenc
 	return 8, nil
 }
 
-// Write implements FileDescriptionImpl.Write.
+// Write implements vfs.FileDescriptionImpl.Write.
 func (efd *EventFileDescription) Write(ctx context.Context, src usermem.IOSequence, _ vfs.WriteOptions) (int64, error) {
 	if src.NumBytes() < 8 {
 		return 0, syscall.EINVAL
diff --git a/pkg/sentry/fsimpl/ext/BUILD b/pkg/sentry/fsimpl/ext/BUILD
index abc610ef3..7b1eec3da 100644
--- a/pkg/sentry/fsimpl/ext/BUILD
+++ b/pkg/sentry/fsimpl/ext/BUILD
@@ -51,6 +51,8 @@ go_library(
         "//pkg/fd",
         "//pkg/fspath",
         "//pkg/log",
+        "//pkg/marshal",
+        "//pkg/marshal/primitive",
         "//pkg/safemem",
         "//pkg/sentry/arch",
         "//pkg/sentry/fs",
@@ -86,9 +88,9 @@ go_test(
     library = ":ext",
     deps = [
         "//pkg/abi/linux",
-        "//pkg/binary",
         "//pkg/context",
         "//pkg/fspath",
+        "//pkg/marshal/primitive",
         "//pkg/sentry/contexttest",
         "//pkg/sentry/fsimpl/ext/disklayout",
         "//pkg/sentry/kernel/auth",
diff --git a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go
index 8f7d5a9bb..2ee7cc7ac 100644
--- a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go
+++ b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go
@@ -59,13 +59,18 @@ func setUp(b *testing.B, imagePath string) (context.Context, *vfs.VirtualFilesys
 	vfsObj.MustRegisterFilesystemType("extfs", ext.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
 		AllowUserMount: true,
 	})
-	mntns, err := vfsObj.NewMountNamespace(ctx, creds, imagePath, "extfs", &vfs.GetFilesystemOptions{InternalData: int(f.Fd())})
+	mntns, err := vfsObj.NewMountNamespace(ctx, creds, imagePath, "extfs", &vfs.MountOptions{
+		GetFilesystemOptions: vfs.GetFilesystemOptions{
+			InternalData: int(f.Fd()),
+		},
+	})
 	if err != nil {
 		f.Close()
 		return nil, nil, nil, nil, err
 	}
 
 	root := mntns.Root()
+	root.IncRef()
 
 	tearDown := func() {
 		root.DecRef(ctx)
@@ -90,7 +95,7 @@ func mount(b *testing.B, imagePath string, vfsfs *vfs.VirtualFilesystem, pop *vf
 	ctx := contexttest.Context(b)
 	creds := auth.CredentialsFromContext(ctx)
 
-	if err := vfsfs.MountAt(ctx, creds, imagePath, pop, "extfs", &vfs.MountOptions{
+	if _, err := vfsfs.MountAt(ctx, creds, imagePath, pop, "extfs", &vfs.MountOptions{
 		GetFilesystemOptions: vfs.GetFilesystemOptions{
 			InternalData: int(f.Fd()),
 		},
diff --git a/pkg/sentry/fsimpl/ext/block_map_file.go b/pkg/sentry/fsimpl/ext/block_map_file.go
index 8bb104ff0..1165234f9 100644
--- a/pkg/sentry/fsimpl/ext/block_map_file.go
+++ b/pkg/sentry/fsimpl/ext/block_map_file.go
@@ -18,7 +18,7 @@ import (
 	"io"
 	"math"
 
-	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
@@ -34,19 +34,19 @@ type blockMapFile struct {
 
 	// directBlks are the direct blocks numbers. The physical blocks pointed by
 	// these holds file data. Contains file blocks 0 to 11.
-	directBlks [numDirectBlks]uint32
+	directBlks [numDirectBlks]primitive.Uint32
 
 	// indirectBlk is the physical block which contains (blkSize/4) direct block
 	// numbers (as uint32 integers).
-	indirectBlk uint32
+	indirectBlk primitive.Uint32
 
 	// doubleIndirectBlk is the physical block which contains (blkSize/4) indirect
 	// block numbers (as uint32 integers).
-	doubleIndirectBlk uint32
+	doubleIndirectBlk primitive.Uint32
 
 	// tripleIndirectBlk is the physical block which contains (blkSize/4) doubly
 	// indirect block numbers (as uint32 integers).
-	tripleIndirectBlk uint32
+	tripleIndirectBlk primitive.Uint32
 
 	// coverage at (i)th index indicates the amount of file data a node at
 	// height (i) covers. Height 0 is the direct block.
@@ -68,10 +68,12 @@ func newBlockMapFile(args inodeArgs) (*blockMapFile, error) {
 	}
 
 	blkMap := file.regFile.inode.diskInode.Data()
-	binary.Unmarshal(blkMap[:numDirectBlks*4], binary.LittleEndian, &file.directBlks)
-	binary.Unmarshal(blkMap[numDirectBlks*4:(numDirectBlks+1)*4], binary.LittleEndian, &file.indirectBlk)
-	binary.Unmarshal(blkMap[(numDirectBlks+1)*4:(numDirectBlks+2)*4], binary.LittleEndian, &file.doubleIndirectBlk)
-	binary.Unmarshal(blkMap[(numDirectBlks+2)*4:(numDirectBlks+3)*4], binary.LittleEndian, &file.tripleIndirectBlk)
+	for i := 0; i < numDirectBlks; i++ {
+		file.directBlks[i].UnmarshalBytes(blkMap[i*4 : (i+1)*4])
+	}
+	file.indirectBlk.UnmarshalBytes(blkMap[numDirectBlks*4 : (numDirectBlks+1)*4])
+	file.doubleIndirectBlk.UnmarshalBytes(blkMap[(numDirectBlks+1)*4 : (numDirectBlks+2)*4])
+	file.tripleIndirectBlk.UnmarshalBytes(blkMap[(numDirectBlks+2)*4 : (numDirectBlks+3)*4])
 	return file, nil
 }
 
@@ -117,16 +119,16 @@ func (f *blockMapFile) ReadAt(dst []byte, off int64) (int, error) {
 		switch {
 		case offset < dirBlksEnd:
 			// Direct block.
-			curR, err = f.read(f.directBlks[offset/f.regFile.inode.blkSize], offset%f.regFile.inode.blkSize, 0, dst[read:])
+			curR, err = f.read(uint32(f.directBlks[offset/f.regFile.inode.blkSize]), offset%f.regFile.inode.blkSize, 0, dst[read:])
 		case offset < indirBlkEnd:
 			// Indirect block.
-			curR, err = f.read(f.indirectBlk, offset-dirBlksEnd, 1, dst[read:])
+			curR, err = f.read(uint32(f.indirectBlk), offset-dirBlksEnd, 1, dst[read:])
 		case offset < doubIndirBlkEnd:
 			// Doubly indirect block.
-			curR, err = f.read(f.doubleIndirectBlk, offset-indirBlkEnd, 2, dst[read:])
+			curR, err = f.read(uint32(f.doubleIndirectBlk), offset-indirBlkEnd, 2, dst[read:])
 		default:
 			// Triply indirect block.
-			curR, err = f.read(f.tripleIndirectBlk, offset-doubIndirBlkEnd, 3, dst[read:])
+			curR, err = f.read(uint32(f.tripleIndirectBlk), offset-doubIndirBlkEnd, 3, dst[read:])
 		}
 
 		read += curR
@@ -174,13 +176,13 @@ func (f *blockMapFile) read(curPhyBlk uint32, relFileOff uint64, height uint, ds
 	read := 0
 	curChildOff := relFileOff % childCov
 	for i := startIdx; i < endIdx; i++ {
-		var childPhyBlk uint32
+		var childPhyBlk primitive.Uint32
 		err := readFromDisk(f.regFile.inode.fs.dev, curPhyBlkOff+int64(i*4), &childPhyBlk)
 		if err != nil {
 			return read, err
 		}
 
-		n, err := f.read(childPhyBlk, curChildOff, height-1, dst[read:])
+		n, err := f.read(uint32(childPhyBlk), curChildOff, height-1, dst[read:])
 		read += n
 		if err != nil {
 			return read, err
diff --git a/pkg/sentry/fsimpl/ext/block_map_test.go b/pkg/sentry/fsimpl/ext/block_map_test.go
index 6fa84e7aa..ed98b482e 100644
--- a/pkg/sentry/fsimpl/ext/block_map_test.go
+++ b/pkg/sentry/fsimpl/ext/block_map_test.go
@@ -20,7 +20,7 @@ import (
 	"testing"
 
 	"github.com/google/go-cmp/cmp"
-	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout"
 )
 
@@ -87,29 +87,33 @@ func blockMapSetUp(t *testing.T) (*blockMapFile, []byte) {
 	mockDisk := make([]byte, mockBMDiskSize)
 	var fileData []byte
 	blkNums := newBlkNumGen()
-	var data []byte
+	off := 0
+	data := make([]byte, (numDirectBlks+3)*(*primitive.Uint32)(nil).SizeBytes())
 
 	// Write the direct blocks.
 	for i := 0; i < numDirectBlks; i++ {
-		curBlkNum := blkNums.next()
-		data = binary.Marshal(data, binary.LittleEndian, curBlkNum)
-		fileData = append(fileData, writeFileDataToBlock(mockDisk, curBlkNum, 0, blkNums)...)
+		curBlkNum := primitive.Uint32(blkNums.next())
+		curBlkNum.MarshalBytes(data[off:])
+		off += curBlkNum.SizeBytes()
+		fileData = append(fileData, writeFileDataToBlock(mockDisk, uint32(curBlkNum), 0, blkNums)...)
 	}
 
 	// Write to indirect block.
-	indirectBlk := blkNums.next()
-	data = binary.Marshal(data, binary.LittleEndian, indirectBlk)
-	fileData = append(fileData, writeFileDataToBlock(mockDisk, indirectBlk, 1, blkNums)...)
-
-	// Write to indirect block.
-	doublyIndirectBlk := blkNums.next()
-	data = binary.Marshal(data, binary.LittleEndian, doublyIndirectBlk)
-	fileData = append(fileData, writeFileDataToBlock(mockDisk, doublyIndirectBlk, 2, blkNums)...)
-
-	// Write to indirect block.
-	triplyIndirectBlk := blkNums.next()
-	data = binary.Marshal(data, binary.LittleEndian, triplyIndirectBlk)
-	fileData = append(fileData, writeFileDataToBlock(mockDisk, triplyIndirectBlk, 3, blkNums)...)
+	indirectBlk := primitive.Uint32(blkNums.next())
+	indirectBlk.MarshalBytes(data[off:])
+	off += indirectBlk.SizeBytes()
+	fileData = append(fileData, writeFileDataToBlock(mockDisk, uint32(indirectBlk), 1, blkNums)...)
+
+	// Write to double indirect block.
+	doublyIndirectBlk := primitive.Uint32(blkNums.next())
+	doublyIndirectBlk.MarshalBytes(data[off:])
+	off += doublyIndirectBlk.SizeBytes()
+	fileData = append(fileData, writeFileDataToBlock(mockDisk, uint32(doublyIndirectBlk), 2, blkNums)...)
+
+	// Write to triple indirect block.
+	triplyIndirectBlk := primitive.Uint32(blkNums.next())
+	triplyIndirectBlk.MarshalBytes(data[off:])
+	fileData = append(fileData, writeFileDataToBlock(mockDisk, uint32(triplyIndirectBlk), 3, blkNums)...)
 
 	args := inodeArgs{
 		fs: &filesystem{
@@ -142,9 +146,9 @@ func writeFileDataToBlock(disk []byte, blkNum uint32, height uint, blkNums *blkN
 
 	var fileData []byte
 	for off := blkNum * mockBMBlkSize; off < (blkNum+1)*mockBMBlkSize; off += 4 {
-		curBlkNum := blkNums.next()
-		copy(disk[off:off+4], binary.Marshal(nil, binary.LittleEndian, curBlkNum))
-		fileData = append(fileData, writeFileDataToBlock(disk, curBlkNum, height-1, blkNums)...)
+		curBlkNum := primitive.Uint32(blkNums.next())
+		curBlkNum.MarshalBytes(disk[off : off+4])
+		fileData = append(fileData, writeFileDataToBlock(disk, uint32(curBlkNum), height-1, blkNums)...)
 	}
 	return fileData
 }
diff --git a/pkg/sentry/fsimpl/ext/dentry.go b/pkg/sentry/fsimpl/ext/dentry.go
index 7a1b4219f..9bfed883a 100644
--- a/pkg/sentry/fsimpl/ext/dentry.go
+++ b/pkg/sentry/fsimpl/ext/dentry.go
@@ -20,6 +20,8 @@ import (
 )
 
 // dentry implements vfs.DentryImpl.
+//
+// +stateify savable
 type dentry struct {
 	vfsd vfs.Dentry
 
diff --git a/pkg/sentry/fsimpl/ext/directory.go b/pkg/sentry/fsimpl/ext/directory.go
index 0fc01668d..0ad79b381 100644
--- a/pkg/sentry/fsimpl/ext/directory.go
+++ b/pkg/sentry/fsimpl/ext/directory.go
@@ -16,7 +16,6 @@ package ext
 
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/binary"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
@@ -28,6 +27,8 @@ import (
 )
 
 // directory represents a directory inode. It holds the childList in memory.
+//
+// +stateify savable
 type directory struct {
 	inode inode
 
@@ -39,7 +40,7 @@ type directory struct {
 	// Lock Order (outermost locks must be taken first):
 	//   directory.mu
 	//     filesystem.mu
-	mu sync.Mutex
+	mu sync.Mutex `state:"nosave"`
 
 	// childList is a list containing (1) child dirents and (2) fake dirents
 	// (with diskDirent == nil) that represent the iteration position of
@@ -98,7 +99,7 @@ func newDirectory(args inodeArgs, newDirent bool) (*directory, error) {
 		} else {
 			curDirent.diskDirent = &disklayout.DirentOld{}
 		}
-		binary.Unmarshal(buf, binary.LittleEndian, curDirent.diskDirent)
+		curDirent.diskDirent.UnmarshalBytes(buf)
 
 		if curDirent.diskDirent.Inode() != 0 && len(curDirent.diskDirent.FileName()) != 0 {
 			// Inode number and name length fields being set to 0 is used to indicate
@@ -120,6 +121,8 @@ func (i *inode) isDir() bool {
 }
 
 // dirent is the directory.childList node.
+//
+// +stateify savable
 type dirent struct {
 	diskDirent disklayout.Dirent
 
@@ -129,6 +132,8 @@ type dirent struct {
 
 // directoryFD represents a directory file description. It implements
 // vfs.FileDescriptionImpl.
+//
+// +stateify savable
 type directoryFD struct {
 	fileDescription
 	vfs.DirectoryFileDescriptionDefaultImpl
diff --git a/pkg/sentry/fsimpl/ext/disklayout/BUILD b/pkg/sentry/fsimpl/ext/disklayout/BUILD
index 9bd9c76c0..d98a05dd8 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/BUILD
+++ b/pkg/sentry/fsimpl/ext/disklayout/BUILD
@@ -22,10 +22,11 @@ go_library(
         "superblock_old.go",
         "test_utils.go",
     ],
+    marshal = True,
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
-        "//pkg/binary",
+        "//pkg/marshal",
         "//pkg/sentry/fs",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/kernel/time",
diff --git a/pkg/sentry/fsimpl/ext/disklayout/block_group.go b/pkg/sentry/fsimpl/ext/disklayout/block_group.go
index ad6f4fef8..0d56ae9da 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/block_group.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/block_group.go
@@ -14,6 +14,10 @@
 
 package disklayout
 
+import (
+	"gvisor.dev/gvisor/pkg/marshal"
+)
+
 // BlockGroup represents a Linux ext block group descriptor. An ext file system
 // is split into a series of block groups. This provides an access layer to
 // information needed to access and use a block group.
@@ -30,6 +34,8 @@ package disklayout
 //
 // See https://www.kernel.org/doc/html/latest/filesystems/ext4/globals.html#block-group-descriptors.
 type BlockGroup interface {
+	marshal.Marshallable
+
 	// InodeTable returns the absolute block number of the block containing the
 	// inode table. This points to an array of Inode structs. Inode tables are
 	// statically allocated at mkfs time. The superblock records the number of
diff --git a/pkg/sentry/fsimpl/ext/disklayout/block_group_32.go b/pkg/sentry/fsimpl/ext/disklayout/block_group_32.go
index 3e16c76db..a35fa22a0 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/block_group_32.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/block_group_32.go
@@ -17,6 +17,8 @@ package disklayout
 // BlockGroup32Bit emulates the first half of struct ext4_group_desc in
 // fs/ext4/ext4.h. It is the block group descriptor struct for ext2, ext3 and
 // 32-bit ext4 filesystems. It implements BlockGroup interface.
+//
+// +marshal
 type BlockGroup32Bit struct {
 	BlockBitmapLo         uint32
 	InodeBitmapLo         uint32
diff --git a/pkg/sentry/fsimpl/ext/disklayout/block_group_64.go b/pkg/sentry/fsimpl/ext/disklayout/block_group_64.go
index 9a809197a..d54d1d345 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/block_group_64.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/block_group_64.go
@@ -18,6 +18,8 @@ package disklayout
 // It is the block group descriptor struct for 64-bit ext4 filesystems.
 // It implements BlockGroup interface. It is an extension of the 32-bit
 // version of BlockGroup.
+//
+// +marshal
 type BlockGroup64Bit struct {
 	// We embed the 32-bit struct here because 64-bit version is just an extension
 	// of the 32-bit version.
diff --git a/pkg/sentry/fsimpl/ext/disklayout/block_group_test.go b/pkg/sentry/fsimpl/ext/disklayout/block_group_test.go
index 0ef4294c0..e4ce484e4 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/block_group_test.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/block_group_test.go
@@ -21,6 +21,8 @@ import (
 // TestBlockGroupSize tests that the block group descriptor structs are of the
 // correct size.
 func TestBlockGroupSize(t *testing.T) {
-	assertSize(t, BlockGroup32Bit{}, 32)
-	assertSize(t, BlockGroup64Bit{}, 64)
+	var bgSmall BlockGroup32Bit
+	assertSize(t, &bgSmall, 32)
+	var bgBig BlockGroup64Bit
+	assertSize(t, &bgBig, 64)
 }
diff --git a/pkg/sentry/fsimpl/ext/disklayout/dirent.go b/pkg/sentry/fsimpl/ext/disklayout/dirent.go
index 417b6cf65..568c8cb4c 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/dirent.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/dirent.go
@@ -15,6 +15,7 @@
 package disklayout
 
 import (
+	"gvisor.dev/gvisor/pkg/marshal"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 )
 
@@ -51,6 +52,8 @@ var (
 //
 // See https://www.kernel.org/doc/html/latest/filesystems/ext4/dynamic.html#linear-classic-directories.
 type Dirent interface {
+	marshal.Marshallable
+
 	// Inode returns the absolute inode number of the underlying inode.
 	// Inode number 0 signifies an unused dirent.
 	Inode() uint32
diff --git a/pkg/sentry/fsimpl/ext/disklayout/dirent_new.go b/pkg/sentry/fsimpl/ext/disklayout/dirent_new.go
index 29ae4a5c2..51f9c2946 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/dirent_new.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/dirent_new.go
@@ -29,12 +29,14 @@ import (
 // Note: This struct can be of variable size on disk. The one described below
 // is of maximum size and the FileName beyond NameLength bytes might contain
 // garbage.
+//
+// +marshal
 type DirentNew struct {
 	InodeNumber  uint32
 	RecordLength uint16
 	NameLength   uint8
 	FileTypeRaw  uint8
-	FileNameRaw  [MaxFileName]byte
+	FileNameRaw  [MaxFileName]byte `marshal:"unaligned"`
 }
 
 // Compiles only if DirentNew implements Dirent.
diff --git a/pkg/sentry/fsimpl/ext/disklayout/dirent_old.go b/pkg/sentry/fsimpl/ext/disklayout/dirent_old.go
index 6fff12a6e..d4b19e086 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/dirent_old.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/dirent_old.go
@@ -22,11 +22,13 @@ import "gvisor.dev/gvisor/pkg/sentry/fs"
 // Note: This struct can be of variable size on disk. The one described below
 // is of maximum size and the FileName beyond NameLength bytes might contain
 // garbage.
+//
+// +marshal
 type DirentOld struct {
 	InodeNumber  uint32
 	RecordLength uint16
 	NameLength   uint16
-	FileNameRaw  [MaxFileName]byte
+	FileNameRaw  [MaxFileName]byte `marshal:"unaligned"`
 }
 
 // Compiles only if DirentOld implements Dirent.
diff --git a/pkg/sentry/fsimpl/ext/disklayout/dirent_test.go b/pkg/sentry/fsimpl/ext/disklayout/dirent_test.go
index 934919f8a..3486864dc 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/dirent_test.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/dirent_test.go
@@ -21,6 +21,8 @@ import (
 // TestDirentSize tests that the dirent structs are of the correct
 // size.
 func TestDirentSize(t *testing.T) {
-	assertSize(t, DirentOld{}, uintptr(DirentSize))
-	assertSize(t, DirentNew{}, uintptr(DirentSize))
+	var dOld DirentOld
+	assertSize(t, &dOld, DirentSize)
+	var dNew DirentNew
+	assertSize(t, &dNew, DirentSize)
 }
diff --git a/pkg/sentry/fsimpl/ext/disklayout/disklayout.go b/pkg/sentry/fsimpl/ext/disklayout/disklayout.go
index bdf4e2132..0834e9ba8 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/disklayout.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/disklayout.go
@@ -36,8 +36,6 @@
 //     escape analysis on an unknown implementation at compile time.
 //
 // Notes:
-//   - All fields in these structs are exported because binary.Read would
-//     panic otherwise.
 //   - All structures on disk are in little-endian order. Only jbd2 (journal)
 //     structures are in big-endian order.
 //   - All OS dependent fields in these structures will be interpretted using
diff --git a/pkg/sentry/fsimpl/ext/disklayout/extent.go b/pkg/sentry/fsimpl/ext/disklayout/extent.go
index 4110649ab..b13999bfc 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/extent.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/extent.go
@@ -14,6 +14,10 @@
 
 package disklayout
 
+import (
+	"gvisor.dev/gvisor/pkg/marshal"
+)
+
 // Extents were introduced in ext4 and provide huge performance gains in terms
 // data locality and reduced metadata block usage. Extents are organized in
 // extent trees. The root node is contained in inode.BlocksRaw.
@@ -64,6 +68,8 @@ type ExtentNode struct {
 // ExtentEntry represents an extent tree node entry. The entry can either be
 // an ExtentIdx or Extent itself. This exists to simplify navigation logic.
 type ExtentEntry interface {
+	marshal.Marshallable
+
 	// FileBlock returns the first file block number covered by this entry.
 	FileBlock() uint32
 
@@ -75,6 +81,8 @@ type ExtentEntry interface {
 // tree node begins with this and is followed by `NumEntries` number of:
 //   - Extent         if `Depth` == 0
 //   - ExtentIdx      otherwise
+//
+// +marshal
 type ExtentHeader struct {
 	// Magic in the extent magic number, must be 0xf30a.
 	Magic uint16
@@ -96,6 +104,8 @@ type ExtentHeader struct {
 // internal nodes. Sorted in ascending order based on FirstFileBlock since
 // Linux does a binary search on this. This points to a block containing the
 // child node.
+//
+// +marshal
 type ExtentIdx struct {
 	FirstFileBlock uint32
 	ChildBlockLo   uint32
@@ -121,6 +131,8 @@ func (ei *ExtentIdx) PhysicalBlock() uint64 {
 // nodes. Sorted in ascending order based on FirstFileBlock since Linux does a
 // binary search on this. This points to an array of data blocks containing the
 // file data. It covers `Length` data blocks starting from `StartBlock`.
+//
+// +marshal
 type Extent struct {
 	FirstFileBlock uint32
 	Length         uint16
diff --git a/pkg/sentry/fsimpl/ext/disklayout/extent_test.go b/pkg/sentry/fsimpl/ext/disklayout/extent_test.go
index 8762b90db..c96002e19 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/extent_test.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/extent_test.go
@@ -21,7 +21,10 @@ import (
 // TestExtentSize tests that the extent structs are of the correct
 // size.
 func TestExtentSize(t *testing.T) {
-	assertSize(t, ExtentHeader{}, ExtentHeaderSize)
-	assertSize(t, ExtentIdx{}, ExtentEntrySize)
-	assertSize(t, Extent{}, ExtentEntrySize)
+	var h ExtentHeader
+	assertSize(t, &h, ExtentHeaderSize)
+	var i ExtentIdx
+	assertSize(t, &i, ExtentEntrySize)
+	var e Extent
+	assertSize(t, &e, ExtentEntrySize)
 }
diff --git a/pkg/sentry/fsimpl/ext/disklayout/inode.go b/pkg/sentry/fsimpl/ext/disklayout/inode.go
index 88ae913f5..ef25040a9 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/inode.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/inode.go
@@ -16,6 +16,7 @@ package disklayout
 
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/marshal"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/time"
 )
@@ -38,6 +39,8 @@ const (
 //
 // See https://www.kernel.org/doc/html/latest/filesystems/ext4/dynamic.html#index-nodes.
 type Inode interface {
+	marshal.Marshallable
+
 	// Mode returns the linux file mode which is majorly used to extract
 	// information like:
 	// - File permissions (read/write/execute by user/group/others).
diff --git a/pkg/sentry/fsimpl/ext/disklayout/inode_new.go b/pkg/sentry/fsimpl/ext/disklayout/inode_new.go
index 8f9f574ce..a4503f5cf 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/inode_new.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/inode_new.go
@@ -27,6 +27,8 @@ import "gvisor.dev/gvisor/pkg/sentry/kernel/time"
 // are used to provide nanoscond precision. Hence, these timestamps will now
 // overflow in May 2446.
 // See https://www.kernel.org/doc/html/latest/filesystems/ext4/dynamic.html#inode-timestamps.
+//
+// +marshal
 type InodeNew struct {
 	InodeOld
 
diff --git a/pkg/sentry/fsimpl/ext/disklayout/inode_old.go b/pkg/sentry/fsimpl/ext/disklayout/inode_old.go
index db25b11b6..e6b28babf 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/inode_old.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/inode_old.go
@@ -30,6 +30,8 @@ const (
 //
 // All fields representing time are in seconds since the epoch. Which means that
 // they will overflow in January 2038.
+//
+// +marshal
 type InodeOld struct {
 	ModeRaw uint16
 	UIDLo   uint16
diff --git a/pkg/sentry/fsimpl/ext/disklayout/inode_test.go b/pkg/sentry/fsimpl/ext/disklayout/inode_test.go
index dd03ee50e..90744e956 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/inode_test.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/inode_test.go
@@ -24,10 +24,12 @@ import (
 
 // TestInodeSize tests that the inode structs are of the correct size.
 func TestInodeSize(t *testing.T) {
-	assertSize(t, InodeOld{}, OldInodeSize)
+	var iOld InodeOld
+	assertSize(t, &iOld, OldInodeSize)
 
 	// This was updated from 156 bytes to 160 bytes in Oct 2015.
-	assertSize(t, InodeNew{}, 160)
+	var iNew InodeNew
+	assertSize(t, &iNew, 160)
 }
 
 // TestTimestampSeconds tests that the seconds part of [a/c/m] timestamps in
diff --git a/pkg/sentry/fsimpl/ext/disklayout/superblock.go b/pkg/sentry/fsimpl/ext/disklayout/superblock.go
index 8bb327006..70948ebe9 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/superblock.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/superblock.go
@@ -14,6 +14,10 @@
 
 package disklayout
 
+import (
+	"gvisor.dev/gvisor/pkg/marshal"
+)
+
 const (
 	// SbOffset is the absolute offset at which the superblock is placed.
 	SbOffset = 1024
@@ -38,6 +42,8 @@ const (
 //
 // See https://www.kernel.org/doc/html/latest/filesystems/ext4/globals.html#super-block.
 type SuperBlock interface {
+	marshal.Marshallable
+
 	// InodesCount returns the total number of inodes in this filesystem.
 	InodesCount() uint32
 
diff --git a/pkg/sentry/fsimpl/ext/disklayout/superblock_32.go b/pkg/sentry/fsimpl/ext/disklayout/superblock_32.go
index 53e515fd3..4dc6080fb 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/superblock_32.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/superblock_32.go
@@ -17,6 +17,8 @@ package disklayout
 // SuperBlock32Bit implements SuperBlock and represents the 32-bit version of
 // the ext4_super_block struct in fs/ext4/ext4.h. Should be used only if
 // RevLevel = DynamicRev and 64-bit feature is disabled.
+//
+// +marshal
 type SuperBlock32Bit struct {
 	// We embed the old superblock struct here because the 32-bit version is just
 	// an extension of the old version.
diff --git a/pkg/sentry/fsimpl/ext/disklayout/superblock_64.go b/pkg/sentry/fsimpl/ext/disklayout/superblock_64.go
index 7c1053fb4..2c9039327 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/superblock_64.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/superblock_64.go
@@ -19,6 +19,8 @@ package disklayout
 // 1024 bytes (smallest possible block size) and hence the superblock always
 // fits in no more than one data block. Should only be used when the 64-bit
 // feature is set.
+//
+// +marshal
 type SuperBlock64Bit struct {
 	// We embed the 32-bit struct here because 64-bit version is just an extension
 	// of the 32-bit version.
diff --git a/pkg/sentry/fsimpl/ext/disklayout/superblock_old.go b/pkg/sentry/fsimpl/ext/disklayout/superblock_old.go
index 9221e0251..e4709f23c 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/superblock_old.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/superblock_old.go
@@ -16,6 +16,8 @@ package disklayout
 
 // SuperBlockOld implements SuperBlock and represents the old version of the
 // superblock struct. Should be used only if RevLevel = OldRev.
+//
+// +marshal
 type SuperBlockOld struct {
 	InodesCountRaw      uint32
 	BlocksCountLo       uint32
diff --git a/pkg/sentry/fsimpl/ext/disklayout/superblock_test.go b/pkg/sentry/fsimpl/ext/disklayout/superblock_test.go
index 463b5ba21..b734b6987 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/superblock_test.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/superblock_test.go
@@ -21,7 +21,10 @@ import (
 // TestSuperBlockSize tests that the superblock structs are of the correct
 // size.
 func TestSuperBlockSize(t *testing.T) {
-	assertSize(t, SuperBlockOld{}, 84)
-	assertSize(t, SuperBlock32Bit{}, 336)
-	assertSize(t, SuperBlock64Bit{}, 1024)
+	var sbOld SuperBlockOld
+	assertSize(t, &sbOld, 84)
+	var sb32 SuperBlock32Bit
+	assertSize(t, &sb32, 336)
+	var sb64 SuperBlock64Bit
+	assertSize(t, &sb64, 1024)
 }
diff --git a/pkg/sentry/fsimpl/ext/disklayout/test_utils.go b/pkg/sentry/fsimpl/ext/disklayout/test_utils.go
index 9c63f04c0..a4bc08411 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/test_utils.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/test_utils.go
@@ -18,13 +18,13 @@ import (
 	"reflect"
 	"testing"
 
-	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/marshal"
 )
 
-func assertSize(t *testing.T, v interface{}, want uintptr) {
+func assertSize(t *testing.T, v marshal.Marshallable, want int) {
 	t.Helper()
 
-	if got := binary.Size(v); got != want {
+	if got := v.SizeBytes(); got != want {
 		t.Errorf("struct %s should be exactly %d bytes but is %d bytes", reflect.TypeOf(v).Name(), want, got)
 	}
 }
diff --git a/pkg/sentry/fsimpl/ext/ext.go b/pkg/sentry/fsimpl/ext/ext.go
index 08ffc2834..38fb7962b 100644
--- a/pkg/sentry/fsimpl/ext/ext.go
+++ b/pkg/sentry/fsimpl/ext/ext.go
@@ -34,11 +34,10 @@ import (
 const Name = "ext"
 
 // FilesystemType implements vfs.FilesystemType.
+//
+// +stateify savable
 type FilesystemType struct{}
 
-// Compiles only if FilesystemType implements vfs.FilesystemType.
-var _ vfs.FilesystemType = (*FilesystemType)(nil)
-
 // getDeviceFd returns an io.ReaderAt to the underlying device.
 // Currently there are two ways of mounting an ext(2/3/4) fs:
 //   1. Specify a mount with our internal special MountType in the OCI spec.
@@ -99,6 +98,9 @@ func (FilesystemType) Name() string {
 	return Name
 }
 
+// Release implements vfs.FilesystemType.Release.
+func (FilesystemType) Release(ctx context.Context) {}
+
 // GetFilesystem implements vfs.FilesystemType.GetFilesystem.
 func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
 	// TODO(b/134676337): Ensure that the user is mounting readonly. If not,
diff --git a/pkg/sentry/fsimpl/ext/ext_test.go b/pkg/sentry/fsimpl/ext/ext_test.go
index 2dbaee287..d9fd4590c 100644
--- a/pkg/sentry/fsimpl/ext/ext_test.go
+++ b/pkg/sentry/fsimpl/ext/ext_test.go
@@ -71,13 +71,18 @@ func setUp(t *testing.T, imagePath string) (context.Context, *vfs.VirtualFilesys
 	vfsObj.MustRegisterFilesystemType("extfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
 		AllowUserMount: true,
 	})
-	mntns, err := vfsObj.NewMountNamespace(ctx, creds, localImagePath, "extfs", &vfs.GetFilesystemOptions{InternalData: int(f.Fd())})
+	mntns, err := vfsObj.NewMountNamespace(ctx, creds, localImagePath, "extfs", &vfs.MountOptions{
+		GetFilesystemOptions: vfs.GetFilesystemOptions{
+			InternalData: int(f.Fd()),
+		},
+	})
 	if err != nil {
 		f.Close()
 		return nil, nil, nil, nil, err
 	}
 
 	root := mntns.Root()
+	root.IncRef()
 
 	tearDown := func() {
 		root.DecRef(ctx)
diff --git a/pkg/sentry/fsimpl/ext/extent_file.go b/pkg/sentry/fsimpl/ext/extent_file.go
index c36225a7c..778460107 100644
--- a/pkg/sentry/fsimpl/ext/extent_file.go
+++ b/pkg/sentry/fsimpl/ext/extent_file.go
@@ -18,12 +18,13 @@ import (
 	"io"
 	"sort"
 
-	"gvisor.dev/gvisor/pkg/binary"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
 // extentFile is a type of regular file which uses extents to store file data.
+//
+// +stateify savable
 type extentFile struct {
 	regFile regularFile
 
@@ -58,7 +59,7 @@ func newExtentFile(args inodeArgs) (*extentFile, error) {
 func (f *extentFile) buildExtTree() error {
 	rootNodeData := f.regFile.inode.diskInode.Data()
 
-	binary.Unmarshal(rootNodeData[:disklayout.ExtentHeaderSize], binary.LittleEndian, &f.root.Header)
+	f.root.Header.UnmarshalBytes(rootNodeData[:disklayout.ExtentHeaderSize])
 
 	// Root node can not have more than 4 entries: 60 bytes = 1 header + 4 entries.
 	if f.root.Header.NumEntries > 4 {
@@ -77,7 +78,7 @@ func (f *extentFile) buildExtTree() error {
 			// Internal node.
 			curEntry = &disklayout.ExtentIdx{}
 		}
-		binary.Unmarshal(rootNodeData[off:off+disklayout.ExtentEntrySize], binary.LittleEndian, curEntry)
+		curEntry.UnmarshalBytes(rootNodeData[off : off+disklayout.ExtentEntrySize])
 		f.root.Entries[i].Entry = curEntry
 	}
 
diff --git a/pkg/sentry/fsimpl/ext/extent_test.go b/pkg/sentry/fsimpl/ext/extent_test.go
index cd10d46ee..985f76ac0 100644
--- a/pkg/sentry/fsimpl/ext/extent_test.go
+++ b/pkg/sentry/fsimpl/ext/extent_test.go
@@ -21,7 +21,6 @@ import (
 
 	"github.com/google/go-cmp/cmp"
 	"github.com/google/go-cmp/cmp/cmpopts"
-	"gvisor.dev/gvisor/pkg/binary"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout"
 )
 
@@ -202,13 +201,14 @@ func extentTreeSetUp(t *testing.T, root *disklayout.ExtentNode) (*extentFile, []
 // writeTree writes the tree represented by `root` to the inode and disk. It
 // also writes random file data on disk.
 func writeTree(in *inode, disk []byte, root *disklayout.ExtentNode, mockExtentBlkSize uint64) []byte {
-	rootData := binary.Marshal(nil, binary.LittleEndian, root.Header)
+	rootData := in.diskInode.Data()
+	root.Header.MarshalBytes(rootData)
+	off := root.Header.SizeBytes()
 	for _, ep := range root.Entries {
-		rootData = binary.Marshal(rootData, binary.LittleEndian, ep.Entry)
+		ep.Entry.MarshalBytes(rootData[off:])
+		off += ep.Entry.SizeBytes()
 	}
 
-	copy(in.diskInode.Data(), rootData)
-
 	var fileData []byte
 	for _, ep := range root.Entries {
 		if root.Header.Height == 0 {
@@ -223,13 +223,14 @@ func writeTree(in *inode, disk []byte, root *disklayout.ExtentNode, mockExtentBl
 // writeTreeToDisk is the recursive step for writeTree which writes the tree
 // on the disk only. Also writes random file data on disk.
 func writeTreeToDisk(disk []byte, curNode disklayout.ExtentEntryPair) []byte {
-	nodeData := binary.Marshal(nil, binary.LittleEndian, curNode.Node.Header)
+	nodeData := disk[curNode.Entry.PhysicalBlock()*mockExtentBlkSize:]
+	curNode.Node.Header.MarshalBytes(nodeData)
+	off := curNode.Node.Header.SizeBytes()
 	for _, ep := range curNode.Node.Entries {
-		nodeData = binary.Marshal(nodeData, binary.LittleEndian, ep.Entry)
+		ep.Entry.MarshalBytes(nodeData[off:])
+		off += ep.Entry.SizeBytes()
 	}
 
-	copy(disk[curNode.Entry.PhysicalBlock()*mockExtentBlkSize:], nodeData)
-
 	var fileData []byte
 	for _, ep := range curNode.Node.Entries {
 		if curNode.Node.Header.Height == 0 {
diff --git a/pkg/sentry/fsimpl/ext/filesystem.go b/pkg/sentry/fsimpl/ext/filesystem.go
index c714ddf73..917f1873d 100644
--- a/pkg/sentry/fsimpl/ext/filesystem.go
+++ b/pkg/sentry/fsimpl/ext/filesystem.go
@@ -38,11 +38,13 @@ var (
 )
 
 // filesystem implements vfs.FilesystemImpl.
+//
+// +stateify savable
 type filesystem struct {
 	vfsfs vfs.Filesystem
 
 	// mu serializes changes to the Dentry tree.
-	mu sync.RWMutex
+	mu sync.RWMutex `state:"nosave"`
 
 	// dev represents the underlying fs device. It does not require protection
 	// because io.ReaderAt permits concurrent read calls to it. It translates to
@@ -81,9 +83,9 @@ var _ vfs.FilesystemImpl = (*filesystem)(nil)
 // stepLocked is loosely analogous to fs/namei.c:walk_component().
 //
 // Preconditions:
-//     - filesystem.mu must be locked (for writing if write param is true).
-//     - !rp.Done().
-//     - inode == vfsd.Impl().(*Dentry).inode.
+// * filesystem.mu must be locked (for writing if write param is true).
+// * !rp.Done().
+// * inode == vfsd.Impl().(*Dentry).inode.
 func stepLocked(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, inode *inode, write bool) (*vfs.Dentry, *inode, error) {
 	if !inode.isDir() {
 		return nil, nil, syserror.ENOTDIR
@@ -166,7 +168,7 @@ func stepLocked(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, in
 // walkLocked is loosely analogous to Linux's fs/namei.c:path_lookupat().
 //
 // Preconditions:
-//     - filesystem.mu must be locked (for writing if write param is true).
+// * filesystem.mu must be locked (for writing if write param is true).
 func walkLocked(ctx context.Context, rp *vfs.ResolvingPath, write bool) (*vfs.Dentry, *inode, error) {
 	vfsd := rp.Start()
 	inode := vfsd.Impl().(*dentry).inode
@@ -194,8 +196,8 @@ func walkLocked(ctx context.Context, rp *vfs.ResolvingPath, write bool) (*vfs.De
 // walkParentLocked is loosely analogous to Linux's fs/namei.c:path_parentat().
 //
 // Preconditions:
-//     - filesystem.mu must be locked (for writing if write param is true).
-//     - !rp.Done().
+// * filesystem.mu must be locked (for writing if write param is true).
+// * !rp.Done().
 func walkParentLocked(ctx context.Context, rp *vfs.ResolvingPath, write bool) (*vfs.Dentry, *inode, error) {
 	vfsd := rp.Start()
 	inode := vfsd.Impl().(*dentry).inode
@@ -490,7 +492,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
 	return syserror.EROFS
 }
 
-// BoundEndpointAt implements FilesystemImpl.BoundEndpointAt.
+// BoundEndpointAt implements vfs.FilesystemImpl.BoundEndpointAt.
 func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) {
 	_, inode, err := fs.walk(ctx, rp, false)
 	if err != nil {
@@ -504,8 +506,8 @@ func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath
 	return nil, syserror.ECONNREFUSED
 }
 
-// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
-func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
+// ListXattrAt implements vfs.FilesystemImpl.ListXattrAt.
+func (fs *filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
 	_, _, err := fs.walk(ctx, rp, false)
 	if err != nil {
 		return nil, err
@@ -513,8 +515,8 @@ func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, si
 	return nil, syserror.ENOTSUP
 }
 
-// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt.
-func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) {
+// GetXattrAt implements vfs.FilesystemImpl.GetXattrAt.
+func (fs *filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetXattrOptions) (string, error) {
 	_, _, err := fs.walk(ctx, rp, false)
 	if err != nil {
 		return "", err
@@ -522,8 +524,8 @@ func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt
 	return "", syserror.ENOTSUP
 }
 
-// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt.
-func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error {
+// SetXattrAt implements vfs.FilesystemImpl.SetXattrAt.
+func (fs *filesystem) SetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetXattrOptions) error {
 	_, _, err := fs.walk(ctx, rp, false)
 	if err != nil {
 		return err
@@ -531,8 +533,8 @@ func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt
 	return syserror.ENOTSUP
 }
 
-// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt.
-func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
+// RemoveXattrAt implements vfs.FilesystemImpl.RemoveXattrAt.
+func (fs *filesystem) RemoveXattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
 	_, _, err := fs.walk(ctx, rp, false)
 	if err != nil {
 		return err
diff --git a/pkg/sentry/fsimpl/ext/inode.go b/pkg/sentry/fsimpl/ext/inode.go
index 30636cf66..9009ba3c7 100644
--- a/pkg/sentry/fsimpl/ext/inode.go
+++ b/pkg/sentry/fsimpl/ext/inode.go
@@ -37,6 +37,8 @@ import (
 //           |-- regular--
 //                       |-- extent file
 //                       |-- block map file
+//
+// +stateify savable
 type inode struct {
 	// refs is a reference count. refs is accessed using atomic memory operations.
 	refs int64
diff --git a/pkg/sentry/fsimpl/ext/regular_file.go b/pkg/sentry/fsimpl/ext/regular_file.go
index e73e740d6..4a5539b37 100644
--- a/pkg/sentry/fsimpl/ext/regular_file.go
+++ b/pkg/sentry/fsimpl/ext/regular_file.go
@@ -31,6 +31,8 @@ import (
 // regularFile represents a regular file's inode. This too follows the
 // inheritance pattern prevelant in the vfs layer described in
 // pkg/sentry/vfs/README.md.
+//
+// +stateify savable
 type regularFile struct {
 	inode inode
 
@@ -67,6 +69,8 @@ func (in *inode) isRegular() bool {
 
 // directoryFD represents a directory file description. It implements
 // vfs.FileDescriptionImpl.
+//
+// +stateify savable
 type regularFileFD struct {
 	fileDescription
 	vfs.LockFD
@@ -75,7 +79,7 @@ type regularFileFD struct {
 	off int64
 
 	// offMu serializes operations that may mutate off.
-	offMu sync.Mutex
+	offMu sync.Mutex `state:"nosave"`
 }
 
 // Release implements vfs.FileDescriptionImpl.Release.
diff --git a/pkg/sentry/fsimpl/ext/symlink.go b/pkg/sentry/fsimpl/ext/symlink.go
index 2fd0d1fa8..5e2bcc837 100644
--- a/pkg/sentry/fsimpl/ext/symlink.go
+++ b/pkg/sentry/fsimpl/ext/symlink.go
@@ -23,6 +23,8 @@ import (
 )
 
 // symlink represents a symlink inode.
+//
+// +stateify savable
 type symlink struct {
 	inode  inode
 	target string // immutable
@@ -61,9 +63,11 @@ func (in *inode) isSymlink() bool {
 	return ok
 }
 
-// symlinkFD represents a symlink file description and implements implements
+// symlinkFD represents a symlink file description and implements
 // vfs.FileDescriptionImpl. which may only be used if open options contains
 // O_PATH. For this reason most of the functions return EBADF.
+//
+// +stateify savable
 type symlinkFD struct {
 	fileDescription
 	vfs.NoLockFD
diff --git a/pkg/sentry/fsimpl/ext/utils.go b/pkg/sentry/fsimpl/ext/utils.go
index d8b728f8c..58ef7b9b8 100644
--- a/pkg/sentry/fsimpl/ext/utils.go
+++ b/pkg/sentry/fsimpl/ext/utils.go
@@ -17,21 +17,21 @@ package ext
 import (
 	"io"
 
-	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/marshal"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
 // readFromDisk performs a binary read from disk into the given struct from
 // the absolute offset provided.
-func readFromDisk(dev io.ReaderAt, abOff int64, v interface{}) error {
-	n := binary.Size(v)
+func readFromDisk(dev io.ReaderAt, abOff int64, v marshal.Marshallable) error {
+	n := v.SizeBytes()
 	buf := make([]byte, n)
 	if read, _ := dev.ReadAt(buf, abOff); read < int(n) {
 		return syserror.EIO
 	}
 
-	binary.Unmarshal(buf, binary.LittleEndian, v)
+	v.UnmarshalBytes(buf)
 	return nil
 }
 
diff --git a/pkg/sentry/fsimpl/fuse/BUILD b/pkg/sentry/fsimpl/fuse/BUILD
index 999111deb..045d7ab08 100644
--- a/pkg/sentry/fsimpl/fuse/BUILD
+++ b/pkg/sentry/fsimpl/fuse/BUILD
@@ -15,21 +15,41 @@ go_template_instance(
     },
 )
 
+go_template_instance(
+    name = "inode_refs",
+    out = "inode_refs.go",
+    package = "fuse",
+    prefix = "inode",
+    template = "//pkg/refs_vfs2:refs_template",
+    types = {
+        "T": "inode",
+    },
+)
+
 go_library(
     name = "fuse",
     srcs = [
         "connection.go",
+        "connection_control.go",
         "dev.go",
+        "directory.go",
+        "file.go",
         "fusefs.go",
-        "init.go",
+        "inode_refs.go",
+        "read_write.go",
         "register.go",
+        "regular_file.go",
         "request_list.go",
+        "request_response.go",
     ],
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
         "//pkg/context",
         "//pkg/log",
+        "//pkg/marshal",
+        "//pkg/refs",
+        "//pkg/safemem",
         "//pkg/sentry/fsimpl/devtmpfs",
         "//pkg/sentry/fsimpl/kernfs",
         "//pkg/sentry/kernel",
@@ -39,7 +59,6 @@ go_library(
         "//pkg/syserror",
         "//pkg/usermem",
         "//pkg/waiter",
-        "//tools/go_marshal/marshal",
         "@org_golang_x_sys//unix:go_default_library",
     ],
 )
@@ -47,10 +66,15 @@ go_library(
 go_test(
     name = "fuse_test",
     size = "small",
-    srcs = ["dev_test.go"],
+    srcs = [
+        "connection_test.go",
+        "dev_test.go",
+        "utils_test.go",
+    ],
     library = ":fuse",
     deps = [
         "//pkg/abi/linux",
+        "//pkg/marshal",
         "//pkg/sentry/fsimpl/testutil",
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
@@ -58,6 +82,5 @@ go_test(
         "//pkg/syserror",
         "//pkg/usermem",
         "//pkg/waiter",
-        "//tools/go_marshal/marshal",
     ],
 )
diff --git a/pkg/sentry/fsimpl/fuse/connection.go b/pkg/sentry/fsimpl/fuse/connection.go
index 6df2728ab..8ccda1264 100644
--- a/pkg/sentry/fsimpl/fuse/connection.go
+++ b/pkg/sentry/fsimpl/fuse/connection.go
@@ -15,31 +15,17 @@
 package fuse
 
 import (
-	"errors"
-	"fmt"
 	"sync"
-	"sync/atomic"
-	"syscall"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
-	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/waiter"
-	"gvisor.dev/gvisor/tools/go_marshal/marshal"
 )
 
-// maxActiveRequestsDefault is the default setting controlling the upper bound
-// on the number of active requests at any given time.
-const maxActiveRequestsDefault = 10000
-
-// Ordinary requests have even IDs, while interrupts IDs are odd.
-// Used to increment the unique ID for each FUSE request.
-var reqIDStep uint64 = 2
-
 const (
 	// fuseDefaultMaxBackground is the default value for MaxBackground.
 	fuseDefaultMaxBackground = 12
@@ -52,43 +38,39 @@ const (
 	fuseDefaultMaxPagesPerReq = 32
 )
 
-// Request represents a FUSE operation request that hasn't been sent to the
-// server yet.
+// connection is the struct by which the sentry communicates with the FUSE server daemon.
 //
-// +stateify savable
-type Request struct {
-	requestEntry
-
-	id   linux.FUSEOpID
-	hdr  *linux.FUSEHeaderIn
-	data []byte
-}
-
-// Response represents an actual response from the server, including the
-// response payload.
+// Lock order:
+// - conn.fd.mu
+// - conn.mu
+// - conn.asyncMu
 //
 // +stateify savable
-type Response struct {
-	opcode linux.FUSEOpcode
-	hdr    linux.FUSEHeaderOut
-	data   []byte
-}
-
-// connection is the struct by which the sentry communicates with the FUSE server daemon.
 type connection struct {
 	fd *DeviceFD
 
+	// mu protects access to struct memebers.
+	mu sync.Mutex `state:"nosave"`
+
+	// attributeVersion is the version of connection's attributes.
+	attributeVersion uint64
+
+	// We target FUSE 7.23.
 	// The following FUSE_INIT flags are currently unsupported by this implementation:
-	// - FUSE_ATOMIC_O_TRUNC: requires open(..., O_TRUNC)
 	// - FUSE_EXPORT_SUPPORT
-	// - FUSE_HANDLE_KILLPRIV
 	// - FUSE_POSIX_LOCKS: requires POSIX locks
 	// - FUSE_FLOCK_LOCKS: requires POSIX locks
 	// - FUSE_AUTO_INVAL_DATA: requires page caching eviction
-	// - FUSE_EXPLICIT_INVAL_DATA: requires page caching eviction
 	// - FUSE_DO_READDIRPLUS/FUSE_READDIRPLUS_AUTO: requires FUSE_READDIRPLUS implementation
 	// - FUSE_ASYNC_DIO
-	// - FUSE_POSIX_ACL: affects defaultPermissions, posixACL, xattr handler
+	// - FUSE_PARALLEL_DIROPS (7.25)
+	// - FUSE_HANDLE_KILLPRIV (7.26)
+	// - FUSE_POSIX_ACL: affects defaultPermissions, posixACL, xattr handler (7.26)
+	// - FUSE_ABORT_ERROR (7.27)
+	// - FUSE_CACHE_SYMLINKS (7.28)
+	// - FUSE_NO_OPENDIR_SUPPORT (7.29)
+	// - FUSE_EXPLICIT_INVAL_DATA: requires page caching eviction (7.30)
+	// - FUSE_MAP_ALIGNMENT (7.31)
 
 	// initialized after receiving FUSE_INIT reply.
 	// Until it's set, suspend sending FUSE requests.
@@ -96,11 +78,7 @@ type connection struct {
 	initialized int32
 
 	// initializedChan is used to block requests before initialization.
-	initializedChan chan struct{}
-
-	// blocked when there are too many outstading backgrounds requests (NumBackground == MaxBackground).
-	// TODO(gvisor.dev/issue/3185): update the numBackground accordingly; use a channel to block.
-	blocked bool
+	initializedChan chan struct{} `state:".(bool)"`
 
 	// connected (connection established) when a new FUSE file system is created.
 	// Set to false when:
@@ -109,48 +87,55 @@ type connection struct {
 	//   device release.
 	connected bool
 
-	// aborted via sysfs.
-	// TODO(gvisor.dev/issue/3185): abort all queued requests.
-	aborted bool
-
 	// connInitError if FUSE_INIT encountered error (major version mismatch).
 	// Only set in INIT.
 	connInitError bool
 
 	// connInitSuccess if FUSE_INIT is successful.
 	// Only set in INIT.
-	// Used for destory.
+	// Used for destory (not yet implemented).
 	connInitSuccess bool
 
-	// TODO(gvisor.dev/issue/3185): All the queue logic are working in progress.
-
-	// NumberBackground is the number of requests in the background.
-	numBackground uint16
+	// aborted via sysfs, and will send ECONNABORTED to read after disconnection (instead of ENODEV).
+	// Set only if abortErr is true and via fuse control fs (not yet implemented).
+	// TODO(gvisor.dev/issue/3525): set this to true when user aborts.
+	aborted bool
 
-	// congestionThreshold for NumBackground.
-	// Negotiated in FUSE_INIT.
-	congestionThreshold uint16
+	// numWating is the number of requests waiting to be
+	// sent to FUSE device or being processed by FUSE daemon.
+	numWaiting uint32
 
-	// maxBackground is the maximum number of NumBackground.
-	// Block connection when it is reached.
-	// Negotiated in FUSE_INIT.
-	maxBackground uint16
+	// Terminology note:
+	//
+	// - `asyncNumMax` is the `MaxBackground` in the FUSE_INIT_IN struct.
+	//
+	// - `asyncCongestionThreshold` is the `CongestionThreshold` in the FUSE_INIT_IN struct.
+	//
+	// We call the "background" requests in unix term as async requests.
+	// The "async requests" in unix term is our async requests that expect a reply,
+	// i.e. `!request.noReply`
 
-	// numActiveBackground is the number of requests in background and has being marked as active.
-	numActiveBackground uint16
+	// asyncMu protects the async request fields.
+	asyncMu sync.Mutex `state:"nosave"`
 
-	// numWating is the number of requests waiting for completion.
-	numWaiting uint32
+	// asyncNum is the number of async requests.
+	// Protected by asyncMu.
+	asyncNum uint16
 
-	// TODO(gvisor.dev/issue/3185): BgQueue
-	// some queue for background queued requests.
+	// asyncCongestionThreshold the number of async requests.
+	// Negotiated in FUSE_INIT as "CongestionThreshold".
+	// TODO(gvisor.dev/issue/3529): add congestion control.
+	// Protected by asyncMu.
+	asyncCongestionThreshold uint16
 
-	// bgLock protects:
-	// MaxBackground, CongestionThreshold, NumBackground,
-	// NumActiveBackground, BgQueue, Blocked.
-	bgLock sync.Mutex
+	// asyncNumMax is the maximum number of asyncNum.
+	// Connection blocks the async requests when it is reached.
+	// Negotiated in FUSE_INIT as "MaxBackground".
+	// Protected by asyncMu.
+	asyncNumMax uint16
 
 	// maxRead is the maximum size of a read buffer in in bytes.
+	// Initialized from a fuse fs parameter.
 	maxRead uint32
 
 	// maxWrite is the maximum size of a write buffer in bytes.
@@ -165,23 +150,20 @@ type connection struct {
 	// Negotiated and only set in INIT.
 	minor uint32
 
-	// asyncRead if read pages asynchronously.
+	// atomicOTrunc is true when FUSE does not send a separate SETATTR request
+	// before open with O_TRUNC flag.
 	// Negotiated and only set in INIT.
-	asyncRead bool
+	atomicOTrunc bool
 
-	// abortErr is true if kernel need to return an unique read error after abort.
+	// asyncRead if read pages asynchronously.
 	// Negotiated and only set in INIT.
-	abortErr bool
+	asyncRead bool
 
 	// writebackCache is true for write-back cache policy,
 	// false for write-through policy.
 	// Negotiated and only set in INIT.
 	writebackCache bool
 
-	// cacheSymlinks if filesystem needs to cache READLINK responses in page cache.
-	// Negotiated and only set in INIT.
-	cacheSymlinks bool
-
 	// bigWrites if doing multi-page cached writes.
 	// Negotiated and only set in INIT.
 	bigWrites bool
@@ -189,116 +171,86 @@ type connection struct {
 	// dontMask if filestestem does not apply umask to creation modes.
 	// Negotiated in INIT.
 	dontMask bool
+
+	// noOpen if FUSE server doesn't support open operation.
+	// This flag only influence performance, not correctness of the program.
+	noOpen bool
+}
+
+func (conn *connection) saveInitializedChan() bool {
+	select {
+	case <-conn.initializedChan:
+		return true // Closed.
+	default:
+		return false // Not closed.
+	}
+}
+
+func (conn *connection) loadInitializedChan(closed bool) {
+	conn.initializedChan = make(chan struct{}, 1)
+	if closed {
+		close(conn.initializedChan)
+	}
 }
 
 // newFUSEConnection creates a FUSE connection to fd.
-func newFUSEConnection(_ context.Context, fd *vfs.FileDescription, maxInFlightRequests uint64) (*connection, error) {
+func newFUSEConnection(_ context.Context, fd *vfs.FileDescription, opts *filesystemOptions) (*connection, error) {
 	// Mark the device as ready so it can be used. /dev/fuse can only be used if the FD was used to
 	// mount a FUSE filesystem.
 	fuseFD := fd.Impl().(*DeviceFD)
-	fuseFD.mounted = true
 
 	// Create the writeBuf for the header to be stored in.
 	hdrLen := uint32((*linux.FUSEHeaderOut)(nil).SizeBytes())
 	fuseFD.writeBuf = make([]byte, hdrLen)
 	fuseFD.completions = make(map[linux.FUSEOpID]*futureResponse)
-	fuseFD.fullQueueCh = make(chan struct{}, maxInFlightRequests)
+	fuseFD.fullQueueCh = make(chan struct{}, opts.maxActiveRequests)
 	fuseFD.writeCursor = 0
 
 	return &connection{
-		fd:                  fuseFD,
-		maxBackground:       fuseDefaultMaxBackground,
-		congestionThreshold: fuseDefaultCongestionThreshold,
-		maxPages:            fuseDefaultMaxPagesPerReq,
-		initializedChan:     make(chan struct{}),
-		connected:           true,
-	}, nil
-}
-
-// SetInitialized atomically sets the connection as initialized.
-func (conn *connection) SetInitialized() {
-	// Unblock the requests sent before INIT.
-	close(conn.initializedChan)
-
-	// Close the channel first to avoid the non-atomic situation
-	// where conn.initialized is true but there are
-	// tasks being blocked on the channel.
-	// And it prevents the newer tasks from gaining
-	// unnecessary higher chance to be issued before the blocked one.
-
-	atomic.StoreInt32(&(conn.initialized), int32(1))
-}
-
-// IsInitialized atomically check if the connection is initialized.
-// pairs with SetInitialized().
-func (conn *connection) Initialized() bool {
-	return atomic.LoadInt32(&(conn.initialized)) != 0
-}
-
-// NewRequest creates a new request that can be sent to the FUSE server.
-func (conn *connection) NewRequest(creds *auth.Credentials, pid uint32, ino uint64, opcode linux.FUSEOpcode, payload marshal.Marshallable) (*Request, error) {
-	conn.fd.mu.Lock()
-	defer conn.fd.mu.Unlock()
-	conn.fd.nextOpID += linux.FUSEOpID(reqIDStep)
-
-	hdrLen := (*linux.FUSEHeaderIn)(nil).SizeBytes()
-	hdr := linux.FUSEHeaderIn{
-		Len:    uint32(hdrLen + payload.SizeBytes()),
-		Opcode: opcode,
-		Unique: conn.fd.nextOpID,
-		NodeID: ino,
-		UID:    uint32(creds.EffectiveKUID),
-		GID:    uint32(creds.EffectiveKGID),
-		PID:    pid,
-	}
-
-	buf := make([]byte, hdr.Len)
-	hdr.MarshalUnsafe(buf[:hdrLen])
-	payload.MarshalUnsafe(buf[hdrLen:])
-
-	return &Request{
-		id:   hdr.Unique,
-		hdr:  &hdr,
-		data: buf,
+		fd:                       fuseFD,
+		asyncNumMax:              fuseDefaultMaxBackground,
+		asyncCongestionThreshold: fuseDefaultCongestionThreshold,
+		maxRead:                  opts.maxRead,
+		maxPages:                 fuseDefaultMaxPagesPerReq,
+		initializedChan:          make(chan struct{}),
+		connected:                true,
 	}, nil
 }
 
-// Call makes a request to the server and blocks the invoking task until a
-// server responds with a response. Task should never be nil.
-// Requests will not be sent before the connection is initialized.
-// For async tasks, use CallAsync().
-func (conn *connection) Call(t *kernel.Task, r *Request) (*Response, error) {
-	// Block requests sent before connection is initalized.
-	if !conn.Initialized() {
-		if err := t.Block(conn.initializedChan); err != nil {
-			return nil, err
-		}
-	}
-
-	return conn.call(t, r)
+// CallAsync makes an async (aka background) request.
+// It's a simple wrapper around Call().
+func (conn *connection) CallAsync(t *kernel.Task, r *Request) error {
+	r.async = true
+	_, err := conn.Call(t, r)
+	return err
 }
 
-// CallAsync makes an async (aka background) request.
-// Those requests either do not expect a response (e.g. release) or
-// the response should be handled by others (e.g. init).
-// Return immediately unless the connection is blocked (before initialization).
-// Async call example: init, release, forget, aio, interrupt.
+// Call makes a request to the server.
+// Block before the connection is initialized.
 // When the Request is FUSE_INIT, it will not be blocked before initialization.
-func (conn *connection) CallAsync(t *kernel.Task, r *Request) error {
+// Task should never be nil.
+//
+// For a sync request, it blocks the invoking task until
+// a server responds with a response.
+//
+// For an async request (that do not expect a response immediately),
+// it returns directly unless being blocked either before initialization
+// or when there are too many async requests ongoing.
+//
+// Example for async request:
+// init, readahead, write, async read/write, fuse_notify_reply,
+// non-sync release, interrupt, forget.
+//
+// The forget request does not have a reply,
+// as documented in include/uapi/linux/fuse.h:FUSE_FORGET.
+func (conn *connection) Call(t *kernel.Task, r *Request) (*Response, error) {
 	// Block requests sent before connection is initalized.
 	if !conn.Initialized() && r.hdr.Opcode != linux.FUSE_INIT {
 		if err := t.Block(conn.initializedChan); err != nil {
-			return err
+			return nil, err
 		}
 	}
 
-	// This should be the only place that invokes call() with a nil task.
-	_, err := conn.call(nil, r)
-	return err
-}
-
-// call makes a call without blocking checks.
-func (conn *connection) call(t *kernel.Task, r *Request) (*Response, error) {
 	if !conn.connected {
 		return nil, syserror.ENOTCONN
 	}
@@ -315,31 +267,6 @@ func (conn *connection) call(t *kernel.Task, r *Request) (*Response, error) {
 	return fut.resolve(t)
 }
 
-// Error returns the error of the FUSE call.
-func (r *Response) Error() error {
-	errno := r.hdr.Error
-	if errno >= 0 {
-		return nil
-	}
-
-	sysErrNo := syscall.Errno(-errno)
-	return error(sysErrNo)
-}
-
-// UnmarshalPayload unmarshals the response data into m.
-func (r *Response) UnmarshalPayload(m marshal.Marshallable) error {
-	hdrLen := r.hdr.SizeBytes()
-	haveDataLen := r.hdr.Len - uint32(hdrLen)
-	wantDataLen := uint32(m.SizeBytes())
-
-	if haveDataLen < wantDataLen {
-		return fmt.Errorf("payload too small. Minimum data lenth required: %d,  but got data length %d", wantDataLen, haveDataLen)
-	}
-
-	m.UnmarshalUnsafe(r.data[hdrLen:])
-	return nil
-}
-
 // callFuture makes a request to the server and returns a future response.
 // Call resolve() when the response needs to be fulfilled.
 func (conn *connection) callFuture(t *kernel.Task, r *Request) (*futureResponse, error) {
@@ -358,11 +285,6 @@ func (conn *connection) callFuture(t *kernel.Task, r *Request) (*futureResponse,
 	// if there are always too many ongoing requests all the time. The
 	// supported maxActiveRequests setting should be really high to avoid this.
 	for conn.fd.numActiveRequests == conn.fd.fs.opts.maxActiveRequests {
-		if t == nil {
-			// Since there is no task that is waiting. We must error out.
-			return nil, errors.New("FUSE request queue full")
-		}
-
 		log.Infof("Blocking request %v from being queued. Too many active requests: %v",
 			r.id, conn.fd.numActiveRequests)
 		conn.fd.mu.Unlock()
@@ -378,9 +300,19 @@ func (conn *connection) callFuture(t *kernel.Task, r *Request) (*futureResponse,
 
 // callFutureLocked makes a request to the server and returns a future response.
 func (conn *connection) callFutureLocked(t *kernel.Task, r *Request) (*futureResponse, error) {
+	// Check connected again holding conn.mu.
+	conn.mu.Lock()
+	if !conn.connected {
+		conn.mu.Unlock()
+		// we checked connected before,
+		// this must be due to aborted connection.
+		return nil, syserror.ECONNABORTED
+	}
+	conn.mu.Unlock()
+
 	conn.fd.queue.PushBack(r)
-	conn.fd.numActiveRequests += 1
-	fut := newFutureResponse(r.hdr.Opcode)
+	conn.fd.numActiveRequests++
+	fut := newFutureResponse(r)
 	conn.fd.completions[r.id] = fut
 
 	// Signal the readers that there is something to read.
@@ -388,50 +320,3 @@ func (conn *connection) callFutureLocked(t *kernel.Task, r *Request) (*futureRes
 
 	return fut, nil
 }
-
-// futureResponse represents an in-flight request, that may or may not have
-// completed yet. Convert it to a resolved Response by calling Resolve, but note
-// that this may block.
-//
-// +stateify savable
-type futureResponse struct {
-	opcode linux.FUSEOpcode
-	ch     chan struct{}
-	hdr    *linux.FUSEHeaderOut
-	data   []byte
-}
-
-// newFutureResponse creates a future response to a FUSE request.
-func newFutureResponse(opcode linux.FUSEOpcode) *futureResponse {
-	return &futureResponse{
-		opcode: opcode,
-		ch:     make(chan struct{}),
-	}
-}
-
-// resolve blocks the task until the server responds to its corresponding request,
-// then returns a resolved response.
-func (f *futureResponse) resolve(t *kernel.Task) (*Response, error) {
-	// If there is no Task associated with this request  - then we don't try to resolve
-	// the response.  Instead, the task writing the response (proxy to the server) will
-	// process the response on our behalf.
-	if t == nil {
-		log.Infof("fuse.Response.resolve: Not waiting on a response from server.")
-		return nil, nil
-	}
-
-	if err := t.Block(f.ch); err != nil {
-		return nil, err
-	}
-
-	return f.getResponse(), nil
-}
-
-// getResponse creates a Response from the data the futureResponse has.
-func (f *futureResponse) getResponse() *Response {
-	return &Response{
-		opcode: f.opcode,
-		hdr:    *f.hdr,
-		data:   f.data,
-	}
-}
diff --git a/pkg/sentry/fsimpl/fuse/init.go b/pkg/sentry/fsimpl/fuse/connection_control.go
index 779c2bd3f..bfde78559 100644
--- a/pkg/sentry/fsimpl/fuse/init.go
+++ b/pkg/sentry/fsimpl/fuse/connection_control.go
@@ -15,7 +15,11 @@
 package fuse
 
 import (
+	"sync/atomic"
+	"syscall"
+
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 )
 
@@ -29,9 +33,10 @@ const (
 	// Follow the same behavior as unix fuse implementation.
 	fuseMaxTimeGranNs = 1000000000
 
-	// Minimum value for MaxWrite.
+	// Minimum value for MaxWrite and MaxRead.
 	// Follow the same behavior as unix fuse implementation.
 	fuseMinMaxWrite = 4096
+	fuseMinMaxRead  = 4096
 
 	// Temporary default value for max readahead, 128kb.
 	fuseDefaultMaxReadahead = 131072
@@ -49,6 +54,26 @@ var (
 	MaxUserCongestionThreshold uint16 = fuseDefaultCongestionThreshold
 )
 
+// SetInitialized atomically sets the connection as initialized.
+func (conn *connection) SetInitialized() {
+	// Unblock the requests sent before INIT.
+	close(conn.initializedChan)
+
+	// Close the channel first to avoid the non-atomic situation
+	// where conn.initialized is true but there are
+	// tasks being blocked on the channel.
+	// And it prevents the newer tasks from gaining
+	// unnecessary higher chance to be issued before the blocked one.
+
+	atomic.StoreInt32(&(conn.initialized), int32(1))
+}
+
+// IsInitialized atomically check if the connection is initialized.
+// pairs with SetInitialized().
+func (conn *connection) Initialized() bool {
+	return atomic.LoadInt32(&(conn.initialized)) != 0
+}
+
 // InitSend sends a FUSE_INIT request.
 func (conn *connection) InitSend(creds *auth.Credentials, pid uint32) error {
 	in := linux.FUSEInitIn{
@@ -70,29 +95,31 @@ func (conn *connection) InitSend(creds *auth.Credentials, pid uint32) error {
 }
 
 // InitRecv receives a FUSE_INIT reply and process it.
+//
+// Preconditions: conn.asyncMu must not be held if minor verion is newer than 13.
 func (conn *connection) InitRecv(res *Response, hasSysAdminCap bool) error {
 	if err := res.Error(); err != nil {
 		return err
 	}
 
-	var out linux.FUSEInitOut
-	if err := res.UnmarshalPayload(&out); err != nil {
+	initRes := fuseInitRes{initLen: res.DataLen()}
+	if err := res.UnmarshalPayload(&initRes); err != nil {
 		return err
 	}
 
-	return conn.initProcessReply(&out, hasSysAdminCap)
+	return conn.initProcessReply(&initRes.initOut, hasSysAdminCap)
 }
 
 // Process the FUSE_INIT reply from the FUSE server.
+// It tries to acquire the conn.asyncMu lock if minor version is newer than 13.
 func (conn *connection) initProcessReply(out *linux.FUSEInitOut, hasSysAdminCap bool) error {
+	// No matter error or not, always set initialzied.
+	// to unblock the blocked requests.
+	defer conn.SetInitialized()
+
 	// No support for old major fuse versions.
 	if out.Major != linux.FUSE_KERNEL_VERSION {
 		conn.connInitError = true
-
-		// Set the connection as initialized and unblock the blocked requests
-		// (i.e. return error for them).
-		conn.SetInitialized()
-
 		return nil
 	}
 
@@ -100,29 +127,14 @@ func (conn *connection) initProcessReply(out *linux.FUSEInitOut, hasSysAdminCap
 	conn.connInitSuccess = true
 	conn.minor = out.Minor
 
-	// No support for limits before minor version 13.
-	if out.Minor >= 13 {
-		conn.bgLock.Lock()
-
-		if out.MaxBackground > 0 {
-			conn.maxBackground = out.MaxBackground
-
-			if !hasSysAdminCap &&
-				conn.maxBackground > MaxUserBackgroundRequest {
-				conn.maxBackground = MaxUserBackgroundRequest
-			}
-		}
-
-		if out.CongestionThreshold > 0 {
-			conn.congestionThreshold = out.CongestionThreshold
-
-			if !hasSysAdminCap &&
-				conn.congestionThreshold > MaxUserCongestionThreshold {
-				conn.congestionThreshold = MaxUserCongestionThreshold
-			}
-		}
-
-		conn.bgLock.Unlock()
+	// No support for negotiating MaxWrite before minor version 5.
+	if out.Minor >= 5 {
+		conn.maxWrite = out.MaxWrite
+	} else {
+		conn.maxWrite = fuseMinMaxWrite
+	}
+	if conn.maxWrite < fuseMinMaxWrite {
+		conn.maxWrite = fuseMinMaxWrite
 	}
 
 	// No support for the following flags before minor version 6.
@@ -131,8 +143,6 @@ func (conn *connection) initProcessReply(out *linux.FUSEInitOut, hasSysAdminCap
 		conn.bigWrites = out.Flags&linux.FUSE_BIG_WRITES != 0
 		conn.dontMask = out.Flags&linux.FUSE_DONT_MASK != 0
 		conn.writebackCache = out.Flags&linux.FUSE_WRITEBACK_CACHE != 0
-		conn.cacheSymlinks = out.Flags&linux.FUSE_CACHE_SYMLINKS != 0
-		conn.abortErr = out.Flags&linux.FUSE_ABORT_ERROR != 0
 
 		// TODO(gvisor.dev/issue/3195): figure out how to use TimeGran (0 < TimeGran <= fuseMaxTimeGranNs).
 
@@ -148,19 +158,90 @@ func (conn *connection) initProcessReply(out *linux.FUSEInitOut, hasSysAdminCap
 		}
 	}
 
-	// No support for negotiating MaxWrite before minor version 5.
-	if out.Minor >= 5 {
-		conn.maxWrite = out.MaxWrite
-	} else {
-		conn.maxWrite = fuseMinMaxWrite
+	// No support for limits before minor version 13.
+	if out.Minor >= 13 {
+		conn.asyncMu.Lock()
+
+		if out.MaxBackground > 0 {
+			conn.asyncNumMax = out.MaxBackground
+
+			if !hasSysAdminCap &&
+				conn.asyncNumMax > MaxUserBackgroundRequest {
+				conn.asyncNumMax = MaxUserBackgroundRequest
+			}
+		}
+
+		if out.CongestionThreshold > 0 {
+			conn.asyncCongestionThreshold = out.CongestionThreshold
+
+			if !hasSysAdminCap &&
+				conn.asyncCongestionThreshold > MaxUserCongestionThreshold {
+				conn.asyncCongestionThreshold = MaxUserCongestionThreshold
+			}
+		}
+
+		conn.asyncMu.Unlock()
 	}
-	if conn.maxWrite < fuseMinMaxWrite {
-		conn.maxWrite = fuseMinMaxWrite
+
+	return nil
+}
+
+// Abort this FUSE connection.
+// It tries to acquire conn.fd.mu, conn.lock, conn.bgLock in order.
+// All possible requests waiting or blocking will be aborted.
+//
+// Preconditions: conn.fd.mu is locked.
+func (conn *connection) Abort(ctx context.Context) {
+	conn.mu.Lock()
+	conn.asyncMu.Lock()
+
+	if !conn.connected {
+		conn.asyncMu.Unlock()
+		conn.mu.Unlock()
+		conn.fd.mu.Unlock()
+		return
 	}
 
-	// Set connection as initialized and unblock the requests
-	// issued before init.
-	conn.SetInitialized()
+	conn.connected = false
 
-	return nil
+	// Empty the `fd.queue` that holds the requests
+	// not yet read by the FUSE daemon yet.
+	// These are a subset of the requests in `fuse.completion` map.
+	for !conn.fd.queue.Empty() {
+		req := conn.fd.queue.Front()
+		conn.fd.queue.Remove(req)
+	}
+
+	var terminate []linux.FUSEOpID
+
+	// 2. Collect the requests have not been sent to FUSE daemon,
+	// or have not received a reply.
+	for unique := range conn.fd.completions {
+		terminate = append(terminate, unique)
+	}
+
+	// Release locks to avoid deadlock.
+	conn.asyncMu.Unlock()
+	conn.mu.Unlock()
+
+	// 1. The requets blocked before initialization.
+	// Will reach call() `connected` check and return.
+	if !conn.Initialized() {
+		conn.SetInitialized()
+	}
+
+	// 2. Terminate the requests collected above.
+	// Set ECONNABORTED error.
+	// sendError() will remove them from `fd.completion` map.
+	// Will enter the path of a normally received error.
+	for _, toTerminate := range terminate {
+		conn.fd.sendError(ctx, -int32(syscall.ECONNABORTED), toTerminate)
+	}
+
+	// 3. The requests not yet written to FUSE device.
+	// Early terminate.
+	// Will reach callFutureLocked() `connected` check and return.
+	close(conn.fd.fullQueueCh)
+
+	// TODO(gvisor.dev/issue/3528): Forget all pending forget reqs.
 }
diff --git a/pkg/sentry/fsimpl/fuse/connection_test.go b/pkg/sentry/fsimpl/fuse/connection_test.go
new file mode 100644
index 000000000..91d16c1cf
--- /dev/null
+++ b/pkg/sentry/fsimpl/fuse/connection_test.go
@@ -0,0 +1,117 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fuse
+
+import (
+	"math/rand"
+	"syscall"
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// TestConnectionInitBlock tests if initialization
+// correctly blocks and unblocks the connection.
+// Since it's unfeasible to test kernelTask.Block() in unit test,
+// the code in Call() are not tested here.
+func TestConnectionInitBlock(t *testing.T) {
+	s := setup(t)
+	defer s.Destroy()
+
+	k := kernel.KernelFromContext(s.Ctx)
+
+	conn, _, err := newTestConnection(s, k, maxActiveRequestsDefault)
+	if err != nil {
+		t.Fatalf("newTestConnection: %v", err)
+	}
+
+	select {
+	case <-conn.initializedChan:
+		t.Fatalf("initializedChan should be blocking before SetInitialized")
+	default:
+	}
+
+	conn.SetInitialized()
+
+	select {
+	case <-conn.initializedChan:
+	default:
+		t.Fatalf("initializedChan should not be blocking after SetInitialized")
+	}
+}
+
+func TestConnectionAbort(t *testing.T) {
+	s := setup(t)
+	defer s.Destroy()
+
+	k := kernel.KernelFromContext(s.Ctx)
+	creds := auth.CredentialsFromContext(s.Ctx)
+	task := kernel.TaskFromContext(s.Ctx)
+
+	const numRequests uint64 = 256
+
+	conn, _, err := newTestConnection(s, k, numRequests)
+	if err != nil {
+		t.Fatalf("newTestConnection: %v", err)
+	}
+
+	testObj := &testPayload{
+		data: rand.Uint32(),
+	}
+
+	var futNormal []*futureResponse
+
+	for i := 0; i < int(numRequests); i++ {
+		req, err := conn.NewRequest(creds, uint32(i), uint64(i), 0, testObj)
+		if err != nil {
+			t.Fatalf("NewRequest creation failed: %v", err)
+		}
+		fut, err := conn.callFutureLocked(task, req)
+		if err != nil {
+			t.Fatalf("callFutureLocked failed: %v", err)
+		}
+		futNormal = append(futNormal, fut)
+	}
+
+	conn.Abort(s.Ctx)
+
+	// Abort should unblock the initialization channel.
+	// Note: no test requests are actually blocked on `conn.initializedChan`.
+	select {
+	case <-conn.initializedChan:
+	default:
+		t.Fatalf("initializedChan should not be blocking after SetInitialized")
+	}
+
+	// Abort will return ECONNABORTED error to unblocked requests.
+	for _, fut := range futNormal {
+		if fut.getResponse().hdr.Error != -int32(syscall.ECONNABORTED) {
+			t.Fatalf("Incorrect error code received for aborted connection: %v", fut.getResponse().hdr.Error)
+		}
+	}
+
+	// After abort, Call() should return directly with ENOTCONN.
+	req, err := conn.NewRequest(creds, 0, 0, 0, testObj)
+	if err != nil {
+		t.Fatalf("NewRequest creation failed: %v", err)
+	}
+	_, err = conn.Call(task, req)
+	if err != syserror.ENOTCONN {
+		t.Fatalf("Incorrect error code received for Call() after connection aborted")
+	}
+
+}
diff --git a/pkg/sentry/fsimpl/fuse/dev.go b/pkg/sentry/fsimpl/fuse/dev.go
index e522ff9a0..1b86a4b4c 100644
--- a/pkg/sentry/fsimpl/fuse/dev.go
+++ b/pkg/sentry/fsimpl/fuse/dev.go
@@ -19,7 +19,6 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
-	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
@@ -32,6 +31,8 @@ import (
 const fuseDevMinor = 229
 
 // fuseDevice implements vfs.Device for /dev/fuse.
+//
+// +stateify savable
 type fuseDevice struct{}
 
 // Open implements vfs.Device.Open.
@@ -50,15 +51,14 @@ func (fuseDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, op
 }
 
 // DeviceFD implements vfs.FileDescriptionImpl for /dev/fuse.
+//
+// +stateify savable
 type DeviceFD struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
 	vfs.DentryMetadataFileDescriptionImpl
 	vfs.NoLockFD
 
-	// mounted specifies whether a FUSE filesystem was mounted using the DeviceFD.
-	mounted bool
-
 	// nextOpID is used to create new requests.
 	nextOpID linux.FUSEOpID
 
@@ -83,7 +83,7 @@ type DeviceFD struct {
 	writeCursorFR *futureResponse
 
 	// mu protects all the queues, maps, buffers and cursors and nextOpID.
-	mu sync.Mutex
+	mu sync.Mutex `state:"nosave"`
 
 	// waitQueue is used to notify interested parties when the device becomes
 	// readable or writable.
@@ -92,21 +92,36 @@ type DeviceFD struct {
 	// fullQueueCh is a channel used to synchronize the readers with the writers.
 	// Writers (inbound requests to the filesystem) block if there are too many
 	// unprocessed in-flight requests.
-	fullQueueCh chan struct{}
+	fullQueueCh chan struct{} `state:".(int)"`
 
 	// fs is the FUSE filesystem that this FD is being used for.
 	fs *filesystem
 }
 
+func (fd *DeviceFD) saveFullQueueCh() int {
+	return cap(fd.fullQueueCh)
+}
+
+func (fd *DeviceFD) loadFullQueueCh(capacity int) {
+	fd.fullQueueCh = make(chan struct{}, capacity)
+}
+
 // Release implements vfs.FileDescriptionImpl.Release.
-func (fd *DeviceFD) Release(context.Context) {
-	fd.fs.conn.connected = false
+func (fd *DeviceFD) Release(ctx context.Context) {
+	if fd.fs != nil {
+		fd.fs.conn.mu.Lock()
+		fd.fs.conn.connected = false
+		fd.fs.conn.mu.Unlock()
+
+		fd.fs.VFSFilesystem().DecRef(ctx)
+		fd.fs = nil
+	}
 }
 
 // PRead implements vfs.FileDescriptionImpl.PRead.
 func (fd *DeviceFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
 	// Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted.
-	if !fd.mounted {
+	if fd.fs == nil {
 		return 0, syserror.EPERM
 	}
 
@@ -116,10 +131,16 @@ func (fd *DeviceFD) PRead(ctx context.Context, dst usermem.IOSequence, offset in
 // Read implements vfs.FileDescriptionImpl.Read.
 func (fd *DeviceFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
 	// Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted.
-	if !fd.mounted {
+	if fd.fs == nil {
 		return 0, syserror.EPERM
 	}
 
+	// Return ENODEV if the filesystem is umounted.
+	if fd.fs.umounted {
+		// TODO(gvisor.dev/issue/3525): return ECONNABORTED if aborted via fuse control fs.
+		return 0, syserror.ENODEV
+	}
+
 	// We require that any Read done on this filesystem have a sane minimum
 	// read buffer. It must have the capacity for the fixed parts of any request
 	// header (Linux uses the request header and the FUSEWriteIn header for this
@@ -143,58 +164,82 @@ func (fd *DeviceFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.R
 }
 
 // readLocked implements the reading of the fuse device while locked with DeviceFD.mu.
+//
+// Preconditions: dst is large enough for any reasonable request.
 func (fd *DeviceFD) readLocked(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
-	if fd.queue.Empty() {
-		return 0, syserror.ErrWouldBlock
-	}
+	var req *Request
 
-	var readCursor uint32
-	var bytesRead int64
-	for {
-		req := fd.queue.Front()
-		if dst.NumBytes() < int64(req.hdr.Len) {
-			// The request is too large. Cannot process it. All requests must be smaller than the
-			// negotiated size as specified by Connection.MaxWrite set as part of the FUSE_INIT
-			// handshake.
-			errno := -int32(syscall.EIO)
-			if req.hdr.Opcode == linux.FUSE_SETXATTR {
-				errno = -int32(syscall.E2BIG)
-			}
+	// Find the first valid request.
+	// For the normal case this loop only execute once.
+	for !fd.queue.Empty() {
+		req = fd.queue.Front()
 
-			// Return the error to the calling task.
-			if err := fd.sendError(ctx, errno, req); err != nil {
-				return 0, err
-			}
+		if int64(req.hdr.Len)+int64(len(req.payload)) <= dst.NumBytes() {
+			break
+		}
 
-			// We're done with this request.
-			fd.queue.Remove(req)
+		// The request is too large. Cannot process it. All requests must be smaller than the
+		// negotiated size as specified by Connection.MaxWrite set as part of the FUSE_INIT
+		// handshake.
+		errno := -int32(syscall.EIO)
+		if req.hdr.Opcode == linux.FUSE_SETXATTR {
+			errno = -int32(syscall.E2BIG)
+		}
 
-			// Restart the read as this request was invalid.
-			log.Warningf("fuse.DeviceFD.Read: request found was too large. Restarting read.")
-			return fd.readLocked(ctx, dst, opts)
+		// Return the error to the calling task.
+		if err := fd.sendError(ctx, errno, req.hdr.Unique); err != nil {
+			return 0, err
 		}
 
-		n, err := dst.CopyOut(ctx, req.data[readCursor:])
+		// We're done with this request.
+		fd.queue.Remove(req)
+		req = nil
+	}
+
+	if req == nil {
+		return 0, syserror.ErrWouldBlock
+	}
+
+	// We already checked the size: dst must be able to fit the whole request.
+	// Now we write the marshalled header, the payload,
+	// and the potential additional payload
+	// to the user memory IOSequence.
+
+	n, err := dst.CopyOut(ctx, req.data)
+	if err != nil {
+		return 0, err
+	}
+	if n != len(req.data) {
+		return 0, syserror.EIO
+	}
+
+	if req.hdr.Opcode == linux.FUSE_WRITE {
+		written, err := dst.DropFirst(n).CopyOut(ctx, req.payload)
 		if err != nil {
 			return 0, err
 		}
-		readCursor += uint32(n)
-		bytesRead += int64(n)
-
-		if readCursor >= req.hdr.Len {
-			// Fully done with this req, remove it from the queue.
-			fd.queue.Remove(req)
-			break
+		if written != len(req.payload) {
+			return 0, syserror.EIO
 		}
+		n += int(written)
 	}
 
-	return bytesRead, nil
+	// Fully done with this req, remove it from the queue.
+	fd.queue.Remove(req)
+
+	// Remove noReply ones from map of requests expecting a reply.
+	if req.noReply {
+		fd.numActiveRequests -= 1
+		delete(fd.completions, req.hdr.Unique)
+	}
+
+	return int64(n), nil
 }
 
 // PWrite implements vfs.FileDescriptionImpl.PWrite.
 func (fd *DeviceFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
 	// Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted.
-	if !fd.mounted {
+	if fd.fs == nil {
 		return 0, syserror.EPERM
 	}
 
@@ -211,10 +256,15 @@ func (fd *DeviceFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.
 // writeLocked implements writing to the fuse device while locked with DeviceFD.mu.
 func (fd *DeviceFD) writeLocked(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
 	// Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted.
-	if !fd.mounted {
+	if fd.fs == nil {
 		return 0, syserror.EPERM
 	}
 
+	// Return ENODEV if the filesystem is umounted.
+	if fd.fs.umounted {
+		return 0, syserror.ENODEV
+	}
+
 	var cn, n int64
 	hdrLen := uint32((*linux.FUSEHeaderOut)(nil).SizeBytes())
 
@@ -276,7 +326,8 @@ func (fd *DeviceFD) writeLocked(ctx context.Context, src usermem.IOSequence, opt
 
 			fut, ok := fd.completions[hdr.Unique]
 			if !ok {
-				// Server sent us a response for a request we never sent?
+				// Server sent us a response for a request we never sent,
+				// or for which we already received a reply (e.g. aborted), an unlikely event.
 				return 0, syserror.EINVAL
 			}
 
@@ -307,8 +358,23 @@ func (fd *DeviceFD) writeLocked(ctx context.Context, src usermem.IOSequence, opt
 
 // Readiness implements vfs.FileDescriptionImpl.Readiness.
 func (fd *DeviceFD) Readiness(mask waiter.EventMask) waiter.EventMask {
+	fd.mu.Lock()
+	defer fd.mu.Unlock()
+	return fd.readinessLocked(mask)
+}
+
+// readinessLocked implements checking the readiness of the fuse device while
+// locked with DeviceFD.mu.
+func (fd *DeviceFD) readinessLocked(mask waiter.EventMask) waiter.EventMask {
 	var ready waiter.EventMask
-	ready |= waiter.EventOut // FD is always writable
+
+	if fd.fs.umounted {
+		ready |= waiter.EventErr
+		return ready & mask
+	}
+
+	// FD is always writable.
+	ready |= waiter.EventOut
 	if !fd.queue.Empty() {
 		// Have reqs available, FD is readable.
 		ready |= waiter.EventIn
@@ -330,7 +396,7 @@ func (fd *DeviceFD) EventUnregister(e *waiter.Entry) {
 // Seek implements vfs.FileDescriptionImpl.Seek.
 func (fd *DeviceFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
 	// Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted.
-	if !fd.mounted {
+	if fd.fs == nil {
 		return 0, syserror.EPERM
 	}
 
@@ -338,59 +404,59 @@ func (fd *DeviceFD) Seek(ctx context.Context, offset int64, whence int32) (int64
 }
 
 // sendResponse sends a response to the waiting task (if any).
+//
+// Preconditions: fd.mu must be held.
 func (fd *DeviceFD) sendResponse(ctx context.Context, fut *futureResponse) error {
-	// See if the running task need to perform some action before returning.
-	// Since we just finished writing the future, we can be sure that
-	// getResponse generates a populated response.
-	if err := fd.noReceiverAction(ctx, fut.getResponse()); err != nil {
-		return err
-	}
+	// Signal the task waiting on a response if any.
+	defer close(fut.ch)
 
 	// Signal that the queue is no longer full.
 	select {
 	case fd.fullQueueCh <- struct{}{}:
 	default:
 	}
-	fd.numActiveRequests -= 1
+	fd.numActiveRequests--
+
+	if fut.async {
+		return fd.asyncCallBack(ctx, fut.getResponse())
+	}
 
-	// Signal the task waiting on a response.
-	close(fut.ch)
 	return nil
 }
 
-// sendError sends an error response to the waiting task (if any).
-func (fd *DeviceFD) sendError(ctx context.Context, errno int32, req *Request) error {
+// sendError sends an error response to the waiting task (if any) by calling sendResponse().
+//
+// Preconditions: fd.mu must be held.
+func (fd *DeviceFD) sendError(ctx context.Context, errno int32, unique linux.FUSEOpID) error {
 	// Return the error to the calling task.
 	outHdrLen := uint32((*linux.FUSEHeaderOut)(nil).SizeBytes())
 	respHdr := linux.FUSEHeaderOut{
 		Len:    outHdrLen,
 		Error:  errno,
-		Unique: req.hdr.Unique,
+		Unique: unique,
 	}
 
 	fut, ok := fd.completions[respHdr.Unique]
 	if !ok {
-		// Server sent us a response for a request we never sent?
+		// A response for a request we never sent,
+		// or for which we already received a reply (e.g. aborted).
 		return syserror.EINVAL
 	}
 	delete(fd.completions, respHdr.Unique)
 
 	fut.hdr = &respHdr
-	if err := fd.sendResponse(ctx, fut); err != nil {
-		return err
-	}
-
-	return nil
+	return fd.sendResponse(ctx, fut)
 }
 
-// noReceiverAction has the calling kernel.Task do some action if its known that no
-// receiver is going to be waiting on the future channel. This is to be used by:
-// FUSE_INIT.
-func (fd *DeviceFD) noReceiverAction(ctx context.Context, r *Response) error {
-	if r.opcode == linux.FUSE_INIT {
+// asyncCallBack executes pre-defined callback function for async requests.
+// Currently used by: FUSE_INIT.
+func (fd *DeviceFD) asyncCallBack(ctx context.Context, r *Response) error {
+	switch r.opcode {
+	case linux.FUSE_INIT:
 		creds := auth.CredentialsFromContext(ctx)
 		rootUserNs := kernel.KernelFromContext(ctx).RootUserNamespace()
 		return fd.fs.conn.InitRecv(r, creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, rootUserNs))
+		// TODO(gvisor.dev/issue/3247): support async read: correctly process the response.
 	}
 
 	return nil
diff --git a/pkg/sentry/fsimpl/fuse/dev_test.go b/pkg/sentry/fsimpl/fuse/dev_test.go
index 1ffe7ccd2..5986133e9 100644
--- a/pkg/sentry/fsimpl/fuse/dev_test.go
+++ b/pkg/sentry/fsimpl/fuse/dev_test.go
@@ -16,7 +16,6 @@ package fuse
 
 import (
 	"fmt"
-	"io"
 	"math/rand"
 	"testing"
 
@@ -28,17 +27,12 @@ import (
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
-	"gvisor.dev/gvisor/tools/go_marshal/marshal"
 )
 
 // echoTestOpcode is the Opcode used during testing. The server used in tests
 // will simply echo the payload back with the appropriate headers.
 const echoTestOpcode linux.FUSEOpcode = 1000
 
-type testPayload struct {
-	data uint32
-}
-
 // TestFUSECommunication tests that the communication layer between the Sentry and the
 // FUSE server daemon works as expected.
 func TestFUSECommunication(t *testing.T) {
@@ -327,102 +321,3 @@ func fuseServerRun(t *testing.T, s *testutil.System, k *kernel.Kernel, fd *vfs.F
 		}
 	}
 }
-
-func setup(t *testing.T) *testutil.System {
-	k, err := testutil.Boot()
-	if err != nil {
-		t.Fatalf("Error creating kernel: %v", err)
-	}
-
-	ctx := k.SupervisorContext()
-	creds := auth.CredentialsFromContext(ctx)
-
-	k.VFS().MustRegisterFilesystemType(Name, &FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
-		AllowUserList:  true,
-		AllowUserMount: true,
-	})
-
-	mntns, err := k.VFS().NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{})
-	if err != nil {
-		t.Fatalf("NewMountNamespace(): %v", err)
-	}
-
-	return testutil.NewSystem(ctx, t, k.VFS(), mntns)
-}
-
-// newTestConnection creates a fuse connection that the sentry can communicate with
-// and the FD for the server to communicate with.
-func newTestConnection(system *testutil.System, k *kernel.Kernel, maxActiveRequests uint64) (*connection, *vfs.FileDescription, error) {
-	vfsObj := &vfs.VirtualFilesystem{}
-	fuseDev := &DeviceFD{}
-
-	if err := vfsObj.Init(system.Ctx); err != nil {
-		return nil, nil, err
-	}
-
-	vd := vfsObj.NewAnonVirtualDentry("genCountFD")
-	defer vd.DecRef(system.Ctx)
-	if err := fuseDev.vfsfd.Init(fuseDev, linux.O_RDWR|linux.O_CREAT, vd.Mount(), vd.Dentry(), &vfs.FileDescriptionOptions{}); err != nil {
-		return nil, nil, err
-	}
-
-	fsopts := filesystemOptions{
-		maxActiveRequests: maxActiveRequests,
-	}
-	fs, err := NewFUSEFilesystem(system.Ctx, 0, &fsopts, &fuseDev.vfsfd)
-	if err != nil {
-		return nil, nil, err
-	}
-
-	return fs.conn, &fuseDev.vfsfd, nil
-}
-
-// SizeBytes implements marshal.Marshallable.SizeBytes.
-func (t *testPayload) SizeBytes() int {
-	return 4
-}
-
-// MarshalBytes implements marshal.Marshallable.MarshalBytes.
-func (t *testPayload) MarshalBytes(dst []byte) {
-	usermem.ByteOrder.PutUint32(dst[:4], t.data)
-}
-
-// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes.
-func (t *testPayload) UnmarshalBytes(src []byte) {
-	*t = testPayload{data: usermem.ByteOrder.Uint32(src[:4])}
-}
-
-// Packed implements marshal.Marshallable.Packed.
-func (t *testPayload) Packed() bool {
-	return true
-}
-
-// MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe.
-func (t *testPayload) MarshalUnsafe(dst []byte) {
-	t.MarshalBytes(dst)
-}
-
-// UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe.
-func (t *testPayload) UnmarshalUnsafe(src []byte) {
-	t.UnmarshalBytes(src)
-}
-
-// CopyOutN implements marshal.Marshallable.CopyOutN.
-func (t *testPayload) CopyOutN(task marshal.Task, addr usermem.Addr, limit int) (int, error) {
-	panic("not implemented")
-}
-
-// CopyOut implements marshal.Marshallable.CopyOut.
-func (t *testPayload) CopyOut(task marshal.Task, addr usermem.Addr) (int, error) {
-	panic("not implemented")
-}
-
-// CopyIn implements marshal.Marshallable.CopyIn.
-func (t *testPayload) CopyIn(task marshal.Task, addr usermem.Addr) (int, error) {
-	panic("not implemented")
-}
-
-// WriteTo implements io.WriterTo.WriteTo.
-func (t *testPayload) WriteTo(w io.Writer) (int64, error) {
-	panic("not implemented")
-}
diff --git a/pkg/sentry/fsimpl/fuse/directory.go b/pkg/sentry/fsimpl/fuse/directory.go
new file mode 100644
index 000000000..8f220a04b
--- /dev/null
+++ b/pkg/sentry/fsimpl/fuse/directory.go
@@ -0,0 +1,105 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fuse
+
+import (
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+type directoryFD struct {
+	fileDescription
+}
+
+// Allocate implements directoryFD.Allocate.
+func (*directoryFD) Allocate(ctx context.Context, mode, offset, length uint64) error {
+	return syserror.EISDIR
+}
+
+// PRead implements vfs.FileDescriptionImpl.PRead.
+func (*directoryFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+	return 0, syserror.EISDIR
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (*directoryFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+	return 0, syserror.EISDIR
+}
+
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
+func (*directoryFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+	return 0, syserror.EISDIR
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (*directoryFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+	return 0, syserror.EISDIR
+}
+
+// IterDirents implements vfs.FileDescriptionImpl.IterDirents.
+func (dir *directoryFD) IterDirents(ctx context.Context, callback vfs.IterDirentsCallback) error {
+	fusefs := dir.inode().fs
+	task, creds := kernel.TaskFromContext(ctx), auth.CredentialsFromContext(ctx)
+
+	in := linux.FUSEReadIn{
+		Fh:     dir.Fh,
+		Offset: uint64(atomic.LoadInt64(&dir.off)),
+		Size:   linux.FUSE_PAGE_SIZE,
+		Flags:  dir.statusFlags(),
+	}
+
+	// TODO(gVisor.dev/issue/3404): Support FUSE_READDIRPLUS.
+	req, err := fusefs.conn.NewRequest(creds, uint32(task.ThreadID()), dir.inode().nodeID, linux.FUSE_READDIR, &in)
+	if err != nil {
+		return err
+	}
+
+	res, err := fusefs.conn.Call(task, req)
+	if err != nil {
+		return err
+	}
+	if err := res.Error(); err != nil {
+		return err
+	}
+
+	var out linux.FUSEDirents
+	if err := res.UnmarshalPayload(&out); err != nil {
+		return err
+	}
+
+	for _, fuseDirent := range out.Dirents {
+		nextOff := int64(fuseDirent.Meta.Off)
+		dirent := vfs.Dirent{
+			Name:    fuseDirent.Name,
+			Type:    uint8(fuseDirent.Meta.Type),
+			Ino:     fuseDirent.Meta.Ino,
+			NextOff: nextOff,
+		}
+
+		if err := callback.Handle(dirent); err != nil {
+			return err
+		}
+		atomic.StoreInt64(&dir.off, nextOff)
+	}
+
+	return nil
+}
diff --git a/pkg/sentry/fsimpl/fuse/file.go b/pkg/sentry/fsimpl/fuse/file.go
new file mode 100644
index 000000000..83f2816b7
--- /dev/null
+++ b/pkg/sentry/fsimpl/fuse/file.go
@@ -0,0 +1,133 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fuse
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// fileDescription implements vfs.FileDescriptionImpl for fuse.
+type fileDescription struct {
+	vfsfd vfs.FileDescription
+	vfs.FileDescriptionDefaultImpl
+	vfs.DentryMetadataFileDescriptionImpl
+	vfs.NoLockFD
+
+	// the file handle used in userspace.
+	Fh uint64
+
+	// Nonseekable is indicate cannot perform seek on a file.
+	Nonseekable bool
+
+	// DirectIO suggest fuse to use direct io operation.
+	DirectIO bool
+
+	// OpenFlag is the flag returned by open.
+	OpenFlag uint32
+
+	// off is the file offset.
+	off int64
+}
+
+func (fd *fileDescription) dentry() *kernfs.Dentry {
+	return fd.vfsfd.Dentry().Impl().(*kernfs.Dentry)
+}
+
+func (fd *fileDescription) inode() *inode {
+	return fd.dentry().Inode().(*inode)
+}
+
+func (fd *fileDescription) filesystem() *vfs.Filesystem {
+	return fd.vfsfd.VirtualDentry().Mount().Filesystem()
+}
+
+func (fd *fileDescription) statusFlags() uint32 {
+	return fd.vfsfd.StatusFlags()
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *fileDescription) Release(ctx context.Context) {
+	// no need to release if FUSE server doesn't implement Open.
+	conn := fd.inode().fs.conn
+	if conn.noOpen {
+		return
+	}
+
+	in := linux.FUSEReleaseIn{
+		Fh:    fd.Fh,
+		Flags: fd.statusFlags(),
+	}
+	// TODO(gvisor.dev/issue/3245): add logic when we support file lock owner.
+	var opcode linux.FUSEOpcode
+	if fd.inode().Mode().IsDir() {
+		opcode = linux.FUSE_RELEASEDIR
+	} else {
+		opcode = linux.FUSE_RELEASE
+	}
+	kernelTask := kernel.TaskFromContext(ctx)
+	// ignoring errors and FUSE server reply is analogous to Linux's behavior.
+	req, err := conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(kernelTask.ThreadID()), fd.inode().nodeID, opcode, &in)
+	if err != nil {
+		// No way to invoke Call() with an errored request.
+		return
+	}
+	// The reply will be ignored since no callback is defined in asyncCallBack().
+	conn.CallAsync(kernelTask, req)
+}
+
+// PRead implements vfs.FileDescriptionImpl.PRead.
+func (fd *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+	return 0, nil
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (fd *fileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+	return 0, nil
+}
+
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
+func (fd *fileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+	return 0, nil
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (fd *fileDescription) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+	return 0, nil
+}
+
+// Seek implements vfs.FileDescriptionImpl.Seek.
+func (fd *fileDescription) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+	return 0, nil
+}
+
+// Stat implements vfs.FileDescriptionImpl.Stat.
+func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
+	fs := fd.filesystem()
+	inode := fd.inode()
+	return inode.Stat(ctx, fs, opts)
+}
+
+// SetStat implements vfs.FileDescriptionImpl.SetStat.
+func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
+	fs := fd.filesystem()
+	creds := auth.CredentialsFromContext(ctx)
+	return fd.inode().setAttr(ctx, fs, creds, opts, true, fd.Fh)
+}
diff --git a/pkg/sentry/fsimpl/fuse/fusefs.go b/pkg/sentry/fsimpl/fuse/fusefs.go
index 83c24ec25..e39df21c6 100644
--- a/pkg/sentry/fsimpl/fuse/fusefs.go
+++ b/pkg/sentry/fsimpl/fuse/fusefs.go
@@ -16,24 +16,36 @@
 package fuse
 
 import (
+	"math"
 	"strconv"
+	"sync"
+	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/marshal"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/waiter"
 )
 
 // Name is the default filesystem name.
 const Name = "fuse"
 
+// maxActiveRequestsDefault is the default setting controlling the upper bound
+// on the number of active requests at any given time.
+const maxActiveRequestsDefault = 10000
+
 // FilesystemType implements vfs.FilesystemType.
+//
+// +stateify savable
 type FilesystemType struct{}
 
+// +stateify savable
 type filesystemOptions struct {
 	// userID specifies the numeric uid of the mount owner.
 	// This option should not be specified by the filesystem owner.
@@ -56,9 +68,16 @@ type filesystemOptions struct {
 	// exist at any time. Any further requests will block when trying to
 	// Call the server.
 	maxActiveRequests uint64
+
+	// maxRead is the max number of bytes to read,
+	// specified as "max_read" in fs parameters.
+	// If not specified by user, use math.MaxUint32 as default value.
+	maxRead uint32
 }
 
 // filesystem implements vfs.FilesystemImpl.
+//
+// +stateify savable
 type filesystem struct {
 	kernfs.Filesystem
 	devMinor uint32
@@ -69,6 +88,9 @@ type filesystem struct {
 
 	// opts is the options the fusefs is initialized with.
 	opts *filesystemOptions
+
+	// umounted is true if filesystem.Release() has been called.
+	umounted bool
 }
 
 // Name implements vfs.FilesystemType.Name.
@@ -76,6 +98,9 @@ func (FilesystemType) Name() string {
 	return Name
 }
 
+// Release implements vfs.FilesystemType.Release.
+func (FilesystemType) Release(ctx context.Context) {}
+
 // GetFilesystem implements vfs.FilesystemType.GetFilesystem.
 func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
 	devMinor, err := vfsObj.GetAnonBlockDevMinor()
@@ -142,14 +167,29 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
 	// Set the maxInFlightRequests option.
 	fsopts.maxActiveRequests = maxActiveRequestsDefault
 
+	if maxReadStr, ok := mopts["max_read"]; ok {
+		delete(mopts, "max_read")
+		maxRead, err := strconv.ParseUint(maxReadStr, 10, 32)
+		if err != nil {
+			log.Warningf("%s.GetFilesystem: invalid max_read: max_read=%s", fsType.Name(), maxReadStr)
+			return nil, nil, syserror.EINVAL
+		}
+		if maxRead < fuseMinMaxRead {
+			maxRead = fuseMinMaxRead
+		}
+		fsopts.maxRead = uint32(maxRead)
+	} else {
+		fsopts.maxRead = math.MaxUint32
+	}
+
 	// Check for unparsed options.
 	if len(mopts) != 0 {
-		log.Warningf("%s.GetFilesystem: unknown options: %v", fsType.Name(), mopts)
+		log.Warningf("%s.GetFilesystem: unsupported or unknown options: %v", fsType.Name(), mopts)
 		return nil, nil, syserror.EINVAL
 	}
 
 	// Create a new FUSE filesystem.
-	fs, err := NewFUSEFilesystem(ctx, devMinor, &fsopts, fuseFd)
+	fs, err := newFUSEFilesystem(ctx, devMinor, &fsopts, fuseFd)
 	if err != nil {
 		log.Warningf("%s.NewFUSEFilesystem: failed with error: %v", fsType.Name(), err)
 		return nil, nil, err
@@ -165,26 +205,28 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
 	}
 
 	// root is the fusefs root directory.
-	root := fs.newInode(creds, fsopts.rootMode)
+	root := fs.newRootInode(creds, fsopts.rootMode)
 
 	return fs.VFSFilesystem(), root.VFSDentry(), nil
 }
 
-// NewFUSEFilesystem creates a new FUSE filesystem.
-func NewFUSEFilesystem(ctx context.Context, devMinor uint32, opts *filesystemOptions, device *vfs.FileDescription) (*filesystem, error) {
-	fs := &filesystem{
-		devMinor: devMinor,
-		opts:     opts,
-	}
-
-	conn, err := newFUSEConnection(ctx, device, opts.maxActiveRequests)
+// newFUSEFilesystem creates a new FUSE filesystem.
+func newFUSEFilesystem(ctx context.Context, devMinor uint32, opts *filesystemOptions, device *vfs.FileDescription) (*filesystem, error) {
+	conn, err := newFUSEConnection(ctx, device, opts)
 	if err != nil {
 		log.Warningf("fuse.NewFUSEFilesystem: NewFUSEConnection failed with error: %v", err)
 		return nil, syserror.EINVAL
 	}
 
-	fs.conn = conn
 	fuseFD := device.Impl().(*DeviceFD)
+
+	fs := &filesystem{
+		devMinor: devMinor,
+		opts:     opts,
+		conn:     conn,
+	}
+
+	fs.VFSFilesystem().IncRef()
 	fuseFD.fs = fs
 
 	return fs, nil
@@ -192,39 +234,361 @@ func NewFUSEFilesystem(ctx context.Context, devMinor uint32, opts *filesystemOpt
 
 // Release implements vfs.FilesystemImpl.Release.
 func (fs *filesystem) Release(ctx context.Context) {
+	fs.conn.fd.mu.Lock()
+
+	fs.umounted = true
+	fs.conn.Abort(ctx)
+	// Notify all the waiters on this fd.
+	fs.conn.fd.waitQueue.Notify(waiter.EventIn)
+
+	fs.conn.fd.mu.Unlock()
+
 	fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor)
 	fs.Filesystem.Release(ctx)
 }
 
 // inode implements kernfs.Inode.
+//
+// +stateify savable
 type inode struct {
+	inodeRefs
+	kernfs.InodeAlwaysValid
 	kernfs.InodeAttrs
-	kernfs.InodeNoDynamicLookup
-	kernfs.InodeNotSymlink
 	kernfs.InodeDirectoryNoNewChildren
+	kernfs.InodeNotSymlink
 	kernfs.OrderedChildren
 
+	// the owning filesystem. fs is immutable.
+	fs *filesystem
+
+	// metaDataMu protects the metadata of this inode.
+	metadataMu sync.Mutex
+
+	nodeID uint64
+
 	locks vfs.FileLocks
 
-	dentry kernfs.Dentry
+	// size of the file.
+	size uint64
+
+	// attributeVersion is the version of inode's attributes.
+	attributeVersion uint64
+
+	// attributeTime is the remaining vaild time of attributes.
+	attributeTime uint64
+
+	// version of the inode.
+	version uint64
+
+	// link is result of following a symbolic link.
+	link string
 }
 
-func (fs *filesystem) newInode(creds *auth.Credentials, mode linux.FileMode) *kernfs.Dentry {
-	i := &inode{}
-	i.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0755)
+func (fs *filesystem) newRootInode(creds *auth.Credentials, mode linux.FileMode) *kernfs.Dentry {
+	i := &inode{fs: fs, nodeID: 1}
+	i.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, 1, linux.ModeDirectory|0755)
 	i.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
-	i.dentry.Init(i)
+	i.EnableLeakCheck()
 
-	return &i.dentry
+	var d kernfs.Dentry
+	d.Init(&fs.Filesystem, i)
+	return &d
+}
+
+func (fs *filesystem) newInode(nodeID uint64, attr linux.FUSEAttr) kernfs.Inode {
+	i := &inode{fs: fs, nodeID: nodeID}
+	creds := auth.Credentials{EffectiveKGID: auth.KGID(attr.UID), EffectiveKUID: auth.KUID(attr.UID)}
+	i.InodeAttrs.Init(&creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.FileMode(attr.Mode))
+	atomic.StoreUint64(&i.size, attr.Size)
+	i.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
+	i.EnableLeakCheck()
+	return i
 }
 
 // Open implements kernfs.Inode.Open.
-func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts)
+func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	isDir := i.InodeAttrs.Mode().IsDir()
+	// return error if specified to open directory but inode is not a directory.
+	if !isDir && opts.Mode.IsDir() {
+		return nil, syserror.ENOTDIR
+	}
+	if opts.Flags&linux.O_LARGEFILE == 0 && atomic.LoadUint64(&i.size) > linux.MAX_NON_LFS {
+		return nil, syserror.EOVERFLOW
+	}
+
+	var fd *fileDescription
+	var fdImpl vfs.FileDescriptionImpl
+	if isDir {
+		directoryFD := &directoryFD{}
+		fd = &(directoryFD.fileDescription)
+		fdImpl = directoryFD
+	} else {
+		regularFD := &regularFileFD{}
+		fd = &(regularFD.fileDescription)
+		fdImpl = regularFD
+	}
+	// FOPEN_KEEP_CACHE is the defualt flag for noOpen.
+	fd.OpenFlag = linux.FOPEN_KEEP_CACHE
+
+	// Only send open request when FUSE server support open or is opening a directory.
+	if !i.fs.conn.noOpen || isDir {
+		kernelTask := kernel.TaskFromContext(ctx)
+		if kernelTask == nil {
+			log.Warningf("fusefs.Inode.Open: couldn't get kernel task from context")
+			return nil, syserror.EINVAL
+		}
+
+		// Build the request.
+		var opcode linux.FUSEOpcode
+		if isDir {
+			opcode = linux.FUSE_OPENDIR
+		} else {
+			opcode = linux.FUSE_OPEN
+		}
+
+		in := linux.FUSEOpenIn{Flags: opts.Flags & ^uint32(linux.O_CREAT|linux.O_EXCL|linux.O_NOCTTY)}
+		if !i.fs.conn.atomicOTrunc {
+			in.Flags &= ^uint32(linux.O_TRUNC)
+		}
+
+		req, err := i.fs.conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(kernelTask.ThreadID()), i.nodeID, opcode, &in)
+		if err != nil {
+			return nil, err
+		}
+
+		// Send the request and receive the reply.
+		res, err := i.fs.conn.Call(kernelTask, req)
+		if err != nil {
+			return nil, err
+		}
+		if err := res.Error(); err == syserror.ENOSYS && !isDir {
+			i.fs.conn.noOpen = true
+		} else if err != nil {
+			return nil, err
+		} else {
+			out := linux.FUSEOpenOut{}
+			if err := res.UnmarshalPayload(&out); err != nil {
+				return nil, err
+			}
+
+			// Process the reply.
+			fd.OpenFlag = out.OpenFlag
+			if isDir {
+				fd.OpenFlag &= ^uint32(linux.FOPEN_DIRECT_IO)
+			}
+
+			fd.Fh = out.Fh
+		}
+	}
+
+	// TODO(gvisor.dev/issue/3234): invalidate mmap after implemented it for FUSE Inode
+	fd.DirectIO = fd.OpenFlag&linux.FOPEN_DIRECT_IO != 0
+	fdOptions := &vfs.FileDescriptionOptions{}
+	if fd.OpenFlag&linux.FOPEN_NONSEEKABLE != 0 {
+		fdOptions.DenyPRead = true
+		fdOptions.DenyPWrite = true
+		fd.Nonseekable = true
+	}
+
+	// If we don't send SETATTR before open (which is indicated by atomicOTrunc)
+	// and O_TRUNC is set, update the inode's version number and clean existing data
+	// by setting the file size to 0.
+	if i.fs.conn.atomicOTrunc && opts.Flags&linux.O_TRUNC != 0 {
+		i.fs.conn.mu.Lock()
+		i.fs.conn.attributeVersion++
+		i.attributeVersion = i.fs.conn.attributeVersion
+		atomic.StoreUint64(&i.size, 0)
+		i.fs.conn.mu.Unlock()
+		i.attributeTime = 0
+	}
+
+	if err := fd.vfsfd.Init(fdImpl, opts.Flags, rp.Mount(), d.VFSDentry(), fdOptions); err != nil {
+		return nil, err
+	}
+	return &fd.vfsfd, nil
+}
+
+// Lookup implements kernfs.Inode.Lookup.
+func (i *inode) Lookup(ctx context.Context, name string) (kernfs.Inode, error) {
+	in := linux.FUSELookupIn{Name: name}
+	return i.newEntry(ctx, name, 0, linux.FUSE_LOOKUP, &in)
+}
+
+// Keep implements kernfs.Inode.Keep.
+func (i *inode) Keep() bool {
+	// Return true so that kernfs keeps the new dentry pointing to this
+	// inode in the dentry tree. This is needed because inodes created via
+	// Lookup are not temporary. They might refer to existing files on server
+	// that can be Unlink'd/Rmdir'd.
+	return true
+}
+
+// IterDirents implements kernfs.Inode.IterDirents.
+func (*inode) IterDirents(ctx context.Context, callback vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) {
+	return offset, nil
+}
+
+// NewFile implements kernfs.Inode.NewFile.
+func (i *inode) NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (kernfs.Inode, error) {
+	kernelTask := kernel.TaskFromContext(ctx)
+	if kernelTask == nil {
+		log.Warningf("fusefs.Inode.NewFile: couldn't get kernel task from context", i.nodeID)
+		return nil, syserror.EINVAL
+	}
+	in := linux.FUSECreateIn{
+		CreateMeta: linux.FUSECreateMeta{
+			Flags: opts.Flags,
+			Mode:  uint32(opts.Mode) | linux.S_IFREG,
+			Umask: uint32(kernelTask.FSContext().Umask()),
+		},
+		Name: name,
+	}
+	return i.newEntry(ctx, name, linux.S_IFREG, linux.FUSE_CREATE, &in)
+}
+
+// NewNode implements kernfs.Inode.NewNode.
+func (i *inode) NewNode(ctx context.Context, name string, opts vfs.MknodOptions) (kernfs.Inode, error) {
+	in := linux.FUSEMknodIn{
+		MknodMeta: linux.FUSEMknodMeta{
+			Mode:  uint32(opts.Mode),
+			Rdev:  linux.MakeDeviceID(uint16(opts.DevMajor), opts.DevMinor),
+			Umask: uint32(kernel.TaskFromContext(ctx).FSContext().Umask()),
+		},
+		Name: name,
+	}
+	return i.newEntry(ctx, name, opts.Mode.FileType(), linux.FUSE_MKNOD, &in)
+}
+
+// NewSymlink implements kernfs.Inode.NewSymlink.
+func (i *inode) NewSymlink(ctx context.Context, name, target string) (kernfs.Inode, error) {
+	in := linux.FUSESymLinkIn{
+		Name:   name,
+		Target: target,
+	}
+	return i.newEntry(ctx, name, linux.S_IFLNK, linux.FUSE_SYMLINK, &in)
+}
+
+// Unlink implements kernfs.Inode.Unlink.
+func (i *inode) Unlink(ctx context.Context, name string, child kernfs.Inode) error {
+	kernelTask := kernel.TaskFromContext(ctx)
+	if kernelTask == nil {
+		log.Warningf("fusefs.Inode.newEntry: couldn't get kernel task from context", i.nodeID)
+		return syserror.EINVAL
+	}
+	in := linux.FUSEUnlinkIn{Name: name}
+	req, err := i.fs.conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(kernelTask.ThreadID()), i.nodeID, linux.FUSE_UNLINK, &in)
 	if err != nil {
+		return err
+	}
+	res, err := i.fs.conn.Call(kernelTask, req)
+	if err != nil {
+		return err
+	}
+	// only return error, discard res.
+	return res.Error()
+}
+
+// NewDir implements kernfs.Inode.NewDir.
+func (i *inode) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (kernfs.Inode, error) {
+	in := linux.FUSEMkdirIn{
+		MkdirMeta: linux.FUSEMkdirMeta{
+			Mode:  uint32(opts.Mode),
+			Umask: uint32(kernel.TaskFromContext(ctx).FSContext().Umask()),
+		},
+		Name: name,
+	}
+	return i.newEntry(ctx, name, linux.S_IFDIR, linux.FUSE_MKDIR, &in)
+}
+
+// RmDir implements kernfs.Inode.RmDir.
+func (i *inode) RmDir(ctx context.Context, name string, child kernfs.Inode) error {
+	fusefs := i.fs
+	task, creds := kernel.TaskFromContext(ctx), auth.CredentialsFromContext(ctx)
+
+	in := linux.FUSERmDirIn{Name: name}
+	req, err := fusefs.conn.NewRequest(creds, uint32(task.ThreadID()), i.nodeID, linux.FUSE_RMDIR, &in)
+	if err != nil {
+		return err
+	}
+
+	res, err := i.fs.conn.Call(task, req)
+	if err != nil {
+		return err
+	}
+	return res.Error()
+}
+
+// newEntry calls FUSE server for entry creation and allocates corresponding entry according to response.
+// Shared by FUSE_MKNOD, FUSE_MKDIR, FUSE_SYMLINK, FUSE_LINK and FUSE_LOOKUP.
+func (i *inode) newEntry(ctx context.Context, name string, fileType linux.FileMode, opcode linux.FUSEOpcode, payload marshal.Marshallable) (kernfs.Inode, error) {
+	kernelTask := kernel.TaskFromContext(ctx)
+	if kernelTask == nil {
+		log.Warningf("fusefs.Inode.newEntry: couldn't get kernel task from context", i.nodeID)
+		return nil, syserror.EINVAL
+	}
+	req, err := i.fs.conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(kernelTask.ThreadID()), i.nodeID, opcode, payload)
+	if err != nil {
+		return nil, err
+	}
+	res, err := i.fs.conn.Call(kernelTask, req)
+	if err != nil {
+		return nil, err
+	}
+	if err := res.Error(); err != nil {
+		return nil, err
+	}
+	out := linux.FUSEEntryOut{}
+	if err := res.UnmarshalPayload(&out); err != nil {
 		return nil, err
 	}
-	return fd.VFSFileDescription(), nil
+	if opcode != linux.FUSE_LOOKUP && ((out.Attr.Mode&linux.S_IFMT)^uint32(fileType) != 0 || out.NodeID == 0 || out.NodeID == linux.FUSE_ROOT_ID) {
+		return nil, syserror.EIO
+	}
+	child := i.fs.newInode(out.NodeID, out.Attr)
+	return child, nil
+}
+
+// Getlink implements kernfs.Inode.Getlink.
+func (i *inode) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDentry, string, error) {
+	path, err := i.Readlink(ctx, mnt)
+	return vfs.VirtualDentry{}, path, err
+}
+
+// Readlink implements kernfs.Inode.Readlink.
+func (i *inode) Readlink(ctx context.Context, mnt *vfs.Mount) (string, error) {
+	if i.Mode().FileType()&linux.S_IFLNK == 0 {
+		return "", syserror.EINVAL
+	}
+	if len(i.link) == 0 {
+		kernelTask := kernel.TaskFromContext(ctx)
+		if kernelTask == nil {
+			log.Warningf("fusefs.Inode.Readlink: couldn't get kernel task from context")
+			return "", syserror.EINVAL
+		}
+		req, err := i.fs.conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(kernelTask.ThreadID()), i.nodeID, linux.FUSE_READLINK, &linux.FUSEEmptyIn{})
+		if err != nil {
+			return "", err
+		}
+		res, err := i.fs.conn.Call(kernelTask, req)
+		if err != nil {
+			return "", err
+		}
+		i.link = string(res.data[res.hdr.SizeBytes():])
+		if !mnt.Options().ReadOnly {
+			i.attributeTime = 0
+		}
+	}
+	return i.link, nil
+}
+
+// getFUSEAttr returns a linux.FUSEAttr of this inode stored in local cache.
+// TODO(gvisor.dev/issue/3679): Add support for other fields.
+func (i *inode) getFUSEAttr() linux.FUSEAttr {
+	return linux.FUSEAttr{
+		Ino:  i.Ino(),
+		Size: atomic.LoadUint64(&i.size),
+		Mode: uint32(i.Mode()),
+	}
 }
 
 // statFromFUSEAttr makes attributes from linux.FUSEAttr to linux.Statx. The
@@ -280,45 +644,179 @@ func statFromFUSEAttr(attr linux.FUSEAttr, mask, devMinor uint32) linux.Statx {
 	return stat
 }
 
-// Stat implements kernfs.Inode.Stat.
-func (i *inode) Stat(ctx context.Context, fs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
-	fusefs := fs.Impl().(*filesystem)
-	conn := fusefs.conn
-	task, creds := kernel.TaskFromContext(ctx), auth.CredentialsFromContext(ctx)
+// getAttr gets the attribute of this inode by issuing a FUSE_GETATTR request
+// or read from local cache. It updates the corresponding attributes if
+// necessary.
+func (i *inode) getAttr(ctx context.Context, fs *vfs.Filesystem, opts vfs.StatOptions, flags uint32, fh uint64) (linux.FUSEAttr, error) {
+	attributeVersion := atomic.LoadUint64(&i.fs.conn.attributeVersion)
+
+	// TODO(gvisor.dev/issue/3679): send the request only if
+	// - invalid local cache for fields specified in the opts.Mask
+	// - forced update
+	// - i.attributeTime expired
+	// If local cache is still valid, return local cache.
+	// Currently we always send a request,
+	// and we always set the metadata with the new result,
+	// unless attributeVersion has changed.
+
+	task := kernel.TaskFromContext(ctx)
 	if task == nil {
 		log.Warningf("couldn't get kernel task from context")
-		return linux.Statx{}, syserror.EINVAL
+		return linux.FUSEAttr{}, syserror.EINVAL
 	}
 
-	var in linux.FUSEGetAttrIn
-	// We don't set any attribute in the request, because in VFS2 fstat(2) will
-	// finally be translated into vfs.FilesystemImpl.StatAt() (see
-	// pkg/sentry/syscalls/linux/vfs2/stat.go), resulting in the same flow
-	// as stat(2). Thus GetAttrFlags and Fh variable will never be used in VFS2.
-	req, err := conn.NewRequest(creds, uint32(task.ThreadID()), i.Ino(), linux.FUSE_GETATTR, &in)
+	creds := auth.CredentialsFromContext(ctx)
+
+	in := linux.FUSEGetAttrIn{
+		GetAttrFlags: flags,
+		Fh:           fh,
+	}
+	req, err := i.fs.conn.NewRequest(creds, uint32(task.ThreadID()), i.nodeID, linux.FUSE_GETATTR, &in)
 	if err != nil {
-		return linux.Statx{}, err
+		return linux.FUSEAttr{}, err
 	}
 
-	res, err := conn.Call(task, req)
+	res, err := i.fs.conn.Call(task, req)
 	if err != nil {
-		return linux.Statx{}, err
+		return linux.FUSEAttr{}, err
 	}
 	if err := res.Error(); err != nil {
-		return linux.Statx{}, err
+		return linux.FUSEAttr{}, err
 	}
 
 	var out linux.FUSEGetAttrOut
 	if err := res.UnmarshalPayload(&out); err != nil {
-		return linux.Statx{}, err
+		return linux.FUSEAttr{}, err
+	}
+
+	// Local version is newer, return the local one.
+	// Skip the update.
+	if attributeVersion != 0 && atomic.LoadUint64(&i.attributeVersion) > attributeVersion {
+		return i.getFUSEAttr(), nil
 	}
 
-	// Set all metadata into kernfs.InodeAttrs.
-	if err := i.SetStat(ctx, fs, creds, vfs.SetStatOptions{
-		Stat: statFromFUSEAttr(out.Attr, linux.STATX_ALL, fusefs.devMinor),
+	// Set the metadata of kernfs.InodeAttrs.
+	if err := i.SetInodeStat(ctx, fs, creds, vfs.SetStatOptions{
+		Stat: statFromFUSEAttr(out.Attr, linux.STATX_ALL, i.fs.devMinor),
 	}); err != nil {
+		return linux.FUSEAttr{}, err
+	}
+
+	// Set the size if no error (after SetStat() check).
+	atomic.StoreUint64(&i.size, out.Attr.Size)
+
+	return out.Attr, nil
+}
+
+// reviseAttr attempts to update the attributes for internal purposes
+// by calling getAttr with a pre-specified mask.
+// Used by read, write, lseek.
+func (i *inode) reviseAttr(ctx context.Context, flags uint32, fh uint64) error {
+	// Never need atime for internal purposes.
+	_, err := i.getAttr(ctx, i.fs.VFSFilesystem(), vfs.StatOptions{
+		Mask: linux.STATX_BASIC_STATS &^ linux.STATX_ATIME,
+	}, flags, fh)
+	return err
+}
+
+// Stat implements kernfs.Inode.Stat.
+func (i *inode) Stat(ctx context.Context, fs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
+	attr, err := i.getAttr(ctx, fs, opts, 0, 0)
+	if err != nil {
 		return linux.Statx{}, err
 	}
 
-	return statFromFUSEAttr(out.Attr, opts.Mask, fusefs.devMinor), nil
+	return statFromFUSEAttr(attr, opts.Mask, i.fs.devMinor), nil
+}
+
+// DecRef implements kernfs.Inode.DecRef.
+func (i *inode) DecRef(ctx context.Context) {
+	i.inodeRefs.DecRef(func() { i.Destroy(ctx) })
+}
+
+// StatFS implements kernfs.Inode.StatFS.
+func (i *inode) StatFS(ctx context.Context, fs *vfs.Filesystem) (linux.Statfs, error) {
+	// TODO(gvisor.dev/issues/3413): Complete the implementation of statfs.
+	return vfs.GenericStatFS(linux.FUSE_SUPER_MAGIC), nil
+}
+
+// fattrMaskFromStats converts vfs.SetStatOptions.Stat.Mask to linux stats mask
+// aligned with the attribute mask defined in include/linux/fs.h.
+func fattrMaskFromStats(mask uint32) uint32 {
+	var fuseAttrMask uint32
+	maskMap := map[uint32]uint32{
+		linux.STATX_MODE:  linux.FATTR_MODE,
+		linux.STATX_UID:   linux.FATTR_UID,
+		linux.STATX_GID:   linux.FATTR_GID,
+		linux.STATX_SIZE:  linux.FATTR_SIZE,
+		linux.STATX_ATIME: linux.FATTR_ATIME,
+		linux.STATX_MTIME: linux.FATTR_MTIME,
+		linux.STATX_CTIME: linux.FATTR_CTIME,
+	}
+	for statxMask, fattrMask := range maskMap {
+		if mask&statxMask != 0 {
+			fuseAttrMask |= fattrMask
+		}
+	}
+	return fuseAttrMask
+}
+
+// SetStat implements kernfs.Inode.SetStat.
+func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
+	return i.setAttr(ctx, fs, creds, opts, false, 0)
+}
+
+func (i *inode) setAttr(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions, useFh bool, fh uint64) error {
+	conn := i.fs.conn
+	task := kernel.TaskFromContext(ctx)
+	if task == nil {
+		log.Warningf("couldn't get kernel task from context")
+		return syserror.EINVAL
+	}
+
+	// We should retain the original file type when assigning new mode.
+	fileType := uint16(i.Mode()) & linux.S_IFMT
+	fattrMask := fattrMaskFromStats(opts.Stat.Mask)
+	if useFh {
+		fattrMask |= linux.FATTR_FH
+	}
+	in := linux.FUSESetAttrIn{
+		Valid:     fattrMask,
+		Fh:        fh,
+		Size:      opts.Stat.Size,
+		Atime:     uint64(opts.Stat.Atime.Sec),
+		Mtime:     uint64(opts.Stat.Mtime.Sec),
+		Ctime:     uint64(opts.Stat.Ctime.Sec),
+		AtimeNsec: opts.Stat.Atime.Nsec,
+		MtimeNsec: opts.Stat.Mtime.Nsec,
+		CtimeNsec: opts.Stat.Ctime.Nsec,
+		Mode:      uint32(fileType | opts.Stat.Mode),
+		UID:       opts.Stat.UID,
+		GID:       opts.Stat.GID,
+	}
+	req, err := conn.NewRequest(creds, uint32(task.ThreadID()), i.nodeID, linux.FUSE_SETATTR, &in)
+	if err != nil {
+		return err
+	}
+
+	res, err := conn.Call(task, req)
+	if err != nil {
+		return err
+	}
+	if err := res.Error(); err != nil {
+		return err
+	}
+	out := linux.FUSEGetAttrOut{}
+	if err := res.UnmarshalPayload(&out); err != nil {
+		return err
+	}
+
+	// Set the metadata of kernfs.InodeAttrs.
+	if err := i.SetInodeStat(ctx, fs, creds, vfs.SetStatOptions{
+		Stat: statFromFUSEAttr(out.Attr, linux.STATX_ALL, i.fs.devMinor),
+	}); err != nil {
+		return err
+	}
+
+	return nil
 }
diff --git a/pkg/sentry/fsimpl/fuse/read_write.go b/pkg/sentry/fsimpl/fuse/read_write.go
new file mode 100644
index 000000000..625d1547f
--- /dev/null
+++ b/pkg/sentry/fsimpl/fuse/read_write.go
@@ -0,0 +1,242 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fuse
+
+import (
+	"io"
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// ReadInPages sends FUSE_READ requests for the size after round it up to
+// a multiple of page size, blocks on it for reply, processes the reply
+// and returns the payload (or joined payloads) as a byte slice.
+// This is used for the general purpose reading.
+// We do not support direct IO (which read the exact number of bytes)
+// at this moment.
+func (fs *filesystem) ReadInPages(ctx context.Context, fd *regularFileFD, off uint64, size uint32) ([][]byte, uint32, error) {
+	attributeVersion := atomic.LoadUint64(&fs.conn.attributeVersion)
+
+	t := kernel.TaskFromContext(ctx)
+	if t == nil {
+		log.Warningf("fusefs.Read: couldn't get kernel task from context")
+		return nil, 0, syserror.EINVAL
+	}
+
+	// Round up to a multiple of page size.
+	readSize, _ := usermem.PageRoundUp(uint64(size))
+
+	// One request cannnot exceed either maxRead or maxPages.
+	maxPages := fs.conn.maxRead >> usermem.PageShift
+	if maxPages > uint32(fs.conn.maxPages) {
+		maxPages = uint32(fs.conn.maxPages)
+	}
+
+	var outs [][]byte
+	var sizeRead uint32
+
+	// readSize is a multiple of usermem.PageSize.
+	// Always request bytes as a multiple of pages.
+	pagesRead, pagesToRead := uint32(0), uint32(readSize>>usermem.PageShift)
+
+	// Reuse the same struct for unmarshalling to avoid unnecessary memory allocation.
+	in := linux.FUSEReadIn{
+		Fh:        fd.Fh,
+		LockOwner: 0, // TODO(gvisor.dev/issue/3245): file lock
+		ReadFlags: 0, // TODO(gvisor.dev/issue/3245): |= linux.FUSE_READ_LOCKOWNER
+		Flags:     fd.statusFlags(),
+	}
+
+	// This loop is intended for fragmented read where the bytes to read is
+	// larger than either the maxPages or maxRead.
+	// For the majority of reads with normal size, this loop should only
+	// execute once.
+	for pagesRead < pagesToRead {
+		pagesCanRead := pagesToRead - pagesRead
+		if pagesCanRead > maxPages {
+			pagesCanRead = maxPages
+		}
+
+		in.Offset = off + (uint64(pagesRead) << usermem.PageShift)
+		in.Size = pagesCanRead << usermem.PageShift
+
+		req, err := fs.conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(t.ThreadID()), fd.inode().nodeID, linux.FUSE_READ, &in)
+		if err != nil {
+			return nil, 0, err
+		}
+
+		// TODO(gvisor.dev/issue/3247): support async read.
+
+		res, err := fs.conn.Call(t, req)
+		if err != nil {
+			return nil, 0, err
+		}
+		if err := res.Error(); err != nil {
+			return nil, 0, err
+		}
+
+		// Not enough bytes in response,
+		// either we reached EOF,
+		// or the FUSE server sends back a response
+		// that cannot even fit the hdr.
+		if len(res.data) <= res.hdr.SizeBytes() {
+			// We treat both case as EOF here for now
+			// since there is no reliable way to detect
+			// the over-short hdr case.
+			break
+		}
+
+		// Directly using the slice to avoid extra copy.
+		out := res.data[res.hdr.SizeBytes():]
+
+		outs = append(outs, out)
+		sizeRead += uint32(len(out))
+
+		pagesRead += pagesCanRead
+	}
+
+	defer fs.ReadCallback(ctx, fd, off, size, sizeRead, attributeVersion)
+
+	// No bytes returned: offset >= EOF.
+	if len(outs) == 0 {
+		return nil, 0, io.EOF
+	}
+
+	return outs, sizeRead, nil
+}
+
+// ReadCallback updates several information after receiving a read response.
+// Due to readahead, sizeRead can be larger than size.
+func (fs *filesystem) ReadCallback(ctx context.Context, fd *regularFileFD, off uint64, size uint32, sizeRead uint32, attributeVersion uint64) {
+	// TODO(gvisor.dev/issue/3247): support async read.
+	// If this is called by an async read, correctly process it.
+	// May need to update the signature.
+
+	i := fd.inode()
+	// TODO(gvisor.dev/issue/1193): Invalidate or update atime.
+
+	// Reached EOF.
+	if sizeRead < size {
+		// TODO(gvisor.dev/issue/3630): If we have writeback cache, then we need to fill this hole.
+		// Might need to update the buf to be returned from the Read().
+
+		// Update existing size.
+		newSize := off + uint64(sizeRead)
+		fs.conn.mu.Lock()
+		if attributeVersion == i.attributeVersion && newSize < atomic.LoadUint64(&i.size) {
+			fs.conn.attributeVersion++
+			i.attributeVersion = i.fs.conn.attributeVersion
+			atomic.StoreUint64(&i.size, newSize)
+		}
+		fs.conn.mu.Unlock()
+	}
+}
+
+// Write sends FUSE_WRITE requests and return the bytes
+// written according to the response.
+//
+// Preconditions: len(data) == size.
+func (fs *filesystem) Write(ctx context.Context, fd *regularFileFD, off uint64, size uint32, data []byte) (uint32, error) {
+	t := kernel.TaskFromContext(ctx)
+	if t == nil {
+		log.Warningf("fusefs.Read: couldn't get kernel task from context")
+		return 0, syserror.EINVAL
+	}
+
+	// One request cannnot exceed either maxWrite or maxPages.
+	maxWrite := uint32(fs.conn.maxPages) << usermem.PageShift
+	if maxWrite > fs.conn.maxWrite {
+		maxWrite = fs.conn.maxWrite
+	}
+
+	// Reuse the same struct for unmarshalling to avoid unnecessary memory allocation.
+	in := linux.FUSEWriteIn{
+		Fh: fd.Fh,
+		// TODO(gvisor.dev/issue/3245): file lock
+		LockOwner: 0,
+		// TODO(gvisor.dev/issue/3245): |= linux.FUSE_READ_LOCKOWNER
+		// TODO(gvisor.dev/issue/3237): |= linux.FUSE_WRITE_CACHE (not added yet)
+		WriteFlags: 0,
+		Flags:      fd.statusFlags(),
+	}
+
+	var written uint32
+
+	// This loop is intended for fragmented write where the bytes to write is
+	// larger than either the maxWrite or maxPages or when bigWrites is false.
+	// Unless a small value for max_write is explicitly used, this loop
+	// is expected to execute only once for the majority of the writes.
+	for written < size {
+		toWrite := size - written
+
+		// Limit the write size to one page.
+		// Note that the bigWrites flag is obsolete,
+		// latest libfuse always sets it on.
+		if !fs.conn.bigWrites && toWrite > usermem.PageSize {
+			toWrite = usermem.PageSize
+		}
+
+		// Limit the write size to maxWrite.
+		if toWrite > maxWrite {
+			toWrite = maxWrite
+		}
+
+		in.Offset = off + uint64(written)
+		in.Size = toWrite
+
+		req, err := fs.conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(t.ThreadID()), fd.inode().nodeID, linux.FUSE_WRITE, &in)
+		if err != nil {
+			return 0, err
+		}
+
+		req.payload = data[written : written+toWrite]
+
+		// TODO(gvisor.dev/issue/3247): support async write.
+
+		res, err := fs.conn.Call(t, req)
+		if err != nil {
+			return 0, err
+		}
+		if err := res.Error(); err != nil {
+			return 0, err
+		}
+
+		out := linux.FUSEWriteOut{}
+		if err := res.UnmarshalPayload(&out); err != nil {
+			return 0, err
+		}
+
+		// Write more than requested? EIO.
+		if out.Size > toWrite {
+			return 0, syserror.EIO
+		}
+
+		written += out.Size
+
+		// Break if short write. Not necessarily an error.
+		if out.Size != toWrite {
+			break
+		}
+	}
+
+	return written, nil
+}
diff --git a/pkg/sentry/fsimpl/fuse/regular_file.go b/pkg/sentry/fsimpl/fuse/regular_file.go
new file mode 100644
index 000000000..5bdd096c3
--- /dev/null
+++ b/pkg/sentry/fsimpl/fuse/regular_file.go
@@ -0,0 +1,230 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fuse
+
+import (
+	"io"
+	"math"
+	"sync"
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+type regularFileFD struct {
+	fileDescription
+
+	// off is the file offset.
+	off int64
+	// offMu protects off.
+	offMu sync.Mutex
+}
+
+// PRead implements vfs.FileDescriptionImpl.PRead.
+func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+	if offset < 0 {
+		return 0, syserror.EINVAL
+	}
+
+	// Check that flags are supported.
+	//
+	// TODO(gvisor.dev/issue/2601): Support select preadv2 flags.
+	if opts.Flags&^linux.RWF_HIPRI != 0 {
+		return 0, syserror.EOPNOTSUPP
+	}
+
+	size := dst.NumBytes()
+	if size == 0 {
+		// Early return if count is 0.
+		return 0, nil
+	} else if size > math.MaxUint32 {
+		// FUSE only supports uint32 for size.
+		// Overflow.
+		return 0, syserror.EINVAL
+	}
+
+	// TODO(gvisor.dev/issue/3678): Add direct IO support.
+
+	inode := fd.inode()
+
+	// Reading beyond EOF, update file size if outdated.
+	if uint64(offset+size) > atomic.LoadUint64(&inode.size) {
+		if err := inode.reviseAttr(ctx, linux.FUSE_GETATTR_FH, fd.Fh); err != nil {
+			return 0, err
+		}
+		// If the offset after update is still too large, return error.
+		if uint64(offset) >= atomic.LoadUint64(&inode.size) {
+			return 0, io.EOF
+		}
+	}
+
+	// Truncate the read with updated file size.
+	fileSize := atomic.LoadUint64(&inode.size)
+	if uint64(offset+size) > fileSize {
+		size = int64(fileSize) - offset
+	}
+
+	buffers, n, err := inode.fs.ReadInPages(ctx, fd, uint64(offset), uint32(size))
+	if err != nil {
+		return 0, err
+	}
+
+	// TODO(gvisor.dev/issue/3237): support indirect IO (e.g. caching),
+	// store the bytes that were read ahead.
+
+	// Update the number of bytes to copy for short read.
+	if n < uint32(size) {
+		size = int64(n)
+	}
+
+	// Copy the bytes read to the dst.
+	// This loop is intended for fragmented reads.
+	// For the majority of reads, this loop only execute once.
+	var copied int64
+	for _, buffer := range buffers {
+		toCopy := int64(len(buffer))
+		if copied+toCopy > size {
+			toCopy = size - copied
+		}
+		cp, err := dst.DropFirst64(copied).CopyOut(ctx, buffer[:toCopy])
+		if err != nil {
+			return 0, err
+		}
+		if int64(cp) != toCopy {
+			return 0, syserror.EIO
+		}
+		copied += toCopy
+	}
+
+	return copied, nil
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+	fd.offMu.Lock()
+	n, err := fd.PRead(ctx, dst, fd.off, opts)
+	fd.off += n
+	fd.offMu.Unlock()
+	return n, err
+}
+
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
+func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+	n, _, err := fd.pwrite(ctx, src, offset, opts)
+	return n, err
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+	fd.offMu.Lock()
+	n, off, err := fd.pwrite(ctx, src, fd.off, opts)
+	fd.off = off
+	fd.offMu.Unlock()
+	return n, err
+}
+
+// pwrite returns the number of bytes written, final offset and error. The
+// final offset should be ignored by PWrite.
+func (fd *regularFileFD) pwrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (written, finalOff int64, err error) {
+	if offset < 0 {
+		return 0, offset, syserror.EINVAL
+	}
+
+	// Check that flags are supported.
+	//
+	// TODO(gvisor.dev/issue/2601): Support select preadv2 flags.
+	if opts.Flags&^linux.RWF_HIPRI != 0 {
+		return 0, offset, syserror.EOPNOTSUPP
+	}
+
+	inode := fd.inode()
+	inode.metadataMu.Lock()
+	defer inode.metadataMu.Unlock()
+
+	// If the file is opened with O_APPEND, update offset to file size.
+	// Note: since our Open() implements the interface of kernfs,
+	// and kernfs currently does not support O_APPEND, this will never
+	// be true before we switch out from kernfs.
+	if fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 {
+		// Locking inode.metadataMu is sufficient for reading size
+		offset = int64(inode.size)
+	}
+
+	srclen := src.NumBytes()
+
+	if srclen > math.MaxUint32 {
+		// FUSE only supports uint32 for size.
+		// Overflow.
+		return 0, offset, syserror.EINVAL
+	}
+	if end := offset + srclen; end < offset {
+		// Overflow.
+		return 0, offset, syserror.EINVAL
+	}
+
+	srclen, err = vfs.CheckLimit(ctx, offset, srclen)
+	if err != nil {
+		return 0, offset, err
+	}
+
+	if srclen == 0 {
+		// Return before causing any side effects.
+		return 0, offset, nil
+	}
+
+	src = src.TakeFirst64(srclen)
+
+	// TODO(gvisor.dev/issue/3237): Add cache support:
+	// buffer cache. Ideally we write from src to our buffer cache first.
+	// The slice passed to fs.Write() should be a slice from buffer cache.
+	data := make([]byte, srclen)
+	// Reason for making a copy here: connection.Call() blocks on kerneltask,
+	// which in turn acquires mm.activeMu lock. Functions like CopyInTo() will
+	// attemp to acquire the mm.activeMu lock as well -> deadlock.
+	// We must finish reading from the userspace memory before
+	// t.Block() deactivates it.
+	cp, err := src.CopyIn(ctx, data)
+	if err != nil {
+		return 0, offset, err
+	}
+	if int64(cp) != srclen {
+		return 0, offset, syserror.EIO
+	}
+
+	n, err := fd.inode().fs.Write(ctx, fd, uint64(offset), uint32(srclen), data)
+	if err != nil {
+		return 0, offset, err
+	}
+
+	if n == 0 {
+		// We have checked srclen != 0 previously.
+		// If err == nil, then it's a short write and we return EIO.
+		return 0, offset, syserror.EIO
+	}
+
+	written = int64(n)
+	finalOff = offset + written
+
+	if finalOff > int64(inode.size) {
+		atomic.StoreUint64(&inode.size, uint64(finalOff))
+		atomic.AddUint64(&inode.fs.conn.attributeVersion, 1)
+	}
+
+	return
+}
diff --git a/pkg/sentry/fsimpl/fuse/request_response.go b/pkg/sentry/fsimpl/fuse/request_response.go
new file mode 100644
index 000000000..7fa00569b
--- /dev/null
+++ b/pkg/sentry/fsimpl/fuse/request_response.go
@@ -0,0 +1,229 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fuse
+
+import (
+	"fmt"
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/marshal"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// fuseInitRes is a variable-length wrapper of linux.FUSEInitOut. The FUSE
+// server may implement an older version of FUSE protocol, which contains a
+// linux.FUSEInitOut with less attributes.
+//
+// Dynamically-sized objects cannot be marshalled.
+type fuseInitRes struct {
+	marshal.StubMarshallable
+
+	// initOut contains the response from the FUSE server.
+	initOut linux.FUSEInitOut
+
+	// initLen is the total length of bytes of the response.
+	initLen uint32
+}
+
+// UnmarshalBytes deserializes src to the initOut attribute in a fuseInitRes.
+func (r *fuseInitRes) UnmarshalBytes(src []byte) {
+	out := &r.initOut
+
+	// Introduced before FUSE kernel version 7.13.
+	out.Major = uint32(usermem.ByteOrder.Uint32(src[:4]))
+	src = src[4:]
+	out.Minor = uint32(usermem.ByteOrder.Uint32(src[:4]))
+	src = src[4:]
+	out.MaxReadahead = uint32(usermem.ByteOrder.Uint32(src[:4]))
+	src = src[4:]
+	out.Flags = uint32(usermem.ByteOrder.Uint32(src[:4]))
+	src = src[4:]
+	out.MaxBackground = uint16(usermem.ByteOrder.Uint16(src[:2]))
+	src = src[2:]
+	out.CongestionThreshold = uint16(usermem.ByteOrder.Uint16(src[:2]))
+	src = src[2:]
+	out.MaxWrite = uint32(usermem.ByteOrder.Uint32(src[:4]))
+	src = src[4:]
+
+	// Introduced in FUSE kernel version 7.23.
+	if len(src) >= 4 {
+		out.TimeGran = uint32(usermem.ByteOrder.Uint32(src[:4]))
+		src = src[4:]
+	}
+	// Introduced in FUSE kernel version 7.28.
+	if len(src) >= 2 {
+		out.MaxPages = uint16(usermem.ByteOrder.Uint16(src[:2]))
+		src = src[2:]
+	}
+}
+
+// SizeBytes is the size of the payload of the FUSE_INIT response.
+func (r *fuseInitRes) SizeBytes() int {
+	return int(r.initLen)
+}
+
+// Ordinary requests have even IDs, while interrupts IDs are odd.
+// Used to increment the unique ID for each FUSE request.
+var reqIDStep uint64 = 2
+
+// Request represents a FUSE operation request that hasn't been sent to the
+// server yet.
+//
+// +stateify savable
+type Request struct {
+	requestEntry
+
+	id   linux.FUSEOpID
+	hdr  *linux.FUSEHeaderIn
+	data []byte
+
+	// payload for this request: extra bytes to write after
+	// the data slice. Used by FUSE_WRITE.
+	payload []byte
+
+	// If this request is async.
+	async bool
+	// If we don't care its response.
+	// Manually set by the caller.
+	noReply bool
+}
+
+// NewRequest creates a new request that can be sent to the FUSE server.
+func (conn *connection) NewRequest(creds *auth.Credentials, pid uint32, ino uint64, opcode linux.FUSEOpcode, payload marshal.Marshallable) (*Request, error) {
+	conn.fd.mu.Lock()
+	defer conn.fd.mu.Unlock()
+	conn.fd.nextOpID += linux.FUSEOpID(reqIDStep)
+
+	hdrLen := (*linux.FUSEHeaderIn)(nil).SizeBytes()
+	hdr := linux.FUSEHeaderIn{
+		Len:    uint32(hdrLen + payload.SizeBytes()),
+		Opcode: opcode,
+		Unique: conn.fd.nextOpID,
+		NodeID: ino,
+		UID:    uint32(creds.EffectiveKUID),
+		GID:    uint32(creds.EffectiveKGID),
+		PID:    pid,
+	}
+
+	buf := make([]byte, hdr.Len)
+
+	// TODO(gVisor.dev/issue/3698): Use the unsafe version once go_marshal is safe to use again.
+	hdr.MarshalBytes(buf[:hdrLen])
+	payload.MarshalBytes(buf[hdrLen:])
+
+	return &Request{
+		id:   hdr.Unique,
+		hdr:  &hdr,
+		data: buf,
+	}, nil
+}
+
+// futureResponse represents an in-flight request, that may or may not have
+// completed yet. Convert it to a resolved Response by calling Resolve, but note
+// that this may block.
+//
+// +stateify savable
+type futureResponse struct {
+	opcode linux.FUSEOpcode
+	ch     chan struct{}
+	hdr    *linux.FUSEHeaderOut
+	data   []byte
+
+	// If this request is async.
+	async bool
+}
+
+// newFutureResponse creates a future response to a FUSE request.
+func newFutureResponse(req *Request) *futureResponse {
+	return &futureResponse{
+		opcode: req.hdr.Opcode,
+		ch:     make(chan struct{}),
+		async:  req.async,
+	}
+}
+
+// resolve blocks the task until the server responds to its corresponding request,
+// then returns a resolved response.
+func (f *futureResponse) resolve(t *kernel.Task) (*Response, error) {
+	// Return directly for async requests.
+	if f.async {
+		return nil, nil
+	}
+
+	if err := t.Block(f.ch); err != nil {
+		return nil, err
+	}
+
+	return f.getResponse(), nil
+}
+
+// getResponse creates a Response from the data the futureResponse has.
+func (f *futureResponse) getResponse() *Response {
+	return &Response{
+		opcode: f.opcode,
+		hdr:    *f.hdr,
+		data:   f.data,
+	}
+}
+
+// Response represents an actual response from the server, including the
+// response payload.
+//
+// +stateify savable
+type Response struct {
+	opcode linux.FUSEOpcode
+	hdr    linux.FUSEHeaderOut
+	data   []byte
+}
+
+// Error returns the error of the FUSE call.
+func (r *Response) Error() error {
+	errno := r.hdr.Error
+	if errno >= 0 {
+		return nil
+	}
+
+	sysErrNo := syscall.Errno(-errno)
+	return error(sysErrNo)
+}
+
+// DataLen returns the size of the response without the header.
+func (r *Response) DataLen() uint32 {
+	return r.hdr.Len - uint32(r.hdr.SizeBytes())
+}
+
+// UnmarshalPayload unmarshals the response data into m.
+func (r *Response) UnmarshalPayload(m marshal.Marshallable) error {
+	hdrLen := r.hdr.SizeBytes()
+	haveDataLen := r.hdr.Len - uint32(hdrLen)
+	wantDataLen := uint32(m.SizeBytes())
+
+	if haveDataLen < wantDataLen {
+		return fmt.Errorf("payload too small. Minimum data lenth required: %d,  but got data length %d", wantDataLen, haveDataLen)
+	}
+
+	// The response data is empty unless there is some payload. And so, doesn't
+	// need to be unmarshalled.
+	if r.data == nil {
+		return nil
+	}
+
+	// TODO(gVisor.dev/issue/3698): Use the unsafe version once go_marshal is safe to use again.
+	m.UnmarshalBytes(r.data[hdrLen:])
+	return nil
+}
diff --git a/pkg/sentry/fsimpl/fuse/utils_test.go b/pkg/sentry/fsimpl/fuse/utils_test.go
new file mode 100644
index 000000000..e1d9e3365
--- /dev/null
+++ b/pkg/sentry/fsimpl/fuse/utils_test.go
@@ -0,0 +1,132 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fuse
+
+import (
+	"io"
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/marshal"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/testutil"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+func setup(t *testing.T) *testutil.System {
+	k, err := testutil.Boot()
+	if err != nil {
+		t.Fatalf("Error creating kernel: %v", err)
+	}
+
+	ctx := k.SupervisorContext()
+	creds := auth.CredentialsFromContext(ctx)
+
+	k.VFS().MustRegisterFilesystemType(Name, &FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserList:  true,
+		AllowUserMount: true,
+	})
+
+	mntns, err := k.VFS().NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.MountOptions{})
+	if err != nil {
+		t.Fatalf("NewMountNamespace(): %v", err)
+	}
+
+	return testutil.NewSystem(ctx, t, k.VFS(), mntns)
+}
+
+// newTestConnection creates a fuse connection that the sentry can communicate with
+// and the FD for the server to communicate with.
+func newTestConnection(system *testutil.System, k *kernel.Kernel, maxActiveRequests uint64) (*connection, *vfs.FileDescription, error) {
+	vfsObj := &vfs.VirtualFilesystem{}
+	fuseDev := &DeviceFD{}
+
+	if err := vfsObj.Init(system.Ctx); err != nil {
+		return nil, nil, err
+	}
+
+	vd := vfsObj.NewAnonVirtualDentry("genCountFD")
+	defer vd.DecRef(system.Ctx)
+	if err := fuseDev.vfsfd.Init(fuseDev, linux.O_RDWR|linux.O_CREAT, vd.Mount(), vd.Dentry(), &vfs.FileDescriptionOptions{}); err != nil {
+		return nil, nil, err
+	}
+
+	fsopts := filesystemOptions{
+		maxActiveRequests: maxActiveRequests,
+	}
+	fs, err := newFUSEFilesystem(system.Ctx, 0, &fsopts, &fuseDev.vfsfd)
+	if err != nil {
+		return nil, nil, err
+	}
+
+	return fs.conn, &fuseDev.vfsfd, nil
+}
+
+type testPayload struct {
+	marshal.StubMarshallable
+	data uint32
+}
+
+// SizeBytes implements marshal.Marshallable.SizeBytes.
+func (t *testPayload) SizeBytes() int {
+	return 4
+}
+
+// MarshalBytes implements marshal.Marshallable.MarshalBytes.
+func (t *testPayload) MarshalBytes(dst []byte) {
+	usermem.ByteOrder.PutUint32(dst[:4], t.data)
+}
+
+// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes.
+func (t *testPayload) UnmarshalBytes(src []byte) {
+	*t = testPayload{data: usermem.ByteOrder.Uint32(src[:4])}
+}
+
+// Packed implements marshal.Marshallable.Packed.
+func (t *testPayload) Packed() bool {
+	return true
+}
+
+// MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe.
+func (t *testPayload) MarshalUnsafe(dst []byte) {
+	t.MarshalBytes(dst)
+}
+
+// UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe.
+func (t *testPayload) UnmarshalUnsafe(src []byte) {
+	t.UnmarshalBytes(src)
+}
+
+// CopyOutN implements marshal.Marshallable.CopyOutN.
+func (t *testPayload) CopyOutN(task marshal.CopyContext, addr usermem.Addr, limit int) (int, error) {
+	panic("not implemented")
+}
+
+// CopyOut implements marshal.Marshallable.CopyOut.
+func (t *testPayload) CopyOut(task marshal.CopyContext, addr usermem.Addr) (int, error) {
+	panic("not implemented")
+}
+
+// CopyIn implements marshal.Marshallable.CopyIn.
+func (t *testPayload) CopyIn(task marshal.CopyContext, addr usermem.Addr) (int, error) {
+	panic("not implemented")
+}
+
+// WriteTo implements io.WriterTo.WriteTo.
+func (t *testPayload) WriteTo(w io.Writer) (int64, error) {
+	panic("not implemented")
+}
diff --git a/pkg/sentry/fsimpl/gofer/BUILD b/pkg/sentry/fsimpl/gofer/BUILD
index 16787116f..ad0afc41b 100644
--- a/pkg/sentry/fsimpl/gofer/BUILD
+++ b/pkg/sentry/fsimpl/gofer/BUILD
@@ -52,6 +52,7 @@ go_library(
         "//pkg/fspath",
         "//pkg/log",
         "//pkg/p9",
+        "//pkg/refs",
         "//pkg/safemem",
         "//pkg/sentry/fs/fsutil",
         "//pkg/sentry/fs/lock",
diff --git a/pkg/sentry/fsimpl/gofer/directory.go b/pkg/sentry/fsimpl/gofer/directory.go
index 2a8011eb4..18c884b59 100644
--- a/pkg/sentry/fsimpl/gofer/directory.go
+++ b/pkg/sentry/fsimpl/gofer/directory.go
@@ -34,8 +34,11 @@ func (d *dentry) isDir() bool {
 	return d.fileType() == linux.S_IFDIR
 }
 
-// Preconditions: filesystem.renameMu must be locked. d.dirMu must be locked.
-// d.isDir(). child must be a newly-created dentry that has never had a parent.
+// Preconditions:
+// * filesystem.renameMu must be locked.
+// * d.dirMu must be locked.
+// * d.isDir().
+// * child must be a newly-created dentry that has never had a parent.
 func (d *dentry) cacheNewChildLocked(child *dentry, name string) {
 	d.IncRef() // reference held by child on its parent
 	child.parent = d
@@ -46,7 +49,9 @@ func (d *dentry) cacheNewChildLocked(child *dentry, name string) {
 	d.children[name] = child
 }
 
-// Preconditions: d.dirMu must be locked. d.isDir().
+// Preconditions:
+// * d.dirMu must be locked.
+// * d.isDir().
 func (d *dentry) cacheNegativeLookupLocked(name string) {
 	// Don't cache negative lookups if InteropModeShared is in effect (since
 	// this makes remote lookup unavoidable), or if d.isSynthetic() (in which
@@ -79,10 +84,12 @@ type createSyntheticOpts struct {
 // createSyntheticChildLocked creates a synthetic file with the given name
 // in d.
 //
-// Preconditions: d.dirMu must be locked. d.isDir(). d does not already contain
-// a child with the given name.
+// Preconditions:
+// * d.dirMu must be locked.
+// * d.isDir().
+// * d does not already contain a child with the given name.
 func (d *dentry) createSyntheticChildLocked(opts *createSyntheticOpts) {
-	d2 := &dentry{
+	child := &dentry{
 		refs:      1, // held by d
 		fs:        d.fs,
 		ino:       d.fs.nextSyntheticIno(),
@@ -97,24 +104,25 @@ func (d *dentry) createSyntheticChildLocked(opts *createSyntheticOpts) {
 	case linux.S_IFDIR:
 		// Nothing else needs to be done.
 	case linux.S_IFSOCK:
-		d2.endpoint = opts.endpoint
+		child.endpoint = opts.endpoint
 	case linux.S_IFIFO:
-		d2.pipe = opts.pipe
+		child.pipe = opts.pipe
 	default:
 		panic(fmt.Sprintf("failed to create synthetic file of unrecognized type: %v", opts.mode.FileType()))
 	}
-	d2.pf.dentry = d2
-	d2.vfsd.Init(d2)
+	child.pf.dentry = child
+	child.vfsd.Init(child)
 
-	d.cacheNewChildLocked(d2, opts.name)
+	d.cacheNewChildLocked(child, opts.name)
 	d.syntheticChildren++
 }
 
+// +stateify savable
 type directoryFD struct {
 	fileDescription
 	vfs.DirectoryFileDescriptionDefaultImpl
 
-	mu      sync.Mutex
+	mu      sync.Mutex `state:"nosave"`
 	off     int64
 	dirents []vfs.Dirent
 }
@@ -151,7 +159,9 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba
 	return nil
 }
 
-// Preconditions: d.isDir(). There exists at least one directoryFD representing d.
+// Preconditions:
+// * d.isDir().
+// * There exists at least one directoryFD representing d.
 func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) {
 	// NOTE(b/135560623): 9P2000.L's readdir does not specify behavior in the
 	// presence of concurrent mutation of an iterated directory, so
diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go
index 9a90351e5..94d96261b 100644
--- a/pkg/sentry/fsimpl/gofer/filesystem.go
+++ b/pkg/sentry/fsimpl/gofer/filesystem.go
@@ -115,9 +115,12 @@ func putDentrySlice(ds *[]*dentry) {
 // Dentries which may become cached as a result of the traversal are appended
 // to *ds.
 //
-// Preconditions: fs.renameMu must be locked. d.dirMu must be locked.
-// !rp.Done(). If !d.cachedMetadataAuthoritative(), then d's cached metadata
-// must be up to date.
+// Preconditions:
+// * fs.renameMu must be locked.
+// * d.dirMu must be locked.
+// * !rp.Done().
+// * If !d.cachedMetadataAuthoritative(), then d's cached metadata must be up
+//   to date.
 //
 // Postconditions: The returned dentry's cached metadata is up to date.
 func (fs *filesystem) stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, mayFollowSymlinks bool, ds **[]*dentry) (*dentry, error) {
@@ -185,8 +188,11 @@ afterSymlink:
 // getChildLocked returns a dentry representing the child of parent with the
 // given name. If no such child exists, getChildLocked returns (nil, nil).
 //
-// Preconditions: fs.renameMu must be locked. parent.dirMu must be locked.
-// parent.isDir(). name is not "." or "..".
+// Preconditions:
+// * fs.renameMu must be locked.
+// * parent.dirMu must be locked.
+// * parent.isDir().
+// * name is not "." or "..".
 //
 // Postconditions: If getChildLocked returns a non-nil dentry, its cached
 // metadata is up to date.
@@ -206,7 +212,8 @@ func (fs *filesystem) getChildLocked(ctx context.Context, vfsObj *vfs.VirtualFil
 	return fs.revalidateChildLocked(ctx, vfsObj, parent, name, child, ds)
 }
 
-// Preconditions: As for getChildLocked. !parent.isSynthetic().
+// Preconditions: Same as getChildLocked, plus:
+// * !parent.isSynthetic().
 func (fs *filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.VirtualFilesystem, parent *dentry, name string, child *dentry, ds **[]*dentry) (*dentry, error) {
 	if child != nil {
 		// Need to lock child.metadataMu because we might be updating child
@@ -279,9 +286,11 @@ func (fs *filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.Vir
 // rp.Start().Impl().(*dentry)). It does not check that the returned directory
 // is searchable by the provider of rp.
 //
-// Preconditions: fs.renameMu must be locked. !rp.Done(). If
-// !d.cachedMetadataAuthoritative(), then d's cached metadata must be up to
-// date.
+// Preconditions:
+// * fs.renameMu must be locked.
+// * !rp.Done().
+// * If !d.cachedMetadataAuthoritative(), then d's cached metadata must be up
+//   to date.
 func (fs *filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, error) {
 	for !rp.Final() {
 		d.dirMu.Lock()
@@ -328,9 +337,10 @@ func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath,
 // createInRemoteDir (if the parent directory is a real remote directory) or
 // createInSyntheticDir (if the parent directory is synthetic) to do so.
 //
-// Preconditions: !rp.Done(). For the final path component in rp,
-// !rp.ShouldFollowSymlink().
-func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool, createInRemoteDir func(parent *dentry, name string) error, createInSyntheticDir func(parent *dentry, name string) error) error {
+// Preconditions:
+// * !rp.Done().
+// * For the final path component in rp, !rp.ShouldFollowSymlink().
+func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool, createInRemoteDir func(parent *dentry, name string, ds **[]*dentry) error, createInSyntheticDir func(parent *dentry, name string) error) error {
 	var ds *[]*dentry
 	fs.renameMu.RLock()
 	defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
@@ -399,7 +409,7 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir
 		// RPC will fail with EEXIST like we would have. If the RPC succeeds, and a
 		// stale dentry exists, the dentry will fail revalidation next time it's
 		// used.
-		if err := createInRemoteDir(parent, name); err != nil {
+		if err := createInRemoteDir(parent, name, &ds); err != nil {
 			return err
 		}
 		ev := linux.IN_CREATE
@@ -414,7 +424,7 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir
 	}
 	// No cached dentry exists; however, there might still be an existing file
 	// at name. As above, we attempt the file creation RPC anyway.
-	if err := createInRemoteDir(parent, name); err != nil {
+	if err := createInRemoteDir(parent, name, &ds); err != nil {
 		return err
 	}
 	if child, ok := parent.children[name]; ok && child == nil {
@@ -721,7 +731,7 @@ func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPa
 
 // LinkAt implements vfs.FilesystemImpl.LinkAt.
 func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error {
-	return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, childName string) error {
+	return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, childName string, _ **[]*dentry) error {
 		if rp.Mount() != vd.Mount() {
 			return syserror.EXDEV
 		}
@@ -754,7 +764,7 @@ func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.
 // MkdirAt implements vfs.FilesystemImpl.MkdirAt.
 func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error {
 	creds := rp.Credentials()
-	return fs.doCreateAt(ctx, rp, true /* dir */, func(parent *dentry, name string) error {
+	return fs.doCreateAt(ctx, rp, true /* dir */, func(parent *dentry, name string, _ **[]*dentry) error {
 		if _, err := parent.file.mkdir(ctx, name, (p9.FileMode)(opts.Mode), (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID)); err != nil {
 			if !opts.ForSyntheticMountpoint || err == syserror.EEXIST {
 				return err
@@ -789,34 +799,49 @@ func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
 
 // MknodAt implements vfs.FilesystemImpl.MknodAt.
 func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error {
-	return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, name string) error {
+	return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, name string, ds **[]*dentry) error {
 		creds := rp.Credentials()
 		_, err := parent.file.mknod(ctx, name, (p9.FileMode)(opts.Mode), opts.DevMajor, opts.DevMinor, (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID))
-		// If the gofer does not allow creating a socket or pipe, create a
-		// synthetic one, i.e. one that is kept entirely in memory.
-		if err == syserror.EPERM {
-			switch opts.Mode.FileType() {
-			case linux.S_IFSOCK:
-				parent.createSyntheticChildLocked(&createSyntheticOpts{
-					name:     name,
-					mode:     opts.Mode,
-					kuid:     creds.EffectiveKUID,
-					kgid:     creds.EffectiveKGID,
-					endpoint: opts.Endpoint,
-				})
-				return nil
-			case linux.S_IFIFO:
-				parent.createSyntheticChildLocked(&createSyntheticOpts{
-					name: name,
-					mode: opts.Mode,
-					kuid: creds.EffectiveKUID,
-					kgid: creds.EffectiveKGID,
-					pipe: pipe.NewVFSPipe(true /* isNamed */, pipe.DefaultPipeSize, usermem.PageSize),
-				})
-				return nil
-			}
+		if err != syserror.EPERM {
+			return err
 		}
-		return err
+
+		// EPERM means that gofer does not allow creating a socket or pipe. Fallback
+		// to creating a synthetic one, i.e. one that is kept entirely in memory.
+
+		// Check that we're not overriding an existing file with a synthetic one.
+		_, err = fs.stepLocked(ctx, rp, parent, true, ds)
+		switch {
+		case err == nil:
+			// Step succeeded, another file exists.
+			return syserror.EEXIST
+		case err != syserror.ENOENT:
+			// Unexpected error.
+			return err
+		}
+
+		switch opts.Mode.FileType() {
+		case linux.S_IFSOCK:
+			parent.createSyntheticChildLocked(&createSyntheticOpts{
+				name:     name,
+				mode:     opts.Mode,
+				kuid:     creds.EffectiveKUID,
+				kgid:     creds.EffectiveKGID,
+				endpoint: opts.Endpoint,
+			})
+			return nil
+		case linux.S_IFIFO:
+			parent.createSyntheticChildLocked(&createSyntheticOpts{
+				name: name,
+				mode: opts.Mode,
+				kuid: creds.EffectiveKUID,
+				kgid: creds.EffectiveKGID,
+				pipe: pipe.NewVFSPipe(true /* isNamed */, pipe.DefaultPipeSize, usermem.PageSize),
+			})
+			return nil
+		}
+		// Retain error from gofer if synthetic file cannot be created internally.
+		return syserror.EPERM
 	}, nil)
 }
 
@@ -1001,7 +1026,7 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.Open
 		// step is required even if !d.cachedMetadataAuthoritative() because
 		// d.mappings has to be updated.
 		// d.metadataMu has already been acquired if trunc == true.
-		d.updateFileSizeLocked(0)
+		d.updateSizeLocked(0)
 
 		if d.cachedMetadataAuthoritative() {
 			d.touchCMtimeLocked()
@@ -1072,8 +1097,10 @@ retry:
 	return &fd.vfsfd, nil
 }
 
-// Preconditions: d.fs.renameMu must be locked. d.dirMu must be locked.
-// !d.isSynthetic().
+// Preconditions:
+// * d.fs.renameMu must be locked.
+// * d.dirMu must be locked.
+// * !d.isSynthetic().
 func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions, ds **[]*dentry) (*vfs.FileDescription, error) {
 	if err := d.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil {
 		return nil, err
@@ -1284,6 +1311,9 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 			if !renamed.isDir() {
 				return syserror.EISDIR
 			}
+			if genericIsAncestorDentry(replaced, renamed) {
+				return syserror.ENOTEMPTY
+			}
 		} else {
 			if rp.MustBeDir() || renamed.isDir() {
 				return syserror.ENOTDIR
@@ -1334,14 +1364,15 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 	// with reference counts and queue oldParent for checkCachingLocked if the
 	// parent isn't actually changing.
 	if oldParent != newParent {
+		oldParent.decRefLocked()
 		ds = appendDentry(ds, oldParent)
 		newParent.IncRef()
 		if renamed.isSynthetic() {
 			oldParent.syntheticChildren--
 			newParent.syntheticChildren++
 		}
+		renamed.parent = newParent
 	}
-	renamed.parent = newParent
 	renamed.name = newName
 	if newParent.children == nil {
 		newParent.children = make(map[string]*dentry)
@@ -1385,11 +1416,11 @@ func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts
 		fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
 		return err
 	}
-	if err := d.setStat(ctx, rp.Credentials(), &opts, rp.Mount()); err != nil {
-		fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
+	err = d.setStat(ctx, rp.Credentials(), &opts, rp.Mount())
+	fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
+	if err != nil {
 		return err
 	}
-	fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
 
 	if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 {
 		d.InotifyWithParent(ctx, ev, 0, vfs.InodeEvent)
@@ -1452,7 +1483,7 @@ func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linu
 
 // SymlinkAt implements vfs.FilesystemImpl.SymlinkAt.
 func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error {
-	return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, name string) error {
+	return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, name string, _ **[]*dentry) error {
 		creds := rp.Credentials()
 		_, err := parent.file.symlink(ctx, target, name, (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID))
 		return err
@@ -1464,7 +1495,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
 	return fs.unlinkAt(ctx, rp, false /* dir */)
 }
 
-// BoundEndpointAt implements FilesystemImpl.BoundEndpointAt.
+// BoundEndpointAt implements vfs.FilesystemImpl.BoundEndpointAt.
 func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) {
 	var ds *[]*dentry
 	fs.renameMu.RLock()
@@ -1485,13 +1516,15 @@ func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath
 				path:   opts.Addr,
 			}, nil
 		}
-		return d.endpoint, nil
+		if d.endpoint != nil {
+			return d.endpoint, nil
+		}
 	}
 	return nil, syserror.ECONNREFUSED
 }
 
-// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
-func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
+// ListXattrAt implements vfs.FilesystemImpl.ListXattrAt.
+func (fs *filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
 	var ds *[]*dentry
 	fs.renameMu.RLock()
 	defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
@@ -1499,11 +1532,11 @@ func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, si
 	if err != nil {
 		return nil, err
 	}
-	return d.listxattr(ctx, rp.Credentials(), size)
+	return d.listXattr(ctx, rp.Credentials(), size)
 }
 
-// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt.
-func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) {
+// GetXattrAt implements vfs.FilesystemImpl.GetXattrAt.
+func (fs *filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetXattrOptions) (string, error) {
 	var ds *[]*dentry
 	fs.renameMu.RLock()
 	defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
@@ -1511,11 +1544,11 @@ func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt
 	if err != nil {
 		return "", err
 	}
-	return d.getxattr(ctx, rp.Credentials(), &opts)
+	return d.getXattr(ctx, rp.Credentials(), &opts)
 }
 
-// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt.
-func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error {
+// SetXattrAt implements vfs.FilesystemImpl.SetXattrAt.
+func (fs *filesystem) SetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetXattrOptions) error {
 	var ds *[]*dentry
 	fs.renameMu.RLock()
 	d, err := fs.resolveLocked(ctx, rp, &ds)
@@ -1523,18 +1556,18 @@ func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt
 		fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
 		return err
 	}
-	if err := d.setxattr(ctx, rp.Credentials(), &opts); err != nil {
-		fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
+	err = d.setXattr(ctx, rp.Credentials(), &opts)
+	fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
+	if err != nil {
 		return err
 	}
-	fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
 
 	d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent)
 	return nil
 }
 
-// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt.
-func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
+// RemoveXattrAt implements vfs.FilesystemImpl.RemoveXattrAt.
+func (fs *filesystem) RemoveXattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
 	var ds *[]*dentry
 	fs.renameMu.RLock()
 	d, err := fs.resolveLocked(ctx, rp, &ds)
@@ -1542,11 +1575,11 @@ func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath,
 		fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
 		return err
 	}
-	if err := d.removexattr(ctx, rp.Credentials(), name); err != nil {
-		fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
+	err = d.removeXattr(ctx, rp.Credentials(), name)
+	fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
+	if err != nil {
 		return err
 	}
-	fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
 
 	d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent)
 	return nil
diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go
index 63e589859..f1dad1b08 100644
--- a/pkg/sentry/fsimpl/gofer/gofer.go
+++ b/pkg/sentry/fsimpl/gofer/gofer.go
@@ -62,9 +62,13 @@ import (
 const Name = "9p"
 
 // FilesystemType implements vfs.FilesystemType.
+//
+// +stateify savable
 type FilesystemType struct{}
 
 // filesystem implements vfs.FilesystemImpl.
+//
+// +stateify savable
 type filesystem struct {
 	vfsfs vfs.Filesystem
 
@@ -77,7 +81,7 @@ type filesystem struct {
 	iopts InternalFilesystemOptions
 
 	// client is the client used by this filesystem. client is immutable.
-	client *p9.Client
+	client *p9.Client `state:"nosave"` // FIXME(gvisor.dev/issue/1663): not yet supported.
 
 	// clock is a realtime clock used to set timestamps in file operations.
 	clock ktime.Clock
@@ -95,7 +99,7 @@ type filesystem struct {
 	// reference count (such that it is usable as vfs.ResolvingPath.Start() or
 	// is reachable from its children), or if it is a child dentry (such that
 	// it is reachable from its parent).
-	renameMu sync.RWMutex
+	renameMu sync.RWMutex `state:"nosave"`
 
 	// cachedDentries contains all dentries with 0 references. (Due to race
 	// conditions, it may also contain dentries with non-zero references.)
@@ -107,7 +111,7 @@ type filesystem struct {
 	// syncableDentries contains all dentries in this filesystem for which
 	// !dentry.file.isNil(). specialFileFDs contains all open specialFileFDs.
 	// These fields are protected by syncMu.
-	syncMu           sync.Mutex
+	syncMu           sync.Mutex `state:"nosave"`
 	syncableDentries map[*dentry]struct{}
 	specialFileFDs   map[*specialFileFD]struct{}
 
@@ -120,6 +124,8 @@ type filesystem struct {
 // dentries, it comes from QID.Path from the 9P server. Synthetic dentries
 // have have their inodeNumber generated sequentially, with the MSB reserved to
 // prevent conflicts with regular dentries.
+//
+// +stateify savable
 type inodeNumber uint64
 
 // Reserve MSB for synthetic mounts.
@@ -132,6 +138,7 @@ func inoFromPath(path uint64) inodeNumber {
 	return inodeNumber(path &^ syntheticInoMask)
 }
 
+// +stateify savable
 type filesystemOptions struct {
 	// "Standard" 9P options.
 	fd      int
@@ -177,6 +184,8 @@ type filesystemOptions struct {
 
 // InteropMode controls the client's interaction with other remote filesystem
 // users.
+//
+// +stateify savable
 type InteropMode uint32
 
 const (
@@ -195,11 +204,7 @@ const (
 	// and consistent with Linux's semantics (in particular, it is not always
 	// possible for clients to set arbitrary atimes and mtimes depending on the
 	// remote filesystem implementation, and never possible for clients to set
-	// arbitrary ctimes.) If a dentry containing a client-defined atime or
-	// mtime is evicted from cache, client timestamps will be sent to the
-	// remote filesystem on a best-effort basis to attempt to ensure that
-	// timestamps will be preserved when another dentry representing the same
-	// file is instantiated.
+	// arbitrary ctimes.)
 	InteropModeExclusive InteropMode = iota
 
 	// InteropModeWritethrough is appropriate when there are read-only users of
@@ -239,6 +244,8 @@ const (
 
 // InternalFilesystemOptions may be passed as
 // vfs.GetFilesystemOptions.InternalData to FilesystemType.GetFilesystem.
+//
+// +stateify savable
 type InternalFilesystemOptions struct {
 	// If LeakConnection is true, do not close the connection to the server
 	// when the Filesystem is released. This is necessary for deployments in
@@ -265,6 +272,9 @@ func (FilesystemType) Name() string {
 	return Name
 }
 
+// Release implements vfs.FilesystemType.Release.
+func (FilesystemType) Release(ctx context.Context) {}
+
 // GetFilesystem implements vfs.FilesystemType.GetFilesystem.
 func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
 	mfp := pgalloc.MemoryFileProviderFromContext(ctx)
@@ -538,6 +548,8 @@ func (fs *filesystem) Release(ctx context.Context) {
 }
 
 // dentry implements vfs.DentryImpl.
+//
+// +stateify savable
 type dentry struct {
 	vfsd vfs.Dentry
 
@@ -567,7 +579,7 @@ type dentry struct {
 	// If file.isNil(), this dentry represents a synthetic file, i.e. a file
 	// that does not exist on the remote filesystem. As of this writing, the
 	// only files that can be synthetic are sockets, pipes, and directories.
-	file p9file
+	file p9file `state:"nosave"` // FIXME(gvisor.dev/issue/1663): not yet supported.
 
 	// If deleted is non-zero, the file represented by this dentry has been
 	// deleted. deleted is accessed using atomic memory operations.
@@ -579,7 +591,7 @@ type dentry struct {
 	cached bool
 	dentryEntry
 
-	dirMu sync.Mutex
+	dirMu sync.Mutex `state:"nosave"`
 
 	// If this dentry represents a directory, children contains:
 	//
@@ -611,7 +623,7 @@ type dentry struct {
 	// To mutate:
 	//   - Lock metadataMu and use atomic operations to update because we might
 	//     have atomic readers that don't hold the lock.
-	metadataMu sync.Mutex
+	metadataMu sync.Mutex  `state:"nosave"`
 	ino        inodeNumber // immutable
 	mode       uint32      // type is immutable, perms are mutable
 	uid        uint32      // auth.KUID, but stored as raw uint32 for sync/atomic
@@ -642,7 +654,7 @@ type dentry struct {
 	// other metadata fields.
 	nlink uint32
 
-	mapsMu sync.Mutex
+	mapsMu sync.Mutex `state:"nosave"`
 
 	// If this dentry represents a regular file, mappings tracks mappings of
 	// the file into memmap.MappingSpaces. mappings is protected by mapsMu.
@@ -666,12 +678,12 @@ type dentry struct {
 	// either p9.File transitions from closed (isNil() == true) to open
 	// (isNil() == false), it may be mutated with handleMu locked, but cannot
 	// be closed until the dentry is destroyed.
-	handleMu  sync.RWMutex
-	readFile  p9file
-	writeFile p9file
+	handleMu  sync.RWMutex `state:"nosave"`
+	readFile  p9file       `state:"nosave"` // FIXME(gvisor.dev/issue/1663): not yet supported.
+	writeFile p9file       `state:"nosave"` // FIXME(gvisor.dev/issue/1663): not yet supported.
 	hostFD    int32
 
-	dataMu sync.RWMutex
+	dataMu sync.RWMutex `state:"nosave"`
 
 	// If this dentry represents a regular file that is client-cached, cache
 	// maps offsets into the cached file to offsets into
@@ -703,6 +715,13 @@ type dentry struct {
 	locks vfs.FileLocks
 
 	// Inotify watches for this dentry.
+	//
+	// Note that inotify may behave unexpectedly in the presence of hard links,
+	// because dentries corresponding to the same file have separate inotify
+	// watches when they should share the same set. This is the case because it is
+	// impossible for us to know for sure whether two dentries correspond to the
+	// same underlying file (see the gofer filesystem section fo vfs/inotify.md for
+	// a more in-depth discussion on this matter).
 	watches vfs.Watches
 }
 
@@ -830,7 +849,7 @@ func (d *dentry) updateFromP9AttrsLocked(mask p9.AttrMask, attr *p9.Attr) {
 		atomic.StoreUint32(&d.nlink, uint32(attr.NLink))
 	}
 	if mask.Size {
-		d.updateFileSizeLocked(attr.Size)
+		d.updateSizeLocked(attr.Size)
 	}
 }
 
@@ -984,7 +1003,7 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs
 				// d.size should be kept up to date, and privatized
 				// copy-on-write mappings of truncated pages need to be
 				// invalidated, even if InteropModeShared is in effect.
-				d.updateFileSizeLocked(stat.Size)
+				d.updateSizeLocked(stat.Size)
 			}
 		}
 		if d.fs.opts.interop == InteropModeShared {
@@ -1021,8 +1040,31 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs
 	return nil
 }
 
+// doAllocate performs an allocate operation on d. Note that d.metadataMu will
+// be held when allocate is called.
+func (d *dentry) doAllocate(ctx context.Context, offset, length uint64, allocate func() error) error {
+	d.metadataMu.Lock()
+	defer d.metadataMu.Unlock()
+
+	// Allocating a smaller size is a noop.
+	size := offset + length
+	if d.cachedMetadataAuthoritative() && size <= d.size {
+		return nil
+	}
+
+	err := allocate()
+	if err != nil {
+		return err
+	}
+	d.updateSizeLocked(size)
+	if d.cachedMetadataAuthoritative() {
+		d.touchCMtimeLocked()
+	}
+	return nil
+}
+
 // Preconditions: d.metadataMu must be locked.
-func (d *dentry) updateFileSizeLocked(newSize uint64) {
+func (d *dentry) updateSizeLocked(newSize uint64) {
 	d.dataMu.Lock()
 	oldSize := d.size
 	atomic.StoreUint64(&d.size, newSize)
@@ -1060,6 +1102,21 @@ func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes)
 	return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(atomic.LoadUint32(&d.mode)), auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid)))
 }
 
+func (d *dentry) checkXattrPermissions(creds *auth.Credentials, name string, ats vfs.AccessTypes) error {
+	// We only support xattrs prefixed with "user." (see b/148380782). Currently,
+	// there is no need to expose any other xattrs through a gofer.
+	if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) {
+		return syserror.EOPNOTSUPP
+	}
+	mode := linux.FileMode(atomic.LoadUint32(&d.mode))
+	kuid := auth.KUID(atomic.LoadUint32(&d.uid))
+	kgid := auth.KGID(atomic.LoadUint32(&d.gid))
+	if err := vfs.GenericCheckPermissions(creds, ats, mode, kuid, kgid); err != nil {
+		return err
+	}
+	return vfs.CheckXattrPermissions(creds, ats, mode, kuid, name)
+}
+
 func (d *dentry) mayDelete(creds *auth.Credentials, child *dentry) error {
 	return vfs.CheckDeleteSticky(creds, linux.FileMode(atomic.LoadUint32(&d.mode)), auth.KUID(atomic.LoadUint32(&child.uid)))
 }
@@ -1293,30 +1350,19 @@ func (d *dentry) destroyLocked(ctx context.Context) {
 	d.handleMu.Unlock()
 
 	if !d.file.isNil() {
-		if !d.isDeleted() {
-			// Write dirty timestamps back to the remote filesystem.
-			atimeDirty := atomic.LoadUint32(&d.atimeDirty) != 0
-			mtimeDirty := atomic.LoadUint32(&d.mtimeDirty) != 0
-			if atimeDirty || mtimeDirty {
-				atime := atomic.LoadInt64(&d.atime)
-				mtime := atomic.LoadInt64(&d.mtime)
-				if err := d.file.setAttr(ctx, p9.SetAttrMask{
-					ATime:              atimeDirty,
-					ATimeNotSystemTime: atimeDirty,
-					MTime:              mtimeDirty,
-					MTimeNotSystemTime: mtimeDirty,
-				}, p9.SetAttr{
-					ATimeSeconds:     uint64(atime / 1e9),
-					ATimeNanoSeconds: uint64(atime % 1e9),
-					MTimeSeconds:     uint64(mtime / 1e9),
-					MTimeNanoSeconds: uint64(mtime % 1e9),
-				}); err != nil {
-					log.Warningf("gofer.dentry.destroyLocked: failed to write dirty timestamps back: %v", err)
-				}
-			}
+		// Note that it's possible that d.atimeDirty or d.mtimeDirty are true,
+		// i.e. client and server timestamps may differ (because e.g. a client
+		// write was serviced by the page cache, and only written back to the
+		// remote file later). Ideally, we'd write client timestamps back to
+		// the remote filesystem so that timestamps for a new dentry
+		// instantiated for the same file would remain coherent. Unfortunately,
+		// this turns out to be too expensive in many cases, so for now we
+		// don't do this.
+		if err := d.file.close(ctx); err != nil {
+			log.Warningf("gofer.dentry.destroyLocked: failed to close file: %v", err)
 		}
-		d.file.close(ctx)
 		d.file = p9file{}
+
 		// Remove d from the set of syncable dentries.
 		d.fs.syncMu.Lock()
 		delete(d.fs.syncableDentries, d)
@@ -1344,9 +1390,7 @@ func (d *dentry) setDeleted() {
 	atomic.StoreUint32(&d.deleted, 1)
 }
 
-// We only support xattrs prefixed with "user." (see b/148380782). Currently,
-// there is no need to expose any other xattrs through a gofer.
-func (d *dentry) listxattr(ctx context.Context, creds *auth.Credentials, size uint64) ([]string, error) {
+func (d *dentry) listXattr(ctx context.Context, creds *auth.Credentials, size uint64) ([]string, error) {
 	if d.file.isNil() || !d.userXattrSupported() {
 		return nil, nil
 	}
@@ -1356,6 +1400,7 @@ func (d *dentry) listxattr(ctx context.Context, creds *auth.Credentials, size ui
 	}
 	xattrs := make([]string, 0, len(xattrMap))
 	for x := range xattrMap {
+		// We only support xattrs in the user.* namespace.
 		if strings.HasPrefix(x, linux.XATTR_USER_PREFIX) {
 			xattrs = append(xattrs, x)
 		}
@@ -1363,51 +1408,33 @@ func (d *dentry) listxattr(ctx context.Context, creds *auth.Credentials, size ui
 	return xattrs, nil
 }
 
-func (d *dentry) getxattr(ctx context.Context, creds *auth.Credentials, opts *vfs.GetxattrOptions) (string, error) {
+func (d *dentry) getXattr(ctx context.Context, creds *auth.Credentials, opts *vfs.GetXattrOptions) (string, error) {
 	if d.file.isNil() {
 		return "", syserror.ENODATA
 	}
-	if err := d.checkPermissions(creds, vfs.MayRead); err != nil {
+	if err := d.checkXattrPermissions(creds, opts.Name, vfs.MayRead); err != nil {
 		return "", err
 	}
-	if !strings.HasPrefix(opts.Name, linux.XATTR_USER_PREFIX) {
-		return "", syserror.EOPNOTSUPP
-	}
-	if !d.userXattrSupported() {
-		return "", syserror.ENODATA
-	}
 	return d.file.getXattr(ctx, opts.Name, opts.Size)
 }
 
-func (d *dentry) setxattr(ctx context.Context, creds *auth.Credentials, opts *vfs.SetxattrOptions) error {
+func (d *dentry) setXattr(ctx context.Context, creds *auth.Credentials, opts *vfs.SetXattrOptions) error {
 	if d.file.isNil() {
 		return syserror.EPERM
 	}
-	if err := d.checkPermissions(creds, vfs.MayWrite); err != nil {
+	if err := d.checkXattrPermissions(creds, opts.Name, vfs.MayWrite); err != nil {
 		return err
 	}
-	if !strings.HasPrefix(opts.Name, linux.XATTR_USER_PREFIX) {
-		return syserror.EOPNOTSUPP
-	}
-	if !d.userXattrSupported() {
-		return syserror.EPERM
-	}
 	return d.file.setXattr(ctx, opts.Name, opts.Value, opts.Flags)
 }
 
-func (d *dentry) removexattr(ctx context.Context, creds *auth.Credentials, name string) error {
+func (d *dentry) removeXattr(ctx context.Context, creds *auth.Credentials, name string) error {
 	if d.file.isNil() {
 		return syserror.EPERM
 	}
-	if err := d.checkPermissions(creds, vfs.MayWrite); err != nil {
+	if err := d.checkXattrPermissions(creds, name, vfs.MayWrite); err != nil {
 		return err
 	}
-	if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) {
-		return syserror.EOPNOTSUPP
-	}
-	if !d.userXattrSupported() {
-		return syserror.EPERM
-	}
 	return d.file.removeXattr(ctx, name)
 }
 
@@ -1418,7 +1445,9 @@ func (d *dentry) userXattrSupported() bool {
 	return filetype == linux.ModeRegular || filetype == linux.ModeDirectory
 }
 
-// Preconditions: !d.isSynthetic(). d.isRegularFile() || d.isDir().
+// Preconditions:
+// * !d.isSynthetic().
+// * d.isRegularFile() || d.isDir().
 func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool) error {
 	// O_TRUNC unconditionally requires us to obtain a new handle (opened with
 	// O_TRUNC).
@@ -1463,8 +1492,9 @@ func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool
 			return err
 		}
 
-		if d.hostFD < 0 && openReadable && h.fd >= 0 {
-			// We have no existing FD; use the new FD for at least reading.
+		if d.hostFD < 0 && h.fd >= 0 && openReadable && (d.writeFile.isNil() || openWritable) {
+			// We have no existing FD, and the new FD meets the requirements
+			// for d.hostFD, so start using it.
 			d.hostFD = h.fd
 		} else if d.hostFD >= 0 && d.writeFile.isNil() && openWritable {
 			// We have an existing read-only FD, but the file has just been
@@ -1613,12 +1643,14 @@ func (d *dentry) decLinks() {
 
 // fileDescription is embedded by gofer implementations of
 // vfs.FileDescriptionImpl.
+//
+// +stateify savable
 type fileDescription struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
 	vfs.LockFD
 
-	lockLogging sync.Once
+	lockLogging sync.Once `state:"nosave"` // FIXME(gvisor.dev/issue/1663): not yet supported.
 }
 
 func (fd *fileDescription) filesystem() *filesystem {
@@ -1656,30 +1688,30 @@ func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions)
 	return nil
 }
 
-// Listxattr implements vfs.FileDescriptionImpl.Listxattr.
-func (fd *fileDescription) Listxattr(ctx context.Context, size uint64) ([]string, error) {
-	return fd.dentry().listxattr(ctx, auth.CredentialsFromContext(ctx), size)
+// ListXattr implements vfs.FileDescriptionImpl.ListXattr.
+func (fd *fileDescription) ListXattr(ctx context.Context, size uint64) ([]string, error) {
+	return fd.dentry().listXattr(ctx, auth.CredentialsFromContext(ctx), size)
 }
 
-// Getxattr implements vfs.FileDescriptionImpl.Getxattr.
-func (fd *fileDescription) Getxattr(ctx context.Context, opts vfs.GetxattrOptions) (string, error) {
-	return fd.dentry().getxattr(ctx, auth.CredentialsFromContext(ctx), &opts)
+// GetXattr implements vfs.FileDescriptionImpl.GetXattr.
+func (fd *fileDescription) GetXattr(ctx context.Context, opts vfs.GetXattrOptions) (string, error) {
+	return fd.dentry().getXattr(ctx, auth.CredentialsFromContext(ctx), &opts)
 }
 
-// Setxattr implements vfs.FileDescriptionImpl.Setxattr.
-func (fd *fileDescription) Setxattr(ctx context.Context, opts vfs.SetxattrOptions) error {
+// SetXattr implements vfs.FileDescriptionImpl.SetXattr.
+func (fd *fileDescription) SetXattr(ctx context.Context, opts vfs.SetXattrOptions) error {
 	d := fd.dentry()
-	if err := d.setxattr(ctx, auth.CredentialsFromContext(ctx), &opts); err != nil {
+	if err := d.setXattr(ctx, auth.CredentialsFromContext(ctx), &opts); err != nil {
 		return err
 	}
 	d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent)
 	return nil
 }
 
-// Removexattr implements vfs.FileDescriptionImpl.Removexattr.
-func (fd *fileDescription) Removexattr(ctx context.Context, name string) error {
+// RemoveXattr implements vfs.FileDescriptionImpl.RemoveXattr.
+func (fd *fileDescription) RemoveXattr(ctx context.Context, name string) error {
 	d := fd.dentry()
-	if err := d.removexattr(ctx, auth.CredentialsFromContext(ctx), name); err != nil {
+	if err := d.removeXattr(ctx, auth.CredentialsFromContext(ctx), name); err != nil {
 		return err
 	}
 	d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent)
diff --git a/pkg/sentry/fsimpl/gofer/handle.go b/pkg/sentry/fsimpl/gofer/handle.go
index 104157512..a9ebe1206 100644
--- a/pkg/sentry/fsimpl/gofer/handle.go
+++ b/pkg/sentry/fsimpl/gofer/handle.go
@@ -25,6 +25,8 @@ import (
 
 // handle represents a remote "open file descriptor", consisting of an opened
 // fid (p9.File) and optionally a host file descriptor.
+//
+// These are explicitly not savable.
 type handle struct {
 	file p9file
 	fd   int32 // -1 if unavailable
diff --git a/pkg/sentry/fsimpl/gofer/p9file.go b/pkg/sentry/fsimpl/gofer/p9file.go
index 87f0b877f..21b4a96fe 100644
--- a/pkg/sentry/fsimpl/gofer/p9file.go
+++ b/pkg/sentry/fsimpl/gofer/p9file.go
@@ -127,6 +127,13 @@ func (f p9file) close(ctx context.Context) error {
 	return err
 }
 
+func (f p9file) setAttrClose(ctx context.Context, valid p9.SetAttrMask, attr p9.SetAttr) error {
+	ctx.UninterruptibleSleepStart(false)
+	err := f.file.SetAttrClose(valid, attr)
+	ctx.UninterruptibleSleepFinish(false)
+	return err
+}
+
 func (f p9file) open(ctx context.Context, flags p9.OpenFlags) (*fd.FD, p9.QID, uint32, error) {
 	ctx.UninterruptibleSleepStart(false)
 	fdobj, qid, iounit, err := f.file.Open(flags)
diff --git a/pkg/sentry/fsimpl/gofer/regular_file.go b/pkg/sentry/fsimpl/gofer/regular_file.go
index 7e1cbf065..f8b19bae7 100644
--- a/pkg/sentry/fsimpl/gofer/regular_file.go
+++ b/pkg/sentry/fsimpl/gofer/regular_file.go
@@ -39,11 +39,12 @@ func (d *dentry) isRegularFile() bool {
 	return d.fileType() == linux.S_IFREG
 }
 
+// +stateify savable
 type regularFileFD struct {
 	fileDescription
 
 	// off is the file offset. off is protected by mu.
-	mu  sync.Mutex
+	mu  sync.Mutex `state:"nosave"`
 	off int64
 }
 
@@ -56,10 +57,16 @@ func (fd *regularFileFD) OnClose(ctx context.Context) error {
 	if !fd.vfsfd.IsWritable() {
 		return nil
 	}
-	// Skip flushing if writes may be buffered by the client, since (as with
-	// the VFS1 client) we don't flush buffered writes on close anyway.
+	// Skip flushing if there are client-buffered writes, since (as with the
+	// VFS1 client) we don't flush buffered writes on close anyway.
 	d := fd.dentry()
-	if d.fs.opts.interop == InteropModeExclusive {
+	if d.fs.opts.interop != InteropModeExclusive {
+		return nil
+	}
+	d.dataMu.RLock()
+	haveDirtyPages := !d.dirty.IsEmpty()
+	d.dataMu.RUnlock()
+	if haveDirtyPages {
 		return nil
 	}
 	d.handleMu.RLock()
@@ -73,28 +80,11 @@ func (fd *regularFileFD) OnClose(ctx context.Context) error {
 // Allocate implements vfs.FileDescriptionImpl.Allocate.
 func (fd *regularFileFD) Allocate(ctx context.Context, mode, offset, length uint64) error {
 	d := fd.dentry()
-	d.metadataMu.Lock()
-	defer d.metadataMu.Unlock()
-
-	// Allocating a smaller size is a noop.
-	size := offset + length
-	if d.cachedMetadataAuthoritative() && size <= d.size {
-		return nil
-	}
-
-	d.handleMu.RLock()
-	err := d.writeFile.allocate(ctx, p9.ToAllocateMode(mode), offset, length)
-	d.handleMu.RUnlock()
-	if err != nil {
-		return err
-	}
-	d.dataMu.Lock()
-	atomic.StoreUint64(&d.size, size)
-	d.dataMu.Unlock()
-	if d.cachedMetadataAuthoritative() {
-		d.touchCMtimeLocked()
-	}
-	return nil
+	return d.doAllocate(ctx, offset, length, func() error {
+		d.handleMu.RLock()
+		defer d.handleMu.RUnlock()
+		return d.writeFile.allocate(ctx, p9.ToAllocateMode(mode), offset, length)
+	})
 }
 
 // PRead implements vfs.FileDescriptionImpl.PRead.
@@ -117,6 +107,10 @@ func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offs
 		return 0, io.EOF
 	}
 
+	var (
+		n       int64
+		readErr error
+	)
 	if fd.vfsfd.StatusFlags()&linux.O_DIRECT != 0 {
 		// Lock d.metadataMu for the rest of the read to prevent d.size from
 		// changing.
@@ -127,20 +121,25 @@ func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offs
 		if err := d.writeback(ctx, offset, dst.NumBytes()); err != nil {
 			return 0, err
 		}
-	}
-
-	rw := getDentryReadWriter(ctx, d, offset)
-	if fd.vfsfd.StatusFlags()&linux.O_DIRECT != 0 {
+		rw := getDentryReadWriter(ctx, d, offset)
 		// Require the read to go to the remote file.
 		rw.direct = true
+		n, readErr = dst.CopyOutFrom(ctx, rw)
+		putDentryReadWriter(rw)
+		if d.fs.opts.interop != InteropModeShared {
+			// Compare Linux's mm/filemap.c:do_generic_file_read() => file_accessed().
+			d.touchAtimeLocked(fd.vfsfd.Mount())
+		}
+	} else {
+		rw := getDentryReadWriter(ctx, d, offset)
+		n, readErr = dst.CopyOutFrom(ctx, rw)
+		putDentryReadWriter(rw)
+		if d.fs.opts.interop != InteropModeShared {
+			// Compare Linux's mm/filemap.c:do_generic_file_read() => file_accessed().
+			d.touchAtime(fd.vfsfd.Mount())
+		}
 	}
-	n, err := dst.CopyOutFrom(ctx, rw)
-	putDentryReadWriter(rw)
-	if d.fs.opts.interop != InteropModeShared {
-		// Compare Linux's mm/filemap.c:do_generic_file_read() => file_accessed().
-		d.touchAtime(fd.vfsfd.Mount())
-	}
-	return n, err
+	return n, readErr
 }
 
 // Read implements vfs.FileDescriptionImpl.Read.
@@ -396,7 +395,7 @@ func (rw *dentryReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error)
 					End:   gapEnd,
 				}
 				optMR := gap.Range()
-				err := rw.d.cache.Fill(rw.ctx, reqMR, maxFillRange(reqMR, optMR), mf, usage.PageCache, h.readToBlocksAt)
+				err := rw.d.cache.Fill(rw.ctx, reqMR, maxFillRange(reqMR, optMR), rw.d.size, mf, usage.PageCache, h.readToBlocksAt)
 				mf.MarkEvictable(rw.d, pgalloc.EvictableRange{optMR.Start, optMR.End})
 				seg, gap = rw.d.cache.Find(rw.off)
 				if !seg.Ok() {
@@ -404,10 +403,10 @@ func (rw *dentryReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error)
 					rw.d.handleMu.RUnlock()
 					return done, err
 				}
-				// err might have occurred in part of gap.Range() outside
-				// gapMR. Forget about it for now; if the error matters and
-				// persists, we'll run into it again in a later iteration of
-				// this loop.
+				// err might have occurred in part of gap.Range() outside gapMR
+				// (in particular, gap.End() might be beyond EOF). Forget about
+				// it for now; if the error matters and persists, we'll run
+				// into it again in a later iteration of this loop.
 			} else {
 				// Read directly from the file.
 				gapDsts := dsts.TakeFirst64(gapMR.Length())
@@ -781,7 +780,7 @@ func (d *dentry) Translate(ctx context.Context, required, optional memmap.Mappab
 
 	mf := d.fs.mfp.MemoryFile()
 	h := d.readHandleLocked()
-	cerr := d.cache.Fill(ctx, required, maxFillRange(required, optional), mf, usage.PageCache, h.readToBlocksAt)
+	cerr := d.cache.Fill(ctx, required, maxFillRange(required, optional), d.size, mf, usage.PageCache, h.readToBlocksAt)
 
 	var ts []memmap.Translation
 	var translatedEnd uint64
@@ -900,6 +899,8 @@ func (d *dentry) Evict(ctx context.Context, er pgalloc.EvictableRange) {
 // dentryPlatformFile is only used when a host FD representing the remote file
 // is available (i.e. dentry.hostFD >= 0), and that FD is used for application
 // memory mappings (i.e. !filesystem.opts.forcePageCache).
+//
+// +stateify savable
 type dentryPlatformFile struct {
 	*dentry
 
@@ -912,7 +913,7 @@ type dentryPlatformFile struct {
 	hostFileMapper fsutil.HostFileMapper
 
 	// hostFileMapperInitOnce is used to lazily initialize hostFileMapper.
-	hostFileMapperInitOnce sync.Once
+	hostFileMapperInitOnce sync.Once `state:"nosave"` // FIXME(gvisor.dev/issue/1663): not yet supported.
 }
 
 // IncRef implements memmap.File.IncRef.
diff --git a/pkg/sentry/fsimpl/gofer/socket.go b/pkg/sentry/fsimpl/gofer/socket.go
index 85d2bee72..326b940a7 100644
--- a/pkg/sentry/fsimpl/gofer/socket.go
+++ b/pkg/sentry/fsimpl/gofer/socket.go
@@ -36,12 +36,14 @@ func (d *dentry) isSocket() bool {
 // An endpoint's lifetime is the time between when filesystem.BoundEndpointAt()
 // is called and either BoundEndpoint.BidirectionalConnect or
 // BoundEndpoint.UnidirectionalConnect is called.
+//
+// +stateify savable
 type endpoint struct {
 	// dentry is the filesystem dentry which produced this endpoint.
 	dentry *dentry
 
 	// file is the p9 file that contains a single unopened fid.
-	file p9.File
+	file p9.File `state:"nosave"` // FIXME(gvisor.dev/issue/1663): not yet supported.
 
 	// path is the sentry path where this endpoint is bound.
 	path string
diff --git a/pkg/sentry/fsimpl/gofer/special_file.go b/pkg/sentry/fsimpl/gofer/special_file.go
index 3c39aa9b7..71581736c 100644
--- a/pkg/sentry/fsimpl/gofer/special_file.go
+++ b/pkg/sentry/fsimpl/gofer/special_file.go
@@ -22,6 +22,7 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fdnotifier"
+	"gvisor.dev/gvisor/pkg/p9"
 	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
@@ -33,11 +34,13 @@ import (
 // special files, and (when filesystemOptions.regularFilesUseSpecialFileFD is
 // in effect) regular files. specialFileFD differs from regularFileFD by using
 // per-FD handles instead of shared per-dentry handles, and never buffering I/O.
+//
+// +stateify savable
 type specialFileFD struct {
 	fileDescription
 
 	// handle is used for file I/O. handle is immutable.
-	handle handle
+	handle handle `state:"nosave"` // FIXME(gvisor.dev/issue/1663): not yet supported.
 
 	// isRegularFile is true if this FD represents a regular file which is only
 	// possible when filesystemOptions.regularFilesUseSpecialFileFD is in
@@ -55,7 +58,7 @@ type specialFileFD struct {
 	queue     waiter.Queue
 
 	// If seekable is true, off is the file offset. off is protected by mu.
-	mu  sync.Mutex
+	mu  sync.Mutex `state:"nosave"`
 	off int64
 }
 
@@ -135,6 +138,16 @@ func (fd *specialFileFD) EventUnregister(e *waiter.Entry) {
 	fd.fileDescription.EventUnregister(e)
 }
 
+func (fd *specialFileFD) Allocate(ctx context.Context, mode, offset, length uint64) error {
+	if fd.isRegularFile {
+		d := fd.dentry()
+		return d.doAllocate(ctx, offset, length, func() error {
+			return fd.handle.file.allocate(ctx, p9.ToAllocateMode(mode), offset, length)
+		})
+	}
+	return fd.FileDescriptionDefaultImpl.Allocate(ctx, mode, offset, length)
+}
+
 // PRead implements vfs.FileDescriptionImpl.PRead.
 func (fd *specialFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
 	if fd.seekable && offset < 0 {
@@ -235,11 +248,12 @@ func (fd *specialFileFD) pwrite(ctx context.Context, src usermem.IOSequence, off
 		d.touchCMtime()
 	}
 	buf := make([]byte, src.NumBytes())
-	// Don't do partial writes if we get a partial read from src.
-	if _, err := src.CopyIn(ctx, buf); err != nil {
-		return 0, offset, err
+	copied, copyErr := src.CopyIn(ctx, buf)
+	if copied == 0 && copyErr != nil {
+		// Only return the error if we didn't get any data.
+		return 0, offset, copyErr
 	}
-	n, err := fd.handle.writeFromBlocksAt(ctx, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf)), uint64(offset))
+	n, err := fd.handle.writeFromBlocksAt(ctx, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf[:copied])), uint64(offset))
 	if err == syserror.EAGAIN {
 		err = syserror.ErrWouldBlock
 	}
@@ -256,7 +270,10 @@ func (fd *specialFileFD) pwrite(ctx context.Context, src usermem.IOSequence, off
 			atomic.StoreUint64(&d.size, uint64(offset))
 		}
 	}
-	return int64(n), offset, err
+	if err != nil {
+		return int64(n), offset, err
+	}
+	return int64(n), offset, copyErr
 }
 
 // Write implements vfs.FileDescriptionImpl.Write.
diff --git a/pkg/sentry/fsimpl/gofer/time.go b/pkg/sentry/fsimpl/gofer/time.go
index e59d07e90..7e825caae 100644
--- a/pkg/sentry/fsimpl/gofer/time.go
+++ b/pkg/sentry/fsimpl/gofer/time.go
@@ -52,8 +52,23 @@ func (d *dentry) touchAtime(mnt *vfs.Mount) {
 	mnt.EndWrite()
 }
 
-// Preconditions: d.cachedMetadataAuthoritative() == true. The caller has
-// successfully called vfs.Mount.CheckBeginWrite().
+// Preconditions: d.metadataMu is locked. d.cachedMetadataAuthoritative() == true.
+func (d *dentry) touchAtimeLocked(mnt *vfs.Mount) {
+	if mnt.Flags.NoATime || mnt.ReadOnly() {
+		return
+	}
+	if err := mnt.CheckBeginWrite(); err != nil {
+		return
+	}
+	now := d.fs.clock.Now().Nanoseconds()
+	atomic.StoreInt64(&d.atime, now)
+	atomic.StoreUint32(&d.atimeDirty, 1)
+	mnt.EndWrite()
+}
+
+// Preconditions:
+// * d.cachedMetadataAuthoritative() == true.
+// * The caller has successfully called vfs.Mount.CheckBeginWrite().
 func (d *dentry) touchCtime() {
 	now := d.fs.clock.Now().Nanoseconds()
 	d.metadataMu.Lock()
@@ -61,8 +76,9 @@ func (d *dentry) touchCtime() {
 	d.metadataMu.Unlock()
 }
 
-// Preconditions: d.cachedMetadataAuthoritative() == true. The caller has
-// successfully called vfs.Mount.CheckBeginWrite().
+// Preconditions:
+// * d.cachedMetadataAuthoritative() == true.
+// * The caller has successfully called vfs.Mount.CheckBeginWrite().
 func (d *dentry) touchCMtime() {
 	now := d.fs.clock.Now().Nanoseconds()
 	d.metadataMu.Lock()
@@ -72,8 +88,9 @@ func (d *dentry) touchCMtime() {
 	d.metadataMu.Unlock()
 }
 
-// Preconditions: d.cachedMetadataAuthoritative() == true. The caller has
-// locked d.metadataMu.
+// Preconditions:
+// * d.cachedMetadataAuthoritative() == true.
+// * The caller has locked d.metadataMu.
 func (d *dentry) touchCMtimeLocked() {
 	now := d.fs.clock.Now().Nanoseconds()
 	atomic.StoreInt64(&d.mtime, now)
diff --git a/pkg/sentry/fsimpl/host/BUILD b/pkg/sentry/fsimpl/host/BUILD
index bd701bbc7..56bcf9bdb 100644
--- a/pkg/sentry/fsimpl/host/BUILD
+++ b/pkg/sentry/fsimpl/host/BUILD
@@ -1,12 +1,37 @@
 load("//tools:defs.bzl", "go_library")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
 
 licenses(["notice"])
 
+go_template_instance(
+    name = "inode_refs",
+    out = "inode_refs.go",
+    package = "host",
+    prefix = "inode",
+    template = "//pkg/refs_vfs2:refs_template",
+    types = {
+        "T": "inode",
+    },
+)
+
+go_template_instance(
+    name = "connected_endpoint_refs",
+    out = "connected_endpoint_refs.go",
+    package = "host",
+    prefix = "ConnectedEndpoint",
+    template = "//pkg/refs_vfs2:refs_template",
+    types = {
+        "T": "ConnectedEndpoint",
+    },
+)
+
 go_library(
     name = "host",
     srcs = [
+        "connected_endpoint_refs.go",
         "control.go",
         "host.go",
+        "inode_refs.go",
         "ioctl_unsafe.go",
         "mmap.go",
         "socket.go",
@@ -24,6 +49,7 @@ go_library(
         "//pkg/fspath",
         "//pkg/iovec",
         "//pkg/log",
+        "//pkg/marshal/primitive",
         "//pkg/refs",
         "//pkg/safemem",
         "//pkg/sentry/arch",
diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go
index 56869f59a..698e913fe 100644
--- a/pkg/sentry/fsimpl/host/host.go
+++ b/pkg/sentry/fsimpl/host/host.go
@@ -27,7 +27,6 @@ import (
 	"gvisor.dev/gvisor/pkg/fdnotifier"
 	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/log"
-	"gvisor.dev/gvisor/pkg/refs"
 	fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
 	"gvisor.dev/gvisor/pkg/sentry/hostfd"
@@ -41,6 +40,44 @@ import (
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
+func newInode(fs *filesystem, hostFD int, fileType linux.FileMode, isTTY bool) (*inode, error) {
+	// Determine if hostFD is seekable. If not, this syscall will return ESPIPE
+	// (see fs/read_write.c:llseek), e.g. for pipes, sockets, and some character
+	// devices.
+	_, err := unix.Seek(hostFD, 0, linux.SEEK_CUR)
+	seekable := err != syserror.ESPIPE
+
+	i := &inode{
+		hostFD:     hostFD,
+		ino:        fs.NextIno(),
+		isTTY:      isTTY,
+		wouldBlock: wouldBlock(uint32(fileType)),
+		seekable:   seekable,
+		// NOTE(b/38213152): Technically, some obscure char devices can be memory
+		// mapped, but we only allow regular files.
+		canMap: fileType == linux.S_IFREG,
+	}
+	i.pf.inode = i
+	i.EnableLeakCheck()
+
+	// Non-seekable files can't be memory mapped, assert this.
+	if !i.seekable && i.canMap {
+		panic("files that can return EWOULDBLOCK (sockets, pipes, etc.) cannot be memory mapped")
+	}
+
+	// If the hostFD would block, we must set it to non-blocking and handle
+	// blocking behavior in the sentry.
+	if i.wouldBlock {
+		if err := syscall.SetNonblock(i.hostFD, true); err != nil {
+			return nil, err
+		}
+		if err := fdnotifier.AddFD(int32(i.hostFD), &i.queue); err != nil {
+			return nil, err
+		}
+	}
+	return i, nil
+}
+
 // NewFDOptions contains options to NewFD.
 type NewFDOptions struct {
 	// If IsTTY is true, the file descriptor is a TTY.
@@ -76,45 +113,12 @@ func NewFD(ctx context.Context, mnt *vfs.Mount, hostFD int, opts *NewFDOptions)
 		flags = uint32(flagsInt)
 	}
 
-	fileMode := linux.FileMode(s.Mode)
-	fileType := fileMode.FileType()
-
-	// Determine if hostFD is seekable. If not, this syscall will return ESPIPE
-	// (see fs/read_write.c:llseek), e.g. for pipes, sockets, and some character
-	// devices.
-	_, err := unix.Seek(hostFD, 0, linux.SEEK_CUR)
-	seekable := err != syserror.ESPIPE
-
-	i := &inode{
-		hostFD:     hostFD,
-		ino:        fs.NextIno(),
-		isTTY:      opts.IsTTY,
-		wouldBlock: wouldBlock(uint32(fileType)),
-		seekable:   seekable,
-		// NOTE(b/38213152): Technically, some obscure char devices can be memory
-		// mapped, but we only allow regular files.
-		canMap: fileType == linux.S_IFREG,
-	}
-	i.pf.inode = i
-
-	// Non-seekable files can't be memory mapped, assert this.
-	if !i.seekable && i.canMap {
-		panic("files that can return EWOULDBLOCK (sockets, pipes, etc.) cannot be memory mapped")
-	}
-
-	// If the hostFD would block, we must set it to non-blocking and handle
-	// blocking behavior in the sentry.
-	if i.wouldBlock {
-		if err := syscall.SetNonblock(i.hostFD, true); err != nil {
-			return nil, err
-		}
-		if err := fdnotifier.AddFD(int32(i.hostFD), &i.queue); err != nil {
-			return nil, err
-		}
-	}
-
 	d := &kernfs.Dentry{}
-	d.Init(i)
+	i, err := newInode(fs, hostFD, linux.FileMode(s.Mode).FileType(), opts.IsTTY)
+	if err != nil {
+		return nil, err
+	}
+	d.Init(&fs.Filesystem, i)
 
 	// i.open will take a reference on d.
 	defer d.DecRef(ctx)
@@ -122,7 +126,7 @@ func NewFD(ctx context.Context, mnt *vfs.Mount, hostFD int, opts *NewFDOptions)
 	// For simplicity, fileDescription.offset is set to 0. Technically, we
 	// should only set to 0 on files that are not seekable (sockets, pipes,
 	// etc.), and use the offset from the host fd otherwise when importing.
-	return i.open(ctx, d.VFSDentry(), mnt, flags)
+	return i.open(ctx, d, mnt, flags)
 }
 
 // ImportFD sets up and returns a vfs.FileDescription from a donated fd.
@@ -133,18 +137,23 @@ func ImportFD(ctx context.Context, mnt *vfs.Mount, hostFD int, isTTY bool) (*vfs
 }
 
 // filesystemType implements vfs.FilesystemType.
+//
+// +stateify savable
 type filesystemType struct{}
 
-// GetFilesystem implements FilesystemType.GetFilesystem.
+// GetFilesystem implements vfs.FilesystemType.GetFilesystem.
 func (filesystemType) GetFilesystem(context.Context, *vfs.VirtualFilesystem, *auth.Credentials, string, vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
 	panic("host.filesystemType.GetFilesystem should never be called")
 }
 
-// Name implements FilesystemType.Name.
+// Name implements vfs.FilesystemType.Name.
 func (filesystemType) Name() string {
 	return "none"
 }
 
+// Release implements vfs.FilesystemType.Release.
+func (filesystemType) Release(ctx context.Context) {}
+
 // NewFilesystem sets up and returns a new hostfs filesystem.
 //
 // Note that there should only ever be one instance of host.filesystem,
@@ -162,6 +171,8 @@ func NewFilesystem(vfsObj *vfs.VirtualFilesystem) (*vfs.Filesystem, error) {
 }
 
 // filesystem implements vfs.FilesystemImpl.
+//
+// +stateify savable
 type filesystem struct {
 	kernfs.Filesystem
 
@@ -181,14 +192,18 @@ func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDe
 }
 
 // inode implements kernfs.Inode.
+//
+// +stateify savable
 type inode struct {
+	kernfs.InodeNoStatFS
 	kernfs.InodeNotDirectory
 	kernfs.InodeNotSymlink
+	kernfs.InodeTemporary // This holds no meaning as this inode can't be Looked up and is always valid.
 
 	locks vfs.FileLocks
 
 	// When the reference count reaches zero, the host fd is closed.
-	refs.AtomicRefCount
+	inodeRefs
 
 	// hostFD contains the host fd that this file was originally created from,
 	// which must be available at time of restore.
@@ -228,7 +243,7 @@ type inode struct {
 	canMap bool
 
 	// mapsMu protects mappings.
-	mapsMu sync.Mutex
+	mapsMu sync.Mutex `state:"nosave"`
 
 	// If canMap is true, mappings tracks mappings of hostFD into
 	// memmap.MappingSpaces.
@@ -238,7 +253,7 @@ type inode struct {
 	pf inodePlatformFile
 }
 
-// CheckPermissions implements kernfs.Inode.
+// CheckPermissions implements kernfs.Inode.CheckPermissions.
 func (i *inode) CheckPermissions(ctx context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error {
 	var s syscall.Stat_t
 	if err := syscall.Fstat(i.hostFD, &s); err != nil {
@@ -247,7 +262,7 @@ func (i *inode) CheckPermissions(ctx context.Context, creds *auth.Credentials, a
 	return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(s.Mode), auth.KUID(s.Uid), auth.KGID(s.Gid))
 }
 
-// Mode implements kernfs.Inode.
+// Mode implements kernfs.Inode.Mode.
 func (i *inode) Mode() linux.FileMode {
 	var s syscall.Stat_t
 	if err := syscall.Fstat(i.hostFD, &s); err != nil {
@@ -258,7 +273,7 @@ func (i *inode) Mode() linux.FileMode {
 	return linux.FileMode(s.Mode)
 }
 
-// Stat implements kernfs.Inode.
+// Stat implements kernfs.Inode.Stat.
 func (i *inode) Stat(ctx context.Context, vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
 	if opts.Mask&linux.STATX__RESERVED != 0 {
 		return linux.Statx{}, syserror.EINVAL
@@ -371,7 +386,7 @@ func (i *inode) fstat(fs *filesystem) (linux.Statx, error) {
 	}, nil
 }
 
-// SetStat implements kernfs.Inode.
+// SetStat implements kernfs.Inode.SetStat.
 func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
 	s := &opts.Stat
 
@@ -430,9 +445,9 @@ func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Cre
 	return nil
 }
 
-// DecRef implements kernfs.Inode.
+// DecRef implements kernfs.Inode.DecRef.
 func (i *inode) DecRef(ctx context.Context) {
-	i.AtomicRefCount.DecRefWithDestructor(ctx, func(context.Context) {
+	i.inodeRefs.DecRef(func() {
 		if i.wouldBlock {
 			fdnotifier.RemoveFD(int32(i.hostFD))
 		}
@@ -442,16 +457,16 @@ func (i *inode) DecRef(ctx context.Context) {
 	})
 }
 
-// Open implements kernfs.Inode.
-func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+// Open implements kernfs.Inode.Open.
+func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
 	// Once created, we cannot re-open a socket fd through /proc/[pid]/fd/.
 	if i.Mode().FileType() == linux.S_IFSOCK {
 		return nil, syserror.ENXIO
 	}
-	return i.open(ctx, vfsd, rp.Mount(), opts.Flags)
+	return i.open(ctx, d, rp.Mount(), opts.Flags)
 }
 
-func (i *inode) open(ctx context.Context, d *vfs.Dentry, mnt *vfs.Mount, flags uint32) (*vfs.FileDescription, error) {
+func (i *inode) open(ctx context.Context, d *kernfs.Dentry, mnt *vfs.Mount, flags uint32) (*vfs.FileDescription, error) {
 	var s syscall.Stat_t
 	if err := syscall.Fstat(i.hostFD, &s); err != nil {
 		return nil, err
@@ -475,17 +490,17 @@ func (i *inode) open(ctx context.Context, d *vfs.Dentry, mnt *vfs.Mount, flags u
 			return nil, err
 		}
 		// Currently, we only allow Unix sockets to be imported.
-		return unixsocket.NewFileDescription(ep, ep.Type(), flags, mnt, d, &i.locks)
+		return unixsocket.NewFileDescription(ep, ep.Type(), flags, mnt, d.VFSDentry(), &i.locks)
 
 	case syscall.S_IFREG, syscall.S_IFIFO, syscall.S_IFCHR:
 		if i.isTTY {
 			fd := &TTYFileDescription{
 				fileDescription: fileDescription{inode: i},
-				termios:         linux.DefaultSlaveTermios,
+				termios:         linux.DefaultReplicaTermios,
 			}
 			fd.LockFD.Init(&i.locks)
 			vfsfd := &fd.vfsfd
-			if err := vfsfd.Init(fd, flags, mnt, d, &vfs.FileDescriptionOptions{}); err != nil {
+			if err := vfsfd.Init(fd, flags, mnt, d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil {
 				return nil, err
 			}
 			return vfsfd, nil
@@ -494,7 +509,7 @@ func (i *inode) open(ctx context.Context, d *vfs.Dentry, mnt *vfs.Mount, flags u
 		fd := &fileDescription{inode: i}
 		fd.LockFD.Init(&i.locks)
 		vfsfd := &fd.vfsfd
-		if err := vfsfd.Init(fd, flags, mnt, d, &vfs.FileDescriptionOptions{}); err != nil {
+		if err := vfsfd.Init(fd, flags, mnt, d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil {
 			return nil, err
 		}
 		return vfsfd, nil
@@ -506,6 +521,8 @@ func (i *inode) open(ctx context.Context, d *vfs.Dentry, mnt *vfs.Mount, flags u
 }
 
 // fileDescription is embedded by host fd implementations of FileDescriptionImpl.
+//
+// +stateify savable
 type fileDescription struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
@@ -520,40 +537,35 @@ type fileDescription struct {
 	inode *inode
 
 	// offsetMu protects offset.
-	offsetMu sync.Mutex
+	offsetMu sync.Mutex `state:"nosave"`
 
 	// offset specifies the current file offset. It is only meaningful when
 	// inode.seekable is true.
 	offset int64
 }
 
-// SetStat implements vfs.FileDescriptionImpl.
+// SetStat implements vfs.FileDescriptionImpl.SetStat.
 func (f *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
 	creds := auth.CredentialsFromContext(ctx)
 	return f.inode.SetStat(ctx, f.vfsfd.Mount().Filesystem(), creds, opts)
 }
 
-// Stat implements vfs.FileDescriptionImpl.
+// Stat implements vfs.FileDescriptionImpl.Stat.
 func (f *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
 	return f.inode.Stat(ctx, f.vfsfd.Mount().Filesystem(), opts)
 }
 
-// Release implements vfs.FileDescriptionImpl.
+// Release implements vfs.FileDescriptionImpl.Release.
 func (f *fileDescription) Release(context.Context) {
 	// noop
 }
 
-// Allocate implements vfs.FileDescriptionImpl.
+// Allocate implements vfs.FileDescriptionImpl.Allocate.
 func (f *fileDescription) Allocate(ctx context.Context, mode, offset, length uint64) error {
-	if !f.inode.seekable {
-		return syserror.ESPIPE
-	}
-
-	// TODO(gvisor.dev/issue/3589): Implement Allocate for non-pipe hostfds.
-	return syserror.EOPNOTSUPP
+	return unix.Fallocate(f.inode.hostFD, uint32(mode), int64(offset), int64(length))
 }
 
-// PRead implements FileDescriptionImpl.
+// PRead implements vfs.FileDescriptionImpl.PRead.
 func (f *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
 	i := f.inode
 	if !i.seekable {
@@ -563,7 +575,7 @@ func (f *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, off
 	return readFromHostFD(ctx, i.hostFD, dst, offset, opts.Flags)
 }
 
-// Read implements FileDescriptionImpl.
+// Read implements vfs.FileDescriptionImpl.Read.
 func (f *fileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
 	i := f.inode
 	if !i.seekable {
@@ -600,7 +612,7 @@ func readFromHostFD(ctx context.Context, hostFD int, dst usermem.IOSequence, off
 	return int64(n), err
 }
 
-// PWrite implements FileDescriptionImpl.
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
 func (f *fileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
 	if !f.inode.seekable {
 		return 0, syserror.ESPIPE
@@ -609,7 +621,7 @@ func (f *fileDescription) PWrite(ctx context.Context, src usermem.IOSequence, of
 	return f.writeToHostFD(ctx, src, offset, opts.Flags)
 }
 
-// Write implements FileDescriptionImpl.
+// Write implements vfs.FileDescriptionImpl.Write.
 func (f *fileDescription) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
 	i := f.inode
 	if !i.seekable {
@@ -657,7 +669,7 @@ func (f *fileDescription) writeToHostFD(ctx context.Context, src usermem.IOSeque
 	return int64(n), err
 }
 
-// Seek implements FileDescriptionImpl.
+// Seek implements vfs.FileDescriptionImpl.Seek.
 //
 // Note that we do not support seeking on directories, since we do not even
 // allow directory fds to be imported at all.
@@ -722,13 +734,13 @@ func (f *fileDescription) Seek(_ context.Context, offset int64, whence int32) (i
 	return f.offset, nil
 }
 
-// Sync implements FileDescriptionImpl.
+// Sync implements vfs.FileDescriptionImpl.Sync.
 func (f *fileDescription) Sync(context.Context) error {
 	// TODO(gvisor.dev/issue/1897): Currently, we always sync everything.
 	return unix.Fsync(f.inode.hostFD)
 }
 
-// ConfigureMMap implements FileDescriptionImpl.
+// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap.
 func (f *fileDescription) ConfigureMMap(_ context.Context, opts *memmap.MMapOpts) error {
 	if !f.inode.canMap {
 		return syserror.ENODEV
diff --git a/pkg/sentry/fsimpl/host/mmap.go b/pkg/sentry/fsimpl/host/mmap.go
index 65d3af38c..b51a17bed 100644
--- a/pkg/sentry/fsimpl/host/mmap.go
+++ b/pkg/sentry/fsimpl/host/mmap.go
@@ -27,11 +27,13 @@ import (
 // cannot implement both kernfs.Inode.IncRef and memmap.File.IncRef.
 //
 // inodePlatformFile should only be used if inode.canMap is true.
+//
+// +stateify savable
 type inodePlatformFile struct {
 	*inode
 
 	// fdRefsMu protects fdRefs.
-	fdRefsMu sync.Mutex
+	fdRefsMu sync.Mutex `state:"nosave"`
 
 	// fdRefs counts references on memmap.File offsets. It is used solely for
 	// memory accounting.
@@ -41,7 +43,7 @@ type inodePlatformFile struct {
 	fileMapper fsutil.HostFileMapper
 
 	// fileMapperInitOnce is used to lazily initialize fileMapper.
-	fileMapperInitOnce sync.Once
+	fileMapperInitOnce sync.Once `state:"nosave"` // FIXME(gvisor.dev/issue/1663): not yet supported.
 }
 
 // IncRef implements memmap.File.IncRef.
diff --git a/pkg/sentry/fsimpl/host/socket.go b/pkg/sentry/fsimpl/host/socket.go
index 4979dd0a9..8a447e29f 100644
--- a/pkg/sentry/fsimpl/host/socket.go
+++ b/pkg/sentry/fsimpl/host/socket.go
@@ -22,7 +22,6 @@ import (
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fdnotifier"
 	"gvisor.dev/gvisor/pkg/log"
-	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sentry/socket/control"
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.dev/gvisor/pkg/sentry/uniqueid"
@@ -59,8 +58,7 @@ func newEndpoint(ctx context.Context, hostFD int, queue *waiter.Queue) (transpor
 //
 // +stateify savable
 type ConnectedEndpoint struct {
-	// ref keeps track of references to a ConnectedEndpoint.
-	ref refs.AtomicRefCount
+	ConnectedEndpointRefs
 
 	// mu protects fd below.
 	mu sync.RWMutex `state:"nosave"`
@@ -132,9 +130,9 @@ func NewConnectedEndpoint(ctx context.Context, hostFD int, addr string, saveable
 		return nil, err
 	}
 
-	// AtomicRefCounters start off with a single reference. We need two.
-	e.ref.IncRef()
-	e.ref.EnableLeakCheck("host.ConnectedEndpoint")
+	// ConnectedEndpointRefs start off with a single reference. We need two.
+	e.IncRef()
+	e.EnableLeakCheck()
 	return &e, nil
 }
 
@@ -318,7 +316,7 @@ func (c *ConnectedEndpoint) destroyLocked() {
 // Release implements transport.ConnectedEndpoint.Release and
 // transport.Receiver.Release.
 func (c *ConnectedEndpoint) Release(ctx context.Context) {
-	c.ref.DecRefWithDestructor(ctx, func(context.Context) {
+	c.DecRef(func() {
 		c.mu.Lock()
 		c.destroyLocked()
 		c.mu.Unlock()
@@ -348,12 +346,12 @@ func (e *SCMConnectedEndpoint) Init() error {
 // Release implements transport.ConnectedEndpoint.Release and
 // transport.Receiver.Release.
 func (e *SCMConnectedEndpoint) Release(ctx context.Context) {
-	e.ref.DecRefWithDestructor(ctx, func(context.Context) {
+	e.DecRef(func() {
 		e.mu.Lock()
+		fdnotifier.RemoveFD(int32(e.fd))
 		if err := syscall.Close(e.fd); err != nil {
 			log.Warningf("Failed to close host fd %d: %v", err)
 		}
-		fdnotifier.RemoveFD(int32(e.fd))
 		e.destroyLocked()
 		e.mu.Unlock()
 	})
@@ -378,8 +376,8 @@ func NewSCMEndpoint(ctx context.Context, hostFD int, queue *waiter.Queue, addr s
 		return nil, err
 	}
 
-	// AtomicRefCounters start off with a single reference. We need two.
-	e.ref.IncRef()
-	e.ref.EnableLeakCheck("host.SCMConnectedEndpoint")
+	// ConnectedEndpointRefs start off with a single reference. We need two.
+	e.IncRef()
+	e.EnableLeakCheck()
 	return &e, nil
 }
diff --git a/pkg/sentry/fsimpl/host/socket_unsafe.go b/pkg/sentry/fsimpl/host/socket_unsafe.go
index 35ded24bc..c0bf45f08 100644
--- a/pkg/sentry/fsimpl/host/socket_unsafe.go
+++ b/pkg/sentry/fsimpl/host/socket_unsafe.go
@@ -63,10 +63,10 @@ func fdReadVec(fd int, bufs [][]byte, control []byte, peek bool, maxlen int64) (
 	controlTrunc = msg.Flags&syscall.MSG_CTRUNC == syscall.MSG_CTRUNC
 
 	if n > length {
-		return length, n, msg.Controllen, controlTrunc, err
+		return length, n, msg.Controllen, controlTrunc, nil
 	}
 
-	return n, n, msg.Controllen, controlTrunc, err
+	return n, n, msg.Controllen, controlTrunc, nil
 }
 
 // fdWriteVec sends from bufs to fd.
diff --git a/pkg/sentry/fsimpl/host/tty.go b/pkg/sentry/fsimpl/host/tty.go
index 27cbd3059..f5c596fec 100644
--- a/pkg/sentry/fsimpl/host/tty.go
+++ b/pkg/sentry/fsimpl/host/tty.go
@@ -17,6 +17,7 @@ package host
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
@@ -29,6 +30,8 @@ import (
 
 // TTYFileDescription implements vfs.FileDescriptionImpl for a host file
 // descriptor that wraps a TTY FD.
+//
+// +stateify savable
 type TTYFileDescription struct {
 	fileDescription
 
@@ -75,7 +78,7 @@ func (t *TTYFileDescription) Release(ctx context.Context) {
 	t.fileDescription.Release(ctx)
 }
 
-// PRead implements vfs.FileDescriptionImpl.
+// PRead implements vfs.FileDescriptionImpl.PRead.
 //
 // Reading from a TTY is only allowed for foreground process groups. Background
 // process groups will either get EIO or a SIGTTIN.
@@ -93,7 +96,7 @@ func (t *TTYFileDescription) PRead(ctx context.Context, dst usermem.IOSequence,
 	return t.fileDescription.PRead(ctx, dst, offset, opts)
 }
 
-// Read implements vfs.FileDescriptionImpl.
+// Read implements vfs.FileDescriptionImpl.Read.
 //
 // Reading from a TTY is only allowed for foreground process groups. Background
 // process groups will either get EIO or a SIGTTIN.
@@ -111,7 +114,7 @@ func (t *TTYFileDescription) Read(ctx context.Context, dst usermem.IOSequence, o
 	return t.fileDescription.Read(ctx, dst, opts)
 }
 
-// PWrite implements vfs.FileDescriptionImpl.
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
 func (t *TTYFileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
 	t.mu.Lock()
 	defer t.mu.Unlock()
@@ -126,7 +129,7 @@ func (t *TTYFileDescription) PWrite(ctx context.Context, src usermem.IOSequence,
 	return t.fileDescription.PWrite(ctx, src, offset, opts)
 }
 
-// Write implements vfs.FileDescriptionImpl.
+// Write implements vfs.FileDescriptionImpl.Write.
 func (t *TTYFileDescription) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
 	t.mu.Lock()
 	defer t.mu.Unlock()
@@ -141,8 +144,13 @@ func (t *TTYFileDescription) Write(ctx context.Context, src usermem.IOSequence,
 	return t.fileDescription.Write(ctx, src, opts)
 }
 
-// Ioctl implements vfs.FileDescriptionImpl.
+// Ioctl implements vfs.FileDescriptionImpl.Ioctl.
 func (t *TTYFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	task := kernel.TaskFromContext(ctx)
+	if task == nil {
+		return 0, syserror.ENOTTY
+	}
+
 	// Ignore arg[0]. This is the real FD:
 	fd := t.inode.hostFD
 	ioctl := args[1].Uint64()
@@ -152,9 +160,7 @@ func (t *TTYFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch
 		if err != nil {
 			return 0, err
 		}
-		_, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), termios, usermem.IOOpts{
-			AddressSpaceActive: true,
-		})
+		_, err = termios.CopyOut(task, args[2].Pointer())
 		return 0, err
 
 	case linux.TCSETS, linux.TCSETSW, linux.TCSETSF:
@@ -166,9 +172,7 @@ func (t *TTYFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch
 		}
 
 		var termios linux.Termios
-		if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &termios, usermem.IOOpts{
-			AddressSpaceActive: true,
-		}); err != nil {
+		if _, err := termios.CopyIn(task, args[2].Pointer()); err != nil {
 			return 0, err
 		}
 		err := ioctlSetTermios(fd, ioctl, &termios)
@@ -192,10 +196,8 @@ func (t *TTYFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch
 		defer t.mu.Unlock()
 
 		// Map the ProcessGroup into a ProcessGroupID in the task's PID namespace.
-		pgID := pidns.IDOfProcessGroup(t.fgProcessGroup)
-		_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), &pgID, usermem.IOOpts{
-			AddressSpaceActive: true,
-		})
+		pgID := primitive.Int32(pidns.IDOfProcessGroup(t.fgProcessGroup))
+		_, err := pgID.CopyOut(task, args[2].Pointer())
 		return 0, err
 
 	case linux.TIOCSPGRP:
@@ -203,11 +205,6 @@ func (t *TTYFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch
 		// Equivalent to tcsetpgrp(fd, *argp).
 		// Set the foreground process group ID of this terminal.
 
-		task := kernel.TaskFromContext(ctx)
-		if task == nil {
-			return 0, syserror.ENOTTY
-		}
-
 		t.mu.Lock()
 		defer t.mu.Unlock()
 
@@ -226,12 +223,11 @@ func (t *TTYFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch
 			return 0, syserror.ENOTTY
 		}
 
-		var pgID kernel.ProcessGroupID
-		if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &pgID, usermem.IOOpts{
-			AddressSpaceActive: true,
-		}); err != nil {
+		var pgIDP primitive.Int32
+		if _, err := pgIDP.CopyIn(task, args[2].Pointer()); err != nil {
 			return 0, err
 		}
+		pgID := kernel.ProcessGroupID(pgIDP)
 
 		// pgID must be non-negative.
 		if pgID < 0 {
@@ -260,9 +256,7 @@ func (t *TTYFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch
 		if err != nil {
 			return 0, err
 		}
-		_, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), winsize, usermem.IOOpts{
-			AddressSpaceActive: true,
-		})
+		_, err = winsize.CopyOut(task, args[2].Pointer())
 		return 0, err
 
 	case linux.TIOCSWINSZ:
@@ -273,9 +267,7 @@ func (t *TTYFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch
 		// set the winsize.
 
 		var winsize linux.Winsize
-		if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &winsize, usermem.IOOpts{
-			AddressSpaceActive: true,
-		}); err != nil {
+		if _, err := winsize.CopyIn(task, args[2].Pointer()); err != nil {
 			return 0, err
 		}
 		err := ioctlSetWinsize(fd, &winsize)
diff --git a/pkg/sentry/fsimpl/kernfs/BUILD b/pkg/sentry/fsimpl/kernfs/BUILD
index 3835557fe..858cc24ce 100644
--- a/pkg/sentry/fsimpl/kernfs/BUILD
+++ b/pkg/sentry/fsimpl/kernfs/BUILD
@@ -26,9 +26,65 @@ go_template_instance(
     },
 )
 
+go_template_instance(
+    name = "dentry_refs",
+    out = "dentry_refs.go",
+    package = "kernfs",
+    prefix = "Dentry",
+    template = "//pkg/refs_vfs2:refs_template",
+    types = {
+        "T": "Dentry",
+    },
+)
+
+go_template_instance(
+    name = "static_directory_refs",
+    out = "static_directory_refs.go",
+    package = "kernfs",
+    prefix = "StaticDirectory",
+    template = "//pkg/refs_vfs2:refs_template",
+    types = {
+        "T": "StaticDirectory",
+    },
+)
+
+go_template_instance(
+    name = "dir_refs",
+    out = "dir_refs.go",
+    package = "kernfs_test",
+    prefix = "dir",
+    template = "//pkg/refs_vfs2:refs_template",
+    types = {
+        "T": "dir",
+    },
+)
+
+go_template_instance(
+    name = "readonly_dir_refs",
+    out = "readonly_dir_refs.go",
+    package = "kernfs_test",
+    prefix = "readonlyDir",
+    template = "//pkg/refs_vfs2:refs_template",
+    types = {
+        "T": "readonlyDir",
+    },
+)
+
+go_template_instance(
+    name = "synthetic_directory_refs",
+    out = "synthetic_directory_refs.go",
+    package = "kernfs",
+    prefix = "syntheticDirectory",
+    template = "//pkg/refs_vfs2:refs_template",
+    types = {
+        "T": "syntheticDirectory",
+    },
+)
+
 go_library(
     name = "kernfs",
     srcs = [
+        "dentry_refs.go",
         "dynamic_bytes_file.go",
         "fd_impl_util.go",
         "filesystem.go",
@@ -36,7 +92,10 @@ go_library(
         "inode_impl_util.go",
         "kernfs.go",
         "slot_list.go",
+        "static_directory_refs.go",
         "symlink.go",
+        "synthetic_directory.go",
+        "synthetic_directory_refs.go",
     ],
     visibility = ["//pkg/sentry:internal"],
     deps = [
@@ -59,11 +118,17 @@ go_library(
 go_test(
     name = "kernfs_test",
     size = "small",
-    srcs = ["kernfs_test.go"],
+    srcs = [
+        "dir_refs.go",
+        "kernfs_test.go",
+        "readonly_dir_refs.go",
+    ],
     deps = [
         ":kernfs",
         "//pkg/abi/linux",
         "//pkg/context",
+        "//pkg/log",
+        "//pkg/refs",
         "//pkg/sentry/contexttest",
         "//pkg/sentry/fsimpl/testutil",
         "//pkg/sentry/kernel/auth",
diff --git a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
index 12adf727a..b929118b1 100644
--- a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
+++ b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
@@ -35,6 +35,7 @@ import (
 // +stateify savable
 type DynamicBytesFile struct {
 	InodeAttrs
+	InodeNoStatFS
 	InodeNoopRefCount
 	InodeNotDirectory
 	InodeNotSymlink
@@ -55,9 +56,9 @@ func (f *DynamicBytesFile) Init(creds *auth.Credentials, devMajor, devMinor uint
 }
 
 // Open implements Inode.Open.
-func (f *DynamicBytesFile) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+func (f *DynamicBytesFile) Open(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
 	fd := &DynamicBytesFD{}
-	if err := fd.Init(rp.Mount(), vfsd, f.data, &f.locks, opts.Flags); err != nil {
+	if err := fd.Init(rp.Mount(), d, f.data, &f.locks, opts.Flags); err != nil {
 		return nil, err
 	}
 	return &fd.vfsfd, nil
@@ -86,12 +87,12 @@ type DynamicBytesFD struct {
 }
 
 // Init initializes a DynamicBytesFD.
-func (fd *DynamicBytesFD) Init(m *vfs.Mount, d *vfs.Dentry, data vfs.DynamicBytesSource, locks *vfs.FileLocks, flags uint32) error {
+func (fd *DynamicBytesFD) Init(m *vfs.Mount, d *Dentry, data vfs.DynamicBytesSource, locks *vfs.FileLocks, flags uint32) error {
 	fd.LockFD.Init(locks)
-	if err := fd.vfsfd.Init(fd, flags, m, d, &vfs.FileDescriptionOptions{}); err != nil {
+	if err := fd.vfsfd.Init(fd, flags, m, d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil {
 		return err
 	}
-	fd.inode = d.Impl().(*Dentry).inode
+	fd.inode = d.inode
 	fd.SetDataSource(data)
 	return nil
 }
diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
index fcee6200a..abf1905d6 100644
--- a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
+++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
@@ -15,7 +15,7 @@
 package kernfs
 
 import (
-	"math"
+	"fmt"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
@@ -28,9 +28,29 @@ import (
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
+// SeekEndConfig describes the SEEK_END behaviour for FDs.
+//
+// +stateify savable
+type SeekEndConfig int
+
+// Constants related to SEEK_END behaviour for FDs.
+const (
+	// Consider the end of the file to be after the final static entry. This is
+	// the default option.
+	SeekEndStaticEntries = iota
+	// Consider the end of the file to be at offset 0.
+	SeekEndZero
+)
+
+// GenericDirectoryFDOptions contains configuration for a GenericDirectoryFD.
+//
+// +stateify savable
+type GenericDirectoryFDOptions struct {
+	SeekEnd SeekEndConfig
+}
+
 // GenericDirectoryFD implements vfs.FileDescriptionImpl for a generic directory
-// inode that uses OrderChildren to track child nodes. GenericDirectoryFD is not
-// compatible with dynamic directories.
+// inode that uses OrderChildren to track child nodes.
 //
 // Note that GenericDirectoryFD holds a lock over OrderedChildren while calling
 // IterDirents callback. The IterDirents callback therefore cannot hash or
@@ -40,16 +60,21 @@ import (
 // Must be initialize with Init before first use.
 //
 // Lock ordering: mu => children.mu.
+//
+// +stateify savable
 type GenericDirectoryFD struct {
 	vfs.FileDescriptionDefaultImpl
 	vfs.DirectoryFileDescriptionDefaultImpl
 	vfs.LockFD
 
+	// Immutable.
+	seekEnd SeekEndConfig
+
 	vfsfd    vfs.FileDescription
 	children *OrderedChildren
 
 	// mu protects the fields below.
-	mu sync.Mutex
+	mu sync.Mutex `state:"nosave"`
 
 	// off is the current directory offset. Protected by "mu".
 	off int64
@@ -57,12 +82,12 @@ type GenericDirectoryFD struct {
 
 // NewGenericDirectoryFD creates a new GenericDirectoryFD and returns its
 // dentry.
-func NewGenericDirectoryFD(m *vfs.Mount, d *vfs.Dentry, children *OrderedChildren, locks *vfs.FileLocks, opts *vfs.OpenOptions) (*GenericDirectoryFD, error) {
+func NewGenericDirectoryFD(m *vfs.Mount, d *Dentry, children *OrderedChildren, locks *vfs.FileLocks, opts *vfs.OpenOptions, fdOpts GenericDirectoryFDOptions) (*GenericDirectoryFD, error) {
 	fd := &GenericDirectoryFD{}
-	if err := fd.Init(children, locks, opts); err != nil {
+	if err := fd.Init(children, locks, opts, fdOpts); err != nil {
 		return nil, err
 	}
-	if err := fd.vfsfd.Init(fd, opts.Flags, m, d, &vfs.FileDescriptionOptions{}); err != nil {
+	if err := fd.vfsfd.Init(fd, opts.Flags, m, d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil {
 		return nil, err
 	}
 	return fd, nil
@@ -71,12 +96,13 @@ func NewGenericDirectoryFD(m *vfs.Mount, d *vfs.Dentry, children *OrderedChildre
 // Init initializes a GenericDirectoryFD. Use it when overriding
 // GenericDirectoryFD. Caller must call fd.VFSFileDescription.Init() with the
 // correct implementation.
-func (fd *GenericDirectoryFD) Init(children *OrderedChildren, locks *vfs.FileLocks, opts *vfs.OpenOptions) error {
+func (fd *GenericDirectoryFD) Init(children *OrderedChildren, locks *vfs.FileLocks, opts *vfs.OpenOptions, fdOpts GenericDirectoryFDOptions) error {
 	if vfs.AccessTypesForOpenFlags(opts)&vfs.MayWrite != 0 {
 		// Can't open directories for writing.
 		return syserror.EISDIR
 	}
 	fd.LockFD.Init(locks)
+	fd.seekEnd = fdOpts.SeekEnd
 	fd.children = children
 	return nil
 }
@@ -175,13 +201,12 @@ func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirent
 	// these.
 	childIdx := fd.off - 2
 	for it := fd.children.nthLocked(childIdx); it != nil; it = it.Next() {
-		inode := it.Dentry.Impl().(*Dentry).inode
-		stat, err := inode.Stat(ctx, fd.filesystem(), opts)
+		stat, err := it.inode.Stat(ctx, fd.filesystem(), opts)
 		if err != nil {
 			return err
 		}
 		dirent := vfs.Dirent{
-			Name:    it.Name,
+			Name:    it.name,
 			Type:    linux.FileMode(stat.Mode).DirentType(),
 			Ino:     stat.Ino,
 			NextOff: fd.off + 1,
@@ -209,9 +234,17 @@ func (fd *GenericDirectoryFD) Seek(ctx context.Context, offset int64, whence int
 	case linux.SEEK_CUR:
 		offset += fd.off
 	case linux.SEEK_END:
-		// TODO(gvisor.dev/issue/1193): This can prevent new files from showing up
-		// if they are added after SEEK_END.
-		offset = math.MaxInt64
+		switch fd.seekEnd {
+		case SeekEndStaticEntries:
+			fd.children.mu.RLock()
+			offset += int64(len(fd.children.set))
+			offset += 2 // '.' and '..' aren't tracked in children.
+			fd.children.mu.RUnlock()
+		case SeekEndZero:
+			// No-op: offset += 0.
+		default:
+			panic(fmt.Sprintf("Invalid GenericDirectoryFD.seekEnd = %v", fd.seekEnd))
+		}
 	default:
 		return 0, syserror.EINVAL
 	}
diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go
index 3e5192edd..6426a55f6 100644
--- a/pkg/sentry/fsimpl/kernfs/filesystem.go
+++ b/pkg/sentry/fsimpl/kernfs/filesystem.go
@@ -32,11 +32,12 @@ import (
 //
 // stepExistingLocked is loosely analogous to fs/namei.c:walk_component().
 //
-// Preconditions: Filesystem.mu must be locked for at least reading. !rp.Done().
+// Preconditions:
+// * Filesystem.mu must be locked for at least reading.
+// * !rp.Done().
 //
 // Postcondition: Caller must call fs.processDeferredDecRefs*.
-func (fs *Filesystem) stepExistingLocked(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, mayFollowSymlinks bool) (*vfs.Dentry, error) {
-	d := vfsd.Impl().(*Dentry)
+func (fs *Filesystem) stepExistingLocked(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry, mayFollowSymlinks bool) (*Dentry, error) {
 	if !d.isDir() {
 		return nil, syserror.ENOTDIR
 	}
@@ -53,20 +54,20 @@ afterSymlink:
 	// calls d_revalidate(), but walk_component() => handle_dots() does not.
 	if name == "." {
 		rp.Advance()
-		return vfsd, nil
+		return d, nil
 	}
 	if name == ".." {
-		if isRoot, err := rp.CheckRoot(ctx, vfsd); err != nil {
+		if isRoot, err := rp.CheckRoot(ctx, d.VFSDentry()); err != nil {
 			return nil, err
 		} else if isRoot || d.parent == nil {
 			rp.Advance()
-			return vfsd, nil
+			return d, nil
 		}
-		if err := rp.CheckMount(ctx, &d.parent.vfsd); err != nil {
+		if err := rp.CheckMount(ctx, d.parent.VFSDentry()); err != nil {
 			return nil, err
 		}
 		rp.Advance()
-		return &d.parent.vfsd, nil
+		return d.parent, nil
 	}
 	if len(name) > linux.NAME_MAX {
 		return nil, syserror.ENAMETOOLONG
@@ -77,7 +78,7 @@ afterSymlink:
 	if err != nil {
 		return nil, err
 	}
-	if err := rp.CheckMount(ctx, &next.vfsd); err != nil {
+	if err := rp.CheckMount(ctx, next.VFSDentry()); err != nil {
 		return nil, err
 	}
 	// Resolve any symlink at current path component.
@@ -88,7 +89,7 @@ afterSymlink:
 		}
 		if targetVD.Ok() {
 			err := rp.HandleJump(targetVD)
-			targetVD.DecRef(ctx)
+			fs.deferDecRefVD(ctx, targetVD)
 			if err != nil {
 				return nil, err
 			}
@@ -100,15 +101,18 @@ afterSymlink:
 		goto afterSymlink
 	}
 	rp.Advance()
-	return &next.vfsd, nil
+	return next, nil
 }
 
 // revalidateChildLocked must be called after a call to parent.vfsd.Child(name)
 // or vfs.ResolvingPath.ResolveChild(name) returns childVFSD (which may be
 // nil) to verify that the returned child (or lack thereof) is correct.
 //
-// Preconditions: Filesystem.mu must be locked for at least reading.
-// parent.dirMu must be locked. parent.isDir(). name is not "." or "..".
+// Preconditions:
+// * Filesystem.mu must be locked for at least reading.
+// * parent.dirMu must be locked.
+// * parent.isDir().
+// * name is not "." or "..".
 //
 // Postconditions: Caller must call fs.processDeferredDecRefs*.
 func (fs *Filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.VirtualFilesystem, parent *Dentry, name string, child *Dentry) (*Dentry, error) {
@@ -116,26 +120,33 @@ func (fs *Filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.Vir
 		// Cached dentry exists, revalidate.
 		if !child.inode.Valid(ctx) {
 			delete(parent.children, name)
-			vfsObj.InvalidateDentry(ctx, &child.vfsd)
-			fs.deferDecRef(&child.vfsd) // Reference from Lookup.
+			if child.inode.Keep() {
+				// Drop the ref owned by kernfs.
+				fs.deferDecRef(child)
+			}
+			vfsObj.InvalidateDentry(ctx, child.VFSDentry())
 			child = nil
 		}
 	}
 	if child == nil {
-		// Dentry isn't cached; it either doesn't exist or failed
-		// revalidation. Attempt to resolve it via Lookup.
-		//
-		// FIXME(gvisor.dev/issue/1193): Inode.Lookup() should return
-		// *(kernfs.)Dentry, not *vfs.Dentry, since (kernfs.)Filesystem assumes
-		// that all dentries in the filesystem are (kernfs.)Dentry and performs
-		// vfs.DentryImpl casts accordingly.
-		childVFSD, err := parent.inode.Lookup(ctx, name)
+		// Dentry isn't cached; it either doesn't exist or failed revalidation.
+		// Attempt to resolve it via Lookup.
+		childInode, err := parent.inode.Lookup(ctx, name)
 		if err != nil {
 			return nil, err
 		}
-		// Reference on childVFSD dropped by a corresponding Valid.
-		child = childVFSD.Impl().(*Dentry)
-		parent.insertChildLocked(name, child)
+		var newChild Dentry
+		newChild.Init(fs, childInode) // childInode's ref is transferred to newChild.
+		parent.insertChildLocked(name, &newChild)
+		child = &newChild
+
+		// Drop the ref on newChild. This will cause the dentry to get pruned
+		// from the dentry tree by the end of current filesystem operation
+		// (before returning to the VFS layer) if another ref is not picked on
+		// this dentry.
+		if !childInode.Keep() {
+			fs.deferDecRef(&newChild)
+		}
 	}
 	return child, nil
 }
@@ -148,20 +159,19 @@ func (fs *Filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.Vir
 // Preconditions: Filesystem.mu must be locked for at least reading.
 //
 // Postconditions: Caller must call fs.processDeferredDecRefs*.
-func (fs *Filesystem) walkExistingLocked(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, Inode, error) {
-	vfsd := rp.Start()
+func (fs *Filesystem) walkExistingLocked(ctx context.Context, rp *vfs.ResolvingPath) (*Dentry, error) {
+	d := rp.Start().Impl().(*Dentry)
 	for !rp.Done() {
 		var err error
-		vfsd, err = fs.stepExistingLocked(ctx, rp, vfsd, true /* mayFollowSymlinks */)
+		d, err = fs.stepExistingLocked(ctx, rp, d, true /* mayFollowSymlinks */)
 		if err != nil {
-			return nil, nil, err
+			return nil, err
 		}
 	}
-	d := vfsd.Impl().(*Dentry)
 	if rp.MustBeDir() && !d.isDir() {
-		return nil, nil, syserror.ENOTDIR
+		return nil, syserror.ENOTDIR
 	}
-	return vfsd, d.inode, nil
+	return d, nil
 }
 
 // walkParentDirLocked resolves all but the last path component of rp to an
@@ -171,32 +181,34 @@ func (fs *Filesystem) walkExistingLocked(ctx context.Context, rp *vfs.ResolvingP
 // walkParentDirLocked is loosely analogous to Linux's
 // fs/namei.c:path_parentat().
 //
-// Preconditions: Filesystem.mu must be locked for at least reading. !rp.Done().
+// Preconditions:
+// * Filesystem.mu must be locked for at least reading.
+// * !rp.Done().
 //
 // Postconditions: Caller must call fs.processDeferredDecRefs*.
-func (fs *Filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, Inode, error) {
-	vfsd := rp.Start()
+func (fs *Filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath) (*Dentry, error) {
+	d := rp.Start().Impl().(*Dentry)
 	for !rp.Final() {
 		var err error
-		vfsd, err = fs.stepExistingLocked(ctx, rp, vfsd, true /* mayFollowSymlinks */)
+		d, err = fs.stepExistingLocked(ctx, rp, d, true /* mayFollowSymlinks */)
 		if err != nil {
-			return nil, nil, err
+			return nil, err
 		}
 	}
-	d := vfsd.Impl().(*Dentry)
 	if !d.isDir() {
-		return nil, nil, syserror.ENOTDIR
+		return nil, syserror.ENOTDIR
 	}
-	return vfsd, d.inode, nil
+	return d, nil
 }
 
 // checkCreateLocked checks that a file named rp.Component() may be created in
-// directory parentVFSD, then returns rp.Component().
+// directory parent, then returns rp.Component().
 //
-// Preconditions: Filesystem.mu must be locked for at least reading. parentInode
-// == parentVFSD.Impl().(*Dentry).Inode. isDir(parentInode) == true.
-func checkCreateLocked(ctx context.Context, rp *vfs.ResolvingPath, parentVFSD *vfs.Dentry, parentInode Inode) (string, error) {
-	if err := parentInode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
+// Preconditions:
+// * Filesystem.mu must be locked for at least reading.
+// * isDir(parentInode) == true.
+func checkCreateLocked(ctx context.Context, rp *vfs.ResolvingPath, parent *Dentry) (string, error) {
+	if err := parent.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
 		return "", err
 	}
 	pc := rp.Component()
@@ -206,11 +218,10 @@ func checkCreateLocked(ctx context.Context, rp *vfs.ResolvingPath, parentVFSD *v
 	if len(pc) > linux.NAME_MAX {
 		return "", syserror.ENAMETOOLONG
 	}
-	// FIXME(gvisor.dev/issue/1193): Data race due to not holding dirMu.
-	if _, ok := parentVFSD.Impl().(*Dentry).children[pc]; ok {
+	if _, ok := parent.children[pc]; ok {
 		return "", syserror.EEXIST
 	}
-	if parentVFSD.IsDead() {
+	if parent.VFSDentry().IsDead() {
 		return "", syserror.ENOENT
 	}
 	return pc, nil
@@ -219,8 +230,8 @@ func checkCreateLocked(ctx context.Context, rp *vfs.ResolvingPath, parentVFSD *v
 // checkDeleteLocked checks that the file represented by vfsd may be deleted.
 //
 // Preconditions: Filesystem.mu must be locked for at least reading.
-func checkDeleteLocked(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry) error {
-	parent := vfsd.Impl().(*Dentry).parent
+func checkDeleteLocked(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry) error {
+	parent := d.parent
 	if parent == nil {
 		return syserror.EBUSY
 	}
@@ -249,11 +260,11 @@ func (fs *Filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds
 	defer fs.processDeferredDecRefs(ctx)
 	defer fs.mu.RUnlock()
 
-	_, inode, err := fs.walkExistingLocked(ctx, rp)
+	d, err := fs.walkExistingLocked(ctx, rp)
 	if err != nil {
 		return err
 	}
-	return inode.CheckPermissions(ctx, creds, ats)
+	return d.inode.CheckPermissions(ctx, creds, ats)
 }
 
 // GetDentryAt implements vfs.FilesystemImpl.GetDentryAt.
@@ -261,20 +272,20 @@ func (fs *Filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, op
 	fs.mu.RLock()
 	defer fs.processDeferredDecRefs(ctx)
 	defer fs.mu.RUnlock()
-	vfsd, inode, err := fs.walkExistingLocked(ctx, rp)
+	d, err := fs.walkExistingLocked(ctx, rp)
 	if err != nil {
 		return nil, err
 	}
 
 	if opts.CheckSearchable {
-		d := vfsd.Impl().(*Dentry)
 		if !d.isDir() {
 			return nil, syserror.ENOTDIR
 		}
-		if err := inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayExec); err != nil {
+		if err := d.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayExec); err != nil {
 			return nil, err
 		}
 	}
+	vfsd := d.VFSDentry()
 	vfsd.IncRef() // Ownership transferred to caller.
 	return vfsd, nil
 }
@@ -284,12 +295,12 @@ func (fs *Filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPa
 	fs.mu.RLock()
 	defer fs.processDeferredDecRefs(ctx)
 	defer fs.mu.RUnlock()
-	vfsd, _, err := fs.walkParentDirLocked(ctx, rp)
+	d, err := fs.walkParentDirLocked(ctx, rp)
 	if err != nil {
 		return nil, err
 	}
-	vfsd.IncRef() // Ownership transferred to caller.
-	return vfsd, nil
+	d.IncRef() // Ownership transferred to caller.
+	return d.VFSDentry(), nil
 }
 
 // LinkAt implements vfs.FilesystemImpl.LinkAt.
@@ -298,13 +309,16 @@ func (fs *Filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.
 		return syserror.EEXIST
 	}
 	fs.mu.Lock()
+	defer fs.processDeferredDecRefs(ctx)
 	defer fs.mu.Unlock()
-	parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp)
-	fs.processDeferredDecRefsLocked(ctx)
+	parent, err := fs.walkParentDirLocked(ctx, rp)
 	if err != nil {
 		return err
 	}
-	pc, err := checkCreateLocked(ctx, rp, parentVFSD, parentInode)
+
+	parent.dirMu.Lock()
+	defer parent.dirMu.Unlock()
+	pc, err := checkCreateLocked(ctx, rp, parent)
 	if err != nil {
 		return err
 	}
@@ -321,11 +335,13 @@ func (fs *Filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.
 		return syserror.EPERM
 	}
 
-	childVFSD, err := parentInode.NewLink(ctx, pc, d.inode)
+	childI, err := parent.inode.NewLink(ctx, pc, d.inode)
 	if err != nil {
 		return err
 	}
-	parentVFSD.Impl().(*Dentry).InsertChild(pc, childVFSD.Impl().(*Dentry))
+	var child Dentry
+	child.Init(fs, childI)
+	parent.insertChildLocked(pc, &child)
 	return nil
 }
 
@@ -335,13 +351,16 @@ func (fs *Filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
 		return syserror.EEXIST
 	}
 	fs.mu.Lock()
+	defer fs.processDeferredDecRefs(ctx)
 	defer fs.mu.Unlock()
-	parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp)
-	fs.processDeferredDecRefsLocked(ctx)
+	parent, err := fs.walkParentDirLocked(ctx, rp)
 	if err != nil {
 		return err
 	}
-	pc, err := checkCreateLocked(ctx, rp, parentVFSD, parentInode)
+
+	parent.dirMu.Lock()
+	defer parent.dirMu.Unlock()
+	pc, err := checkCreateLocked(ctx, rp, parent)
 	if err != nil {
 		return err
 	}
@@ -349,11 +368,16 @@ func (fs *Filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
 		return err
 	}
 	defer rp.Mount().EndWrite()
-	childVFSD, err := parentInode.NewDir(ctx, pc, opts)
+	childI, err := parent.inode.NewDir(ctx, pc, opts)
 	if err != nil {
-		return err
+		if !opts.ForSyntheticMountpoint || err == syserror.EEXIST {
+			return err
+		}
+		childI = newSyntheticDirectory(rp.Credentials(), opts.Mode)
 	}
-	parentVFSD.Impl().(*Dentry).InsertChild(pc, childVFSD.Impl().(*Dentry))
+	var child Dentry
+	child.Init(fs, childI)
+	parent.insertChildLocked(pc, &child)
 	return nil
 }
 
@@ -363,13 +387,16 @@ func (fs *Filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
 		return syserror.EEXIST
 	}
 	fs.mu.Lock()
+	defer fs.processDeferredDecRefs(ctx)
 	defer fs.mu.Unlock()
-	parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp)
-	fs.processDeferredDecRefsLocked(ctx)
+	parent, err := fs.walkParentDirLocked(ctx, rp)
 	if err != nil {
 		return err
 	}
-	pc, err := checkCreateLocked(ctx, rp, parentVFSD, parentInode)
+
+	parent.dirMu.Lock()
+	defer parent.dirMu.Unlock()
+	pc, err := checkCreateLocked(ctx, rp, parent)
 	if err != nil {
 		return err
 	}
@@ -377,11 +404,13 @@ func (fs *Filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
 		return err
 	}
 	defer rp.Mount().EndWrite()
-	newVFSD, err := parentInode.NewNode(ctx, pc, opts)
+	newI, err := parent.inode.NewNode(ctx, pc, opts)
 	if err != nil {
 		return err
 	}
-	parentVFSD.Impl().(*Dentry).InsertChild(pc, newVFSD.Impl().(*Dentry))
+	var newD Dentry
+	newD.Init(fs, newI)
+	parent.insertChildLocked(pc, &newD)
 	return nil
 }
 
@@ -397,28 +426,28 @@ func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
 	// Do not create new file.
 	if opts.Flags&linux.O_CREAT == 0 {
 		fs.mu.RLock()
-		vfsd, inode, err := fs.walkExistingLocked(ctx, rp)
+		defer fs.processDeferredDecRefs(ctx)
+		d, err := fs.walkExistingLocked(ctx, rp)
 		if err != nil {
 			fs.mu.RUnlock()
-			fs.processDeferredDecRefs(ctx)
 			return nil, err
 		}
-		if err := inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil {
+		if err := d.inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil {
 			fs.mu.RUnlock()
-			fs.processDeferredDecRefs(ctx)
 			return nil, err
 		}
-		inode.IncRef()
-		defer inode.DecRef(ctx)
+		// Open may block so we need to unlock fs.mu. IncRef d to prevent
+		// its destruction while fs.mu is unlocked.
+		d.IncRef()
 		fs.mu.RUnlock()
-		fs.processDeferredDecRefs(ctx)
-		return inode.Open(ctx, rp, vfsd, opts)
+		fd, err := d.inode.Open(ctx, rp, d, opts)
+		d.DecRef(ctx)
+		return fd, err
 	}
 
 	// May create new file.
 	mustCreate := opts.Flags&linux.O_EXCL != 0
-	vfsd := rp.Start()
-	inode := vfsd.Impl().(*Dentry).inode
+	d := rp.Start().Impl().(*Dentry)
 	fs.mu.Lock()
 	unlocked := false
 	unlock := func() {
@@ -427,6 +456,10 @@ func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
 			unlocked = true
 		}
 	}
+	// Process all to-be-decref'd dentries at the end at once.
+	// Since we defer unlock() AFTER this, fs.mu is guaranteed to be unlocked
+	// when this is executed.
+	defer fs.processDeferredDecRefs(ctx)
 	defer unlock()
 	if rp.Done() {
 		if rp.MustBeDir() {
@@ -435,22 +468,24 @@ func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
 		if mustCreate {
 			return nil, syserror.EEXIST
 		}
-		if err := inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil {
+		if err := d.inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil {
 			return nil, err
 		}
-		inode.IncRef()
-		defer inode.DecRef(ctx)
+		// Open may block so we need to unlock fs.mu. IncRef d to prevent
+		// its destruction while fs.mu is unlocked.
+		d.IncRef()
 		unlock()
-		return inode.Open(ctx, rp, vfsd, opts)
+		fd, err := d.inode.Open(ctx, rp, d, opts)
+		d.DecRef(ctx)
+		return fd, err
 	}
 afterTrailingSymlink:
-	parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp)
-	fs.processDeferredDecRefsLocked(ctx)
+	parent, err := fs.walkParentDirLocked(ctx, rp)
 	if err != nil {
 		return nil, err
 	}
 	// Check for search permission in the parent directory.
-	if err := parentInode.CheckPermissions(ctx, rp.Credentials(), vfs.MayExec); err != nil {
+	if err := parent.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayExec); err != nil {
 		return nil, err
 	}
 	// Reject attempts to open directories with O_CREAT.
@@ -465,10 +500,10 @@ afterTrailingSymlink:
 		return nil, syserror.ENAMETOOLONG
 	}
 	// Determine whether or not we need to create a file.
-	childVFSD, err := fs.stepExistingLocked(ctx, rp, parentVFSD, false /* mayFollowSymlinks */)
+	child, err := fs.stepExistingLocked(ctx, rp, parent, false /* mayFollowSymlinks */)
 	if err == syserror.ENOENT {
 		// Already checked for searchability above; now check for writability.
-		if err := parentInode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite); err != nil {
+		if err := parent.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite); err != nil {
 			return nil, err
 		}
 		if err := rp.Mount().CheckBeginWrite(); err != nil {
@@ -476,16 +511,23 @@ afterTrailingSymlink:
 		}
 		defer rp.Mount().EndWrite()
 		// Create and open the child.
-		childVFSD, err = parentInode.NewFile(ctx, pc, opts)
+		childI, err := parent.inode.NewFile(ctx, pc, opts)
 		if err != nil {
 			return nil, err
 		}
-		child := childVFSD.Impl().(*Dentry)
-		parentVFSD.Impl().(*Dentry).InsertChild(pc, child)
-		child.inode.IncRef()
-		defer child.inode.DecRef(ctx)
+		var child Dentry
+		child.Init(fs, childI)
+		// FIXME(gvisor.dev/issue/1193): Race between checking existence with
+		// fs.stepExistingLocked and parent.insertChild. If possible, we should hold
+		// dirMu from one to the other.
+		parent.insertChild(pc, &child)
+		// Open may block so we need to unlock fs.mu. IncRef child to prevent
+		// its destruction while fs.mu is unlocked.
+		child.IncRef()
 		unlock()
-		return child.inode.Open(ctx, rp, childVFSD, opts)
+		fd, err := child.inode.Open(ctx, rp, &child, opts)
+		child.DecRef(ctx)
+		return fd, err
 	}
 	if err != nil {
 		return nil, err
@@ -494,7 +536,6 @@ afterTrailingSymlink:
 	if mustCreate {
 		return nil, syserror.EEXIST
 	}
-	child := childVFSD.Impl().(*Dentry)
 	if rp.ShouldFollowSymlink() && child.isSymlink() {
 		targetVD, targetPathname, err := child.inode.Getlink(ctx, rp.Mount())
 		if err != nil {
@@ -502,7 +543,7 @@ afterTrailingSymlink:
 		}
 		if targetVD.Ok() {
 			err := rp.HandleJump(targetVD)
-			targetVD.DecRef(ctx)
+			fs.deferDecRefVD(ctx, targetVD)
 			if err != nil {
 				return nil, err
 			}
@@ -518,25 +559,28 @@ afterTrailingSymlink:
 	if err := child.inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil {
 		return nil, err
 	}
-	child.inode.IncRef()
-	defer child.inode.DecRef(ctx)
+	// Open may block so we need to unlock fs.mu. IncRef child to prevent
+	// its destruction while fs.mu is unlocked.
+	child.IncRef()
 	unlock()
-	return child.inode.Open(ctx, rp, &child.vfsd, opts)
+	fd, err := child.inode.Open(ctx, rp, child, opts)
+	child.DecRef(ctx)
+	return fd, err
 }
 
 // ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt.
 func (fs *Filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) {
 	fs.mu.RLock()
-	d, inode, err := fs.walkExistingLocked(ctx, rp)
-	fs.mu.RUnlock()
-	fs.processDeferredDecRefs(ctx)
+	defer fs.processDeferredDecRefs(ctx)
+	defer fs.mu.RUnlock()
+	d, err := fs.walkExistingLocked(ctx, rp)
 	if err != nil {
 		return "", err
 	}
-	if !d.Impl().(*Dentry).isSymlink() {
+	if !d.isSymlink() {
 		return "", syserror.EINVAL
 	}
-	return inode.Readlink(ctx)
+	return d.inode.Readlink(ctx, rp.Mount())
 }
 
 // RenameAt implements vfs.FilesystemImpl.RenameAt.
@@ -548,16 +592,15 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 	noReplace := opts.Flags&linux.RENAME_NOREPLACE != 0
 
 	fs.mu.Lock()
-	defer fs.processDeferredDecRefsLocked(ctx)
+	defer fs.processDeferredDecRefs(ctx)
 	defer fs.mu.Unlock()
 
 	// Resolve the destination directory first to verify that it's on this
 	// Mount.
-	dstDirVFSD, dstDirInode, err := fs.walkParentDirLocked(ctx, rp)
+	dstDir, err := fs.walkParentDirLocked(ctx, rp)
 	if err != nil {
 		return err
 	}
-	dstDir := dstDirVFSD.Impl().(*Dentry)
 	mnt := rp.Mount()
 	if mnt != oldParentVD.Mount() {
 		return syserror.EXDEV
@@ -575,16 +618,15 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 	if err != nil {
 		return err
 	}
-	srcVFSD := &src.vfsd
 
 	// Can we remove the src dentry?
-	if err := checkDeleteLocked(ctx, rp, srcVFSD); err != nil {
+	if err := checkDeleteLocked(ctx, rp, src); err != nil {
 		return err
 	}
 
 	// Can we create the dst dentry?
 	var dst *Dentry
-	pc, err := checkCreateLocked(ctx, rp, dstDirVFSD, dstDirInode)
+	pc, err := checkCreateLocked(ctx, rp, dstDir)
 	switch err {
 	case nil:
 		// Ok, continue with rename as replacement.
@@ -595,14 +637,14 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 		}
 		dst = dstDir.children[pc]
 		if dst == nil {
-			panic(fmt.Sprintf("Child %q for parent Dentry %+v disappeared inside atomic section?", pc, dstDirVFSD))
+			panic(fmt.Sprintf("Child %q for parent Dentry %+v disappeared inside atomic section?", pc, dstDir))
 		}
 	default:
 		return err
 	}
 	var dstVFSD *vfs.Dentry
 	if dst != nil {
-		dstVFSD = &dst.vfsd
+		dstVFSD = dst.VFSDentry()
 	}
 
 	mntns := vfs.MountNamespaceFromContext(ctx)
@@ -618,35 +660,44 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 		defer dstDir.dirMu.Unlock()
 	}
 
+	srcVFSD := src.VFSDentry()
 	if err := virtfs.PrepareRenameDentry(mntns, srcVFSD, dstVFSD); err != nil {
 		return err
 	}
-	replaced, err := srcDir.inode.Rename(ctx, src.name, pc, srcVFSD, dstDirVFSD)
+	err = srcDir.inode.Rename(ctx, src.name, pc, src.inode, dstDir.inode)
 	if err != nil {
 		virtfs.AbortRenameDentry(srcVFSD, dstVFSD)
 		return err
 	}
 	delete(srcDir.children, src.name)
 	if srcDir != dstDir {
-		fs.deferDecRef(srcDirVFSD)
-		dstDir.IncRef()
+		fs.deferDecRef(srcDir) // child (src) drops ref on old parent.
+		dstDir.IncRef()        // child (src) takes a ref on the new parent.
 	}
 	src.parent = dstDir
 	src.name = pc
 	if dstDir.children == nil {
 		dstDir.children = make(map[string]*Dentry)
 	}
+	replaced := dstDir.children[pc]
 	dstDir.children[pc] = src
-	virtfs.CommitRenameReplaceDentry(ctx, srcVFSD, replaced)
+	var replaceVFSD *vfs.Dentry
+	if replaced != nil {
+		// deferDecRef so that fs.mu and dstDir.mu are unlocked by then.
+		fs.deferDecRef(replaced)
+		replaceVFSD = replaced.VFSDentry()
+	}
+	virtfs.CommitRenameReplaceDentry(ctx, srcVFSD, replaceVFSD)
 	return nil
 }
 
 // RmdirAt implements vfs.FilesystemImpl.RmdirAt.
 func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error {
 	fs.mu.Lock()
+	defer fs.processDeferredDecRefs(ctx)
 	defer fs.mu.Unlock()
-	vfsd, inode, err := fs.walkExistingLocked(ctx, rp)
-	fs.processDeferredDecRefsLocked(ctx)
+
+	d, err := fs.walkExistingLocked(ctx, rp)
 	if err != nil {
 		return err
 	}
@@ -654,14 +705,13 @@ func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
 		return err
 	}
 	defer rp.Mount().EndWrite()
-	if err := checkDeleteLocked(ctx, rp, vfsd); err != nil {
+	if err := checkDeleteLocked(ctx, rp, d); err != nil {
 		return err
 	}
-	d := vfsd.Impl().(*Dentry)
 	if !d.isDir() {
 		return syserror.ENOTDIR
 	}
-	if inode.HasChildren() {
+	if d.inode.HasChildren() {
 		return syserror.ENOTEMPTY
 	}
 	virtfs := rp.VirtualFilesystem()
@@ -671,13 +721,18 @@ func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
 
 	mntns := vfs.MountNamespaceFromContext(ctx)
 	defer mntns.DecRef(ctx)
+	vfsd := d.VFSDentry()
 	if err := virtfs.PrepareDeleteDentry(mntns, vfsd); err != nil {
 		return err
 	}
-	if err := parentDentry.inode.RmDir(ctx, rp.Component(), vfsd); err != nil {
+
+	if err := parentDentry.inode.RmDir(ctx, d.name, d.inode); err != nil {
 		virtfs.AbortDeleteDentry(vfsd)
 		return err
 	}
+	delete(parentDentry.children, d.name)
+	// Defer decref so that fs.mu and parentDentry.dirMu are unlocked by then.
+	fs.deferDecRef(d)
 	virtfs.CommitDeleteDentry(ctx, vfsd)
 	return nil
 }
@@ -685,41 +740,40 @@ func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
 // SetStatAt implements vfs.FilesystemImpl.SetStatAt.
 func (fs *Filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error {
 	fs.mu.RLock()
-	_, inode, err := fs.walkExistingLocked(ctx, rp)
-	fs.mu.RUnlock()
-	fs.processDeferredDecRefs(ctx)
+	defer fs.processDeferredDecRefs(ctx)
+	defer fs.mu.RUnlock()
+	d, err := fs.walkExistingLocked(ctx, rp)
 	if err != nil {
 		return err
 	}
 	if opts.Stat.Mask == 0 {
 		return nil
 	}
-	return inode.SetStat(ctx, fs.VFSFilesystem(), rp.Credentials(), opts)
+	return d.inode.SetStat(ctx, fs.VFSFilesystem(), rp.Credentials(), opts)
 }
 
 // StatAt implements vfs.FilesystemImpl.StatAt.
 func (fs *Filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) {
 	fs.mu.RLock()
-	_, inode, err := fs.walkExistingLocked(ctx, rp)
-	fs.mu.RUnlock()
-	fs.processDeferredDecRefs(ctx)
+	defer fs.processDeferredDecRefs(ctx)
+	defer fs.mu.RUnlock()
+	d, err := fs.walkExistingLocked(ctx, rp)
 	if err != nil {
 		return linux.Statx{}, err
 	}
-	return inode.Stat(ctx, fs.VFSFilesystem(), opts)
+	return d.inode.Stat(ctx, fs.VFSFilesystem(), opts)
 }
 
 // StatFSAt implements vfs.FilesystemImpl.StatFSAt.
 func (fs *Filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) {
 	fs.mu.RLock()
-	_, _, err := fs.walkExistingLocked(ctx, rp)
-	fs.mu.RUnlock()
-	fs.processDeferredDecRefs(ctx)
+	defer fs.processDeferredDecRefs(ctx)
+	defer fs.mu.RUnlock()
+	d, err := fs.walkExistingLocked(ctx, rp)
 	if err != nil {
 		return linux.Statfs{}, err
 	}
-	// TODO(gvisor.dev/issue/1193): actually implement statfs.
-	return linux.Statfs{}, syserror.ENOSYS
+	return d.inode.StatFS(ctx, fs.VFSFilesystem())
 }
 
 // SymlinkAt implements vfs.FilesystemImpl.SymlinkAt.
@@ -728,13 +782,16 @@ func (fs *Filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ
 		return syserror.EEXIST
 	}
 	fs.mu.Lock()
+	defer fs.processDeferredDecRefs(ctx)
 	defer fs.mu.Unlock()
-	parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp)
-	fs.processDeferredDecRefsLocked(ctx)
+	parent, err := fs.walkParentDirLocked(ctx, rp)
 	if err != nil {
 		return err
 	}
-	pc, err := checkCreateLocked(ctx, rp, parentVFSD, parentInode)
+	parent.dirMu.Lock()
+	defer parent.dirMu.Unlock()
+
+	pc, err := checkCreateLocked(ctx, rp, parent)
 	if err != nil {
 		return err
 	}
@@ -742,20 +799,23 @@ func (fs *Filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ
 		return err
 	}
 	defer rp.Mount().EndWrite()
-	childVFSD, err := parentInode.NewSymlink(ctx, pc, target)
+	childI, err := parent.inode.NewSymlink(ctx, pc, target)
 	if err != nil {
 		return err
 	}
-	parentVFSD.Impl().(*Dentry).InsertChild(pc, childVFSD.Impl().(*Dentry))
+	var child Dentry
+	child.Init(fs, childI)
+	parent.insertChildLocked(pc, &child)
 	return nil
 }
 
 // UnlinkAt implements vfs.FilesystemImpl.UnlinkAt.
 func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error {
 	fs.mu.Lock()
+	defer fs.processDeferredDecRefs(ctx)
 	defer fs.mu.Unlock()
-	vfsd, _, err := fs.walkExistingLocked(ctx, rp)
-	fs.processDeferredDecRefsLocked(ctx)
+
+	d, err := fs.walkExistingLocked(ctx, rp)
 	if err != nil {
 		return err
 	}
@@ -763,10 +823,9 @@ func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
 		return err
 	}
 	defer rp.Mount().EndWrite()
-	if err := checkDeleteLocked(ctx, rp, vfsd); err != nil {
+	if err := checkDeleteLocked(ctx, rp, d); err != nil {
 		return err
 	}
-	d := vfsd.Impl().(*Dentry)
 	if d.isDir() {
 		return syserror.EISDIR
 	}
@@ -776,38 +835,42 @@ func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
 	defer parentDentry.dirMu.Unlock()
 	mntns := vfs.MountNamespaceFromContext(ctx)
 	defer mntns.DecRef(ctx)
+	vfsd := d.VFSDentry()
 	if err := virtfs.PrepareDeleteDentry(mntns, vfsd); err != nil {
 		return err
 	}
-	if err := parentDentry.inode.Unlink(ctx, rp.Component(), vfsd); err != nil {
+	if err := parentDentry.inode.Unlink(ctx, d.name, d.inode); err != nil {
 		virtfs.AbortDeleteDentry(vfsd)
 		return err
 	}
+	delete(parentDentry.children, d.name)
+	// Defer decref so that fs.mu and parentDentry.dirMu are unlocked by then.
+	fs.deferDecRef(d)
 	virtfs.CommitDeleteDentry(ctx, vfsd)
 	return nil
 }
 
-// BoundEndpointAt implements FilesystemImpl.BoundEndpointAt.
+// BoundEndpointAt implements vfs.FilesystemImpl.BoundEndpointAt.
 func (fs *Filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) {
 	fs.mu.RLock()
-	_, inode, err := fs.walkExistingLocked(ctx, rp)
-	fs.mu.RUnlock()
-	fs.processDeferredDecRefs(ctx)
+	defer fs.processDeferredDecRefs(ctx)
+	defer fs.mu.RUnlock()
+	d, err := fs.walkExistingLocked(ctx, rp)
 	if err != nil {
 		return nil, err
 	}
-	if err := inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite); err != nil {
+	if err := d.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite); err != nil {
 		return nil, err
 	}
 	return nil, syserror.ECONNREFUSED
 }
 
-// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
-func (fs *Filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
+// ListXattrAt implements vfs.FilesystemImpl.ListXattrAt.
+func (fs *Filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
 	fs.mu.RLock()
-	_, _, err := fs.walkExistingLocked(ctx, rp)
-	fs.mu.RUnlock()
-	fs.processDeferredDecRefs(ctx)
+	defer fs.processDeferredDecRefs(ctx)
+	defer fs.mu.RUnlock()
+	_, err := fs.walkExistingLocked(ctx, rp)
 	if err != nil {
 		return nil, err
 	}
@@ -815,12 +878,12 @@ func (fs *Filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, si
 	return nil, syserror.ENOTSUP
 }
 
-// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt.
-func (fs *Filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) {
+// GetXattrAt implements vfs.FilesystemImpl.GetXattrAt.
+func (fs *Filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetXattrOptions) (string, error) {
 	fs.mu.RLock()
-	_, _, err := fs.walkExistingLocked(ctx, rp)
-	fs.mu.RUnlock()
-	fs.processDeferredDecRefs(ctx)
+	defer fs.processDeferredDecRefs(ctx)
+	defer fs.mu.RUnlock()
+	_, err := fs.walkExistingLocked(ctx, rp)
 	if err != nil {
 		return "", err
 	}
@@ -828,12 +891,12 @@ func (fs *Filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt
 	return "", syserror.ENOTSUP
 }
 
-// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt.
-func (fs *Filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error {
+// SetXattrAt implements vfs.FilesystemImpl.SetXattrAt.
+func (fs *Filesystem) SetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetXattrOptions) error {
 	fs.mu.RLock()
-	_, _, err := fs.walkExistingLocked(ctx, rp)
-	fs.mu.RUnlock()
-	fs.processDeferredDecRefs(ctx)
+	defer fs.processDeferredDecRefs(ctx)
+	defer fs.mu.RUnlock()
+	_, err := fs.walkExistingLocked(ctx, rp)
 	if err != nil {
 		return err
 	}
@@ -841,12 +904,12 @@ func (fs *Filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt
 	return syserror.ENOTSUP
 }
 
-// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt.
-func (fs *Filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
+// RemoveXattrAt implements vfs.FilesystemImpl.RemoveXattrAt.
+func (fs *Filesystem) RemoveXattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
 	fs.mu.RLock()
-	_, _, err := fs.walkExistingLocked(ctx, rp)
-	fs.mu.RUnlock()
-	fs.processDeferredDecRefs(ctx)
+	defer fs.processDeferredDecRefs(ctx)
+	defer fs.mu.RUnlock()
+	_, err := fs.walkExistingLocked(ctx, rp)
 	if err != nil {
 		return err
 	}
@@ -860,3 +923,16 @@ func (fs *Filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDe
 	defer fs.mu.RUnlock()
 	return genericPrependPath(vfsroot, vd.Mount(), vd.Dentry().Impl().(*Dentry), b)
 }
+
+func (fs *Filesystem) deferDecRefVD(ctx context.Context, vd vfs.VirtualDentry) {
+	if d, ok := vd.Dentry().Impl().(*Dentry); ok && d.fs == fs {
+		// The following is equivalent to vd.DecRef(ctx). This is needed
+		// because if d belongs to this filesystem, we can not DecRef it right
+		// away as we may be holding fs.mu. d.DecRef may acquire fs.mu. So we
+		// defer the DecRef to when locks are dropped.
+		vd.Mount().DecRef(ctx)
+		fs.deferDecRef(d)
+	} else {
+		vd.DecRef(ctx)
+	}
+}
diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
index fe8a1e710..122b10591 100644
--- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
+++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
@@ -20,7 +20,6 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
-	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/sync"
@@ -32,7 +31,10 @@ import (
 // count for inodes, performing no extra actions when references are obtained or
 // released. This is suitable for simple file inodes that don't reference any
 // resources.
+//
+// +stateify savable
 type InodeNoopRefCount struct {
+	InodeTemporary
 }
 
 // IncRef implements Inode.IncRef.
@@ -51,30 +53,32 @@ func (InodeNoopRefCount) TryIncRef() bool {
 // InodeDirectoryNoNewChildren partially implements the Inode interface.
 // InodeDirectoryNoNewChildren represents a directory inode which does not
 // support creation of new children.
+//
+// +stateify savable
 type InodeDirectoryNoNewChildren struct{}
 
 // NewFile implements Inode.NewFile.
-func (InodeDirectoryNoNewChildren) NewFile(context.Context, string, vfs.OpenOptions) (*vfs.Dentry, error) {
+func (InodeDirectoryNoNewChildren) NewFile(context.Context, string, vfs.OpenOptions) (Inode, error) {
 	return nil, syserror.EPERM
 }
 
 // NewDir implements Inode.NewDir.
-func (InodeDirectoryNoNewChildren) NewDir(context.Context, string, vfs.MkdirOptions) (*vfs.Dentry, error) {
+func (InodeDirectoryNoNewChildren) NewDir(context.Context, string, vfs.MkdirOptions) (Inode, error) {
 	return nil, syserror.EPERM
 }
 
 // NewLink implements Inode.NewLink.
-func (InodeDirectoryNoNewChildren) NewLink(context.Context, string, Inode) (*vfs.Dentry, error) {
+func (InodeDirectoryNoNewChildren) NewLink(context.Context, string, Inode) (Inode, error) {
 	return nil, syserror.EPERM
 }
 
 // NewSymlink implements Inode.NewSymlink.
-func (InodeDirectoryNoNewChildren) NewSymlink(context.Context, string, string) (*vfs.Dentry, error) {
+func (InodeDirectoryNoNewChildren) NewSymlink(context.Context, string, string) (Inode, error) {
 	return nil, syserror.EPERM
 }
 
 // NewNode implements Inode.NewNode.
-func (InodeDirectoryNoNewChildren) NewNode(context.Context, string, vfs.MknodOptions) (*vfs.Dentry, error) {
+func (InodeDirectoryNoNewChildren) NewNode(context.Context, string, vfs.MknodOptions) (Inode, error) {
 	return nil, syserror.EPERM
 }
 
@@ -82,7 +86,10 @@ func (InodeDirectoryNoNewChildren) NewNode(context.Context, string, vfs.MknodOpt
 // inodeDirectory and inodeDynamicDirectory sub interfaces. Inodes that do not
 // represent directories can embed this to provide no-op implementations for
 // directory-related functions.
+//
+// +stateify savable
 type InodeNotDirectory struct {
+	InodeAlwaysValid
 }
 
 // HasChildren implements Inode.HasChildren.
@@ -91,47 +98,47 @@ func (InodeNotDirectory) HasChildren() bool {
 }
 
 // NewFile implements Inode.NewFile.
-func (InodeNotDirectory) NewFile(context.Context, string, vfs.OpenOptions) (*vfs.Dentry, error) {
+func (InodeNotDirectory) NewFile(context.Context, string, vfs.OpenOptions) (Inode, error) {
 	panic("NewFile called on non-directory inode")
 }
 
 // NewDir implements Inode.NewDir.
-func (InodeNotDirectory) NewDir(context.Context, string, vfs.MkdirOptions) (*vfs.Dentry, error) {
+func (InodeNotDirectory) NewDir(context.Context, string, vfs.MkdirOptions) (Inode, error) {
 	panic("NewDir called on non-directory inode")
 }
 
 // NewLink implements Inode.NewLinkink.
-func (InodeNotDirectory) NewLink(context.Context, string, Inode) (*vfs.Dentry, error) {
+func (InodeNotDirectory) NewLink(context.Context, string, Inode) (Inode, error) {
 	panic("NewLink called on non-directory inode")
 }
 
 // NewSymlink implements Inode.NewSymlink.
-func (InodeNotDirectory) NewSymlink(context.Context, string, string) (*vfs.Dentry, error) {
+func (InodeNotDirectory) NewSymlink(context.Context, string, string) (Inode, error) {
 	panic("NewSymlink called on non-directory inode")
 }
 
 // NewNode implements Inode.NewNode.
-func (InodeNotDirectory) NewNode(context.Context, string, vfs.MknodOptions) (*vfs.Dentry, error) {
+func (InodeNotDirectory) NewNode(context.Context, string, vfs.MknodOptions) (Inode, error) {
 	panic("NewNode called on non-directory inode")
 }
 
 // Unlink implements Inode.Unlink.
-func (InodeNotDirectory) Unlink(context.Context, string, *vfs.Dentry) error {
+func (InodeNotDirectory) Unlink(context.Context, string, Inode) error {
 	panic("Unlink called on non-directory inode")
 }
 
 // RmDir implements Inode.RmDir.
-func (InodeNotDirectory) RmDir(context.Context, string, *vfs.Dentry) error {
+func (InodeNotDirectory) RmDir(context.Context, string, Inode) error {
 	panic("RmDir called on non-directory inode")
 }
 
 // Rename implements Inode.Rename.
-func (InodeNotDirectory) Rename(context.Context, string, string, *vfs.Dentry, *vfs.Dentry) (*vfs.Dentry, error) {
+func (InodeNotDirectory) Rename(context.Context, string, string, Inode, Inode) error {
 	panic("Rename called on non-directory inode")
 }
 
 // Lookup implements Inode.Lookup.
-func (InodeNotDirectory) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) {
+func (InodeNotDirectory) Lookup(ctx context.Context, name string) (Inode, error) {
 	panic("Lookup called on non-directory inode")
 }
 
@@ -140,40 +147,15 @@ func (InodeNotDirectory) IterDirents(ctx context.Context, callback vfs.IterDiren
 	panic("IterDirents called on non-directory inode")
 }
 
-// Valid implements Inode.Valid.
-func (InodeNotDirectory) Valid(context.Context) bool {
-	return true
-}
-
-// InodeNoDynamicLookup partially implements the Inode interface, specifically
-// the inodeDynamicLookup sub interface. Directory inodes that do not support
-// dymanic entries (i.e. entries that are not "hashed" into the
-// vfs.Dentry.children) can embed this to provide no-op implementations for
-// functions related to dynamic entries.
-type InodeNoDynamicLookup struct{}
-
-// Lookup implements Inode.Lookup.
-func (InodeNoDynamicLookup) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) {
-	return nil, syserror.ENOENT
-}
-
-// IterDirents implements Inode.IterDirents.
-func (InodeNoDynamicLookup) IterDirents(ctx context.Context, callback vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) {
-	return offset, nil
-}
-
-// Valid implements Inode.Valid.
-func (InodeNoDynamicLookup) Valid(ctx context.Context) bool {
-	return true
-}
-
 // InodeNotSymlink partially implements the Inode interface, specifically the
 // inodeSymlink sub interface. All inodes that are not symlinks may embed this
 // to return the appropriate errors from symlink-related functions.
+//
+// +stateify savable
 type InodeNotSymlink struct{}
 
 // Readlink implements Inode.Readlink.
-func (InodeNotSymlink) Readlink(context.Context) (string, error) {
+func (InodeNotSymlink) Readlink(context.Context, *vfs.Mount) (string, error) {
 	return "", syserror.EINVAL
 }
 
@@ -187,6 +169,8 @@ func (InodeNotSymlink) Getlink(context.Context, *vfs.Mount) (vfs.VirtualDentry,
 // inode attributes.
 //
 // Must be initialized by Init prior to first use.
+//
+// +stateify savable
 type InodeAttrs struct {
 	devMajor uint32
 	devMinor uint32
@@ -257,12 +241,29 @@ func (a *InodeAttrs) Stat(context.Context, *vfs.Filesystem, vfs.StatOptions) (li
 
 // SetStat implements Inode.SetStat.
 func (a *InodeAttrs) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
+	return a.SetInodeStat(ctx, fs, creds, opts)
+}
+
+// SetInodeStat sets the corresponding attributes from opts to InodeAttrs.
+// This function can be used by other kernfs-based filesystem implementation to
+// sets the unexported attributes into InodeAttrs.
+func (a *InodeAttrs) SetInodeStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
 	if opts.Stat.Mask == 0 {
 		return nil
 	}
-	if opts.Stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID) != 0 {
+
+	// Note that not all fields are modifiable. For example, the file type and
+	// inode numbers are immutable after node creation. Setting the size is often
+	// allowed by kernfs files but does not do anything. If some other behavior is
+	// needed, the embedder should consider extending SetStat.
+	//
+	// TODO(gvisor.dev/issue/1193): Implement other stat fields like timestamps.
+	if opts.Stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID|linux.STATX_SIZE) != 0 {
 		return syserror.EPERM
 	}
+	if opts.Stat.Mask&linux.STATX_SIZE != 0 && a.Mode().IsDir() {
+		return syserror.EISDIR
+	}
 	if err := vfs.CheckSetStat(ctx, creds, &opts, a.Mode(), auth.KUID(atomic.LoadUint32(&a.uid)), auth.KGID(atomic.LoadUint32(&a.gid))); err != nil {
 		return err
 	}
@@ -285,13 +286,6 @@ func (a *InodeAttrs) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *aut
 		atomic.StoreUint32(&a.gid, stat.GID)
 	}
 
-	// Note that not all fields are modifiable. For example, the file type and
-	// inode numbers are immutable after node creation.
-
-	// TODO(gvisor.dev/issue/1193): Implement other stat fields like timestamps.
-	// Also, STATX_SIZE will need some special handling, because read-only static
-	// files should return EIO for truncate operations.
-
 	return nil
 }
 
@@ -321,13 +315,17 @@ func (a *InodeAttrs) DecLinks() {
 	}
 }
 
+// +stateify savable
 type slot struct {
-	Name   string
-	Dentry *vfs.Dentry
+	name   string
+	inode  Inode
+	static bool
 	slotEntry
 }
 
 // OrderedChildrenOptions contains initialization options for OrderedChildren.
+//
+// +stateify savable
 type OrderedChildrenOptions struct {
 	// Writable indicates whether vfs.FilesystemImpl methods implemented by
 	// OrderedChildren may modify the tracked children. This applies to
@@ -337,20 +335,28 @@ type OrderedChildrenOptions struct {
 }
 
 // OrderedChildren partially implements the Inode interface. OrderedChildren can
-// be embedded in directory inodes to keep track of the children in the
+// be embedded in directory inodes to keep track of children in the
 // directory, and can then be used to implement a generic directory FD -- see
-// GenericDirectoryFD. OrderedChildren is not compatible with dynamic
-// directories.
+// GenericDirectoryFD.
+//
+// OrderedChildren can represent a node in an Inode tree. The children inodes
+// might be directories themselves using OrderedChildren; hence extending the
+// tree. The parent inode (OrderedChildren user) holds a ref on all its static
+// children. This lets the static inodes outlive their associated dentry.
+// While the dentry might have to be regenerated via a Lookup() call, we can
+// keep reusing the same static inode. These static children inodes are finally
+// DecRef'd when this directory inode is being destroyed. This makes
+// OrderedChildren suitable for static directory entries as well.
 //
 // Must be initialize with Init before first use.
+//
+// +stateify savable
 type OrderedChildren struct {
-	refs.AtomicRefCount
-
 	// Can children be modified by user syscalls? It set to false, interface
 	// methods that would modify the children return EPERM. Immutable.
 	writable bool
 
-	mu    sync.RWMutex
+	mu    sync.RWMutex `state:"nosave"`
 	order slotList
 	set   map[string]*slot
 }
@@ -361,36 +367,66 @@ func (o *OrderedChildren) Init(opts OrderedChildrenOptions) {
 	o.set = make(map[string]*slot)
 }
 
-// DecRef implements Inode.DecRef.
-func (o *OrderedChildren) DecRef(ctx context.Context) {
-	o.AtomicRefCount.DecRefWithDestructor(ctx, func(context.Context) {
-		o.mu.Lock()
-		defer o.mu.Unlock()
-		o.order.Reset()
-		o.set = nil
-	})
+// Destroy clears the children stored in o. It should be called by structs
+// embedding OrderedChildren upon destruction, i.e. when their reference count
+// reaches zero.
+func (o *OrderedChildren) Destroy(ctx context.Context) {
+	o.mu.Lock()
+	defer o.mu.Unlock()
+	// Drop the ref that o owns on the static inodes it holds.
+	for _, s := range o.set {
+		if s.static {
+			s.inode.DecRef(ctx)
+		}
+	}
+	o.order.Reset()
+	o.set = nil
 }
 
-// Populate inserts children into this OrderedChildren, and d's dentry
-// cache. Populate returns the number of directories inserted, which the caller
+// Populate inserts static children into this OrderedChildren.
+// Populate returns the number of directories inserted, which the caller
 // may use to update the link count for the parent directory.
 //
-// Precondition: d must represent a directory inode. children must not contain
-// any conflicting entries already in o.
-func (o *OrderedChildren) Populate(d *Dentry, children map[string]*Dentry) uint32 {
+// Precondition:
+//   * d must represent a directory inode.
+//   * children must not contain any conflicting entries already in o.
+//   * Caller must hold a reference on all inodes passed.
+//
+// Postcondition: Caller's references on inodes are transferred to o.
+func (o *OrderedChildren) Populate(children map[string]Inode) uint32 {
 	var links uint32
 	for name, child := range children {
-		if child.isDir() {
+		if child.Mode().IsDir() {
 			links++
 		}
-		if err := o.Insert(name, child.VFSDentry()); err != nil {
-			panic(fmt.Sprintf("Collision when attempting to insert child %q (%+v) into %+v", name, child, d))
+		if err := o.insert(name, child, true); err != nil {
+			panic(fmt.Sprintf("Collision when attempting to insert child %q (%+v)", name, child))
 		}
-		d.InsertChild(name, child)
 	}
 	return links
 }
 
+// Lookup implements Inode.Lookup.
+func (o *OrderedChildren) Lookup(ctx context.Context, name string) (Inode, error) {
+	o.mu.RLock()
+	defer o.mu.RUnlock()
+
+	s, ok := o.set[name]
+	if !ok {
+		return nil, syserror.ENOENT
+	}
+
+	s.inode.IncRef() // This ref is passed to the dentry upon creation via Init.
+	return s.inode, nil
+}
+
+// IterDirents implements Inode.IterDirents.
+func (o *OrderedChildren) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, offset, relOffset int64) (newOffset int64, err error) {
+	// All entries from OrderedChildren have already been handled in
+	// GenericDirectoryFD.IterDirents.
+	return offset, nil
+}
+
 // HasChildren implements Inode.HasChildren.
 func (o *OrderedChildren) HasChildren() bool {
 	o.mu.RLock()
@@ -398,17 +434,27 @@ func (o *OrderedChildren) HasChildren() bool {
 	return len(o.set) > 0
 }
 
-// Insert inserts child into o. This ignores the writability of o, as this is
-// not part of the vfs.FilesystemImpl interface, and is a lower-level operation.
-func (o *OrderedChildren) Insert(name string, child *vfs.Dentry) error {
+// Insert inserts a dynamic child into o. This ignores the writability of o, as
+// this is not part of the vfs.FilesystemImpl interface, and is a lower-level operation.
+func (o *OrderedChildren) Insert(name string, child Inode) error {
+	return o.insert(name, child, false)
+}
+
+// insert inserts child into o.
+//
+// Precondition: Caller must be holding a ref on child if static is true.
+//
+// Postcondition: Caller's ref on child is transferred to o if static is true.
+func (o *OrderedChildren) insert(name string, child Inode, static bool) error {
 	o.mu.Lock()
 	defer o.mu.Unlock()
 	if _, ok := o.set[name]; ok {
 		return syserror.EEXIST
 	}
 	s := &slot{
-		Name:   name,
-		Dentry: child,
+		name:   name,
+		inode:  child,
+		static: static,
 	}
 	o.order.PushBack(s)
 	o.set[name] = s
@@ -418,44 +464,49 @@ func (o *OrderedChildren) Insert(name string, child *vfs.Dentry) error {
 // Precondition: caller must hold o.mu for writing.
 func (o *OrderedChildren) removeLocked(name string) {
 	if s, ok := o.set[name]; ok {
+		if s.static {
+			panic(fmt.Sprintf("removeLocked called on a static inode: %v", s.inode))
+		}
 		delete(o.set, name)
 		o.order.Remove(s)
 	}
 }
 
 // Precondition: caller must hold o.mu for writing.
-func (o *OrderedChildren) replaceChildLocked(name string, new *vfs.Dentry) *vfs.Dentry {
+func (o *OrderedChildren) replaceChildLocked(ctx context.Context, name string, newI Inode) {
 	if s, ok := o.set[name]; ok {
+		if s.static {
+			panic(fmt.Sprintf("replacing a static inode: %v", s.inode))
+		}
+
 		// Existing slot with given name, simply replace the dentry.
-		var old *vfs.Dentry
-		old, s.Dentry = s.Dentry, new
-		return old
+		s.inode = newI
 	}
 
 	// No existing slot with given name, create and hash new slot.
 	s := &slot{
-		Name:   name,
-		Dentry: new,
+		name:   name,
+		inode:  newI,
+		static: false,
 	}
 	o.order.PushBack(s)
 	o.set[name] = s
-	return nil
 }
 
 // Precondition: caller must hold o.mu for reading or writing.
-func (o *OrderedChildren) checkExistingLocked(name string, child *vfs.Dentry) error {
+func (o *OrderedChildren) checkExistingLocked(name string, child Inode) error {
 	s, ok := o.set[name]
 	if !ok {
 		return syserror.ENOENT
 	}
-	if s.Dentry != child {
-		panic(fmt.Sprintf("Dentry hashed into inode doesn't match what vfs thinks! OrderedChild: %+v, vfs: %+v", s.Dentry, child))
+	if s.inode != child {
+		panic(fmt.Sprintf("Inode doesn't match what kernfs thinks! OrderedChild: %+v, kernfs: %+v", s.inode, child))
 	}
 	return nil
 }
 
 // Unlink implements Inode.Unlink.
-func (o *OrderedChildren) Unlink(ctx context.Context, name string, child *vfs.Dentry) error {
+func (o *OrderedChildren) Unlink(ctx context.Context, name string, child Inode) error {
 	if !o.writable {
 		return syserror.EPERM
 	}
@@ -470,13 +521,14 @@ func (o *OrderedChildren) Unlink(ctx context.Context, name string, child *vfs.De
 	return nil
 }
 
-// Rmdir implements Inode.Rmdir.
-func (o *OrderedChildren) RmDir(ctx context.Context, name string, child *vfs.Dentry) error {
+// RmDir implements Inode.RmDir.
+func (o *OrderedChildren) RmDir(ctx context.Context, name string, child Inode) error {
 	// We're not responsible for checking that child is a directory, that it's
 	// empty, or updating any link counts; so this is the same as unlink.
 	return o.Unlink(ctx, name, child)
 }
 
+// +stateify savable
 type renameAcrossDifferentImplementationsError struct{}
 
 func (renameAcrossDifferentImplementationsError) Error() string {
@@ -492,13 +544,13 @@ func (renameAcrossDifferentImplementationsError) Error() string {
 // that will support Rename.
 //
 // Postcondition: reference on any replaced dentry transferred to caller.
-func (o *OrderedChildren) Rename(ctx context.Context, oldname, newname string, child, dstDir *vfs.Dentry) (*vfs.Dentry, error) {
-	dst, ok := dstDir.Impl().(*Dentry).inode.(interface{}).(*OrderedChildren)
+func (o *OrderedChildren) Rename(ctx context.Context, oldname, newname string, child, dstDir Inode) error {
+	dst, ok := dstDir.(interface{}).(*OrderedChildren)
 	if !ok {
-		return nil, renameAcrossDifferentImplementationsError{}
+		return renameAcrossDifferentImplementationsError{}
 	}
 	if !o.writable || !dst.writable {
-		return nil, syserror.EPERM
+		return syserror.EPERM
 	}
 	// Note: There's a potential deadlock below if concurrent calls to Rename
 	// refer to the same src and dst directories in reverse. We avoid any
@@ -511,12 +563,12 @@ func (o *OrderedChildren) Rename(ctx context.Context, oldname, newname string, c
 		defer dst.mu.Unlock()
 	}
 	if err := o.checkExistingLocked(oldname, child); err != nil {
-		return nil, err
+		return err
 	}
 
 	// TODO(gvisor.dev/issue/3027): Check sticky bit before removing.
-	replaced := dst.replaceChildLocked(newname, child)
-	return replaced, nil
+	dst.replaceChildLocked(ctx, newname, child)
+	return nil
 }
 
 // nthLocked returns an iterator to the nth child tracked by this object. The
@@ -535,12 +587,14 @@ func (o *OrderedChildren) nthLocked(i int64) *slot {
 }
 
 // InodeSymlink partially implements Inode interface for symlinks.
+//
+// +stateify savable
 type InodeSymlink struct {
 	InodeNotDirectory
 }
 
 // Open implements Inode.Open.
-func (InodeSymlink) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+func (InodeSymlink) Open(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
 	return nil, syserror.ELOOP
 }
 
@@ -549,43 +603,46 @@ func (InodeSymlink) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.D
 //
 // +stateify savable
 type StaticDirectory struct {
-	InodeNotSymlink
-	InodeDirectoryNoNewChildren
+	InodeAlwaysValid
 	InodeAttrs
-	InodeNoDynamicLookup
+	InodeDirectoryNoNewChildren
+	InodeNoStatFS
+	InodeNotSymlink
+	InodeTemporary
 	OrderedChildren
+	StaticDirectoryRefs
 
-	locks vfs.FileLocks
+	locks  vfs.FileLocks
+	fdOpts GenericDirectoryFDOptions
 }
 
 var _ Inode = (*StaticDirectory)(nil)
 
 // NewStaticDir creates a new static directory and returns its dentry.
-func NewStaticDir(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode, children map[string]*Dentry) *Dentry {
+func NewStaticDir(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode, children map[string]Inode, fdOpts GenericDirectoryFDOptions) Inode {
 	inode := &StaticDirectory{}
-	inode.Init(creds, devMajor, devMinor, ino, perm)
-
-	dentry := &Dentry{}
-	dentry.Init(inode)
+	inode.Init(creds, devMajor, devMinor, ino, perm, fdOpts)
+	inode.EnableLeakCheck()
 
 	inode.OrderedChildren.Init(OrderedChildrenOptions{})
-	links := inode.OrderedChildren.Populate(dentry, children)
+	links := inode.OrderedChildren.Populate(children)
 	inode.IncLinks(links)
 
-	return dentry
+	return inode
 }
 
 // Init initializes StaticDirectory.
-func (s *StaticDirectory) Init(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode) {
+func (s *StaticDirectory) Init(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode, fdOpts GenericDirectoryFDOptions) {
 	if perm&^linux.PermissionsMask != 0 {
 		panic(fmt.Sprintf("Only permission mask must be set: %x", perm&linux.PermissionsMask))
 	}
+	s.fdOpts = fdOpts
 	s.InodeAttrs.Init(creds, devMajor, devMinor, ino, linux.ModeDirectory|perm)
 }
 
-// Open implements kernfs.Inode.
-func (s *StaticDirectory) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	fd, err := NewGenericDirectoryFD(rp.Mount(), vfsd, &s.OrderedChildren, &s.locks, &opts)
+// Open implements Inode.Open.
+func (s *StaticDirectory) Open(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	fd, err := NewGenericDirectoryFD(rp.Mount(), d, &s.OrderedChildren, &s.locks, &opts, s.fdOpts)
 	if err != nil {
 		return nil, err
 	}
@@ -597,10 +654,38 @@ func (*StaticDirectory) SetStat(context.Context, *vfs.Filesystem, *auth.Credenti
 	return syserror.EPERM
 }
 
-// AlwaysValid partially implements kernfs.inodeDynamicLookup.
-type AlwaysValid struct{}
+// DecRef implements Inode.DecRef.
+func (s *StaticDirectory) DecRef(ctx context.Context) {
+	s.StaticDirectoryRefs.DecRef(func() { s.Destroy(ctx) })
+}
+
+// InodeAlwaysValid partially implements Inode.
+//
+// +stateify savable
+type InodeAlwaysValid struct{}
 
-// Valid implements kernfs.inodeDynamicLookup.
-func (*AlwaysValid) Valid(context.Context) bool {
+// Valid implements Inode.Valid.
+func (*InodeAlwaysValid) Valid(context.Context) bool {
 	return true
 }
+
+// InodeTemporary partially implements Inode.
+//
+// +stateify savable
+type InodeTemporary struct{}
+
+// Keep implements Inode.Keep.
+func (*InodeTemporary) Keep() bool {
+	return false
+}
+
+// InodeNoStatFS partially implements the Inode interface, where the client
+// filesystem doesn't support statfs(2).
+//
+// +stateify savable
+type InodeNoStatFS struct{}
+
+// StatFS implements Inode.StatFS.
+func (*InodeNoStatFS) StatFS(context.Context, *vfs.Filesystem) (linux.Statfs, error) {
+	return linux.Statfs{}, syserror.ENOSYS
+}
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go
index 51dbc050c..606081e68 100644
--- a/pkg/sentry/fsimpl/kernfs/kernfs.go
+++ b/pkg/sentry/fsimpl/kernfs/kernfs.go
@@ -29,12 +29,16 @@
 //
 // Reference Model:
 //
-// Kernfs dentries represents named pointers to inodes. Dentries and inode have
+// Kernfs dentries represents named pointers to inodes. Kernfs is solely
+// reponsible for maintaining and modifying its dentry tree; inode
+// implementations can not access the tree. Dentries and inodes have
 // independent lifetimes and reference counts. A child dentry unconditionally
 // holds a reference on its parent directory's dentry. A dentry also holds a
-// reference on the inode it points to. Multiple dentries can point to the same
-// inode (for example, in the case of hardlinks). File descriptors hold a
-// reference to the dentry they're opened on.
+// reference on the inode it points to (although that might not be the only
+// reference on the inode). Due to this inodes can outlive the dentries that
+// point to them. Multiple dentries can point to the same inode (for example,
+// in the case of hardlinks). File descriptors hold a reference to the dentry
+// they're opened on.
 //
 // Dentries are guaranteed to exist while holding Filesystem.mu for
 // reading. Dropping dentries require holding Filesystem.mu for writing. To
@@ -47,8 +51,8 @@
 //   kernfs.Dentry.dirMu
 //     vfs.VirtualFilesystem.mountMu
 //       vfs.Dentry.mu
-//   kernfs.Filesystem.droppedDentriesMu
 //   (inode implementation locks, if any)
+// kernfs.Filesystem.droppedDentriesMu
 package kernfs
 
 import (
@@ -57,7 +61,6 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
-	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/sync"
@@ -66,15 +69,17 @@ import (
 // Filesystem mostly implements vfs.FilesystemImpl for a generic in-memory
 // filesystem. Concrete implementations are expected to embed this in their own
 // Filesystem type.
+//
+// +stateify savable
 type Filesystem struct {
 	vfsfs vfs.Filesystem
 
-	droppedDentriesMu sync.Mutex
+	droppedDentriesMu sync.Mutex `state:"nosave"`
 
 	// droppedDentries is a list of dentries waiting to be DecRef()ed. This is
 	// used to defer dentry destruction until mu can be acquired for
 	// writing. Protected by droppedDentriesMu.
-	droppedDentries []*vfs.Dentry
+	droppedDentries []*Dentry
 
 	// mu synchronizes the lifetime of Dentries on this filesystem. Holding it
 	// for reading guarantees continued existence of any resolved dentries, but
@@ -93,11 +98,11 @@ type Filesystem struct {
 	// example:
 	//
 	//   fs.mu.RLock()
-	//   fs.mu.processDeferredDecRefs()
+	//   defer fs.processDeferredDecRefs()
 	//   defer fs.mu.RUnlock()
 	//   ...
 	//   fs.deferDecRef(dentry)
-	mu sync.RWMutex
+	mu sync.RWMutex `state:"nosave"`
 
 	// nextInoMinusOne is used to to allocate inode numbers on this
 	// filesystem. Must be accessed by atomic operations.
@@ -106,9 +111,8 @@ type Filesystem struct {
 
 // deferDecRef defers dropping a dentry ref until the next call to
 // processDeferredDecRefs{,Locked}. See comment on Filesystem.mu.
-//
-// Precondition: d must not already be pending destruction.
-func (fs *Filesystem) deferDecRef(d *vfs.Dentry) {
+// This may be called while Filesystem.mu or Dentry.dirMu is locked.
+func (fs *Filesystem) deferDecRef(d *Dentry) {
 	fs.droppedDentriesMu.Lock()
 	fs.droppedDentries = append(fs.droppedDentries, d)
 	fs.droppedDentriesMu.Unlock()
@@ -116,17 +120,14 @@ func (fs *Filesystem) deferDecRef(d *vfs.Dentry) {
 
 // processDeferredDecRefs calls vfs.Dentry.DecRef on all dentries in the
 // droppedDentries list. See comment on Filesystem.mu.
+//
+// Precondition: Filesystem.mu or Dentry.dirMu must NOT be locked.
 func (fs *Filesystem) processDeferredDecRefs(ctx context.Context) {
-	fs.mu.Lock()
-	fs.processDeferredDecRefsLocked(ctx)
-	fs.mu.Unlock()
-}
-
-// Precondition: fs.mu must be held for writing.
-func (fs *Filesystem) processDeferredDecRefsLocked(ctx context.Context) {
 	fs.droppedDentriesMu.Lock()
 	for _, d := range fs.droppedDentries {
-		d.DecRef(ctx)
+		// Defer the DecRef call so that we are not holding droppedDentriesMu
+		// when DecRef is called.
+		defer d.DecRef(ctx)
 	}
 	fs.droppedDentries = fs.droppedDentries[:0] // Keep slice memory for reuse.
 	fs.droppedDentriesMu.Unlock()
@@ -155,15 +156,19 @@ const (
 //
 // A kernfs dentry is similar to a dentry in a traditional filesystem: it's a
 // named reference to an inode. A dentry generally lives as long as it's part of
-// a mounted filesystem tree. Kernfs doesn't cache dentries once all references
-// to them are removed. Dentries hold a single reference to the inode they point
+// a mounted filesystem tree. Kernfs drops dentries once all references to them
+// are dropped. Dentries hold a single reference to the inode they point
 // to, and child dentries hold a reference on their parent.
 //
 // Must be initialized by Init prior to first use.
+//
+// +stateify savable
 type Dentry struct {
 	vfsd vfs.Dentry
+	DentryRefs
 
-	refs.AtomicRefCount
+	// fs is the owning filesystem. fs is immutable.
+	fs *Filesystem
 
 	// flags caches useful information about the dentry from the inode. See the
 	// dflags* consts above. Must be accessed by atomic ops.
@@ -173,7 +178,11 @@ type Dentry struct {
 	name   string
 
 	// dirMu protects children and the names of child Dentries.
-	dirMu    sync.Mutex
+	//
+	// Note that holding fs.mu for writing is not sufficient;
+	// revalidateChildLocked(), which is a very hot path, may modify children with
+	// fs.mu acquired for reading only.
+	dirMu    sync.Mutex `state:"nosave"`
 	children map[string]*Dentry
 
 	inode Inode
@@ -184,8 +193,9 @@ type Dentry struct {
 // Precondition: Caller must hold a reference on inode.
 //
 // Postcondition: Caller's reference on inode is transferred to the dentry.
-func (d *Dentry) Init(inode Inode) {
+func (d *Dentry) Init(fs *Filesystem, inode Inode) {
 	d.vfsd.Init(d)
+	d.fs = fs
 	d.inode = inode
 	ftype := inode.Mode().FileType()
 	if ftype == linux.ModeDirectory {
@@ -194,6 +204,7 @@ func (d *Dentry) Init(inode Inode) {
 	if ftype == linux.ModeSymlink {
 		d.flags |= dflagsIsSymlink
 	}
+	d.EnableLeakCheck()
 }
 
 // VFSDentry returns the generic vfs dentry for this kernfs dentry.
@@ -213,15 +224,27 @@ func (d *Dentry) isSymlink() bool {
 
 // DecRef implements vfs.DentryImpl.DecRef.
 func (d *Dentry) DecRef(ctx context.Context) {
-	d.AtomicRefCount.DecRefWithDestructor(ctx, d.destroy)
-}
-
-// Precondition: Dentry must be removed from VFS' dentry cache.
-func (d *Dentry) destroy(ctx context.Context) {
-	d.inode.DecRef(ctx) // IncRef from Init.
-	d.inode = nil
-	if d.parent != nil {
-		d.parent.DecRef(ctx) // IncRef from Dentry.InsertChild.
+	decRefParent := false
+	d.fs.mu.Lock()
+	d.DentryRefs.DecRef(func() {
+		d.inode.DecRef(ctx) // IncRef from Init.
+		d.inode = nil
+		if d.parent != nil {
+			// We will DecRef d.parent once all locks are dropped.
+			decRefParent = true
+			d.parent.dirMu.Lock()
+			// Remove d from parent.children. It might already have been
+			// removed due to invalidation.
+			if _, ok := d.parent.children[d.name]; ok {
+				delete(d.parent.children, d.name)
+				d.fs.VFSFilesystem().VirtualFilesystem().InvalidateDentry(ctx, d.VFSDentry())
+			}
+			d.parent.dirMu.Unlock()
+		}
+	})
+	d.fs.mu.Unlock()
+	if decRefParent {
+		d.parent.DecRef(ctx) // IncRef from Dentry.insertChild.
 	}
 }
 
@@ -240,25 +263,26 @@ func (d *Dentry) Watches() *vfs.Watches {
 // OnZeroWatches implements vfs.Dentry.OnZeroWatches.
 func (d *Dentry) OnZeroWatches(context.Context) {}
 
-// InsertChild inserts child into the vfs dentry cache with the given name under
-// this dentry. This does not update the directory inode, so calling this on
-// its own isn't sufficient to insert a child into a directory. InsertChild
-// updates the link count on d if required.
+// insertChild inserts child into the vfs dentry cache with the given name under
+// this dentry. This does not update the directory inode, so calling this on its
+// own isn't sufficient to insert a child into a directory.
 //
 // Precondition: d must represent a directory inode.
-func (d *Dentry) InsertChild(name string, child *Dentry) {
+func (d *Dentry) insertChild(name string, child *Dentry) {
 	d.dirMu.Lock()
 	d.insertChildLocked(name, child)
 	d.dirMu.Unlock()
 }
 
-// insertChildLocked is equivalent to InsertChild, with additional
+// insertChildLocked is equivalent to insertChild, with additional
 // preconditions.
 //
-// Precondition: d.dirMu must be locked.
+// Preconditions:
+// * d must represent a directory inode.
+// * d.dirMu must be locked.
 func (d *Dentry) insertChildLocked(name string, child *Dentry) {
 	if !d.isDir() {
-		panic(fmt.Sprintf("InsertChild called on non-directory Dentry: %+v.", d))
+		panic(fmt.Sprintf("insertChildLocked called on non-directory Dentry: %+v.", d))
 	}
 	d.IncRef() // DecRef in child's Dentry.destroy.
 	child.parent = d
@@ -289,7 +313,6 @@ func (d *Dentry) Inode() Inode {
 //
 // - Checking that dentries passed to methods are of the appropriate file type.
 // - Checking permissions.
-// - Updating link and reference counts.
 //
 // Specific responsibilities of implementations are documented below.
 type Inode interface {
@@ -299,7 +322,8 @@ type Inode interface {
 	inodeRefs
 
 	// Methods related to node metadata. A generic implementation is provided by
-	// InodeAttrs.
+	// InodeAttrs. Note that a concrete filesystem using kernfs is responsible for
+	// managing link counts.
 	inodeMetadata
 
 	// Method for inodes that represent symlink. InodeNotSymlink provides a
@@ -310,18 +334,26 @@ type Inode interface {
 	// a blanket implementation for all non-directory inodes.
 	inodeDirectory
 
-	// Method for inodes that represent dynamic directories and their
-	// children. InodeNoDynamicLookup provides a blanket implementation for all
-	// non-dynamic-directory inodes.
-	inodeDynamicLookup
-
 	// Open creates a file description for the filesystem object represented by
 	// this inode. The returned file description should hold a reference on the
-	// inode for its lifetime.
+	// dentry for its lifetime.
 	//
 	// Precondition: rp.Done(). vfsd.Impl() must be the kernfs Dentry containing
 	// the inode on which Open() is being called.
-	Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error)
+	Open(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error)
+
+	// StatFS returns filesystem statistics for the client filesystem. This
+	// corresponds to vfs.FilesystemImpl.StatFSAt. If the client filesystem
+	// doesn't support statfs(2), this should return ENOSYS.
+	StatFS(ctx context.Context, fs *vfs.Filesystem) (linux.Statfs, error)
+
+	// Keep indicates whether the dentry created after Inode.Lookup should be
+	// kept in the kernfs dentry tree.
+	Keep() bool
+
+	// Valid should return true if this inode is still valid, or needs to
+	// be resolved again by a call to Lookup.
+	Valid(ctx context.Context) bool
 }
 
 type inodeRefs interface {
@@ -354,8 +386,8 @@ type inodeMetadata interface {
 // Precondition: All methods in this interface may only be called on directory
 // inodes.
 type inodeDirectory interface {
-	// The New{File,Dir,Node,Symlink} methods below should return a new inode
-	// hashed into this inode.
+	// The New{File,Dir,Node,Link,Symlink} methods below should return a new inode
+	// that will be hashed into the dentry tree.
 	//
 	// These inode constructors are inode-level operations rather than
 	// filesystem-level operations to allow client filesystems to mix different
@@ -366,63 +398,57 @@ type inodeDirectory interface {
 	HasChildren() bool
 
 	// NewFile creates a new regular file inode.
-	NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (*vfs.Dentry, error)
+	NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (Inode, error)
 
 	// NewDir creates a new directory inode.
-	NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (*vfs.Dentry, error)
+	NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (Inode, error)
 
 	// NewLink creates a new hardlink to a specified inode in this
 	// directory. Implementations should create a new kernfs Dentry pointing to
 	// target, and update target's link count.
-	NewLink(ctx context.Context, name string, target Inode) (*vfs.Dentry, error)
+	NewLink(ctx context.Context, name string, target Inode) (Inode, error)
 
 	// NewSymlink creates a new symbolic link inode.
-	NewSymlink(ctx context.Context, name, target string) (*vfs.Dentry, error)
+	NewSymlink(ctx context.Context, name, target string) (Inode, error)
 
 	// NewNode creates a new filesystem node for a mknod syscall.
-	NewNode(ctx context.Context, name string, opts vfs.MknodOptions) (*vfs.Dentry, error)
+	NewNode(ctx context.Context, name string, opts vfs.MknodOptions) (Inode, error)
 
 	// Unlink removes a child dentry from this directory inode.
-	Unlink(ctx context.Context, name string, child *vfs.Dentry) error
+	Unlink(ctx context.Context, name string, child Inode) error
 
 	// RmDir removes an empty child directory from this directory
 	// inode. Implementations must update the parent directory's link count,
 	// if required. Implementations are not responsible for checking that child
 	// is a directory, checking for an empty directory.
-	RmDir(ctx context.Context, name string, child *vfs.Dentry) error
+	RmDir(ctx context.Context, name string, child Inode) error
 
 	// Rename is called on the source directory containing an inode being
 	// renamed. child should point to the resolved child in the source
-	// directory. If Rename replaces a dentry in the destination directory, it
-	// should return the replaced dentry or nil otherwise.
+	// directory.
 	//
 	// Precondition: Caller must serialize concurrent calls to Rename.
-	Rename(ctx context.Context, oldname, newname string, child, dstDir *vfs.Dentry) (replaced *vfs.Dentry, err error)
-}
+	Rename(ctx context.Context, oldname, newname string, child, dstDir Inode) error
 
-type inodeDynamicLookup interface {
-	// Lookup should return an appropriate dentry if name should resolve to a
-	// child of this dynamic directory inode. This gives the directory an
-	// opportunity on every lookup to resolve additional entries that aren't
-	// hashed into the directory. This is only called when the inode is a
-	// directory. If the inode is not a directory, or if the directory only
-	// contains a static set of children, the implementer can unconditionally
-	// return an appropriate error (ENOTDIR and ENOENT respectively).
+	// Lookup should return an appropriate inode if name should resolve to a
+	// child of this directory inode. This gives the directory an opportunity
+	// on every lookup to resolve additional entries. This is only called when
+	// the inode is a directory.
 	//
-	// The child returned by Lookup will be hashed into the VFS dentry tree. Its
-	// lifetime can be controlled by the filesystem implementation with an
-	// appropriate implementation of Valid.
+	// The child returned by Lookup will be hashed into the VFS dentry tree,
+	// atleast for the duration of the current FS operation.
 	//
-	// Lookup returns the child with an extra reference and the caller owns this
-	// reference.
-	Lookup(ctx context.Context, name string) (*vfs.Dentry, error)
-
-	// Valid should return true if this inode is still valid, or needs to
-	// be resolved again by a call to Lookup.
-	Valid(ctx context.Context) bool
+	// Lookup must return the child with an extra reference whose ownership is
+	// transferred to the dentry that is created to point to that inode. If
+	// Inode.Keep returns false, that new dentry will be dropped at the end of
+	// the current filesystem operation (before returning back to the VFS
+	// layer) if no other ref is picked on that dentry. If Inode.Keep returns
+	// true, then the dentry will be cached into the dentry tree until it is
+	// Unlink'd or RmDir'd.
+	Lookup(ctx context.Context, name string) (Inode, error)
 
 	// IterDirents is used to iterate over dynamically created entries. It invokes
-	// cb on each entry in the directory represented by the FileDescription.
+	// cb on each entry in the directory represented by the Inode.
 	// 'offset' is the offset for the entire IterDirents call, which may include
 	// results from the caller (e.g. "." and ".."). 'relOffset' is the offset
 	// inside the entries returned by this IterDirents invocation. In other words,
@@ -434,7 +460,7 @@ type inodeDynamicLookup interface {
 type inodeSymlink interface {
 	// Readlink returns the target of a symbolic link. If an inode is not a
 	// symlink, the implementation should return EINVAL.
-	Readlink(ctx context.Context) (string, error)
+	Readlink(ctx context.Context, mnt *vfs.Mount) (string, error)
 
 	// Getlink returns the target of a symbolic link, as used by path
 	// resolution:
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
index c5d5afedf..82fa19c03 100644
--- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go
+++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
@@ -36,7 +36,7 @@ const staticFileContent = "This is sample content for a static test file."
 
 // RootDentryFn is a generator function for creating the root dentry of a test
 // filesystem. See newTestSystem.
-type RootDentryFn func(*auth.Credentials, *filesystem) *kernfs.Dentry
+type RootDentryFn func(*auth.Credentials, *filesystem) kernfs.Inode
 
 // newTestSystem sets up a minimal environment for running a test, including an
 // instance of a test filesystem. Tests can control the contents of the
@@ -52,7 +52,7 @@ func newTestSystem(t *testing.T, rootFn RootDentryFn) *testutil.System {
 	v.MustRegisterFilesystemType("testfs", &fsType{rootFn: rootFn}, &vfs.RegisterFilesystemTypeOptions{
 		AllowUserMount: true,
 	})
-	mns, err := v.NewMountNamespace(ctx, creds, "", "testfs", &vfs.GetFilesystemOptions{})
+	mns, err := v.NewMountNamespace(ctx, creds, "", "testfs", &vfs.MountOptions{})
 	if err != nil {
 		t.Fatalf("Failed to create testfs root mount: %v", err)
 	}
@@ -72,14 +72,11 @@ type file struct {
 	content string
 }
 
-func (fs *filesystem) newFile(creds *auth.Credentials, content string) *kernfs.Dentry {
+func (fs *filesystem) newFile(creds *auth.Credentials, content string) kernfs.Inode {
 	f := &file{}
 	f.content = content
 	f.DynamicBytesFile.Init(creds, 0 /* devMajor */, 0 /* devMinor */, fs.NextIno(), f, 0777)
-
-	d := &kernfs.Dentry{}
-	d.Init(f)
-	return d
+	return f
 }
 
 func (f *file) Generate(ctx context.Context, buf *bytes.Buffer) error {
@@ -96,100 +93,110 @@ func (*attrs) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.S
 }
 
 type readonlyDir struct {
+	readonlyDirRefs
 	attrs
-	kernfs.InodeNotSymlink
-	kernfs.InodeNoDynamicLookup
+	kernfs.InodeAlwaysValid
 	kernfs.InodeDirectoryNoNewChildren
+	kernfs.InodeNoStatFS
+	kernfs.InodeNotSymlink
+	kernfs.InodeTemporary
 	kernfs.OrderedChildren
 
 	locks vfs.FileLocks
-
-	dentry kernfs.Dentry
 }
 
-func (fs *filesystem) newReadonlyDir(creds *auth.Credentials, mode linux.FileMode, contents map[string]*kernfs.Dentry) *kernfs.Dentry {
+func (fs *filesystem) newReadonlyDir(creds *auth.Credentials, mode linux.FileMode, contents map[string]kernfs.Inode) kernfs.Inode {
 	dir := &readonlyDir{}
 	dir.attrs.Init(creds, 0 /* devMajor */, 0 /* devMinor */, fs.NextIno(), linux.ModeDirectory|mode)
 	dir.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
-	dir.dentry.Init(dir)
-
-	dir.IncLinks(dir.OrderedChildren.Populate(&dir.dentry, contents))
-
-	return &dir.dentry
+	dir.EnableLeakCheck()
+	dir.IncLinks(dir.OrderedChildren.Populate(contents))
+	return dir
 }
 
-func (d *readonlyDir) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &d.locks, &opts)
+func (d *readonlyDir) Open(ctx context.Context, rp *vfs.ResolvingPath, kd *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), kd, &d.OrderedChildren, &d.locks, &opts, kernfs.GenericDirectoryFDOptions{
+		SeekEnd: kernfs.SeekEndStaticEntries,
+	})
 	if err != nil {
 		return nil, err
 	}
 	return fd.VFSFileDescription(), nil
 }
 
+func (d *readonlyDir) DecRef(ctx context.Context) {
+	d.readonlyDirRefs.DecRef(func() { d.Destroy(ctx) })
+}
+
 type dir struct {
+	dirRefs
 	attrs
+	kernfs.InodeAlwaysValid
 	kernfs.InodeNotSymlink
-	kernfs.InodeNoDynamicLookup
+	kernfs.InodeNoStatFS
+	kernfs.InodeTemporary
 	kernfs.OrderedChildren
 
 	locks vfs.FileLocks
 
-	fs     *filesystem
-	dentry kernfs.Dentry
+	fs *filesystem
 }
 
-func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, contents map[string]*kernfs.Dentry) *kernfs.Dentry {
+func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, contents map[string]kernfs.Inode) kernfs.Inode {
 	dir := &dir{}
 	dir.fs = fs
 	dir.attrs.Init(creds, 0 /* devMajor */, 0 /* devMinor */, fs.NextIno(), linux.ModeDirectory|mode)
 	dir.OrderedChildren.Init(kernfs.OrderedChildrenOptions{Writable: true})
-	dir.dentry.Init(dir)
+	dir.EnableLeakCheck()
 
-	dir.IncLinks(dir.OrderedChildren.Populate(&dir.dentry, contents))
-
-	return &dir.dentry
+	dir.IncLinks(dir.OrderedChildren.Populate(contents))
+	return dir
 }
 
-func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &d.locks, &opts)
+func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, kd *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), kd, &d.OrderedChildren, &d.locks, &opts, kernfs.GenericDirectoryFDOptions{
+		SeekEnd: kernfs.SeekEndStaticEntries,
+	})
 	if err != nil {
 		return nil, err
 	}
 	return fd.VFSFileDescription(), nil
 }
 
-func (d *dir) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (*vfs.Dentry, error) {
+func (d *dir) DecRef(ctx context.Context) {
+	d.dirRefs.DecRef(func() { d.Destroy(ctx) })
+}
+
+func (d *dir) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (kernfs.Inode, error) {
 	creds := auth.CredentialsFromContext(ctx)
 	dir := d.fs.newDir(creds, opts.Mode, nil)
-	dirVFSD := dir.VFSDentry()
-	if err := d.OrderedChildren.Insert(name, dirVFSD); err != nil {
+	if err := d.OrderedChildren.Insert(name, dir); err != nil {
 		dir.DecRef(ctx)
 		return nil, err
 	}
 	d.IncLinks(1)
-	return dirVFSD, nil
+	return dir, nil
 }
 
-func (d *dir) NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (*vfs.Dentry, error) {
+func (d *dir) NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (kernfs.Inode, error) {
 	creds := auth.CredentialsFromContext(ctx)
 	f := d.fs.newFile(creds, "")
-	fVFSD := f.VFSDentry()
-	if err := d.OrderedChildren.Insert(name, fVFSD); err != nil {
+	if err := d.OrderedChildren.Insert(name, f); err != nil {
 		f.DecRef(ctx)
 		return nil, err
 	}
-	return fVFSD, nil
+	return f, nil
 }
 
-func (*dir) NewLink(context.Context, string, kernfs.Inode) (*vfs.Dentry, error) {
+func (*dir) NewLink(context.Context, string, kernfs.Inode) (kernfs.Inode, error) {
 	return nil, syserror.EPERM
 }
 
-func (*dir) NewSymlink(context.Context, string, string) (*vfs.Dentry, error) {
+func (*dir) NewSymlink(context.Context, string, string) (kernfs.Inode, error) {
 	return nil, syserror.EPERM
 }
 
-func (*dir) NewNode(context.Context, string, vfs.MknodOptions) (*vfs.Dentry, error) {
+func (*dir) NewNode(context.Context, string, vfs.MknodOptions) (kernfs.Inode, error) {
 	return nil, syserror.EPERM
 }
 
@@ -197,18 +204,22 @@ func (fsType) Name() string {
 	return "kernfs"
 }
 
+func (fsType) Release(ctx context.Context) {}
+
 func (fst fsType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opt vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
 	fs := &filesystem{}
 	fs.VFSFilesystem().Init(vfsObj, &fst, fs)
 	root := fst.rootFn(creds, fs)
-	return fs.VFSFilesystem(), root.VFSDentry(), nil
+	var d kernfs.Dentry
+	d.Init(&fs.Filesystem, root)
+	return fs.VFSFilesystem(), d.VFSDentry(), nil
 }
 
 // -------------------- Remainder of the file are test cases --------------------
 
 func TestBasic(t *testing.T) {
-	sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry {
-		return fs.newReadonlyDir(creds, 0755, map[string]*kernfs.Dentry{
+	sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) kernfs.Inode {
+		return fs.newReadonlyDir(creds, 0755, map[string]kernfs.Inode{
 			"file1": fs.newFile(creds, staticFileContent),
 		})
 	})
@@ -217,8 +228,8 @@ func TestBasic(t *testing.T) {
 }
 
 func TestMkdirGetDentry(t *testing.T) {
-	sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry {
-		return fs.newReadonlyDir(creds, 0755, map[string]*kernfs.Dentry{
+	sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) kernfs.Inode {
+		return fs.newReadonlyDir(creds, 0755, map[string]kernfs.Inode{
 			"dir1": fs.newDir(creds, 0755, nil),
 		})
 	})
@@ -232,8 +243,8 @@ func TestMkdirGetDentry(t *testing.T) {
 }
 
 func TestReadStaticFile(t *testing.T) {
-	sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry {
-		return fs.newReadonlyDir(creds, 0755, map[string]*kernfs.Dentry{
+	sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) kernfs.Inode {
+		return fs.newReadonlyDir(creds, 0755, map[string]kernfs.Inode{
 			"file1": fs.newFile(creds, staticFileContent),
 		})
 	})
@@ -258,8 +269,8 @@ func TestReadStaticFile(t *testing.T) {
 }
 
 func TestCreateNewFileInStaticDir(t *testing.T) {
-	sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry {
-		return fs.newReadonlyDir(creds, 0755, map[string]*kernfs.Dentry{
+	sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) kernfs.Inode {
+		return fs.newReadonlyDir(creds, 0755, map[string]kernfs.Inode{
 			"dir1": fs.newDir(creds, 0755, nil),
 		})
 	})
@@ -285,7 +296,7 @@ func TestCreateNewFileInStaticDir(t *testing.T) {
 }
 
 func TestDirFDReadWrite(t *testing.T) {
-	sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry {
+	sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) kernfs.Inode {
 		return fs.newReadonlyDir(creds, 0755, nil)
 	})
 	defer sys.Destroy()
@@ -309,11 +320,11 @@ func TestDirFDReadWrite(t *testing.T) {
 }
 
 func TestDirFDIterDirents(t *testing.T) {
-	sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry {
-		return fs.newReadonlyDir(creds, 0755, map[string]*kernfs.Dentry{
+	sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) kernfs.Inode {
+		return fs.newReadonlyDir(creds, 0755, map[string]kernfs.Inode{
 			// Fill root with nodes backed by various inode implementations.
 			"dir1": fs.newReadonlyDir(creds, 0755, nil),
-			"dir2": fs.newDir(creds, 0755, map[string]*kernfs.Dentry{
+			"dir2": fs.newDir(creds, 0755, map[string]kernfs.Inode{
 				"dir3": fs.newDir(creds, 0755, nil),
 			}),
 			"file1": fs.newFile(creds, staticFileContent),
diff --git a/pkg/sentry/fsimpl/kernfs/symlink.go b/pkg/sentry/fsimpl/kernfs/symlink.go
index 2ab3f53fd..934cc6c9e 100644
--- a/pkg/sentry/fsimpl/kernfs/symlink.go
+++ b/pkg/sentry/fsimpl/kernfs/symlink.go
@@ -24,10 +24,13 @@ import (
 
 // StaticSymlink provides an Inode implementation for symlinks that point to
 // a immutable target.
+//
+// +stateify savable
 type StaticSymlink struct {
 	InodeAttrs
 	InodeNoopRefCount
 	InodeSymlink
+	InodeNoStatFS
 
 	target string
 }
@@ -35,13 +38,10 @@ type StaticSymlink struct {
 var _ Inode = (*StaticSymlink)(nil)
 
 // NewStaticSymlink creates a new symlink file pointing to 'target'.
-func NewStaticSymlink(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, target string) *Dentry {
+func NewStaticSymlink(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, target string) Inode {
 	inode := &StaticSymlink{}
 	inode.Init(creds, devMajor, devMinor, ino, target)
-
-	d := &Dentry{}
-	d.Init(inode)
-	return d
+	return inode
 }
 
 // Init initializes the instance.
@@ -50,8 +50,8 @@ func (s *StaticSymlink) Init(creds *auth.Credentials, devMajor uint32, devMinor
 	s.InodeAttrs.Init(creds, devMajor, devMinor, ino, linux.ModeSymlink|0777)
 }
 
-// Readlink implements Inode.
-func (s *StaticSymlink) Readlink(_ context.Context) (string, error) {
+// Readlink implements Inode.Readlink.
+func (s *StaticSymlink) Readlink(_ context.Context, _ *vfs.Mount) (string, error) {
 	return s.target, nil
 }
 
diff --git a/pkg/sentry/fsimpl/kernfs/synthetic_directory.go b/pkg/sentry/fsimpl/kernfs/synthetic_directory.go
new file mode 100644
index 000000000..d0ed17b18
--- /dev/null
+++ b/pkg/sentry/fsimpl/kernfs/synthetic_directory.go
@@ -0,0 +1,112 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernfs
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// syntheticDirectory implements kernfs.Inode for a directory created by
+// MkdirAt(ForSyntheticMountpoint=true).
+//
+// +stateify savable
+type syntheticDirectory struct {
+	InodeAlwaysValid
+	InodeAttrs
+	InodeNoStatFS
+	InodeNotSymlink
+	OrderedChildren
+	syntheticDirectoryRefs
+
+	locks vfs.FileLocks
+}
+
+var _ Inode = (*syntheticDirectory)(nil)
+
+func newSyntheticDirectory(creds *auth.Credentials, perm linux.FileMode) Inode {
+	inode := &syntheticDirectory{}
+	inode.Init(creds, 0 /* devMajor */, 0 /* devMinor */, 0 /* ino */, perm)
+	return inode
+}
+
+func (dir *syntheticDirectory) Init(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode) {
+	if perm&^linux.PermissionsMask != 0 {
+		panic(fmt.Sprintf("perm contains non-permission bits: %#o", perm))
+	}
+	dir.InodeAttrs.Init(creds, devMajor, devMinor, ino, linux.S_IFDIR|perm)
+	dir.OrderedChildren.Init(OrderedChildrenOptions{
+		Writable: true,
+	})
+}
+
+// Open implements Inode.Open.
+func (dir *syntheticDirectory) Open(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	fd, err := NewGenericDirectoryFD(rp.Mount(), d, &dir.OrderedChildren, &dir.locks, &opts, GenericDirectoryFDOptions{})
+	if err != nil {
+		return nil, err
+	}
+	return &fd.vfsfd, nil
+}
+
+// NewFile implements Inode.NewFile.
+func (dir *syntheticDirectory) NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (Inode, error) {
+	return nil, syserror.EPERM
+}
+
+// NewDir implements Inode.NewDir.
+func (dir *syntheticDirectory) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (Inode, error) {
+	if !opts.ForSyntheticMountpoint {
+		return nil, syserror.EPERM
+	}
+	subdirI := newSyntheticDirectory(auth.CredentialsFromContext(ctx), opts.Mode&linux.PermissionsMask)
+	if err := dir.OrderedChildren.Insert(name, subdirI); err != nil {
+		subdirI.DecRef(ctx)
+		return nil, err
+	}
+	return subdirI, nil
+}
+
+// NewLink implements Inode.NewLink.
+func (dir *syntheticDirectory) NewLink(ctx context.Context, name string, target Inode) (Inode, error) {
+	return nil, syserror.EPERM
+}
+
+// NewSymlink implements Inode.NewSymlink.
+func (dir *syntheticDirectory) NewSymlink(ctx context.Context, name, target string) (Inode, error) {
+	return nil, syserror.EPERM
+}
+
+// NewNode implements Inode.NewNode.
+func (dir *syntheticDirectory) NewNode(ctx context.Context, name string, opts vfs.MknodOptions) (Inode, error) {
+	return nil, syserror.EPERM
+}
+
+// DecRef implements Inode.DecRef.
+func (dir *syntheticDirectory) DecRef(ctx context.Context) {
+	dir.syntheticDirectoryRefs.DecRef(func() { dir.Destroy(ctx) })
+}
+
+// Keep implements Inode.Keep. This is redundant because inodes will never be
+// created via Lookup and inodes are always valid. Makes sense to return true
+// because these inodes are not temporary and should only be removed on RmDir.
+func (dir *syntheticDirectory) Keep() bool {
+	return true
+}
diff --git a/pkg/sentry/fsimpl/overlay/BUILD b/pkg/sentry/fsimpl/overlay/BUILD
index 8cf5b35d3..1e11b0428 100644
--- a/pkg/sentry/fsimpl/overlay/BUILD
+++ b/pkg/sentry/fsimpl/overlay/BUILD
@@ -21,14 +21,16 @@ go_library(
         "directory.go",
         "filesystem.go",
         "fstree.go",
-        "non_directory.go",
         "overlay.go",
+        "regular_file.go",
     ],
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
         "//pkg/context",
         "//pkg/fspath",
+        "//pkg/log",
+        "//pkg/sentry/arch",
         "//pkg/sentry/fs/lock",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/memmap",
@@ -37,5 +39,6 @@ go_library(
         "//pkg/sync",
         "//pkg/syserror",
         "//pkg/usermem",
+        "//pkg/waiter",
     ],
 )
diff --git a/pkg/sentry/fsimpl/overlay/copy_up.go b/pkg/sentry/fsimpl/overlay/copy_up.go
index b3d19ff82..4506642ca 100644
--- a/pkg/sentry/fsimpl/overlay/copy_up.go
+++ b/pkg/sentry/fsimpl/overlay/copy_up.go
@@ -22,6 +22,8 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
@@ -40,6 +42,10 @@ func (d *dentry) copyUpLocked(ctx context.Context) error {
 		return nil
 	}
 
+	// Attach our credentials to the context, as some VFS operations use
+	// credentials from context rather an take an explicit creds parameter.
+	ctx = auth.ContextWithCredentials(ctx, d.fs.creds)
+
 	ftype := atomic.LoadUint32(&d.mode) & linux.S_IFMT
 	switch ftype {
 	case linux.S_IFREG, linux.S_IFDIR, linux.S_IFLNK, linux.S_IFBLK, linux.S_IFCHR:
@@ -69,13 +75,28 @@ func (d *dentry) copyUpLocked(ctx context.Context) error {
 		return syserror.ENOENT
 	}
 
-	// Perform copy-up.
+	// Obtain settable timestamps from the lower layer.
 	vfsObj := d.fs.vfsfs.VirtualFilesystem()
+	oldpop := vfs.PathOperation{
+		Root:  d.lowerVDs[0],
+		Start: d.lowerVDs[0],
+	}
+	const timestampsMask = linux.STATX_ATIME | linux.STATX_MTIME
+	oldStat, err := vfsObj.StatAt(ctx, d.fs.creds, &oldpop, &vfs.StatOptions{
+		Mask: timestampsMask,
+	})
+	if err != nil {
+		return err
+	}
+
+	// Perform copy-up.
 	newpop := vfs.PathOperation{
 		Root:  d.parent.upperVD,
 		Start: d.parent.upperVD,
 		Path:  fspath.Parse(d.name),
 	}
+	// Used during copy-up of memory-mapped regular files.
+	var mmapOpts *memmap.MMapOpts
 	cleanupUndoCopyUp := func() {
 		var err error
 		if ftype == linux.S_IFDIR {
@@ -84,15 +105,16 @@ func (d *dentry) copyUpLocked(ctx context.Context) error {
 			err = vfsObj.UnlinkAt(ctx, d.fs.creds, &newpop)
 		}
 		if err != nil {
-			ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer file after copy-up error: %v", err)
+			panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer file after copy-up error: %v", err))
+		}
+		if d.upperVD.Ok() {
+			d.upperVD.DecRef(ctx)
+			d.upperVD = vfs.VirtualDentry{}
 		}
 	}
 	switch ftype {
 	case linux.S_IFREG:
-		oldFD, err := vfsObj.OpenAt(ctx, d.fs.creds, &vfs.PathOperation{
-			Root:  d.lowerVDs[0],
-			Start: d.lowerVDs[0],
-		}, &vfs.OpenOptions{
+		oldFD, err := vfsObj.OpenAt(ctx, d.fs.creds, &oldpop, &vfs.OpenOptions{
 			Flags: linux.O_RDONLY,
 		})
 		if err != nil {
@@ -127,11 +149,32 @@ func (d *dentry) copyUpLocked(ctx context.Context) error {
 				break
 			}
 		}
+		d.mapsMu.Lock()
+		defer d.mapsMu.Unlock()
+		if d.wrappedMappable != nil {
+			// We may have memory mappings of the file on the lower layer.
+			// Switch to mapping the file on the upper layer instead.
+			mmapOpts = &memmap.MMapOpts{
+				Perms:    usermem.ReadWrite,
+				MaxPerms: usermem.ReadWrite,
+			}
+			if err := newFD.ConfigureMMap(ctx, mmapOpts); err != nil {
+				cleanupUndoCopyUp()
+				return err
+			}
+			if mmapOpts.MappingIdentity != nil {
+				mmapOpts.MappingIdentity.DecRef(ctx)
+			}
+			// Don't actually switch Mappables until the end of copy-up; see
+			// below for why.
+		}
 		if err := newFD.SetStat(ctx, vfs.SetStatOptions{
 			Stat: linux.Statx{
-				Mask: linux.STATX_UID | linux.STATX_GID,
-				UID:  d.uid,
-				GID:  d.gid,
+				Mask:  linux.STATX_UID | linux.STATX_GID | oldStat.Mask&timestampsMask,
+				UID:   d.uid,
+				GID:   d.gid,
+				Atime: oldStat.Atime,
+				Mtime: oldStat.Mtime,
 			},
 		}); err != nil {
 			cleanupUndoCopyUp()
@@ -148,9 +191,11 @@ func (d *dentry) copyUpLocked(ctx context.Context) error {
 		}
 		if err := vfsObj.SetStatAt(ctx, d.fs.creds, &newpop, &vfs.SetStatOptions{
 			Stat: linux.Statx{
-				Mask: linux.STATX_UID | linux.STATX_GID,
-				UID:  d.uid,
-				GID:  d.gid,
+				Mask:  linux.STATX_UID | linux.STATX_GID | oldStat.Mask&timestampsMask,
+				UID:   d.uid,
+				GID:   d.gid,
+				Atime: oldStat.Atime,
+				Mtime: oldStat.Mtime,
 			},
 		}); err != nil {
 			cleanupUndoCopyUp()
@@ -164,10 +209,7 @@ func (d *dentry) copyUpLocked(ctx context.Context) error {
 		d.upperVD = upperVD
 
 	case linux.S_IFLNK:
-		target, err := vfsObj.ReadlinkAt(ctx, d.fs.creds, &vfs.PathOperation{
-			Root:  d.lowerVDs[0],
-			Start: d.lowerVDs[0],
-		})
+		target, err := vfsObj.ReadlinkAt(ctx, d.fs.creds, &oldpop)
 		if err != nil {
 			return err
 		}
@@ -176,10 +218,12 @@ func (d *dentry) copyUpLocked(ctx context.Context) error {
 		}
 		if err := vfsObj.SetStatAt(ctx, d.fs.creds, &newpop, &vfs.SetStatOptions{
 			Stat: linux.Statx{
-				Mask: linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID,
-				Mode: uint16(d.mode),
-				UID:  d.uid,
-				GID:  d.gid,
+				Mask:  linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | oldStat.Mask&timestampsMask,
+				Mode:  uint16(d.mode),
+				UID:   d.uid,
+				GID:   d.gid,
+				Atime: oldStat.Atime,
+				Mtime: oldStat.Mtime,
 			},
 		}); err != nil {
 			cleanupUndoCopyUp()
@@ -193,25 +237,20 @@ func (d *dentry) copyUpLocked(ctx context.Context) error {
 		d.upperVD = upperVD
 
 	case linux.S_IFBLK, linux.S_IFCHR:
-		lowerStat, err := vfsObj.StatAt(ctx, d.fs.creds, &vfs.PathOperation{
-			Root:  d.lowerVDs[0],
-			Start: d.lowerVDs[0],
-		}, &vfs.StatOptions{})
-		if err != nil {
-			return err
-		}
 		if err := vfsObj.MknodAt(ctx, d.fs.creds, &newpop, &vfs.MknodOptions{
 			Mode:     linux.FileMode(d.mode),
-			DevMajor: lowerStat.RdevMajor,
-			DevMinor: lowerStat.RdevMinor,
+			DevMajor: oldStat.RdevMajor,
+			DevMinor: oldStat.RdevMinor,
 		}); err != nil {
 			return err
 		}
 		if err := vfsObj.SetStatAt(ctx, d.fs.creds, &newpop, &vfs.SetStatOptions{
 			Stat: linux.Statx{
-				Mask: linux.STATX_UID | linux.STATX_GID,
-				UID:  d.uid,
-				GID:  d.gid,
+				Mask:  linux.STATX_UID | linux.STATX_GID | oldStat.Mask&timestampsMask,
+				UID:   d.uid,
+				GID:   d.gid,
+				Atime: oldStat.Atime,
+				Mtime: oldStat.Mtime,
 			},
 		}); err != nil {
 			cleanupUndoCopyUp()
@@ -229,7 +268,10 @@ func (d *dentry) copyUpLocked(ctx context.Context) error {
 		panic(fmt.Sprintf("unexpected file type %o", ftype))
 	}
 
-	// TODO(gvisor.dev/issue/1199): copy up xattrs
+	if err := d.copyXattrsLocked(ctx); err != nil {
+		cleanupUndoCopyUp()
+		return err
+	}
 
 	// Update the dentry's device and inode numbers (except for directories,
 	// for which these remain overlay-assigned).
@@ -241,14 +283,10 @@ func (d *dentry) copyUpLocked(ctx context.Context) error {
 			Mask: linux.STATX_INO,
 		})
 		if err != nil {
-			d.upperVD.DecRef(ctx)
-			d.upperVD = vfs.VirtualDentry{}
 			cleanupUndoCopyUp()
 			return err
 		}
 		if upperStat.Mask&linux.STATX_INO == 0 {
-			d.upperVD.DecRef(ctx)
-			d.upperVD = vfs.VirtualDentry{}
 			cleanupUndoCopyUp()
 			return syserror.EREMOTE
 		}
@@ -257,6 +295,135 @@ func (d *dentry) copyUpLocked(ctx context.Context) error {
 		atomic.StoreUint64(&d.ino, upperStat.Ino)
 	}
 
+	if mmapOpts != nil && mmapOpts.Mappable != nil {
+		// Note that if mmapOpts != nil, then d.mapsMu is locked for writing
+		// (from the S_IFREG path above).
+
+		// Propagate mappings of d to the new Mappable. Remember which mappings
+		// we added so we can remove them on failure.
+		upperMappable := mmapOpts.Mappable
+		allAdded := make(map[memmap.MappableRange]memmap.MappingsOfRange)
+		for seg := d.lowerMappings.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
+			added := make(memmap.MappingsOfRange)
+			for m := range seg.Value() {
+				if err := upperMappable.AddMapping(ctx, m.MappingSpace, m.AddrRange, seg.Start(), m.Writable); err != nil {
+					for m := range added {
+						upperMappable.RemoveMapping(ctx, m.MappingSpace, m.AddrRange, seg.Start(), m.Writable)
+					}
+					for mr, mappings := range allAdded {
+						for m := range mappings {
+							upperMappable.RemoveMapping(ctx, m.MappingSpace, m.AddrRange, mr.Start, m.Writable)
+						}
+					}
+					return err
+				}
+				added[m] = struct{}{}
+			}
+			allAdded[seg.Range()] = added
+		}
+
+		// Switch to the new Mappable. We do this at the end of copy-up
+		// because:
+		//
+		// - We need to switch Mappables (by changing d.wrappedMappable) before
+		// invalidating Translations from the old Mappable (to pick up
+		// Translations from the new one).
+		//
+		// - We need to lock d.dataMu while changing d.wrappedMappable, but
+		// must invalidate Translations with d.dataMu unlocked (due to lock
+		// ordering).
+		//
+		// - Consequently, once we unlock d.dataMu, other threads may
+		// immediately observe the new (copied-up) Mappable, which we want to
+		// delay until copy-up is guaranteed to succeed.
+		d.dataMu.Lock()
+		lowerMappable := d.wrappedMappable
+		d.wrappedMappable = upperMappable
+		d.dataMu.Unlock()
+		d.lowerMappings.InvalidateAll(memmap.InvalidateOpts{})
+
+		// Remove mappings from the old Mappable.
+		for seg := d.lowerMappings.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
+			for m := range seg.Value() {
+				lowerMappable.RemoveMapping(ctx, m.MappingSpace, m.AddrRange, seg.Start(), m.Writable)
+			}
+		}
+		d.lowerMappings.RemoveAll()
+	}
+
 	atomic.StoreUint32(&d.copiedUp, 1)
 	return nil
 }
+
+// copyXattrsLocked copies a subset of lower's extended attributes to upper.
+// Attributes that configure an overlay in the lower are not copied up.
+//
+// Preconditions: d.copyMu must be locked for writing.
+func (d *dentry) copyXattrsLocked(ctx context.Context) error {
+	vfsObj := d.fs.vfsfs.VirtualFilesystem()
+	lowerPop := &vfs.PathOperation{Root: d.lowerVDs[0], Start: d.lowerVDs[0]}
+	upperPop := &vfs.PathOperation{Root: d.upperVD, Start: d.upperVD}
+
+	lowerXattrs, err := vfsObj.ListXattrAt(ctx, d.fs.creds, lowerPop, 0)
+	if err != nil {
+		if err == syserror.EOPNOTSUPP {
+			// There are no guarantees as to the contents of lowerXattrs.
+			return nil
+		}
+		ctx.Infof("failed to copy up xattrs because ListXattrAt failed: %v", err)
+		return err
+	}
+
+	for _, name := range lowerXattrs {
+		// Do not copy up overlay attributes.
+		if isOverlayXattr(name) {
+			continue
+		}
+
+		value, err := vfsObj.GetXattrAt(ctx, d.fs.creds, lowerPop, &vfs.GetXattrOptions{Name: name, Size: 0})
+		if err != nil {
+			ctx.Infof("failed to copy up xattrs because GetXattrAt failed: %v", err)
+			return err
+		}
+
+		if err := vfsObj.SetXattrAt(ctx, d.fs.creds, upperPop, &vfs.SetXattrOptions{Name: name, Value: value}); err != nil {
+			ctx.Infof("failed to copy up xattrs because SetXattrAt failed: %v", err)
+			return err
+		}
+	}
+	return nil
+}
+
+// copyUpDescendantsLocked ensures that all descendants of d are copied up.
+//
+// Preconditions:
+// * filesystem.renameMu must be locked.
+// * d.dirMu must be locked.
+// * d.isDir().
+func (d *dentry) copyUpDescendantsLocked(ctx context.Context, ds **[]*dentry) error {
+	dirents, err := d.getDirentsLocked(ctx)
+	if err != nil {
+		return err
+	}
+	for _, dirent := range dirents {
+		if dirent.Name == "." || dirent.Name == ".." {
+			continue
+		}
+		child, err := d.fs.getChildLocked(ctx, d, dirent.Name, ds)
+		if err != nil {
+			return err
+		}
+		if err := child.copyUpLocked(ctx); err != nil {
+			return err
+		}
+		if child.isDir() {
+			child.dirMu.Lock()
+			err := child.copyUpDescendantsLocked(ctx, ds)
+			child.dirMu.Unlock()
+			if err != nil {
+				return err
+			}
+		}
+	}
+	return nil
+}
diff --git a/pkg/sentry/fsimpl/overlay/directory.go b/pkg/sentry/fsimpl/overlay/directory.go
index 6a79f7ffe..df4492346 100644
--- a/pkg/sentry/fsimpl/overlay/directory.go
+++ b/pkg/sentry/fsimpl/overlay/directory.go
@@ -29,7 +29,9 @@ func (d *dentry) isDir() bool {
 	return atomic.LoadUint32(&d.mode)&linux.S_IFMT == linux.S_IFDIR
 }
 
-// Preconditions: d.dirMu must be locked. d.isDir().
+// Preconditions:
+// * d.dirMu must be locked.
+// * d.isDir().
 func (d *dentry) collectWhiteoutsForRmdirLocked(ctx context.Context) (map[string]bool, error) {
 	vfsObj := d.fs.vfsfs.VirtualFilesystem()
 	var readdirErr error
@@ -98,12 +100,13 @@ func (d *dentry) collectWhiteoutsForRmdirLocked(ctx context.Context) (map[string
 	return whiteouts, readdirErr
 }
 
+// +stateify savable
 type directoryFD struct {
 	fileDescription
 	vfs.DirectoryFileDescriptionDefaultImpl
 	vfs.DentryMetadataFileDescriptionImpl
 
-	mu      sync.Mutex
+	mu      sync.Mutex `state:"nosave"`
 	off     int64
 	dirents []vfs.Dirent
 }
@@ -114,10 +117,12 @@ func (fd *directoryFD) Release(ctx context.Context) {
 
 // IterDirents implements vfs.FileDescriptionImpl.IterDirents.
 func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error {
+	d := fd.dentry()
+	defer d.InotifyWithParent(ctx, linux.IN_ACCESS, 0, vfs.PathEvent)
+
 	fd.mu.Lock()
 	defer fd.mu.Unlock()
 
-	d := fd.dentry()
 	if fd.dirents == nil {
 		ds, err := d.getDirents(ctx)
 		if err != nil {
@@ -141,7 +146,14 @@ func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) {
 	defer d.fs.renameMu.RUnlock()
 	d.dirMu.Lock()
 	defer d.dirMu.Unlock()
+	return d.getDirentsLocked(ctx)
+}
 
+// Preconditions:
+// * filesystem.renameMu must be locked.
+// * d.dirMu must be locked.
+// * d.isDir().
+func (d *dentry) getDirentsLocked(ctx context.Context) ([]vfs.Dirent, error) {
 	if d.dirents != nil {
 		return d.dirents, nil
 	}
diff --git a/pkg/sentry/fsimpl/overlay/filesystem.go b/pkg/sentry/fsimpl/overlay/filesystem.go
index 86d0164b4..78a01bbb7 100644
--- a/pkg/sentry/fsimpl/overlay/filesystem.go
+++ b/pkg/sentry/fsimpl/overlay/filesystem.go
@@ -15,6 +15,8 @@
 package overlay
 
 import (
+	"fmt"
+	"strings"
 	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
@@ -27,10 +29,15 @@ import (
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
+// _OVL_XATTR_PREFIX is an extended attribute key prefix to identify overlayfs
+// attributes.
+// Linux: fs/overlayfs/overlayfs.h:OVL_XATTR_PREFIX
+const _OVL_XATTR_PREFIX = linux.XATTR_TRUSTED_PREFIX + "overlay."
+
 // _OVL_XATTR_OPAQUE is an extended attribute key whose value is set to "y" for
 // opaque directories.
 // Linux: fs/overlayfs/overlayfs.h:OVL_XATTR_OPAQUE
-const _OVL_XATTR_OPAQUE = "trusted.overlay.opaque"
+const _OVL_XATTR_OPAQUE = _OVL_XATTR_PREFIX + "opaque"
 
 func isWhiteout(stat *linux.Statx) bool {
 	return stat.Mode&linux.S_IFMT == linux.S_IFCHR && stat.RdevMajor == 0 && stat.RdevMinor == 0
@@ -110,8 +117,10 @@ func (fs *filesystem) renameMuUnlockAndCheckDrop(ctx context.Context, ds **[]*de
 // Dentries which may have a reference count of zero, and which therefore
 // should be dropped once traversal is complete, are appended to ds.
 //
-// Preconditions: fs.renameMu must be locked. d.dirMu must be locked.
-// !rp.Done().
+// Preconditions:
+// * fs.renameMu must be locked.
+// * d.dirMu must be locked.
+// * !rp.Done().
 func (fs *filesystem) stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, mayFollowSymlinks bool, ds **[]*dentry) (*dentry, error) {
 	if !d.isDir() {
 		return nil, syserror.ENOTDIR
@@ -159,7 +168,9 @@ afterSymlink:
 	return child, nil
 }
 
-// Preconditions: fs.renameMu must be locked. d.dirMu must be locked.
+// Preconditions:
+// * fs.renameMu must be locked.
+// * d.dirMu must be locked.
 func (fs *filesystem) getChildLocked(ctx context.Context, parent *dentry, name string, ds **[]*dentry) (*dentry, error) {
 	if child, ok := parent.children[name]; ok {
 		return child, nil
@@ -177,7 +188,9 @@ func (fs *filesystem) getChildLocked(ctx context.Context, parent *dentry, name s
 	return child, nil
 }
 
-// Preconditions: fs.renameMu must be locked. parent.dirMu must be locked.
+// Preconditions:
+// * fs.renameMu must be locked.
+// * parent.dirMu must be locked.
 func (fs *filesystem) lookupLocked(ctx context.Context, parent *dentry, name string) (*dentry, error) {
 	childPath := fspath.Parse(name)
 	child := fs.newDentry()
@@ -199,6 +212,7 @@ func (fs *filesystem) lookupLocked(ctx context.Context, parent *dentry, name str
 			lookupErr = err
 			return false
 		}
+		defer childVD.DecRef(ctx)
 
 		mask := uint32(linux.STATX_TYPE)
 		if !existsOnAnyLayer {
@@ -237,6 +251,7 @@ func (fs *filesystem) lookupLocked(ctx context.Context, parent *dentry, name str
 		}
 
 		// Update child to include this layer.
+		childVD.IncRef()
 		if isUpper {
 			child.upperVD = childVD
 			child.copiedUp = 1
@@ -261,10 +276,10 @@ func (fs *filesystem) lookupLocked(ctx context.Context, parent *dentry, name str
 
 		// Directories are merged with directories from lower layers if they
 		// are not explicitly opaque.
-		opaqueVal, err := vfsObj.GetxattrAt(ctx, fs.creds, &vfs.PathOperation{
+		opaqueVal, err := vfsObj.GetXattrAt(ctx, fs.creds, &vfs.PathOperation{
 			Root:  childVD,
 			Start: childVD,
-		}, &vfs.GetxattrOptions{
+		}, &vfs.GetXattrOptions{
 			Name: _OVL_XATTR_OPAQUE,
 			Size: 1,
 		})
@@ -300,7 +315,9 @@ func (fs *filesystem) lookupLocked(ctx context.Context, parent *dentry, name str
 // lookupLayerLocked is similar to lookupLocked, but only returns information
 // about the file rather than a dentry.
 //
-// Preconditions: fs.renameMu must be locked. parent.dirMu must be locked.
+// Preconditions:
+// * fs.renameMu must be locked.
+// * parent.dirMu must be locked.
 func (fs *filesystem) lookupLayerLocked(ctx context.Context, parent *dentry, name string) (lookupLayer, error) {
 	childPath := fspath.Parse(name)
 	lookupLayer := lookupLayerNone
@@ -385,7 +402,9 @@ func (ll lookupLayer) existsInOverlay() bool {
 // rp.Start().Impl().(*dentry)). It does not check that the returned directory
 // is searchable by the provider of rp.
 //
-// Preconditions: fs.renameMu must be locked. !rp.Done().
+// Preconditions:
+// * fs.renameMu must be locked.
+// * !rp.Done().
 func (fs *filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, error) {
 	for !rp.Final() {
 		d.dirMu.Lock()
@@ -425,8 +444,9 @@ func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath,
 // doCreateAt checks that creating a file at rp is permitted, then invokes
 // create to do so.
 //
-// Preconditions: !rp.Done(). For the final path component in rp,
-// !rp.ShouldFollowSymlink().
+// Preconditions:
+// * !rp.Done().
+// * For the final path component in rp, !rp.ShouldFollowSymlink().
 func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool, create func(parent *dentry, name string, haveUpperWhiteout bool) error) error {
 	var ds *[]*dentry
 	fs.renameMu.RLock()
@@ -479,7 +499,13 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir
 	if err := create(parent, name, childLayer == lookupLayerUpperWhiteout); err != nil {
 		return err
 	}
+
 	parent.dirents = nil
+	ev := linux.IN_CREATE
+	if dir {
+		ev |= linux.IN_ISDIR
+	}
+	parent.watches.Notify(ctx, name, uint32(ev), 0 /* cookie */, vfs.InodeEvent, false /* unlinked */)
 	return nil
 }
 
@@ -493,7 +519,7 @@ func (fs *filesystem) createWhiteout(ctx context.Context, vfsObj *vfs.VirtualFil
 
 func (fs *filesystem) cleanupRecreateWhiteout(ctx context.Context, vfsObj *vfs.VirtualFilesystem, pop *vfs.PathOperation) {
 	if err := fs.createWhiteout(ctx, vfsObj, pop); err != nil {
-		ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to recreate whiteout after failed file creation: %v", err)
+		panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to recreate whiteout after failed file creation: %v", err))
 	}
 }
 
@@ -605,12 +631,13 @@ func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.
 			},
 		}); err != nil {
 			if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &newpop); cleanupErr != nil {
-				ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer file after LinkAt metadata update failure: %v", cleanupErr)
+				panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer file after LinkAt metadata update failure: %v", cleanupErr))
 			} else if haveUpperWhiteout {
 				fs.cleanupRecreateWhiteout(ctx, vfsObj, &newpop)
 			}
 			return err
 		}
+		old.watches.Notify(ctx, "", linux.IN_ATTRIB, 0 /* cookie */, vfs.InodeEvent, false /* unlinked */)
 		return nil
 	})
 }
@@ -644,7 +671,7 @@ func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
 			},
 		}); err != nil {
 			if cleanupErr := vfsObj.RmdirAt(ctx, fs.creds, &pop); cleanupErr != nil {
-				ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer directory after MkdirAt metadata update failure: %v", cleanupErr)
+				panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer directory after MkdirAt metadata update failure: %v", cleanupErr))
 			} else if haveUpperWhiteout {
 				fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
 			}
@@ -654,12 +681,12 @@ func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
 			// There may be directories on lower layers (previously hidden by
 			// the whiteout) that the new directory should not be merged with.
 			// Mark it opaque to prevent merging.
-			if err := vfsObj.SetxattrAt(ctx, fs.creds, &pop, &vfs.SetxattrOptions{
+			if err := vfsObj.SetXattrAt(ctx, fs.creds, &pop, &vfs.SetXattrOptions{
 				Name:  _OVL_XATTR_OPAQUE,
 				Value: "y",
 			}); err != nil {
 				if cleanupErr := vfsObj.RmdirAt(ctx, fs.creds, &pop); cleanupErr != nil {
-					ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer directory after MkdirAt set-opaque failure: %v", cleanupErr)
+					panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer directory after MkdirAt set-opaque failure: %v", cleanupErr))
 				} else {
 					fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
 				}
@@ -703,7 +730,7 @@ func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
 			},
 		}); err != nil {
 			if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &pop); cleanupErr != nil {
-				ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer file after MknodAt metadata update failure: %v", cleanupErr)
+				panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer file after MknodAt metadata update failure: %v", cleanupErr))
 			} else if haveUpperWhiteout {
 				fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
 			}
@@ -732,10 +759,13 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
 
 	start := rp.Start().Impl().(*dentry)
 	if rp.Done() {
+		if mayCreate && rp.MustBeDir() {
+			return nil, syserror.EISDIR
+		}
 		if mustCreate {
 			return nil, syserror.EEXIST
 		}
-		if mayWrite {
+		if start.isRegularFile() && mayWrite {
 			if err := start.copyUpLocked(ctx); err != nil {
 				return nil, err
 			}
@@ -755,6 +785,10 @@ afterTrailingSymlink:
 	if err := parent.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
 		return nil, err
 	}
+	// Reject attempts to open directories with O_CREAT.
+	if mayCreate && rp.MustBeDir() {
+		return nil, syserror.EISDIR
+	}
 	// Determine whether or not we need to create a file.
 	parent.dirMu.Lock()
 	child, err := fs.stepLocked(ctx, rp, parent, false /* mayFollowSymlinks */, &ds)
@@ -763,12 +797,11 @@ afterTrailingSymlink:
 		parent.dirMu.Unlock()
 		return fd, err
 	}
+	parent.dirMu.Unlock()
 	if err != nil {
-		parent.dirMu.Unlock()
 		return nil, err
 	}
 	// Open existing child or follow symlink.
-	parent.dirMu.Unlock()
 	if mustCreate {
 		return nil, syserror.EEXIST
 	}
@@ -783,7 +816,10 @@ afterTrailingSymlink:
 		start = parent
 		goto afterTrailingSymlink
 	}
-	if mayWrite {
+	if rp.MustBeDir() && !child.isDir() {
+		return nil, syserror.ENOTDIR
+	}
+	if child.isRegularFile() && mayWrite {
 		if err := child.copyUpLocked(ctx); err != nil {
 			return nil, err
 		}
@@ -836,8 +872,11 @@ func (d *dentry) openCopiedUp(ctx context.Context, rp *vfs.ResolvingPath, opts *
 	if err != nil {
 		return nil, err
 	}
+	if ftype != linux.S_IFREG {
+		return layerFD, nil
+	}
 	layerFlags := layerFD.StatusFlags()
-	fd := &nonDirectoryFD{
+	fd := &regularFileFD{
 		copiedUp:    isUpper,
 		cachedFD:    layerFD,
 		cachedFlags: layerFlags,
@@ -851,8 +890,9 @@ func (d *dentry) openCopiedUp(ctx context.Context, rp *vfs.ResolvingPath, opts *
 	return &fd.vfsfd, nil
 }
 
-// Preconditions: parent.dirMu must be locked. parent does not already contain
-// a child named rp.Component().
+// Preconditions:
+// * parent.dirMu must be locked.
+// * parent does not already contain a child named rp.Component().
 func (fs *filesystem) createAndOpenLocked(ctx context.Context, rp *vfs.ResolvingPath, parent *dentry, opts *vfs.OpenOptions, ds **[]*dentry) (*vfs.FileDescription, error) {
 	creds := rp.Credentials()
 	if err := parent.checkPermissions(creds, vfs.MayWrite); err != nil {
@@ -913,7 +953,7 @@ func (fs *filesystem) createAndOpenLocked(ctx context.Context, rp *vfs.Resolving
 		},
 	}); err != nil {
 		if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &pop); cleanupErr != nil {
-			ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer file after OpenAt(O_CREAT) metadata update failure: %v", cleanupErr)
+			panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer file after OpenAt(O_CREAT) metadata update failure: %v", cleanupErr))
 		} else if haveUpperWhiteout {
 			fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
 		}
@@ -924,7 +964,7 @@ func (fs *filesystem) createAndOpenLocked(ctx context.Context, rp *vfs.Resolving
 	child, err := fs.getChildLocked(ctx, parent, childName, ds)
 	if err != nil {
 		if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &pop); cleanupErr != nil {
-			ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer file after OpenAt(O_CREAT) dentry lookup failure: %v", cleanupErr)
+			panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer file after OpenAt(O_CREAT) dentry lookup failure: %v", cleanupErr))
 		} else if haveUpperWhiteout {
 			fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
 		}
@@ -932,7 +972,7 @@ func (fs *filesystem) createAndOpenLocked(ctx context.Context, rp *vfs.Resolving
 	}
 	// Finally construct the overlay FD.
 	upperFlags := upperFD.StatusFlags()
-	fd := &nonDirectoryFD{
+	fd := &regularFileFD{
 		copiedUp:    true,
 		cachedFD:    upperFD,
 		cachedFlags: upperFlags,
@@ -945,6 +985,7 @@ func (fs *filesystem) createAndOpenLocked(ctx context.Context, rp *vfs.Resolving
 		// just can't open it anymore for some reason.
 		return nil, err
 	}
+	parent.watches.Notify(ctx, childName, linux.IN_CREATE, 0 /* cookie */, vfs.PathEvent, false /* unlinked */)
 	return &fd.vfsfd, nil
 }
 
@@ -990,9 +1031,224 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 	}
 	defer mnt.EndWrite()
 
-	// FIXME(gvisor.dev/issue/1199): Actually implement rename.
-	_ = newParent
-	return syserror.EXDEV
+	oldParent := oldParentVD.Dentry().Impl().(*dentry)
+	creds := rp.Credentials()
+	if err := oldParent.checkPermissions(creds, vfs.MayWrite|vfs.MayExec); err != nil {
+		return err
+	}
+	// We need a dentry representing the renamed file since, if it's a
+	// directory, we need to check for write permission on it.
+	oldParent.dirMu.Lock()
+	defer oldParent.dirMu.Unlock()
+	renamed, err := fs.getChildLocked(ctx, oldParent, oldName, &ds)
+	if err != nil {
+		return err
+	}
+	if err := vfs.CheckDeleteSticky(creds, linux.FileMode(atomic.LoadUint32(&oldParent.mode)), auth.KUID(atomic.LoadUint32(&renamed.uid))); err != nil {
+		return err
+	}
+	if renamed.isDir() {
+		if renamed == newParent || genericIsAncestorDentry(renamed, newParent) {
+			return syserror.EINVAL
+		}
+		if oldParent != newParent {
+			if err := renamed.checkPermissions(creds, vfs.MayWrite); err != nil {
+				return err
+			}
+		}
+	} else {
+		if opts.MustBeDir || rp.MustBeDir() {
+			return syserror.ENOTDIR
+		}
+	}
+
+	if oldParent != newParent {
+		if err := newParent.checkPermissions(creds, vfs.MayWrite|vfs.MayExec); err != nil {
+			return err
+		}
+		newParent.dirMu.Lock()
+		defer newParent.dirMu.Unlock()
+	}
+	if newParent.vfsd.IsDead() {
+		return syserror.ENOENT
+	}
+	replacedLayer, err := fs.lookupLayerLocked(ctx, newParent, newName)
+	if err != nil {
+		return err
+	}
+	var (
+		replaced     *dentry
+		replacedVFSD *vfs.Dentry
+		whiteouts    map[string]bool
+	)
+	if replacedLayer.existsInOverlay() {
+		replaced, err = fs.getChildLocked(ctx, newParent, newName, &ds)
+		if err != nil {
+			return err
+		}
+		replacedVFSD = &replaced.vfsd
+		if replaced.isDir() {
+			if !renamed.isDir() {
+				return syserror.EISDIR
+			}
+			if genericIsAncestorDentry(replaced, renamed) {
+				return syserror.ENOTEMPTY
+			}
+			replaced.dirMu.Lock()
+			defer replaced.dirMu.Unlock()
+			whiteouts, err = replaced.collectWhiteoutsForRmdirLocked(ctx)
+			if err != nil {
+				return err
+			}
+		} else {
+			if rp.MustBeDir() || renamed.isDir() {
+				return syserror.ENOTDIR
+			}
+		}
+	}
+
+	if oldParent == newParent && oldName == newName {
+		return nil
+	}
+
+	// renamed and oldParent need to be copied-up before they're renamed on the
+	// upper layer.
+	if err := renamed.copyUpLocked(ctx); err != nil {
+		return err
+	}
+	// If renamed is a directory, all of its descendants need to be copied-up
+	// before they're renamed on the upper layer.
+	if renamed.isDir() {
+		if err := renamed.copyUpDescendantsLocked(ctx, &ds); err != nil {
+			return err
+		}
+	}
+	// newParent must be copied-up before it can contain renamed on the upper
+	// layer.
+	if err := newParent.copyUpLocked(ctx); err != nil {
+		return err
+	}
+	// If replaced exists, it doesn't need to be copied-up, but we do need to
+	// serialize with copy-up. Holding renameMu for writing should be
+	// sufficient, but out of an abundance of caution...
+	if replaced != nil {
+		replaced.copyMu.RLock()
+		defer replaced.copyMu.RUnlock()
+	}
+
+	vfsObj := rp.VirtualFilesystem()
+	mntns := vfs.MountNamespaceFromContext(ctx)
+	defer mntns.DecRef(ctx)
+	if err := vfsObj.PrepareRenameDentry(mntns, &renamed.vfsd, replacedVFSD); err != nil {
+		return err
+	}
+
+	newpop := vfs.PathOperation{
+		Root:  newParent.upperVD,
+		Start: newParent.upperVD,
+		Path:  fspath.Parse(newName),
+	}
+
+	needRecreateWhiteouts := false
+	cleanupRecreateWhiteouts := func() {
+		if !needRecreateWhiteouts {
+			return
+		}
+		for whiteoutName, whiteoutUpper := range whiteouts {
+			if !whiteoutUpper {
+				continue
+			}
+			if err := fs.createWhiteout(ctx, vfsObj, &vfs.PathOperation{
+				Root:  replaced.upperVD,
+				Start: replaced.upperVD,
+				Path:  fspath.Parse(whiteoutName),
+			}); err != nil && err != syserror.EEXIST {
+				panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to recreate deleted whiteout after RenameAt failure: %v", err))
+			}
+		}
+	}
+	if renamed.isDir() {
+		if replacedLayer == lookupLayerUpper {
+			// Remove whiteouts from the directory being replaced.
+			needRecreateWhiteouts = true
+			for whiteoutName, whiteoutUpper := range whiteouts {
+				if !whiteoutUpper {
+					continue
+				}
+				if err := vfsObj.UnlinkAt(ctx, fs.creds, &vfs.PathOperation{
+					Root:  replaced.upperVD,
+					Start: replaced.upperVD,
+					Path:  fspath.Parse(whiteoutName),
+				}); err != nil {
+					cleanupRecreateWhiteouts()
+					vfsObj.AbortRenameDentry(&renamed.vfsd, replacedVFSD)
+					return err
+				}
+			}
+		} else if replacedLayer == lookupLayerUpperWhiteout {
+			// We need to explicitly remove the whiteout since otherwise rename
+			// on the upper layer will fail with ENOTDIR.
+			if err := vfsObj.UnlinkAt(ctx, fs.creds, &newpop); err != nil {
+				vfsObj.AbortRenameDentry(&renamed.vfsd, replacedVFSD)
+				return err
+			}
+		}
+	}
+
+	// Essentially no gVisor filesystem supports RENAME_WHITEOUT, so just do a
+	// regular rename and create the whiteout at the origin manually. Unlike
+	// RENAME_WHITEOUT, this isn't atomic with respect to other users of the
+	// upper filesystem, but this is already the case for virtually all other
+	// overlay filesystem operations too.
+	oldpop := vfs.PathOperation{
+		Root:  oldParent.upperVD,
+		Start: oldParent.upperVD,
+		Path:  fspath.Parse(oldName),
+	}
+	if err := vfsObj.RenameAt(ctx, creds, &oldpop, &newpop, &opts); err != nil {
+		cleanupRecreateWhiteouts()
+		vfsObj.AbortRenameDentry(&renamed.vfsd, replacedVFSD)
+		return err
+	}
+
+	// Below this point, the renamed dentry is now at newpop, and anything we
+	// replaced is gone forever. Commit the rename, update the overlay
+	// filesystem tree, and abandon attempts to recover from errors.
+	vfsObj.CommitRenameReplaceDentry(ctx, &renamed.vfsd, replacedVFSD)
+	delete(oldParent.children, oldName)
+	if replaced != nil {
+		ds = appendDentry(ds, replaced)
+	}
+	if oldParent != newParent {
+		newParent.dirents = nil
+		// This can't drop the last reference on oldParent because one is held
+		// by oldParentVD, so lock recursion is impossible.
+		oldParent.DecRef(ctx)
+		ds = appendDentry(ds, oldParent)
+		newParent.IncRef()
+		renamed.parent = newParent
+	}
+	renamed.name = newName
+	if newParent.children == nil {
+		newParent.children = make(map[string]*dentry)
+	}
+	newParent.children[newName] = renamed
+	oldParent.dirents = nil
+
+	if err := fs.createWhiteout(ctx, vfsObj, &oldpop); err != nil {
+		panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to create whiteout at origin after RenameAt: %v", err))
+	}
+	if renamed.isDir() {
+		if err := vfsObj.SetXattrAt(ctx, fs.creds, &newpop, &vfs.SetXattrOptions{
+			Name:  _OVL_XATTR_OPAQUE,
+			Value: "y",
+		}); err != nil {
+			panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to make renamed directory opaque: %v", err))
+		}
+	}
+
+	vfs.InotifyRename(ctx, &renamed.watches, &oldParent.watches, &newParent.watches, oldName, newName, renamed.isDir())
+	return nil
 }
 
 // RmdirAt implements vfs.FilesystemImpl.RmdirAt.
@@ -1040,6 +1296,9 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
 	if !child.isDir() {
 		return syserror.ENOTDIR
 	}
+	if err := vfs.CheckDeleteSticky(rp.Credentials(), linux.FileMode(atomic.LoadUint32(&parent.mode)), auth.KUID(atomic.LoadUint32(&child.uid))); err != nil {
+		return err
+	}
 	child.dirMu.Lock()
 	defer child.dirMu.Unlock()
 	whiteouts, err := child.collectWhiteoutsForRmdirLocked(ctx)
@@ -1071,7 +1330,7 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
 					Start: child.upperVD,
 					Path:  fspath.Parse(whiteoutName),
 				}); err != nil && err != syserror.EEXIST {
-					ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to recreate deleted whiteout after RmdirAt failure: %v", err)
+					panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to recreate deleted whiteout after RmdirAt failure: %v", err))
 				}
 			}
 		}
@@ -1101,15 +1360,14 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
 		// Don't attempt to recover from this: the original directory is
 		// already gone, so any dentries representing it are invalid, and
 		// creating a new directory won't undo that.
-		ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to create whiteout during RmdirAt: %v", err)
-		vfsObj.AbortDeleteDentry(&child.vfsd)
-		return err
+		panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to create whiteout during RmdirAt: %v", err))
 	}
 
 	vfsObj.CommitDeleteDentry(ctx, &child.vfsd)
 	delete(parent.children, name)
 	ds = appendDentry(ds, child)
 	parent.dirents = nil
+	parent.watches.Notify(ctx, name, linux.IN_DELETE|linux.IN_ISDIR, 0 /* cookie */, vfs.InodeEvent, true /* unlinked */)
 	return nil
 }
 
@@ -1117,12 +1375,25 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
 func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error {
 	var ds *[]*dentry
 	fs.renameMu.RLock()
-	defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
 	d, err := fs.resolveLocked(ctx, rp, &ds)
 	if err != nil {
+		fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+		return err
+	}
+	err = d.setStatLocked(ctx, rp, opts)
+	fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+	if err != nil {
 		return err
 	}
 
+	if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 {
+		d.InotifyWithParent(ctx, ev, 0 /* cookie */, vfs.InodeEvent)
+	}
+	return nil
+}
+
+// Precondition: d.fs.renameMu must be held for reading.
+func (d *dentry) setStatLocked(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error {
 	mode := linux.FileMode(atomic.LoadUint32(&d.mode))
 	if err := vfs.CheckSetStat(ctx, rp.Credentials(), &opts, mode, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))); err != nil {
 		return err
@@ -1217,7 +1488,7 @@ func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ
 			},
 		}); err != nil {
 			if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &pop); cleanupErr != nil {
-				ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer file after SymlinkAt metadata update failure: %v", cleanupErr)
+				panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer file after SymlinkAt metadata update failure: %v", cleanupErr))
 			} else if haveUpperWhiteout {
 				fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
 			}
@@ -1263,12 +1534,38 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
 		return err
 	}
 
+	parentMode := atomic.LoadUint32(&parent.mode)
 	child := parent.children[name]
 	var childLayer lookupLayer
+	if child == nil {
+		if parentMode&linux.S_ISVTX != 0 {
+			// If the parent's sticky bit is set, we need a child dentry to get
+			// its owner.
+			child, err = fs.getChildLocked(ctx, parent, name, &ds)
+			if err != nil {
+				return err
+			}
+		} else {
+			// Determine if the file being unlinked actually exists. Holding
+			// parent.dirMu prevents a dentry from being instantiated for the file,
+			// which in turn prevents it from being copied-up, so this result is
+			// stable.
+			childLayer, err = fs.lookupLayerLocked(ctx, parent, name)
+			if err != nil {
+				return err
+			}
+			if !childLayer.existsInOverlay() {
+				return syserror.ENOENT
+			}
+		}
+	}
 	if child != nil {
 		if child.isDir() {
 			return syserror.EISDIR
 		}
+		if err := vfs.CheckDeleteSticky(rp.Credentials(), linux.FileMode(parentMode), auth.KUID(atomic.LoadUint32(&child.uid))); err != nil {
+			return err
+		}
 		if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil {
 			return err
 		}
@@ -1281,18 +1578,6 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
 		} else {
 			childLayer = lookupLayerLower
 		}
-	} else {
-		// Determine if the file being unlinked actually exists. Holding
-		// parent.dirMu prevents a dentry from being instantiated for the file,
-		// which in turn prevents it from being copied-up, so this result is
-		// stable.
-		childLayer, err = fs.lookupLayerLocked(ctx, parent, name)
-		if err != nil {
-			return err
-		}
-		if !childLayer.existsInOverlay() {
-			return syserror.ENOENT
-		}
 	}
 
 	pop := vfs.PathOperation{
@@ -1310,70 +1595,175 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
 		}
 	}
 	if err := fs.createWhiteout(ctx, vfsObj, &pop); err != nil {
-		ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to create whiteout during UnlinkAt: %v", err)
-		if child != nil {
-			vfsObj.AbortDeleteDentry(&child.vfsd)
-		}
-		return err
+		panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to create whiteout during UnlinkAt: %v", err))
 	}
 
+	var cw *vfs.Watches
 	if child != nil {
 		vfsObj.CommitDeleteDentry(ctx, &child.vfsd)
 		delete(parent.children, name)
 		ds = appendDentry(ds, child)
+		cw = &child.watches
 	}
+	vfs.InotifyRemoveChild(ctx, cw, &parent.watches, name)
 	parent.dirents = nil
 	return nil
 }
 
-// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
-func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
+// isOverlayXattr returns whether the given extended attribute configures the
+// overlay.
+func isOverlayXattr(name string) bool {
+	return strings.HasPrefix(name, _OVL_XATTR_PREFIX)
+}
+
+// ListXattrAt implements vfs.FilesystemImpl.ListXattrAt.
+func (fs *filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
 	var ds *[]*dentry
 	fs.renameMu.RLock()
 	defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
-	_, err := fs.resolveLocked(ctx, rp, &ds)
+	d, err := fs.resolveLocked(ctx, rp, &ds)
+	if err != nil {
+		return nil, err
+	}
+
+	return fs.listXattr(ctx, d, size)
+}
+
+func (fs *filesystem) listXattr(ctx context.Context, d *dentry, size uint64) ([]string, error) {
+	vfsObj := d.fs.vfsfs.VirtualFilesystem()
+	top := d.topLayer()
+	names, err := vfsObj.ListXattrAt(ctx, fs.creds, &vfs.PathOperation{Root: top, Start: top}, size)
 	if err != nil {
 		return nil, err
 	}
-	// TODO(gvisor.dev/issue/1199): Linux overlayfs actually allows listxattr,
-	// but not any other xattr syscalls. For now we just reject all of them.
-	return nil, syserror.ENOTSUP
+
+	// Filter out all overlay attributes.
+	n := 0
+	for _, name := range names {
+		if !isOverlayXattr(name) {
+			names[n] = name
+			n++
+		}
+	}
+	return names[:n], err
 }
 
-// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt.
-func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) {
+// GetXattrAt implements vfs.FilesystemImpl.GetXattrAt.
+func (fs *filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetXattrOptions) (string, error) {
 	var ds *[]*dentry
 	fs.renameMu.RLock()
 	defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
-	_, err := fs.resolveLocked(ctx, rp, &ds)
+	d, err := fs.resolveLocked(ctx, rp, &ds)
 	if err != nil {
 		return "", err
 	}
-	return "", syserror.ENOTSUP
+
+	return fs.getXattr(ctx, d, rp.Credentials(), &opts)
+}
+
+func (fs *filesystem) getXattr(ctx context.Context, d *dentry, creds *auth.Credentials, opts *vfs.GetXattrOptions) (string, error) {
+	if err := d.checkXattrPermissions(creds, opts.Name, vfs.MayRead); err != nil {
+		return "", err
+	}
+
+	// Return EOPNOTSUPP when fetching an overlay attribute.
+	// See fs/overlayfs/super.c:ovl_own_xattr_get().
+	if isOverlayXattr(opts.Name) {
+		return "", syserror.EOPNOTSUPP
+	}
+
+	// Analogous to fs/overlayfs/super.c:ovl_other_xattr_get().
+	vfsObj := d.fs.vfsfs.VirtualFilesystem()
+	top := d.topLayer()
+	return vfsObj.GetXattrAt(ctx, fs.creds, &vfs.PathOperation{Root: top, Start: top}, opts)
 }
 
-// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt.
-func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error {
+// SetXattrAt implements vfs.FilesystemImpl.SetXattrAt.
+func (fs *filesystem) SetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetXattrOptions) error {
 	var ds *[]*dentry
 	fs.renameMu.RLock()
-	defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
-	_, err := fs.resolveLocked(ctx, rp, &ds)
+	d, err := fs.resolveLocked(ctx, rp, &ds)
 	if err != nil {
+		fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
 		return err
 	}
-	return syserror.ENOTSUP
+
+	err = fs.setXattrLocked(ctx, d, rp.Mount(), rp.Credentials(), &opts)
+	fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+	if err != nil {
+		return err
+	}
+
+	d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0 /* cookie */, vfs.InodeEvent)
+	return nil
 }
 
-// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt.
-func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
+// Precondition: fs.renameMu must be locked.
+func (fs *filesystem) setXattrLocked(ctx context.Context, d *dentry, mnt *vfs.Mount, creds *auth.Credentials, opts *vfs.SetXattrOptions) error {
+	if err := d.checkXattrPermissions(creds, opts.Name, vfs.MayWrite); err != nil {
+		return err
+	}
+
+	// Return EOPNOTSUPP when setting an overlay attribute.
+	// See fs/overlayfs/super.c:ovl_own_xattr_set().
+	if isOverlayXattr(opts.Name) {
+		return syserror.EOPNOTSUPP
+	}
+
+	// Analogous to fs/overlayfs/super.c:ovl_other_xattr_set().
+	if err := mnt.CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer mnt.EndWrite()
+	if err := d.copyUpLocked(ctx); err != nil {
+		return err
+	}
+	vfsObj := d.fs.vfsfs.VirtualFilesystem()
+	return vfsObj.SetXattrAt(ctx, fs.creds, &vfs.PathOperation{Root: d.upperVD, Start: d.upperVD}, opts)
+}
+
+// RemoveXattrAt implements vfs.FilesystemImpl.RemoveXattrAt.
+func (fs *filesystem) RemoveXattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
 	var ds *[]*dentry
 	fs.renameMu.RLock()
-	defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
-	_, err := fs.resolveLocked(ctx, rp, &ds)
+	d, err := fs.resolveLocked(ctx, rp, &ds)
+	if err != nil {
+		fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+		return err
+	}
+
+	err = fs.removeXattrLocked(ctx, d, rp.Mount(), rp.Credentials(), name)
+	fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
 	if err != nil {
 		return err
 	}
-	return syserror.ENOTSUP
+
+	d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0 /* cookie */, vfs.InodeEvent)
+	return nil
+}
+
+// Precondition: fs.renameMu must be locked.
+func (fs *filesystem) removeXattrLocked(ctx context.Context, d *dentry, mnt *vfs.Mount, creds *auth.Credentials, name string) error {
+	if err := d.checkXattrPermissions(creds, name, vfs.MayWrite); err != nil {
+		return err
+	}
+
+	// Like SetXattrAt, return EOPNOTSUPP when removing an overlay attribute.
+	// Linux passes the remove request to xattr_handler->set.
+	// See fs/xattr.c:vfs_removexattr().
+	if isOverlayXattr(name) {
+		return syserror.EOPNOTSUPP
+	}
+
+	if err := mnt.CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer mnt.EndWrite()
+	if err := d.copyUpLocked(ctx); err != nil {
+		return err
+	}
+	vfsObj := d.fs.vfsfs.VirtualFilesystem()
+	return vfsObj.RemoveXattrAt(ctx, fs.creds, &vfs.PathOperation{Root: d.upperVD, Start: d.upperVD}, name)
 }
 
 // PrependPath implements vfs.FilesystemImpl.PrependPath.
diff --git a/pkg/sentry/fsimpl/overlay/non_directory.go b/pkg/sentry/fsimpl/overlay/non_directory.go
deleted file mode 100644
index d3060a481..000000000
--- a/pkg/sentry/fsimpl/overlay/non_directory.go
+++ /dev/null
@@ -1,266 +0,0 @@
-// Copyright 2020 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package overlay
-
-import (
-	"sync/atomic"
-
-	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/context"
-	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
-	"gvisor.dev/gvisor/pkg/sentry/memmap"
-	"gvisor.dev/gvisor/pkg/sentry/vfs"
-	"gvisor.dev/gvisor/pkg/sync"
-	"gvisor.dev/gvisor/pkg/usermem"
-)
-
-func (d *dentry) isSymlink() bool {
-	return atomic.LoadUint32(&d.mode)&linux.S_IFMT == linux.S_IFLNK
-}
-
-func (d *dentry) readlink(ctx context.Context) (string, error) {
-	layerVD := d.topLayer()
-	return d.fs.vfsfs.VirtualFilesystem().ReadlinkAt(ctx, d.fs.creds, &vfs.PathOperation{
-		Root:  layerVD,
-		Start: layerVD,
-	})
-}
-
-type nonDirectoryFD struct {
-	fileDescription
-
-	// If copiedUp is false, cachedFD represents
-	// fileDescription.dentry().lowerVDs[0]; otherwise, cachedFD represents
-	// fileDescription.dentry().upperVD. cachedFlags is the last known value of
-	// cachedFD.StatusFlags(). copiedUp, cachedFD, and cachedFlags are
-	// protected by mu.
-	mu          sync.Mutex
-	copiedUp    bool
-	cachedFD    *vfs.FileDescription
-	cachedFlags uint32
-}
-
-func (fd *nonDirectoryFD) getCurrentFD(ctx context.Context) (*vfs.FileDescription, error) {
-	fd.mu.Lock()
-	defer fd.mu.Unlock()
-	wrappedFD, err := fd.currentFDLocked(ctx)
-	if err != nil {
-		return nil, err
-	}
-	wrappedFD.IncRef()
-	return wrappedFD, nil
-}
-
-func (fd *nonDirectoryFD) currentFDLocked(ctx context.Context) (*vfs.FileDescription, error) {
-	d := fd.dentry()
-	statusFlags := fd.vfsfd.StatusFlags()
-	if !fd.copiedUp && d.isCopiedUp() {
-		// Switch to the copied-up file.
-		upperVD := d.topLayer()
-		upperFD, err := fd.filesystem().vfsfs.VirtualFilesystem().OpenAt(ctx, d.fs.creds, &vfs.PathOperation{
-			Root:  upperVD,
-			Start: upperVD,
-		}, &vfs.OpenOptions{
-			Flags: statusFlags,
-		})
-		if err != nil {
-			return nil, err
-		}
-		oldOff, oldOffErr := fd.cachedFD.Seek(ctx, 0, linux.SEEK_CUR)
-		if oldOffErr == nil {
-			if _, err := upperFD.Seek(ctx, oldOff, linux.SEEK_SET); err != nil {
-				upperFD.DecRef(ctx)
-				return nil, err
-			}
-		}
-		fd.cachedFD.DecRef(ctx)
-		fd.copiedUp = true
-		fd.cachedFD = upperFD
-		fd.cachedFlags = statusFlags
-	} else if fd.cachedFlags != statusFlags {
-		if err := fd.cachedFD.SetStatusFlags(ctx, d.fs.creds, statusFlags); err != nil {
-			return nil, err
-		}
-		fd.cachedFlags = statusFlags
-	}
-	return fd.cachedFD, nil
-}
-
-// Release implements vfs.FileDescriptionImpl.Release.
-func (fd *nonDirectoryFD) Release(ctx context.Context) {
-	fd.cachedFD.DecRef(ctx)
-	fd.cachedFD = nil
-}
-
-// OnClose implements vfs.FileDescriptionImpl.OnClose.
-func (fd *nonDirectoryFD) OnClose(ctx context.Context) error {
-	// Linux doesn't define ovl_file_operations.flush at all (i.e. its
-	// equivalent to OnClose is a no-op). We pass through to
-	// fd.cachedFD.OnClose() without upgrading if fd.dentry() has been
-	// copied-up, since OnClose is mostly used to define post-close writeback,
-	// and if fd.cachedFD hasn't been updated then it can't have been used to
-	// mutate fd.dentry() anyway.
-	fd.mu.Lock()
-	if statusFlags := fd.vfsfd.StatusFlags(); fd.cachedFlags != statusFlags {
-		if err := fd.cachedFD.SetStatusFlags(ctx, fd.filesystem().creds, statusFlags); err != nil {
-			fd.mu.Unlock()
-			return err
-		}
-		fd.cachedFlags = statusFlags
-	}
-	wrappedFD := fd.cachedFD
-	defer wrappedFD.IncRef()
-	fd.mu.Unlock()
-	return wrappedFD.OnClose(ctx)
-}
-
-// Stat implements vfs.FileDescriptionImpl.Stat.
-func (fd *nonDirectoryFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
-	var stat linux.Statx
-	if layerMask := opts.Mask &^ statInternalMask; layerMask != 0 {
-		wrappedFD, err := fd.getCurrentFD(ctx)
-		if err != nil {
-			return linux.Statx{}, err
-		}
-		stat, err = wrappedFD.Stat(ctx, vfs.StatOptions{
-			Mask: layerMask,
-			Sync: opts.Sync,
-		})
-		wrappedFD.DecRef(ctx)
-		if err != nil {
-			return linux.Statx{}, err
-		}
-	}
-	fd.dentry().statInternalTo(ctx, &opts, &stat)
-	return stat, nil
-}
-
-// SetStat implements vfs.FileDescriptionImpl.SetStat.
-func (fd *nonDirectoryFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
-	d := fd.dentry()
-	mode := linux.FileMode(atomic.LoadUint32(&d.mode))
-	if err := vfs.CheckSetStat(ctx, auth.CredentialsFromContext(ctx), &opts, mode, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))); err != nil {
-		return err
-	}
-	mnt := fd.vfsfd.Mount()
-	if err := mnt.CheckBeginWrite(); err != nil {
-		return err
-	}
-	defer mnt.EndWrite()
-	if err := d.copyUpLocked(ctx); err != nil {
-		return err
-	}
-	// Changes to d's attributes are serialized by d.copyMu.
-	d.copyMu.Lock()
-	defer d.copyMu.Unlock()
-	wrappedFD, err := fd.currentFDLocked(ctx)
-	if err != nil {
-		return err
-	}
-	if err := wrappedFD.SetStat(ctx, opts); err != nil {
-		return err
-	}
-	d.updateAfterSetStatLocked(&opts)
-	return nil
-}
-
-// StatFS implements vfs.FileDescriptionImpl.StatFS.
-func (fd *nonDirectoryFD) StatFS(ctx context.Context) (linux.Statfs, error) {
-	return fd.filesystem().statFS(ctx)
-}
-
-// PRead implements vfs.FileDescriptionImpl.PRead.
-func (fd *nonDirectoryFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
-	wrappedFD, err := fd.getCurrentFD(ctx)
-	if err != nil {
-		return 0, err
-	}
-	defer wrappedFD.DecRef(ctx)
-	return wrappedFD.PRead(ctx, dst, offset, opts)
-}
-
-// Read implements vfs.FileDescriptionImpl.Read.
-func (fd *nonDirectoryFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
-	// Hold fd.mu during the read to serialize the file offset.
-	fd.mu.Lock()
-	defer fd.mu.Unlock()
-	wrappedFD, err := fd.currentFDLocked(ctx)
-	if err != nil {
-		return 0, err
-	}
-	return wrappedFD.Read(ctx, dst, opts)
-}
-
-// PWrite implements vfs.FileDescriptionImpl.PWrite.
-func (fd *nonDirectoryFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
-	wrappedFD, err := fd.getCurrentFD(ctx)
-	if err != nil {
-		return 0, err
-	}
-	defer wrappedFD.DecRef(ctx)
-	return wrappedFD.PWrite(ctx, src, offset, opts)
-}
-
-// Write implements vfs.FileDescriptionImpl.Write.
-func (fd *nonDirectoryFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
-	// Hold fd.mu during the write to serialize the file offset.
-	fd.mu.Lock()
-	defer fd.mu.Unlock()
-	wrappedFD, err := fd.currentFDLocked(ctx)
-	if err != nil {
-		return 0, err
-	}
-	return wrappedFD.Write(ctx, src, opts)
-}
-
-// Seek implements vfs.FileDescriptionImpl.Seek.
-func (fd *nonDirectoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
-	// Hold fd.mu during the seek to serialize the file offset.
-	fd.mu.Lock()
-	defer fd.mu.Unlock()
-	wrappedFD, err := fd.currentFDLocked(ctx)
-	if err != nil {
-		return 0, err
-	}
-	return wrappedFD.Seek(ctx, offset, whence)
-}
-
-// Sync implements vfs.FileDescriptionImpl.Sync.
-func (fd *nonDirectoryFD) Sync(ctx context.Context) error {
-	fd.mu.Lock()
-	if !fd.dentry().isCopiedUp() {
-		fd.mu.Unlock()
-		return nil
-	}
-	wrappedFD, err := fd.currentFDLocked(ctx)
-	if err != nil {
-		fd.mu.Unlock()
-		return err
-	}
-	wrappedFD.IncRef()
-	defer wrappedFD.DecRef(ctx)
-	fd.mu.Unlock()
-	return wrappedFD.Sync(ctx)
-}
-
-// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap.
-func (fd *nonDirectoryFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
-	wrappedFD, err := fd.getCurrentFD(ctx)
-	if err != nil {
-		return err
-	}
-	defer wrappedFD.DecRef(ctx)
-	return wrappedFD.ConfigureMMap(ctx, opts)
-}
diff --git a/pkg/sentry/fsimpl/overlay/overlay.go b/pkg/sentry/fsimpl/overlay/overlay.go
index 75cc006bf..4c5de8d32 100644
--- a/pkg/sentry/fsimpl/overlay/overlay.go
+++ b/pkg/sentry/fsimpl/overlay/overlay.go
@@ -18,10 +18,14 @@
 //
 // Lock order:
 //
-// directoryFD.mu / nonDirectoryFD.mu
+// directoryFD.mu / regularFileFD.mu
 //   filesystem.renameMu
 //     dentry.dirMu
 //       dentry.copyMu
+//         *** "memmap.Mappable locks" below this point
+//         dentry.mapsMu
+//           *** "memmap.Mappable locks taken by Translate" below this point
+//           dentry.dataMu
 //
 // Locking dentry.dirMu in multiple dentries requires that parent dentries are
 // locked before child dentries, and that filesystem.renameMu is locked to
@@ -37,6 +41,7 @@ import (
 	"gvisor.dev/gvisor/pkg/fspath"
 	fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
@@ -46,6 +51,8 @@ import (
 const Name = "overlay"
 
 // FilesystemType implements vfs.FilesystemType.
+//
+// +stateify savable
 type FilesystemType struct{}
 
 // Name implements vfs.FilesystemType.Name.
@@ -53,8 +60,13 @@ func (FilesystemType) Name() string {
 	return Name
 }
 
+// Release implements FilesystemType.Release.
+func (FilesystemType) Release(ctx context.Context) {}
+
 // FilesystemOptions may be passed as vfs.GetFilesystemOptions.InternalData to
 // FilesystemType.GetFilesystem.
+//
+// +stateify savable
 type FilesystemOptions struct {
 	// Callers passing FilesystemOptions to
 	// overlay.FilesystemType.GetFilesystem() are responsible for ensuring that
@@ -71,6 +83,8 @@ type FilesystemOptions struct {
 }
 
 // filesystem implements vfs.FilesystemImpl.
+//
+// +stateify savable
 type filesystem struct {
 	vfsfs vfs.Filesystem
 
@@ -93,7 +107,7 @@ type filesystem struct {
 	// renameMu synchronizes renaming with non-renaming operations in order to
 	// ensure consistent lock ordering between dentry.dirMu in different
 	// dentries.
-	renameMu sync.RWMutex
+	renameMu sync.RWMutex `state:"nosave"`
 
 	// lastDirIno is the last inode number assigned to a directory. lastDirIno
 	// is accessed using atomic memory operations.
@@ -106,16 +120,16 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
 	fsoptsRaw := opts.InternalData
 	fsopts, haveFSOpts := fsoptsRaw.(FilesystemOptions)
 	if fsoptsRaw != nil && !haveFSOpts {
-		ctx.Warningf("overlay.FilesystemType.GetFilesystem: GetFilesystemOptions.InternalData has type %T, wanted overlay.FilesystemOptions or nil", fsoptsRaw)
+		ctx.Infof("overlay.FilesystemType.GetFilesystem: GetFilesystemOptions.InternalData has type %T, wanted overlay.FilesystemOptions or nil", fsoptsRaw)
 		return nil, nil, syserror.EINVAL
 	}
 	if haveFSOpts {
 		if len(fsopts.LowerRoots) == 0 {
-			ctx.Warningf("overlay.FilesystemType.GetFilesystem: LowerRoots must be non-empty")
+			ctx.Infof("overlay.FilesystemType.GetFilesystem: LowerRoots must be non-empty")
 			return nil, nil, syserror.EINVAL
 		}
 		if len(fsopts.LowerRoots) < 2 && !fsopts.UpperRoot.Ok() {
-			ctx.Warningf("overlay.FilesystemType.GetFilesystem: at least two LowerRoots are required when UpperRoot is unspecified")
+			ctx.Infof("overlay.FilesystemType.GetFilesystem: at least two LowerRoots are required when UpperRoot is unspecified")
 			return nil, nil, syserror.EINVAL
 		}
 		// We don't enforce a maximum number of lower layers when not
@@ -132,7 +146,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
 			delete(mopts, "workdir")
 			upperPath := fspath.Parse(upperPathname)
 			if !upperPath.Absolute {
-				ctx.Warningf("overlay.FilesystemType.GetFilesystem: upperdir %q must be absolute", upperPathname)
+				ctx.Infof("overlay.FilesystemType.GetFilesystem: upperdir %q must be absolute", upperPathname)
 				return nil, nil, syserror.EINVAL
 			}
 			upperRoot, err := vfsObj.GetDentryAt(ctx, creds, &vfs.PathOperation{
@@ -144,13 +158,13 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
 				CheckSearchable: true,
 			})
 			if err != nil {
-				ctx.Warningf("overlay.FilesystemType.GetFilesystem: failed to resolve upperdir %q: %v", upperPathname, err)
+				ctx.Infof("overlay.FilesystemType.GetFilesystem: failed to resolve upperdir %q: %v", upperPathname, err)
 				return nil, nil, err
 			}
 			defer upperRoot.DecRef(ctx)
 			privateUpperRoot, err := clonePrivateMount(vfsObj, upperRoot, false /* forceReadOnly */)
 			if err != nil {
-				ctx.Warningf("overlay.FilesystemType.GetFilesystem: failed to make private bind mount of upperdir %q: %v", upperPathname, err)
+				ctx.Infof("overlay.FilesystemType.GetFilesystem: failed to make private bind mount of upperdir %q: %v", upperPathname, err)
 				return nil, nil, err
 			}
 			defer privateUpperRoot.DecRef(ctx)
@@ -158,24 +172,24 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
 		}
 		lowerPathnamesStr, ok := mopts["lowerdir"]
 		if !ok {
-			ctx.Warningf("overlay.FilesystemType.GetFilesystem: missing required option lowerdir")
+			ctx.Infof("overlay.FilesystemType.GetFilesystem: missing required option lowerdir")
 			return nil, nil, syserror.EINVAL
 		}
 		delete(mopts, "lowerdir")
 		lowerPathnames := strings.Split(lowerPathnamesStr, ":")
 		const maxLowerLayers = 500 // Linux: fs/overlay/super.c:OVL_MAX_STACK
 		if len(lowerPathnames) < 2 && !fsopts.UpperRoot.Ok() {
-			ctx.Warningf("overlay.FilesystemType.GetFilesystem: at least two lowerdirs are required when upperdir is unspecified")
+			ctx.Infof("overlay.FilesystemType.GetFilesystem: at least two lowerdirs are required when upperdir is unspecified")
 			return nil, nil, syserror.EINVAL
 		}
 		if len(lowerPathnames) > maxLowerLayers {
-			ctx.Warningf("overlay.FilesystemType.GetFilesystem: %d lowerdirs specified, maximum %d", len(lowerPathnames), maxLowerLayers)
+			ctx.Infof("overlay.FilesystemType.GetFilesystem: %d lowerdirs specified, maximum %d", len(lowerPathnames), maxLowerLayers)
 			return nil, nil, syserror.EINVAL
 		}
 		for _, lowerPathname := range lowerPathnames {
 			lowerPath := fspath.Parse(lowerPathname)
 			if !lowerPath.Absolute {
-				ctx.Warningf("overlay.FilesystemType.GetFilesystem: lowerdir %q must be absolute", lowerPathname)
+				ctx.Infof("overlay.FilesystemType.GetFilesystem: lowerdir %q must be absolute", lowerPathname)
 				return nil, nil, syserror.EINVAL
 			}
 			lowerRoot, err := vfsObj.GetDentryAt(ctx, creds, &vfs.PathOperation{
@@ -187,13 +201,13 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
 				CheckSearchable: true,
 			})
 			if err != nil {
-				ctx.Warningf("overlay.FilesystemType.GetFilesystem: failed to resolve lowerdir %q: %v", lowerPathname, err)
+				ctx.Infof("overlay.FilesystemType.GetFilesystem: failed to resolve lowerdir %q: %v", lowerPathname, err)
 				return nil, nil, err
 			}
 			defer lowerRoot.DecRef(ctx)
 			privateLowerRoot, err := clonePrivateMount(vfsObj, lowerRoot, true /* forceReadOnly */)
 			if err != nil {
-				ctx.Warningf("overlay.FilesystemType.GetFilesystem: failed to make private bind mount of lowerdir %q: %v", lowerPathname, err)
+				ctx.Infof("overlay.FilesystemType.GetFilesystem: failed to make private bind mount of lowerdir %q: %v", lowerPathname, err)
 				return nil, nil, err
 			}
 			defer privateLowerRoot.DecRef(ctx)
@@ -201,7 +215,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
 		}
 	}
 	if len(mopts) != 0 {
-		ctx.Warningf("overlay.FilesystemType.GetFilesystem: unused options: %v", mopts)
+		ctx.Infof("overlay.FilesystemType.GetFilesystem: unused options: %v", mopts)
 		return nil, nil, syserror.EINVAL
 	}
 
@@ -274,7 +288,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
 		return nil, nil, syserror.EREMOTE
 	}
 	if isWhiteout(&rootStat) {
-		ctx.Warningf("overlay.FilesystemType.GetFilesystem: filesystem root is a whiteout")
+		ctx.Infof("overlay.FilesystemType.GetFilesystem: filesystem root is a whiteout")
 		root.destroyLocked(ctx)
 		fs.vfsfs.DecRef(ctx)
 		return nil, nil, syserror.EINVAL
@@ -315,7 +329,11 @@ func clonePrivateMount(vfsObj *vfs.VirtualFilesystem, vd vfs.VirtualDentry, forc
 	if err != nil {
 		return vfs.VirtualDentry{}, err
 	}
-	return vfs.MakeVirtualDentry(newmnt, vd.Dentry()), nil
+	// Take a reference on the dentry which will be owned by the returned
+	// VirtualDentry.
+	d := vd.Dentry()
+	d.IncRef()
+	return vfs.MakeVirtualDentry(newmnt, d), nil
 }
 
 // Release implements vfs.FilesystemImpl.Release.
@@ -358,6 +376,8 @@ func (fs *filesystem) newDirIno() uint64 {
 }
 
 // dentry implements vfs.DentryImpl.
+//
+// +stateify savable
 type dentry struct {
 	vfsd vfs.Dentry
 
@@ -390,7 +410,7 @@ type dentry struct {
 	// and dirents (if not nil) is a cache of dirents as returned by
 	// directoryFDs representing this directory. children is protected by
 	// dirMu.
-	dirMu    sync.Mutex
+	dirMu    sync.Mutex `state:"nosave"`
 	children map[string]*dentry
 	dirents  []vfs.Dirent
 
@@ -400,7 +420,7 @@ type dentry struct {
 	// If !upperVD.Ok(), it can transition to a valid vfs.VirtualDentry (i.e.
 	// be copied up) with copyMu locked for writing; otherwise, it is
 	// immutable. lowerVDs is always immutable.
-	copyMu   sync.RWMutex
+	copyMu   sync.RWMutex `state:"nosave"`
 	upperVD  vfs.VirtualDentry
 	lowerVDs []vfs.VirtualDentry
 
@@ -415,7 +435,43 @@ type dentry struct {
 	devMinor uint32
 	ino      uint64
 
+	// If this dentry represents a regular file, then:
+	//
+	// - mapsMu is used to synchronize between copy-up and memmap.Mappable
+	// methods on dentry preceding mm.MemoryManager.activeMu in the lock order.
+	//
+	// - dataMu is used to synchronize between copy-up and
+	// dentry.(memmap.Mappable).Translate.
+	//
+	// - lowerMappings tracks memory mappings of the file. lowerMappings is
+	// used to invalidate mappings of the lower layer when the file is copied
+	// up to ensure that they remain coherent with subsequent writes to the
+	// file. (Note that, as of this writing, Linux overlayfs does not do this;
+	// this feature is a gVisor extension.) lowerMappings is protected by
+	// mapsMu.
+	//
+	// - If this dentry is copied-up, then wrappedMappable is the Mappable
+	// obtained from a call to the current top layer's
+	// FileDescription.ConfigureMMap(). Once wrappedMappable becomes non-nil
+	// (from a call to regularFileFD.ensureMappable()), it cannot become nil.
+	// wrappedMappable is protected by mapsMu and dataMu.
+	//
+	// - isMappable is non-zero iff wrappedMappable is non-nil. isMappable is
+	// accessed using atomic memory operations.
+	mapsMu          sync.Mutex
+	lowerMappings   memmap.MappingSet
+	dataMu          sync.RWMutex
+	wrappedMappable memmap.Mappable
+	isMappable      uint32
+
 	locks vfs.FileLocks
+
+	// watches is the set of inotify watches on the file repesented by this dentry.
+	//
+	// Note that hard links to the same file will not share the same set of
+	// watches, due to the fact that we do not have inode structures in this
+	// overlay implementation.
+	watches vfs.Watches
 }
 
 // newDentry creates a new dentry. The dentry initially has no references; it
@@ -475,6 +531,14 @@ func (d *dentry) checkDropLocked(ctx context.Context) {
 	if atomic.LoadInt64(&d.refs) != 0 {
 		return
 	}
+
+	// Make sure that we do not lose watches on dentries that have not been
+	// deleted. Note that overlayfs never calls VFS.InvalidateDentry(), so
+	// d.vfsd.IsDead() indicates that d was deleted.
+	if !d.vfsd.IsDead() && d.watches.Size() > 0 {
+		return
+	}
+
 	// Refs is still zero; destroy it.
 	d.destroyLocked(ctx)
 	return
@@ -482,7 +546,9 @@ func (d *dentry) checkDropLocked(ctx context.Context) {
 
 // destroyLocked destroys the dentry.
 //
-// Preconditions: d.fs.renameMu must be locked for writing. d.refs == 0.
+// Preconditions:
+// * d.fs.renameMu must be locked for writing.
+// * d.refs == 0.
 func (d *dentry) destroyLocked(ctx context.Context) {
 	switch atomic.LoadInt64(&d.refs) {
 	case 0:
@@ -501,6 +567,8 @@ func (d *dentry) destroyLocked(ctx context.Context) {
 		lowerVD.DecRef(ctx)
 	}
 
+	d.watches.HandleDeletion(ctx)
+
 	if d.parent != nil {
 		d.parent.dirMu.Lock()
 		if !d.vfsd.IsDead() {
@@ -519,19 +587,36 @@ func (d *dentry) destroyLocked(ctx context.Context) {
 
 // InotifyWithParent implements vfs.DentryImpl.InotifyWithParent.
 func (d *dentry) InotifyWithParent(ctx context.Context, events uint32, cookie uint32, et vfs.EventType) {
-	// TODO(gvisor.dev/issue/1479): Implement inotify.
+	if d.isDir() {
+		events |= linux.IN_ISDIR
+	}
+
+	// overlayfs never calls VFS.InvalidateDentry(), so d.vfsd.IsDead() indicates
+	// that d was deleted.
+	deleted := d.vfsd.IsDead()
+
+	d.fs.renameMu.RLock()
+	// The ordering below is important, Linux always notifies the parent first.
+	if d.parent != nil {
+		d.parent.watches.Notify(ctx, d.name, events, cookie, et, deleted)
+	}
+	d.watches.Notify(ctx, "", events, cookie, et, deleted)
+	d.fs.renameMu.RUnlock()
 }
 
 // Watches implements vfs.DentryImpl.Watches.
 func (d *dentry) Watches() *vfs.Watches {
-	// TODO(gvisor.dev/issue/1479): Implement inotify.
-	return nil
+	return &d.watches
 }
 
 // OnZeroWatches implements vfs.DentryImpl.OnZeroWatches.
-//
-// TODO(gvisor.dev/issue/1479): Implement inotify.
-func (d *dentry) OnZeroWatches(context.Context) {}
+func (d *dentry) OnZeroWatches(ctx context.Context) {
+	if atomic.LoadInt64(&d.refs) == 0 {
+		d.fs.renameMu.Lock()
+		d.checkDropLocked(ctx)
+		d.fs.renameMu.Unlock()
+	}
+}
 
 // iterLayers invokes yield on each layer comprising d, from top to bottom. If
 // any call to yield returns false, iterLayer stops iteration.
@@ -564,6 +649,16 @@ func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes)
 	return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(atomic.LoadUint32(&d.mode)), auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid)))
 }
 
+func (d *dentry) checkXattrPermissions(creds *auth.Credentials, name string, ats vfs.AccessTypes) error {
+	mode := linux.FileMode(atomic.LoadUint32(&d.mode))
+	kuid := auth.KUID(atomic.LoadUint32(&d.uid))
+	kgid := auth.KGID(atomic.LoadUint32(&d.gid))
+	if err := vfs.GenericCheckPermissions(creds, ats, mode, kuid, kgid); err != nil {
+		return err
+	}
+	return vfs.CheckXattrPermissions(creds, ats, mode, kuid, name)
+}
+
 // statInternalMask is the set of stat fields that is set by
 // dentry.statInternalTo().
 const statInternalMask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO
@@ -602,6 +697,8 @@ func (d *dentry) updateAfterSetStatLocked(opts *vfs.SetStatOptions) {
 
 // fileDescription is embedded by overlay implementations of
 // vfs.FileDescriptionImpl.
+//
+// +stateify savable
 type fileDescription struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
@@ -616,6 +713,48 @@ func (fd *fileDescription) dentry() *dentry {
 	return fd.vfsfd.Dentry().Impl().(*dentry)
 }
 
+// ListXattr implements vfs.FileDescriptionImpl.ListXattr.
+func (fd *fileDescription) ListXattr(ctx context.Context, size uint64) ([]string, error) {
+	return fd.filesystem().listXattr(ctx, fd.dentry(), size)
+}
+
+// GetXattr implements vfs.FileDescriptionImpl.GetXattr.
+func (fd *fileDescription) GetXattr(ctx context.Context, opts vfs.GetXattrOptions) (string, error) {
+	return fd.filesystem().getXattr(ctx, fd.dentry(), auth.CredentialsFromContext(ctx), &opts)
+}
+
+// SetXattr implements vfs.FileDescriptionImpl.SetXattr.
+func (fd *fileDescription) SetXattr(ctx context.Context, opts vfs.SetXattrOptions) error {
+	fs := fd.filesystem()
+	d := fd.dentry()
+
+	fs.renameMu.RLock()
+	err := fs.setXattrLocked(ctx, d, fd.vfsfd.Mount(), auth.CredentialsFromContext(ctx), &opts)
+	fs.renameMu.RUnlock()
+	if err != nil {
+		return err
+	}
+
+	d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent)
+	return nil
+}
+
+// RemoveXattr implements vfs.FileDescriptionImpl.RemoveXattr.
+func (fd *fileDescription) RemoveXattr(ctx context.Context, name string) error {
+	fs := fd.filesystem()
+	d := fd.dentry()
+
+	fs.renameMu.RLock()
+	err := fs.removeXattrLocked(ctx, d, fd.vfsfd.Mount(), auth.CredentialsFromContext(ctx), name)
+	fs.renameMu.RUnlock()
+	if err != nil {
+		return err
+	}
+
+	d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent)
+	return nil
+}
+
 // LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
 func (fd *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
 	return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block)
diff --git a/pkg/sentry/fsimpl/overlay/regular_file.go b/pkg/sentry/fsimpl/overlay/regular_file.go
new file mode 100644
index 000000000..2b89a7a6d
--- /dev/null
+++ b/pkg/sentry/fsimpl/overlay/regular_file.go
@@ -0,0 +1,456 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package overlay
+
+import (
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/memmap"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+func (d *dentry) isRegularFile() bool {
+	return atomic.LoadUint32(&d.mode)&linux.S_IFMT == linux.S_IFREG
+}
+
+func (d *dentry) isSymlink() bool {
+	return atomic.LoadUint32(&d.mode)&linux.S_IFMT == linux.S_IFLNK
+}
+
+func (d *dentry) readlink(ctx context.Context) (string, error) {
+	layerVD := d.topLayer()
+	return d.fs.vfsfs.VirtualFilesystem().ReadlinkAt(ctx, d.fs.creds, &vfs.PathOperation{
+		Root:  layerVD,
+		Start: layerVD,
+	})
+}
+
+// +stateify savable
+type regularFileFD struct {
+	fileDescription
+
+	// If copiedUp is false, cachedFD represents
+	// fileDescription.dentry().lowerVDs[0]; otherwise, cachedFD represents
+	// fileDescription.dentry().upperVD. cachedFlags is the last known value of
+	// cachedFD.StatusFlags(). copiedUp, cachedFD, and cachedFlags are
+	// protected by mu.
+	mu          sync.Mutex `state:"nosave"`
+	copiedUp    bool
+	cachedFD    *vfs.FileDescription
+	cachedFlags uint32
+
+	// If copiedUp is false, lowerWaiters contains all waiter.Entries
+	// registered with cachedFD. lowerWaiters is protected by mu.
+	lowerWaiters map[*waiter.Entry]waiter.EventMask
+}
+
+func (fd *regularFileFD) getCurrentFD(ctx context.Context) (*vfs.FileDescription, error) {
+	fd.mu.Lock()
+	defer fd.mu.Unlock()
+	wrappedFD, err := fd.currentFDLocked(ctx)
+	if err != nil {
+		return nil, err
+	}
+	wrappedFD.IncRef()
+	return wrappedFD, nil
+}
+
+func (fd *regularFileFD) currentFDLocked(ctx context.Context) (*vfs.FileDescription, error) {
+	d := fd.dentry()
+	statusFlags := fd.vfsfd.StatusFlags()
+	if !fd.copiedUp && d.isCopiedUp() {
+		// Switch to the copied-up file.
+		upperVD := d.topLayer()
+		upperFD, err := fd.filesystem().vfsfs.VirtualFilesystem().OpenAt(ctx, d.fs.creds, &vfs.PathOperation{
+			Root:  upperVD,
+			Start: upperVD,
+		}, &vfs.OpenOptions{
+			Flags: statusFlags,
+		})
+		if err != nil {
+			return nil, err
+		}
+		oldOff, oldOffErr := fd.cachedFD.Seek(ctx, 0, linux.SEEK_CUR)
+		if oldOffErr == nil {
+			if _, err := upperFD.Seek(ctx, oldOff, linux.SEEK_SET); err != nil {
+				upperFD.DecRef(ctx)
+				return nil, err
+			}
+		}
+		if len(fd.lowerWaiters) != 0 {
+			ready := upperFD.Readiness(^waiter.EventMask(0))
+			for e, mask := range fd.lowerWaiters {
+				fd.cachedFD.EventUnregister(e)
+				upperFD.EventRegister(e, mask)
+				if ready&mask != 0 {
+					e.Callback.Callback(e)
+				}
+			}
+		}
+		fd.cachedFD.DecRef(ctx)
+		fd.copiedUp = true
+		fd.cachedFD = upperFD
+		fd.cachedFlags = statusFlags
+		fd.lowerWaiters = nil
+	} else if fd.cachedFlags != statusFlags {
+		if err := fd.cachedFD.SetStatusFlags(ctx, d.fs.creds, statusFlags); err != nil {
+			return nil, err
+		}
+		fd.cachedFlags = statusFlags
+	}
+	return fd.cachedFD, nil
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *regularFileFD) Release(ctx context.Context) {
+	fd.cachedFD.DecRef(ctx)
+	fd.cachedFD = nil
+}
+
+// OnClose implements vfs.FileDescriptionImpl.OnClose.
+func (fd *regularFileFD) OnClose(ctx context.Context) error {
+	// Linux doesn't define ovl_file_operations.flush at all (i.e. its
+	// equivalent to OnClose is a no-op). We pass through to
+	// fd.cachedFD.OnClose() without upgrading if fd.dentry() has been
+	// copied-up, since OnClose is mostly used to define post-close writeback,
+	// and if fd.cachedFD hasn't been updated then it can't have been used to
+	// mutate fd.dentry() anyway.
+	fd.mu.Lock()
+	if statusFlags := fd.vfsfd.StatusFlags(); fd.cachedFlags != statusFlags {
+		if err := fd.cachedFD.SetStatusFlags(ctx, fd.filesystem().creds, statusFlags); err != nil {
+			fd.mu.Unlock()
+			return err
+		}
+		fd.cachedFlags = statusFlags
+	}
+	wrappedFD := fd.cachedFD
+	fd.mu.Unlock()
+	return wrappedFD.OnClose(ctx)
+}
+
+// Stat implements vfs.FileDescriptionImpl.Stat.
+func (fd *regularFileFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
+	var stat linux.Statx
+	if layerMask := opts.Mask &^ statInternalMask; layerMask != 0 {
+		wrappedFD, err := fd.getCurrentFD(ctx)
+		if err != nil {
+			return linux.Statx{}, err
+		}
+		stat, err = wrappedFD.Stat(ctx, vfs.StatOptions{
+			Mask: layerMask,
+			Sync: opts.Sync,
+		})
+		wrappedFD.DecRef(ctx)
+		if err != nil {
+			return linux.Statx{}, err
+		}
+	}
+	fd.dentry().statInternalTo(ctx, &opts, &stat)
+	return stat, nil
+}
+
+// Allocate implements vfs.FileDescriptionImpl.Allocate.
+func (fd *regularFileFD) Allocate(ctx context.Context, mode, offset, length uint64) error {
+	wrappedFD, err := fd.getCurrentFD(ctx)
+	if err != nil {
+		return err
+	}
+	defer wrappedFD.DecRef(ctx)
+	return wrappedFD.Allocate(ctx, mode, offset, length)
+}
+
+// SetStat implements vfs.FileDescriptionImpl.SetStat.
+func (fd *regularFileFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
+	d := fd.dentry()
+	mode := linux.FileMode(atomic.LoadUint32(&d.mode))
+	if err := vfs.CheckSetStat(ctx, auth.CredentialsFromContext(ctx), &opts, mode, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))); err != nil {
+		return err
+	}
+	mnt := fd.vfsfd.Mount()
+	if err := mnt.CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer mnt.EndWrite()
+	if err := d.copyUpLocked(ctx); err != nil {
+		return err
+	}
+	// Changes to d's attributes are serialized by d.copyMu.
+	d.copyMu.Lock()
+	defer d.copyMu.Unlock()
+	wrappedFD, err := fd.currentFDLocked(ctx)
+	if err != nil {
+		return err
+	}
+	if err := wrappedFD.SetStat(ctx, opts); err != nil {
+		return err
+	}
+	d.updateAfterSetStatLocked(&opts)
+	if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 {
+		d.InotifyWithParent(ctx, ev, 0, vfs.InodeEvent)
+	}
+	return nil
+}
+
+// StatFS implements vfs.FileDescriptionImpl.StatFS.
+func (fd *regularFileFD) StatFS(ctx context.Context) (linux.Statfs, error) {
+	return fd.filesystem().statFS(ctx)
+}
+
+// Readiness implements waiter.Waitable.Readiness.
+func (fd *regularFileFD) Readiness(mask waiter.EventMask) waiter.EventMask {
+	ctx := context.Background()
+	wrappedFD, err := fd.getCurrentFD(ctx)
+	if err != nil {
+		// TODO(b/171089913): Just use fd.cachedFD since Readiness can't return
+		// an error. This is obviously wrong, but at least consistent with
+		// VFS1.
+		log.Warningf("overlay.regularFileFD.Readiness: currentFDLocked failed: %v", err)
+		fd.mu.Lock()
+		wrappedFD = fd.cachedFD
+		wrappedFD.IncRef()
+		fd.mu.Unlock()
+	}
+	defer wrappedFD.DecRef(ctx)
+	return wrappedFD.Readiness(mask)
+}
+
+// EventRegister implements waiter.Waitable.EventRegister.
+func (fd *regularFileFD) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+	fd.mu.Lock()
+	defer fd.mu.Unlock()
+	wrappedFD, err := fd.currentFDLocked(context.Background())
+	if err != nil {
+		// TODO(b/171089913): Just use fd.cachedFD since EventRegister can't
+		// return an error. This is obviously wrong, but at least consistent
+		// with VFS1.
+		log.Warningf("overlay.regularFileFD.EventRegister: currentFDLocked failed: %v", err)
+		wrappedFD = fd.cachedFD
+	}
+	wrappedFD.EventRegister(e, mask)
+	if !fd.copiedUp {
+		if fd.lowerWaiters == nil {
+			fd.lowerWaiters = make(map[*waiter.Entry]waiter.EventMask)
+		}
+		fd.lowerWaiters[e] = mask
+	}
+}
+
+// EventUnregister implements waiter.Waitable.EventUnregister.
+func (fd *regularFileFD) EventUnregister(e *waiter.Entry) {
+	fd.mu.Lock()
+	defer fd.mu.Unlock()
+	fd.cachedFD.EventUnregister(e)
+	if !fd.copiedUp {
+		delete(fd.lowerWaiters, e)
+	}
+}
+
+// PRead implements vfs.FileDescriptionImpl.PRead.
+func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+	wrappedFD, err := fd.getCurrentFD(ctx)
+	if err != nil {
+		return 0, err
+	}
+	defer wrappedFD.DecRef(ctx)
+	return wrappedFD.PRead(ctx, dst, offset, opts)
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+	// Hold fd.mu during the read to serialize the file offset.
+	fd.mu.Lock()
+	defer fd.mu.Unlock()
+	wrappedFD, err := fd.currentFDLocked(ctx)
+	if err != nil {
+		return 0, err
+	}
+	return wrappedFD.Read(ctx, dst, opts)
+}
+
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
+func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+	wrappedFD, err := fd.getCurrentFD(ctx)
+	if err != nil {
+		return 0, err
+	}
+	defer wrappedFD.DecRef(ctx)
+	return wrappedFD.PWrite(ctx, src, offset, opts)
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+	// Hold fd.mu during the write to serialize the file offset.
+	fd.mu.Lock()
+	defer fd.mu.Unlock()
+	wrappedFD, err := fd.currentFDLocked(ctx)
+	if err != nil {
+		return 0, err
+	}
+	return wrappedFD.Write(ctx, src, opts)
+}
+
+// Seek implements vfs.FileDescriptionImpl.Seek.
+func (fd *regularFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+	// Hold fd.mu during the seek to serialize the file offset.
+	fd.mu.Lock()
+	defer fd.mu.Unlock()
+	wrappedFD, err := fd.currentFDLocked(ctx)
+	if err != nil {
+		return 0, err
+	}
+	return wrappedFD.Seek(ctx, offset, whence)
+}
+
+// Sync implements vfs.FileDescriptionImpl.Sync.
+func (fd *regularFileFD) Sync(ctx context.Context) error {
+	fd.mu.Lock()
+	if !fd.dentry().isCopiedUp() {
+		fd.mu.Unlock()
+		return nil
+	}
+	wrappedFD, err := fd.currentFDLocked(ctx)
+	if err != nil {
+		fd.mu.Unlock()
+		return err
+	}
+	wrappedFD.IncRef()
+	defer wrappedFD.DecRef(ctx)
+	fd.mu.Unlock()
+	return wrappedFD.Sync(ctx)
+}
+
+// Ioctl implements vfs.FileDescriptionImpl.Ioctl.
+func (fd *regularFileFD) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	wrappedFD, err := fd.getCurrentFD(ctx)
+	if err != nil {
+		return 0, err
+	}
+	defer wrappedFD.DecRef(ctx)
+	return wrappedFD.Ioctl(ctx, uio, args)
+}
+
+// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap.
+func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
+	if err := fd.ensureMappable(ctx, opts); err != nil {
+		return err
+	}
+	return vfs.GenericConfigureMMap(&fd.vfsfd, fd.dentry(), opts)
+}
+
+// ensureMappable ensures that fd.dentry().wrappedMappable is not nil.
+func (fd *regularFileFD) ensureMappable(ctx context.Context, opts *memmap.MMapOpts) error {
+	d := fd.dentry()
+
+	// Fast path if we already have a Mappable for the current top layer.
+	if atomic.LoadUint32(&d.isMappable) != 0 {
+		return nil
+	}
+
+	// Only permit mmap of regular files, since other file types may have
+	// unpredictable behavior when mmapped (e.g. /dev/zero).
+	if atomic.LoadUint32(&d.mode)&linux.S_IFMT != linux.S_IFREG {
+		return syserror.ENODEV
+	}
+
+	// Get a Mappable for the current top layer.
+	fd.mu.Lock()
+	defer fd.mu.Unlock()
+	d.copyMu.RLock()
+	defer d.copyMu.RUnlock()
+	if atomic.LoadUint32(&d.isMappable) != 0 {
+		return nil
+	}
+	wrappedFD, err := fd.currentFDLocked(ctx)
+	if err != nil {
+		return err
+	}
+	if err := wrappedFD.ConfigureMMap(ctx, opts); err != nil {
+		return err
+	}
+	if opts.MappingIdentity != nil {
+		opts.MappingIdentity.DecRef(ctx)
+		opts.MappingIdentity = nil
+	}
+	// Use this Mappable for all mappings of this layer (unless we raced with
+	// another call to ensureMappable).
+	d.mapsMu.Lock()
+	defer d.mapsMu.Unlock()
+	d.dataMu.Lock()
+	defer d.dataMu.Unlock()
+	if d.wrappedMappable == nil {
+		d.wrappedMappable = opts.Mappable
+		atomic.StoreUint32(&d.isMappable, 1)
+	}
+	return nil
+}
+
+// AddMapping implements memmap.Mappable.AddMapping.
+func (d *dentry) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) error {
+	d.mapsMu.Lock()
+	defer d.mapsMu.Unlock()
+	if err := d.wrappedMappable.AddMapping(ctx, ms, ar, offset, writable); err != nil {
+		return err
+	}
+	if !d.isCopiedUp() {
+		d.lowerMappings.AddMapping(ms, ar, offset, writable)
+	}
+	return nil
+}
+
+// RemoveMapping implements memmap.Mappable.RemoveMapping.
+func (d *dentry) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) {
+	d.mapsMu.Lock()
+	defer d.mapsMu.Unlock()
+	d.wrappedMappable.RemoveMapping(ctx, ms, ar, offset, writable)
+	if !d.isCopiedUp() {
+		d.lowerMappings.RemoveMapping(ms, ar, offset, writable)
+	}
+}
+
+// CopyMapping implements memmap.Mappable.CopyMapping.
+func (d *dentry) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR usermem.AddrRange, offset uint64, writable bool) error {
+	d.mapsMu.Lock()
+	defer d.mapsMu.Unlock()
+	if err := d.wrappedMappable.CopyMapping(ctx, ms, srcAR, dstAR, offset, writable); err != nil {
+		return err
+	}
+	if !d.isCopiedUp() {
+		d.lowerMappings.AddMapping(ms, dstAR, offset, writable)
+	}
+	return nil
+}
+
+// Translate implements memmap.Mappable.Translate.
+func (d *dentry) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) {
+	d.dataMu.RLock()
+	defer d.dataMu.RUnlock()
+	return d.wrappedMappable.Translate(ctx, required, optional, at)
+}
+
+// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable.
+func (d *dentry) InvalidateUnsavable(ctx context.Context) error {
+	d.mapsMu.Lock()
+	defer d.mapsMu.Unlock()
+	return d.wrappedMappable.InvalidateUnsavable(ctx)
+}
diff --git a/pkg/sentry/fsimpl/pipefs/pipefs.go b/pkg/sentry/fsimpl/pipefs/pipefs.go
index 2ca793db9..e44b79b68 100644
--- a/pkg/sentry/fsimpl/pipefs/pipefs.go
+++ b/pkg/sentry/fsimpl/pipefs/pipefs.go
@@ -31,6 +31,7 @@ import (
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
+// +stateify savable
 type filesystemType struct{}
 
 // Name implements vfs.FilesystemType.Name.
@@ -38,11 +39,15 @@ func (filesystemType) Name() string {
 	return "pipefs"
 }
 
+// Release implements vfs.FilesystemType.Release.
+func (filesystemType) Release(ctx context.Context) {}
+
 // GetFilesystem implements vfs.FilesystemType.GetFilesystem.
 func (filesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
 	panic("pipefs.filesystemType.GetFilesystem should never be called")
 }
 
+// +stateify savable
 type filesystem struct {
 	kernfs.Filesystem
 
@@ -76,6 +81,8 @@ func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDe
 }
 
 // inode implements kernfs.Inode.
+//
+// +stateify savable
 type inode struct {
 	kernfs.InodeNotDirectory
 	kernfs.InodeNotSymlink
@@ -143,12 +150,14 @@ func (i *inode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds *auth.
 	return syserror.EPERM
 }
 
-// TODO(gvisor.dev/issue/1193): kernfs does not provide a way to implement
-// statfs, from which we should indicate PIPEFS_MAGIC.
-
 // Open implements kernfs.Inode.Open.
-func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	return i.pipe.Open(ctx, rp.Mount(), vfsd, opts.Flags, &i.locks)
+func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	return i.pipe.Open(ctx, rp.Mount(), d.VFSDentry(), opts.Flags, &i.locks)
+}
+
+// StatFS implements kernfs.Inode.StatFS.
+func (i *inode) StatFS(ctx context.Context, fs *vfs.Filesystem) (linux.Statfs, error) {
+	return vfs.GenericStatFS(linux.PIPEFS_MAGIC), nil
 }
 
 // NewConnectedPipeFDs returns a pair of FileDescriptions representing the read
@@ -159,7 +168,7 @@ func NewConnectedPipeFDs(ctx context.Context, mnt *vfs.Mount, flags uint32) (*vf
 	fs := mnt.Filesystem().Impl().(*filesystem)
 	inode := newInode(ctx, fs)
 	var d kernfs.Dentry
-	d.Init(inode)
+	d.Init(&fs.Filesystem, inode)
 	defer d.DecRef(ctx)
 	return inode.pipe.ReaderWriterPair(mnt, d.VFSDentry(), flags)
 }
diff --git a/pkg/sentry/fsimpl/proc/BUILD b/pkg/sentry/fsimpl/proc/BUILD
index 14ecfd300..2e086e34c 100644
--- a/pkg/sentry/fsimpl/proc/BUILD
+++ b/pkg/sentry/fsimpl/proc/BUILD
@@ -1,18 +1,79 @@
 load("//tools:defs.bzl", "go_library", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
 
 licenses(["notice"])
 
+go_template_instance(
+    name = "fd_dir_inode_refs",
+    out = "fd_dir_inode_refs.go",
+    package = "proc",
+    prefix = "fdDirInode",
+    template = "//pkg/refs_vfs2:refs_template",
+    types = {
+        "T": "fdDirInode",
+    },
+)
+
+go_template_instance(
+    name = "fd_info_dir_inode_refs",
+    out = "fd_info_dir_inode_refs.go",
+    package = "proc",
+    prefix = "fdInfoDirInode",
+    template = "//pkg/refs_vfs2:refs_template",
+    types = {
+        "T": "fdInfoDirInode",
+    },
+)
+
+go_template_instance(
+    name = "subtasks_inode_refs",
+    out = "subtasks_inode_refs.go",
+    package = "proc",
+    prefix = "subtasksInode",
+    template = "//pkg/refs_vfs2:refs_template",
+    types = {
+        "T": "subtasksInode",
+    },
+)
+
+go_template_instance(
+    name = "task_inode_refs",
+    out = "task_inode_refs.go",
+    package = "proc",
+    prefix = "taskInode",
+    template = "//pkg/refs_vfs2:refs_template",
+    types = {
+        "T": "taskInode",
+    },
+)
+
+go_template_instance(
+    name = "tasks_inode_refs",
+    out = "tasks_inode_refs.go",
+    package = "proc",
+    prefix = "tasksInode",
+    template = "//pkg/refs_vfs2:refs_template",
+    types = {
+        "T": "tasksInode",
+    },
+)
+
 go_library(
     name = "proc",
     srcs = [
+        "fd_dir_inode_refs.go",
+        "fd_info_dir_inode_refs.go",
         "filesystem.go",
         "subtasks.go",
+        "subtasks_inode_refs.go",
         "task.go",
         "task_fds.go",
         "task_files.go",
+        "task_inode_refs.go",
         "task_net.go",
         "tasks.go",
         "tasks_files.go",
+        "tasks_inode_refs.go",
         "tasks_sys.go",
     ],
     visibility = ["//pkg/sentry:internal"],
@@ -39,6 +100,7 @@ go_library(
         "//pkg/sync",
         "//pkg/syserror",
         "//pkg/tcpip/header",
+        "//pkg/tcpip/network/ipv4",
         "//pkg/usermem",
     ],
 )
diff --git a/pkg/sentry/fsimpl/proc/filesystem.go b/pkg/sentry/fsimpl/proc/filesystem.go
index 2463d51cd..fd70a07de 100644
--- a/pkg/sentry/fsimpl/proc/filesystem.go
+++ b/pkg/sentry/fsimpl/proc/filesystem.go
@@ -34,13 +34,15 @@ const Name = "proc"
 // +stateify savable
 type FilesystemType struct{}
 
-var _ vfs.FilesystemType = (*FilesystemType)(nil)
-
 // Name implements vfs.FilesystemType.Name.
 func (FilesystemType) Name() string {
 	return Name
 }
 
+// Release implements vfs.FilesystemType.Release.
+func (FilesystemType) Release(ctx context.Context) {}
+
+// +stateify savable
 type filesystem struct {
 	kernfs.Filesystem
 
@@ -72,7 +74,9 @@ func (ft FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualF
 		cgroups = data.Cgroups
 	}
 
-	_, dentry := procfs.newTasksInode(k, pidns, cgroups)
+	inode := procfs.newTasksInode(k, pidns, cgroups)
+	var dentry kernfs.Dentry
+	dentry.Init(&procfs.Filesystem, inode)
 	return procfs.VFSFilesystem(), dentry.VFSDentry(), nil
 }
 
@@ -84,6 +88,8 @@ func (fs *filesystem) Release(ctx context.Context) {
 
 // dynamicInode is an overfitted interface for common Inodes with
 // dynamicByteSource types used in procfs.
+//
+// +stateify savable
 type dynamicInode interface {
 	kernfs.Inode
 	vfs.DynamicBytesSource
@@ -91,14 +97,12 @@ type dynamicInode interface {
 	Init(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, data vfs.DynamicBytesSource, perm linux.FileMode)
 }
 
-func (fs *filesystem) newDentry(creds *auth.Credentials, ino uint64, perm linux.FileMode, inode dynamicInode) *kernfs.Dentry {
-	inode.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, ino, inode, perm)
-
-	d := &kernfs.Dentry{}
-	d.Init(inode)
-	return d
+func (fs *filesystem) newInode(creds *auth.Credentials, perm linux.FileMode, inode dynamicInode) dynamicInode {
+	inode.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), inode, perm)
+	return inode
 }
 
+// +stateify savable
 type staticFile struct {
 	kernfs.DynamicBytesFile
 	vfs.StaticData
@@ -110,8 +114,24 @@ func newStaticFile(data string) *staticFile {
 	return &staticFile{StaticData: vfs.StaticData{Data: data}}
 }
 
+func (fs *filesystem) newStaticDir(creds *auth.Credentials, children map[string]kernfs.Inode) kernfs.Inode {
+	return kernfs.NewStaticDir(creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, children, kernfs.GenericDirectoryFDOptions{
+		SeekEnd: kernfs.SeekEndZero,
+	})
+}
+
 // InternalData contains internal data passed in to the procfs mount via
 // vfs.GetFilesystemOptions.InternalData.
+//
+// +stateify savable
 type InternalData struct {
 	Cgroups map[string]string
 }
+
+// +stateify savable
+type implStatFS struct{}
+
+// StatFS implements kernfs.Inode.StatFS.
+func (*implStatFS) StatFS(context.Context, *vfs.Filesystem) (linux.Statfs, error) {
+	return vfs.GenericStatFS(linux.PROC_SUPER_MAGIC), nil
+}
diff --git a/pkg/sentry/fsimpl/proc/subtasks.go b/pkg/sentry/fsimpl/proc/subtasks.go
index 79c2725f3..bad2fab4f 100644
--- a/pkg/sentry/fsimpl/proc/subtasks.go
+++ b/pkg/sentry/fsimpl/proc/subtasks.go
@@ -31,11 +31,14 @@ import (
 //
 // +stateify savable
 type subtasksInode struct {
-	kernfs.InodeNotSymlink
-	kernfs.InodeDirectoryNoNewChildren
+	implStatFS
+	kernfs.InodeAlwaysValid
 	kernfs.InodeAttrs
+	kernfs.InodeDirectoryNoNewChildren
+	kernfs.InodeNotSymlink
+	kernfs.InodeTemporary
 	kernfs.OrderedChildren
-	kernfs.AlwaysValid
+	subtasksInodeRefs
 
 	locks vfs.FileLocks
 
@@ -47,7 +50,7 @@ type subtasksInode struct {
 
 var _ kernfs.Inode = (*subtasksInode)(nil)
 
-func (fs *filesystem) newSubtasks(task *kernel.Task, pidns *kernel.PIDNamespace, cgroupControllers map[string]string) *kernfs.Dentry {
+func (fs *filesystem) newSubtasks(task *kernel.Task, pidns *kernel.PIDNamespace, cgroupControllers map[string]string) kernfs.Inode {
 	subInode := &subtasksInode{
 		fs:                fs,
 		task:              task,
@@ -57,16 +60,14 @@ func (fs *filesystem) newSubtasks(task *kernel.Task, pidns *kernel.PIDNamespace,
 	// Note: credentials are overridden by taskOwnedInode.
 	subInode.InodeAttrs.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555)
 	subInode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
+	subInode.EnableLeakCheck()
 
 	inode := &taskOwnedInode{Inode: subInode, owner: task}
-	dentry := &kernfs.Dentry{}
-	dentry.Init(inode)
-
-	return dentry
+	return inode
 }
 
-// Lookup implements kernfs.inodeDynamicLookup.
-func (i *subtasksInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) {
+// Lookup implements kernfs.inodeDirectory.Lookup.
+func (i *subtasksInode) Lookup(ctx context.Context, name string) (kernfs.Inode, error) {
 	tid, err := strconv.ParseUint(name, 10, 32)
 	if err != nil {
 		return nil, syserror.ENOENT
@@ -79,12 +80,10 @@ func (i *subtasksInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, e
 	if subTask.ThreadGroup() != i.task.ThreadGroup() {
 		return nil, syserror.ENOENT
 	}
-
-	subTaskDentry := i.fs.newTaskInode(subTask, i.pidns, false, i.cgroupControllers)
-	return subTaskDentry.VFSDentry(), nil
+	return i.fs.newTaskInode(subTask, i.pidns, false, i.cgroupControllers)
 }
 
-// IterDirents implements kernfs.inodeDynamicLookup.
+// IterDirents implements kernfs.inodeDirectory.IterDirents.
 func (i *subtasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) {
 	tasks := i.task.ThreadGroup().MemberIDs(i.pidns)
 	if len(tasks) == 0 {
@@ -115,6 +114,7 @@ func (i *subtasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallb
 	return offset, nil
 }
 
+// +stateify savable
 type subtasksFD struct {
 	kernfs.GenericDirectoryFD
 
@@ -152,19 +152,21 @@ func (fd *subtasksFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) erro
 	return fd.GenericDirectoryFD.SetStat(ctx, opts)
 }
 
-// Open implements kernfs.Inode.
-func (i *subtasksInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+// Open implements kernfs.Inode.Open.
+func (i *subtasksInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
 	fd := &subtasksFD{task: i.task}
-	if err := fd.Init(&i.OrderedChildren, &i.locks, &opts); err != nil {
+	if err := fd.Init(&i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{
+		SeekEnd: kernfs.SeekEndZero,
+	}); err != nil {
 		return nil, err
 	}
-	if err := fd.VFSFileDescription().Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}); err != nil {
+	if err := fd.VFSFileDescription().Init(fd, opts.Flags, rp.Mount(), d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil {
 		return nil, err
 	}
 	return fd.VFSFileDescription(), nil
 }
 
-// Stat implements kernfs.Inode.
+// Stat implements kernfs.Inode.Stat.
 func (i *subtasksInode) Stat(ctx context.Context, vsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
 	stat, err := i.InodeAttrs.Stat(ctx, vsfs, opts)
 	if err != nil {
@@ -176,7 +178,12 @@ func (i *subtasksInode) Stat(ctx context.Context, vsfs *vfs.Filesystem, opts vfs
 	return stat, nil
 }
 
-// SetStat implements Inode.SetStat not allowing inode attributes to be changed.
+// SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed.
 func (*subtasksInode) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
 	return syserror.EPERM
 }
+
+// DecRef implements kernfs.Inode.DecRef.
+func (i *subtasksInode) DecRef(ctx context.Context) {
+	i.subtasksInodeRefs.DecRef(func() { i.Destroy(ctx) })
+}
diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go
index a5c7aa470..b63a4eca0 100644
--- a/pkg/sentry/fsimpl/proc/task.go
+++ b/pkg/sentry/fsimpl/proc/task.go
@@ -32,11 +32,13 @@ import (
 //
 // +stateify savable
 type taskInode struct {
-	kernfs.InodeNotSymlink
-	kernfs.InodeDirectoryNoNewChildren
-	kernfs.InodeNoDynamicLookup
+	implStatFS
 	kernfs.InodeAttrs
+	kernfs.InodeDirectoryNoNewChildren
+	kernfs.InodeNotSymlink
+	kernfs.InodeTemporary
 	kernfs.OrderedChildren
+	taskInodeRefs
 
 	locks vfs.FileLocks
 
@@ -45,80 +47,92 @@ type taskInode struct {
 
 var _ kernfs.Inode = (*taskInode)(nil)
 
-func (fs *filesystem) newTaskInode(task *kernel.Task, pidns *kernel.PIDNamespace, isThreadGroup bool, cgroupControllers map[string]string) *kernfs.Dentry {
-	// TODO(gvisor.dev/issue/164): Fail with ESRCH if task exited.
-	contents := map[string]*kernfs.Dentry{
-		"auxv":      fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &auxvData{task: task}),
-		"cmdline":   fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &cmdlineData{task: task, arg: cmdlineDataArg}),
+func (fs *filesystem) newTaskInode(task *kernel.Task, pidns *kernel.PIDNamespace, isThreadGroup bool, cgroupControllers map[string]string) (kernfs.Inode, error) {
+	if task.ExitState() == kernel.TaskExitDead {
+		return nil, syserror.ESRCH
+	}
+
+	contents := map[string]kernfs.Inode{
+		"auxv":      fs.newTaskOwnedInode(task, fs.NextIno(), 0444, &auxvData{task: task}),
+		"cmdline":   fs.newTaskOwnedInode(task, fs.NextIno(), 0444, &cmdlineData{task: task, arg: cmdlineDataArg}),
 		"comm":      fs.newComm(task, fs.NextIno(), 0444),
-		"environ":   fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &cmdlineData{task: task, arg: environDataArg}),
+		"cwd":       fs.newCwdSymlink(task, fs.NextIno()),
+		"environ":   fs.newTaskOwnedInode(task, fs.NextIno(), 0444, &cmdlineData{task: task, arg: environDataArg}),
 		"exe":       fs.newExeSymlink(task, fs.NextIno()),
 		"fd":        fs.newFDDirInode(task),
 		"fdinfo":    fs.newFDInfoDirInode(task),
-		"gid_map":   fs.newTaskOwnedFile(task, fs.NextIno(), 0644, &idMapData{task: task, gids: true}),
-		"io":        fs.newTaskOwnedFile(task, fs.NextIno(), 0400, newIO(task, isThreadGroup)),
-		"maps":      fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &mapsData{task: task}),
-		"mountinfo": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &mountInfoData{task: task}),
-		"mounts":    fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &mountsData{task: task}),
+		"gid_map":   fs.newTaskOwnedInode(task, fs.NextIno(), 0644, &idMapData{task: task, gids: true}),
+		"io":        fs.newTaskOwnedInode(task, fs.NextIno(), 0400, newIO(task, isThreadGroup)),
+		"maps":      fs.newTaskOwnedInode(task, fs.NextIno(), 0444, &mapsData{task: task}),
+		"mountinfo": fs.newTaskOwnedInode(task, fs.NextIno(), 0444, &mountInfoData{task: task}),
+		"mounts":    fs.newTaskOwnedInode(task, fs.NextIno(), 0444, &mountsData{task: task}),
 		"net":       fs.newTaskNetDir(task),
-		"ns": fs.newTaskOwnedDir(task, fs.NextIno(), 0511, map[string]*kernfs.Dentry{
+		"ns": fs.newTaskOwnedDir(task, fs.NextIno(), 0511, map[string]kernfs.Inode{
 			"net":  fs.newNamespaceSymlink(task, fs.NextIno(), "net"),
 			"pid":  fs.newNamespaceSymlink(task, fs.NextIno(), "pid"),
 			"user": fs.newNamespaceSymlink(task, fs.NextIno(), "user"),
 		}),
-		"oom_score":     fs.newTaskOwnedFile(task, fs.NextIno(), 0444, newStaticFile("0\n")),
-		"oom_score_adj": fs.newTaskOwnedFile(task, fs.NextIno(), 0644, &oomScoreAdj{task: task}),
-		"smaps":         fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &smapsData{task: task}),
-		"stat":          fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &taskStatData{task: task, pidns: pidns, tgstats: isThreadGroup}),
-		"statm":         fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &statmData{task: task}),
-		"status":        fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &statusData{task: task, pidns: pidns}),
-		"uid_map":       fs.newTaskOwnedFile(task, fs.NextIno(), 0644, &idMapData{task: task, gids: false}),
+		"oom_score":     fs.newTaskOwnedInode(task, fs.NextIno(), 0444, newStaticFile("0\n")),
+		"oom_score_adj": fs.newTaskOwnedInode(task, fs.NextIno(), 0644, &oomScoreAdj{task: task}),
+		"smaps":         fs.newTaskOwnedInode(task, fs.NextIno(), 0444, &smapsData{task: task}),
+		"stat":          fs.newTaskOwnedInode(task, fs.NextIno(), 0444, &taskStatData{task: task, pidns: pidns, tgstats: isThreadGroup}),
+		"statm":         fs.newTaskOwnedInode(task, fs.NextIno(), 0444, &statmData{task: task}),
+		"status":        fs.newTaskOwnedInode(task, fs.NextIno(), 0444, &statusData{task: task, pidns: pidns}),
+		"uid_map":       fs.newTaskOwnedInode(task, fs.NextIno(), 0644, &idMapData{task: task, gids: false}),
 	}
 	if isThreadGroup {
 		contents["task"] = fs.newSubtasks(task, pidns, cgroupControllers)
 	}
 	if len(cgroupControllers) > 0 {
-		contents["cgroup"] = fs.newTaskOwnedFile(task, fs.NextIno(), 0444, newCgroupData(cgroupControllers))
+		contents["cgroup"] = fs.newTaskOwnedInode(task, fs.NextIno(), 0444, newCgroupData(cgroupControllers))
 	}
 
 	taskInode := &taskInode{task: task}
 	// Note: credentials are overridden by taskOwnedInode.
 	taskInode.InodeAttrs.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555)
+	taskInode.EnableLeakCheck()
 
 	inode := &taskOwnedInode{Inode: taskInode, owner: task}
-	dentry := &kernfs.Dentry{}
-	dentry.Init(inode)
 
 	taskInode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
-	links := taskInode.OrderedChildren.Populate(dentry, contents)
+	links := taskInode.OrderedChildren.Populate(contents)
 	taskInode.IncLinks(links)
 
-	return dentry
+	return inode, nil
 }
 
-// Valid implements kernfs.inodeDynamicLookup. This inode remains valid as long
+// Valid implements kernfs.Inode.Valid. This inode remains valid as long
 // as the task is still running. When it's dead, another tasks with the same
 // PID could replace it.
 func (i *taskInode) Valid(ctx context.Context) bool {
 	return i.task.ExitState() != kernel.TaskExitDead
 }
 
-// Open implements kernfs.Inode.
-func (i *taskInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts)
+// Open implements kernfs.Inode.Open.
+func (i *taskInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), d, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{
+		SeekEnd: kernfs.SeekEndZero,
+	})
 	if err != nil {
 		return nil, err
 	}
 	return fd.VFSFileDescription(), nil
 }
 
-// SetStat implements Inode.SetStat not allowing inode attributes to be changed.
+// SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed.
 func (*taskInode) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
 	return syserror.EPERM
 }
 
+// DecRef implements kernfs.Inode.DecRef.
+func (i *taskInode) DecRef(ctx context.Context) {
+	i.taskInodeRefs.DecRef(func() { i.Destroy(ctx) })
+}
+
 // taskOwnedInode implements kernfs.Inode and overrides inode owner with task
 // effective user and group.
+//
+// +stateify savable
 type taskOwnedInode struct {
 	kernfs.Inode
 
@@ -128,34 +142,26 @@ type taskOwnedInode struct {
 
 var _ kernfs.Inode = (*taskOwnedInode)(nil)
 
-func (fs *filesystem) newTaskOwnedFile(task *kernel.Task, ino uint64, perm linux.FileMode, inode dynamicInode) *kernfs.Dentry {
+func (fs *filesystem) newTaskOwnedInode(task *kernel.Task, ino uint64, perm linux.FileMode, inode dynamicInode) kernfs.Inode {
 	// Note: credentials are overridden by taskOwnedInode.
 	inode.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, inode, perm)
 
-	taskInode := &taskOwnedInode{Inode: inode, owner: task}
-	d := &kernfs.Dentry{}
-	d.Init(taskInode)
-	return d
+	return &taskOwnedInode{Inode: inode, owner: task}
 }
 
-func (fs *filesystem) newTaskOwnedDir(task *kernel.Task, ino uint64, perm linux.FileMode, children map[string]*kernfs.Dentry) *kernfs.Dentry {
-	dir := &kernfs.StaticDirectory{}
-
+func (fs *filesystem) newTaskOwnedDir(task *kernel.Task, ino uint64, perm linux.FileMode, children map[string]kernfs.Inode) kernfs.Inode {
 	// Note: credentials are overridden by taskOwnedInode.
-	dir.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, perm)
-
-	inode := &taskOwnedInode{Inode: dir, owner: task}
-	d := &kernfs.Dentry{}
-	d.Init(inode)
+	fdOpts := kernfs.GenericDirectoryFDOptions{SeekEnd: kernfs.SeekEndZero}
+	dir := kernfs.NewStaticDir(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, perm, children, fdOpts)
 
-	dir.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
-	links := dir.OrderedChildren.Populate(d, children)
-	dir.IncLinks(links)
+	return &taskOwnedInode{Inode: dir, owner: task}
+}
 
-	return d
+func (i *taskOwnedInode) Valid(ctx context.Context) bool {
+	return i.owner.ExitState() != kernel.TaskExitDead && i.Inode.Valid(ctx)
 }
 
-// Stat implements kernfs.Inode.
+// Stat implements kernfs.Inode.Stat.
 func (i *taskOwnedInode) Stat(ctx context.Context, fs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
 	stat, err := i.Inode.Stat(ctx, fs, opts)
 	if err != nil {
@@ -173,7 +179,7 @@ func (i *taskOwnedInode) Stat(ctx context.Context, fs *vfs.Filesystem, opts vfs.
 	return stat, nil
 }
 
-// CheckPermissions implements kernfs.Inode.
+// CheckPermissions implements kernfs.Inode.CheckPermissions.
 func (i *taskOwnedInode) CheckPermissions(_ context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error {
 	mode := i.Mode()
 	uid, gid := i.getOwner(mode)
diff --git a/pkg/sentry/fsimpl/proc/task_fds.go b/pkg/sentry/fsimpl/proc/task_fds.go
index f0d3f7f5e..2c80ac5c2 100644
--- a/pkg/sentry/fsimpl/proc/task_fds.go
+++ b/pkg/sentry/fsimpl/proc/task_fds.go
@@ -22,7 +22,6 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
-	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
@@ -52,6 +51,7 @@ func taskFDExists(ctx context.Context, t *kernel.Task, fd int32) bool {
 	return true
 }
 
+// +stateify savable
 type fdDir struct {
 	locks vfs.FileLocks
 
@@ -63,7 +63,7 @@ type fdDir struct {
 	produceSymlink bool
 }
 
-// IterDirents implements kernfs.inodeDynamicLookup.
+// IterDirents implements kernfs.inodeDirectory.IterDirents.
 func (i *fdDir) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) {
 	var fds []int32
 	i.task.WithMuLocked(func(t *kernel.Task) {
@@ -87,31 +87,39 @@ func (i *fdDir) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, off
 			Name:    strconv.FormatUint(uint64(fd), 10),
 			Type:    typ,
 			Ino:     i.fs.NextIno(),
-			NextOff: offset + 1,
+			NextOff: int64(fd) + 3,
 		}
 		if err := cb.Handle(dirent); err != nil {
-			return offset, err
+			// Getdents should iterate correctly despite mutation
+			// of fds, so we return the next fd to serialize plus
+			// 2 (which accounts for the "." and ".." tracked by
+			// kernfs) as the offset.
+			return int64(fd) + 2, err
 		}
-		offset++
 	}
-	return offset, nil
+	// We serialized them all.  Next offset should be higher than last
+	// serialized fd.
+	return int64(fds[len(fds)-1]) + 3, nil
 }
 
 // fdDirInode represents the inode for /proc/[pid]/fd directory.
 //
 // +stateify savable
 type fdDirInode struct {
-	kernfs.InodeNotSymlink
-	kernfs.InodeDirectoryNoNewChildren
+	fdDir
+	fdDirInodeRefs
+	implStatFS
+	kernfs.InodeAlwaysValid
 	kernfs.InodeAttrs
+	kernfs.InodeDirectoryNoNewChildren
+	kernfs.InodeNotSymlink
+	kernfs.InodeTemporary
 	kernfs.OrderedChildren
-	kernfs.AlwaysValid
-	fdDir
 }
 
 var _ kernfs.Inode = (*fdDirInode)(nil)
 
-func (fs *filesystem) newFDDirInode(task *kernel.Task) *kernfs.Dentry {
+func (fs *filesystem) newFDDirInode(task *kernel.Task) kernfs.Inode {
 	inode := &fdDirInode{
 		fdDir: fdDir{
 			fs:             fs,
@@ -120,16 +128,18 @@ func (fs *filesystem) newFDDirInode(task *kernel.Task) *kernfs.Dentry {
 		},
 	}
 	inode.InodeAttrs.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555)
-
-	dentry := &kernfs.Dentry{}
-	dentry.Init(inode)
+	inode.EnableLeakCheck()
 	inode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
+	return inode
+}
 
-	return dentry
+// IterDirents implements kernfs.inodeDirectory.IterDirents.
+func (i *fdDirInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) {
+	return i.fdDir.IterDirents(ctx, cb, offset, relOffset)
 }
 
-// Lookup implements kernfs.inodeDynamicLookup.
-func (i *fdDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) {
+// Lookup implements kernfs.inodeDirectory.Lookup.
+func (i *fdDirInode) Lookup(ctx context.Context, name string) (kernfs.Inode, error) {
 	fdInt, err := strconv.ParseInt(name, 10, 32)
 	if err != nil {
 		return nil, syserror.ENOENT
@@ -138,20 +148,21 @@ func (i *fdDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, erro
 	if !taskFDExists(ctx, i.task, fd) {
 		return nil, syserror.ENOENT
 	}
-	taskDentry := i.fs.newFDSymlink(i.task, fd, i.fs.NextIno())
-	return taskDentry.VFSDentry(), nil
+	return i.fs.newFDSymlink(i.task, fd, i.fs.NextIno()), nil
 }
 
-// Open implements kernfs.Inode.
-func (i *fdDirInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts)
+// Open implements kernfs.Inode.Open.
+func (i *fdDirInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), d, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{
+		SeekEnd: kernfs.SeekEndZero,
+	})
 	if err != nil {
 		return nil, err
 	}
 	return fd.VFSFileDescription(), nil
 }
 
-// CheckPermissions implements kernfs.Inode.
+// CheckPermissions implements kernfs.Inode.CheckPermissions.
 //
 // This is to match Linux, which uses a special permission handler to guarantee
 // that a process can still access /proc/self/fd after it has executed
@@ -173,10 +184,16 @@ func (i *fdDirInode) CheckPermissions(ctx context.Context, creds *auth.Credentia
 	return err
 }
 
+// DecRef implements kernfs.Inode.DecRef.
+func (i *fdDirInode) DecRef(ctx context.Context) {
+	i.fdDirInodeRefs.DecRef(func() { i.Destroy(ctx) })
+}
+
 // fdSymlink is an symlink for the /proc/[pid]/fd/[fd] file.
 //
 // +stateify savable
 type fdSymlink struct {
+	implStatFS
 	kernfs.InodeAttrs
 	kernfs.InodeNoopRefCount
 	kernfs.InodeSymlink
@@ -187,19 +204,16 @@ type fdSymlink struct {
 
 var _ kernfs.Inode = (*fdSymlink)(nil)
 
-func (fs *filesystem) newFDSymlink(task *kernel.Task, fd int32, ino uint64) *kernfs.Dentry {
+func (fs *filesystem) newFDSymlink(task *kernel.Task, fd int32, ino uint64) kernfs.Inode {
 	inode := &fdSymlink{
 		task: task,
 		fd:   fd,
 	}
 	inode.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777)
-
-	d := &kernfs.Dentry{}
-	d.Init(inode)
-	return d
+	return inode
 }
 
-func (s *fdSymlink) Readlink(ctx context.Context) (string, error) {
+func (s *fdSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) {
 	file, _ := getTaskFD(s.task, s.fd)
 	if file == nil {
 		return "", syserror.ENOENT
@@ -221,21 +235,29 @@ func (s *fdSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDen
 	return vd, "", nil
 }
 
+// Valid implements kernfs.Inode.Valid.
+func (s *fdSymlink) Valid(ctx context.Context) bool {
+	return taskFDExists(ctx, s.task, s.fd)
+}
+
 // fdInfoDirInode represents the inode for /proc/[pid]/fdinfo directory.
 //
 // +stateify savable
 type fdInfoDirInode struct {
-	kernfs.InodeNotSymlink
-	kernfs.InodeDirectoryNoNewChildren
+	fdDir
+	fdInfoDirInodeRefs
+	implStatFS
+	kernfs.InodeAlwaysValid
 	kernfs.InodeAttrs
+	kernfs.InodeDirectoryNoNewChildren
+	kernfs.InodeNotSymlink
+	kernfs.InodeTemporary
 	kernfs.OrderedChildren
-	kernfs.AlwaysValid
-	fdDir
 }
 
 var _ kernfs.Inode = (*fdInfoDirInode)(nil)
 
-func (fs *filesystem) newFDInfoDirInode(task *kernel.Task) *kernfs.Dentry {
+func (fs *filesystem) newFDInfoDirInode(task *kernel.Task) kernfs.Inode {
 	inode := &fdInfoDirInode{
 		fdDir: fdDir{
 			fs:   fs,
@@ -243,16 +265,13 @@ func (fs *filesystem) newFDInfoDirInode(task *kernel.Task) *kernfs.Dentry {
 		},
 	}
 	inode.InodeAttrs.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555)
-
-	dentry := &kernfs.Dentry{}
-	dentry.Init(inode)
+	inode.EnableLeakCheck()
 	inode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
-
-	return dentry
+	return inode
 }
 
-// Lookup implements kernfs.inodeDynamicLookup.
-func (i *fdInfoDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) {
+// Lookup implements kernfs.inodeDirectory.Lookup.
+func (i *fdInfoDirInode) Lookup(ctx context.Context, name string) (kernfs.Inode, error) {
 	fdInt, err := strconv.ParseInt(name, 10, 32)
 	if err != nil {
 		return nil, syserror.ENOENT
@@ -265,25 +284,35 @@ func (i *fdInfoDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry,
 		task: i.task,
 		fd:   fd,
 	}
-	dentry := i.fs.newTaskOwnedFile(i.task, i.fs.NextIno(), 0444, data)
-	return dentry.VFSDentry(), nil
+	return i.fs.newTaskOwnedInode(i.task, i.fs.NextIno(), 0444, data), nil
 }
 
-// Open implements kernfs.Inode.
-func (i *fdInfoDirInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts)
+// IterDirents implements Inode.IterDirents.
+func (i *fdInfoDirInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, offset, relOffset int64) (newOffset int64, err error) {
+	return i.fdDir.IterDirents(ctx, cb, offset, relOffset)
+}
+
+// Open implements kernfs.Inode.Open.
+func (i *fdInfoDirInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), d, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{
+		SeekEnd: kernfs.SeekEndZero,
+	})
 	if err != nil {
 		return nil, err
 	}
 	return fd.VFSFileDescription(), nil
 }
 
+// DecRef implements kernfs.Inode.DecRef.
+func (i *fdInfoDirInode) DecRef(ctx context.Context) {
+	i.fdInfoDirInodeRefs.DecRef(func() { i.Destroy(ctx) })
+}
+
 // fdInfoData implements vfs.DynamicBytesSource for /proc/[pid]/fdinfo/[fd].
 //
 // +stateify savable
 type fdInfoData struct {
 	kernfs.DynamicBytesFile
-	refs.AtomicRefCount
 
 	task *kernel.Task
 	fd   int32
@@ -305,3 +334,8 @@ func (d *fdInfoData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 	fmt.Fprintf(buf, "flags:\t0%o\n", flags)
 	return nil
 }
+
+// Valid implements kernfs.Inode.Valid.
+func (d *fdInfoData) Valid(ctx context.Context) bool {
+	return taskFDExists(ctx, d.task, d.fd)
+}
diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go
index 830b78949..79f8b7e9f 100644
--- a/pkg/sentry/fsimpl/proc/task_files.go
+++ b/pkg/sentry/fsimpl/proc/task_files.go
@@ -247,13 +247,10 @@ type commInode struct {
 	task *kernel.Task
 }
 
-func (fs *filesystem) newComm(task *kernel.Task, ino uint64, perm linux.FileMode) *kernfs.Dentry {
+func (fs *filesystem) newComm(task *kernel.Task, ino uint64, perm linux.FileMode) kernfs.Inode {
 	inode := &commInode{task: task}
 	inode.DynamicBytesFile.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, &commData{task: task}, perm)
-
-	d := &kernfs.Dentry{}
-	d.Init(inode)
-	return d
+	return inode
 }
 
 func (i *commInode) CheckPermissions(ctx context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error {
@@ -543,7 +540,7 @@ func (s *statusData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 	var vss, rss, data uint64
 	s.task.WithMuLocked(func(t *kernel.Task) {
 		if fdTable := t.FDTable(); fdTable != nil {
-			fds = fdTable.Size()
+			fds = fdTable.CurrentMaxFDs()
 		}
 		if mm := t.MemoryManager(); mm != nil {
 			vss = mm.VirtualMemorySize()
@@ -648,6 +645,7 @@ func (o *oomScoreAdj) Write(ctx context.Context, src usermem.IOSequence, offset
 //
 // +stateify savable
 type exeSymlink struct {
+	implStatFS
 	kernfs.InodeAttrs
 	kernfs.InodeNoopRefCount
 	kernfs.InodeSymlink
@@ -657,29 +655,30 @@ type exeSymlink struct {
 
 var _ kernfs.Inode = (*exeSymlink)(nil)
 
-func (fs *filesystem) newExeSymlink(task *kernel.Task, ino uint64) *kernfs.Dentry {
+func (fs *filesystem) newExeSymlink(task *kernel.Task, ino uint64) kernfs.Inode {
 	inode := &exeSymlink{task: task}
 	inode.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777)
-
-	d := &kernfs.Dentry{}
-	d.Init(inode)
-	return d
+	return inode
 }
 
-// Readlink implements kernfs.Inode.
-func (s *exeSymlink) Readlink(ctx context.Context) (string, error) {
-	if !kernel.ContextCanTrace(ctx, s.task, false) {
-		return "", syserror.EACCES
-	}
-
-	// Pull out the executable for /proc/[pid]/exe.
-	exec, err := s.executable()
+// Readlink implements kernfs.Inode.Readlink.
+func (s *exeSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) {
+	exec, _, err := s.Getlink(ctx, nil)
 	if err != nil {
 		return "", err
 	}
 	defer exec.DecRef(ctx)
 
-	return exec.PathnameWithDeleted(ctx), nil
+	root := vfs.RootFromContext(ctx)
+	if !root.Ok() {
+		// It could have raced with process deletion.
+		return "", syserror.ESRCH
+	}
+	defer root.DecRef(ctx)
+
+	vfsObj := exec.Mount().Filesystem().VirtualFilesystem()
+	name, _ := vfsObj.PathnameWithDeleted(ctx, root, exec)
+	return name, nil
 }
 
 // Getlink implements kernfs.Inode.Getlink.
@@ -687,23 +686,12 @@ func (s *exeSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDent
 	if !kernel.ContextCanTrace(ctx, s.task, false) {
 		return vfs.VirtualDentry{}, "", syserror.EACCES
 	}
-
-	exec, err := s.executable()
-	if err != nil {
-		return vfs.VirtualDentry{}, "", err
-	}
-	defer exec.DecRef(ctx)
-
-	vd := exec.(*fsbridge.VFSFile).FileDescription().VirtualDentry()
-	vd.IncRef()
-	return vd, "", nil
-}
-
-func (s *exeSymlink) executable() (file fsbridge.File, err error) {
 	if err := checkTaskState(s.task); err != nil {
-		return nil, err
+		return vfs.VirtualDentry{}, "", err
 	}
 
+	var err error
+	var exec fsbridge.File
 	s.task.WithMuLocked(func(t *kernel.Task) {
 		mm := t.MemoryManager()
 		if mm == nil {
@@ -714,12 +702,75 @@ func (s *exeSymlink) executable() (file fsbridge.File, err error) {
 		// The MemoryManager may be destroyed, in which case
 		// MemoryManager.destroy will simply set the executable to nil
 		// (with locks held).
-		file = mm.Executable()
-		if file == nil {
+		exec = mm.Executable()
+		if exec == nil {
 			err = syserror.ESRCH
 		}
 	})
-	return
+	if err != nil {
+		return vfs.VirtualDentry{}, "", err
+	}
+	defer exec.DecRef(ctx)
+
+	vd := exec.(*fsbridge.VFSFile).FileDescription().VirtualDentry()
+	vd.IncRef()
+	return vd, "", nil
+}
+
+// cwdSymlink is an symlink for the /proc/[pid]/cwd file.
+//
+// +stateify savable
+type cwdSymlink struct {
+	implStatFS
+	kernfs.InodeAttrs
+	kernfs.InodeNoopRefCount
+	kernfs.InodeSymlink
+
+	task *kernel.Task
+}
+
+var _ kernfs.Inode = (*cwdSymlink)(nil)
+
+func (fs *filesystem) newCwdSymlink(task *kernel.Task, ino uint64) kernfs.Inode {
+	inode := &cwdSymlink{task: task}
+	inode.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777)
+	return inode
+}
+
+// Readlink implements kernfs.Inode.Readlink.
+func (s *cwdSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) {
+	cwd, _, err := s.Getlink(ctx, nil)
+	if err != nil {
+		return "", err
+	}
+	defer cwd.DecRef(ctx)
+
+	root := vfs.RootFromContext(ctx)
+	if !root.Ok() {
+		// It could have raced with process deletion.
+		return "", syserror.ESRCH
+	}
+	defer root.DecRef(ctx)
+
+	vfsObj := cwd.Mount().Filesystem().VirtualFilesystem()
+	name, _ := vfsObj.PathnameWithDeleted(ctx, root, cwd)
+	return name, nil
+}
+
+// Getlink implements kernfs.Inode.Getlink.
+func (s *cwdSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDentry, string, error) {
+	if !kernel.ContextCanTrace(ctx, s.task, false) {
+		return vfs.VirtualDentry{}, "", syserror.EACCES
+	}
+	if err := checkTaskState(s.task); err != nil {
+		return vfs.VirtualDentry{}, "", err
+	}
+	cwd := s.task.FSContext().WorkingDirectoryVFS2()
+	if !cwd.Ok() {
+		// It could have raced with process deletion.
+		return vfs.VirtualDentry{}, "", syserror.ESRCH
+	}
+	return cwd, "", nil
 }
 
 // mountInfoData is used to implement /proc/[pid]/mountinfo.
@@ -784,13 +835,14 @@ func (i *mountsData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 	return nil
 }
 
+// +stateify savable
 type namespaceSymlink struct {
 	kernfs.StaticSymlink
 
 	task *kernel.Task
 }
 
-func (fs *filesystem) newNamespaceSymlink(task *kernel.Task, ino uint64, ns string) *kernfs.Dentry {
+func (fs *filesystem) newNamespaceSymlink(task *kernel.Task, ino uint64, ns string) kernfs.Inode {
 	// Namespace symlinks should contain the namespace name and the inode number
 	// for the namespace instance, so for example user:[123456]. We currently fake
 	// the inode number by sticking the symlink inode in its place.
@@ -801,37 +853,39 @@ func (fs *filesystem) newNamespaceSymlink(task *kernel.Task, ino uint64, ns stri
 	inode.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, target)
 
 	taskInode := &taskOwnedInode{Inode: inode, owner: task}
-	d := &kernfs.Dentry{}
-	d.Init(taskInode)
-	return d
+	return taskInode
 }
 
-// Readlink implements Inode.
-func (s *namespaceSymlink) Readlink(ctx context.Context) (string, error) {
+// Readlink implements kernfs.Inode.Readlink.
+func (s *namespaceSymlink) Readlink(ctx context.Context, mnt *vfs.Mount) (string, error) {
 	if err := checkTaskState(s.task); err != nil {
 		return "", err
 	}
-	return s.StaticSymlink.Readlink(ctx)
+	return s.StaticSymlink.Readlink(ctx, mnt)
 }
 
-// Getlink implements Inode.Getlink.
+// Getlink implements kernfs.Inode.Getlink.
 func (s *namespaceSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDentry, string, error) {
 	if err := checkTaskState(s.task); err != nil {
 		return vfs.VirtualDentry{}, "", err
 	}
 
 	// Create a synthetic inode to represent the namespace.
+	fs := mnt.Filesystem().Impl().(*filesystem)
 	dentry := &kernfs.Dentry{}
-	dentry.Init(&namespaceInode{})
+	dentry.Init(&fs.Filesystem, &namespaceInode{})
 	vd := vfs.MakeVirtualDentry(mnt, dentry.VFSDentry())
-	vd.IncRef()
-	dentry.DecRef(ctx)
+	// Only IncRef vd.Mount() because vd.Dentry() already holds a ref of 1.
+	mnt.IncRef()
 	return vd, "", nil
 }
 
 // namespaceInode is a synthetic inode created to represent a namespace in
 // /proc/[pid]/ns/*.
+//
+// +stateify savable
 type namespaceInode struct {
+	implStatFS
 	kernfs.InodeAttrs
 	kernfs.InodeNoopRefCount
 	kernfs.InodeNotDirectory
@@ -850,12 +904,12 @@ func (i *namespaceInode) Init(creds *auth.Credentials, devMajor, devMinor uint32
 	i.InodeAttrs.Init(creds, devMajor, devMinor, ino, linux.ModeRegular|perm)
 }
 
-// Open implements Inode.Open.
-func (i *namespaceInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+// Open implements kernfs.Inode.Open.
+func (i *namespaceInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
 	fd := &namespaceFD{inode: i}
 	i.IncRef()
 	fd.LockFD.Init(&i.locks)
-	if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}); err != nil {
+	if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil {
 		return nil, err
 	}
 	return &fd.vfsfd, nil
@@ -863,6 +917,8 @@ func (i *namespaceInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *
 
 // namespace FD is a synthetic file that represents a namespace in
 // /proc/[pid]/ns/*.
+//
+// +stateify savable
 type namespaceFD struct {
 	vfs.FileDescriptionDefaultImpl
 	vfs.LockFD
@@ -873,20 +929,20 @@ type namespaceFD struct {
 
 var _ vfs.FileDescriptionImpl = (*namespaceFD)(nil)
 
-// Stat implements FileDescriptionImpl.
+// Stat implements vfs.FileDescriptionImpl.Stat.
 func (fd *namespaceFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
 	vfs := fd.vfsfd.VirtualDentry().Mount().Filesystem()
 	return fd.inode.Stat(ctx, vfs, opts)
 }
 
-// SetStat implements FileDescriptionImpl.
+// SetStat implements vfs.FileDescriptionImpl.SetStat.
 func (fd *namespaceFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
 	vfs := fd.vfsfd.VirtualDentry().Mount().Filesystem()
 	creds := auth.CredentialsFromContext(ctx)
 	return fd.inode.SetStat(ctx, vfs, creds, opts)
 }
 
-// Release implements FileDescriptionImpl.
+// Release implements vfs.FileDescriptionImpl.Release.
 func (fd *namespaceFD) Release(ctx context.Context) {
 	fd.inode.DecRef(ctx)
 }
diff --git a/pkg/sentry/fsimpl/proc/task_net.go b/pkg/sentry/fsimpl/proc/task_net.go
index a4c884bf9..3425e8698 100644
--- a/pkg/sentry/fsimpl/proc/task_net.go
+++ b/pkg/sentry/fsimpl/proc/task_net.go
@@ -37,12 +37,12 @@ import (
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
-func (fs *filesystem) newTaskNetDir(task *kernel.Task) *kernfs.Dentry {
+func (fs *filesystem) newTaskNetDir(task *kernel.Task) kernfs.Inode {
 	k := task.Kernel()
 	pidns := task.PIDNamespace()
 	root := auth.NewRootCredentials(pidns.UserNamespace())
 
-	var contents map[string]*kernfs.Dentry
+	var contents map[string]kernfs.Inode
 	if stack := task.NetworkNamespace().Stack(); stack != nil {
 		const (
 			arp       = "IP address       HW type     Flags       HW address            Mask     Device\n"
@@ -56,34 +56,34 @@ func (fs *filesystem) newTaskNetDir(task *kernel.Task) *kernfs.Dentry {
 
 		// TODO(gvisor.dev/issue/1833): Make sure file contents reflect the task
 		// network namespace.
-		contents = map[string]*kernfs.Dentry{
-			"dev":  fs.newDentry(root, fs.NextIno(), 0444, &netDevData{stack: stack}),
-			"snmp": fs.newDentry(root, fs.NextIno(), 0444, &netSnmpData{stack: stack}),
+		contents = map[string]kernfs.Inode{
+			"dev":  fs.newInode(root, 0444, &netDevData{stack: stack}),
+			"snmp": fs.newInode(root, 0444, &netSnmpData{stack: stack}),
 
 			// The following files are simple stubs until they are implemented in
 			// netstack, if the file contains a header the stub is just the header
 			// otherwise it is an empty file.
-			"arp":       fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(arp)),
-			"netlink":   fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(netlink)),
-			"netstat":   fs.newDentry(root, fs.NextIno(), 0444, &netStatData{}),
-			"packet":    fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(packet)),
-			"protocols": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(protocols)),
+			"arp":       fs.newInode(root, 0444, newStaticFile(arp)),
+			"netlink":   fs.newInode(root, 0444, newStaticFile(netlink)),
+			"netstat":   fs.newInode(root, 0444, &netStatData{}),
+			"packet":    fs.newInode(root, 0444, newStaticFile(packet)),
+			"protocols": fs.newInode(root, 0444, newStaticFile(protocols)),
 
 			// Linux sets psched values to: nsec per usec, psched tick in ns, 1000000,
 			// high res timer ticks per sec (ClockGetres returns 1ns resolution).
-			"psched": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(psched)),
-			"ptype":  fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(ptype)),
-			"route":  fs.newDentry(root, fs.NextIno(), 0444, &netRouteData{stack: stack}),
-			"tcp":    fs.newDentry(root, fs.NextIno(), 0444, &netTCPData{kernel: k}),
-			"udp":    fs.newDentry(root, fs.NextIno(), 0444, &netUDPData{kernel: k}),
-			"unix":   fs.newDentry(root, fs.NextIno(), 0444, &netUnixData{kernel: k}),
+			"psched": fs.newInode(root, 0444, newStaticFile(psched)),
+			"ptype":  fs.newInode(root, 0444, newStaticFile(ptype)),
+			"route":  fs.newInode(root, 0444, &netRouteData{stack: stack}),
+			"tcp":    fs.newInode(root, 0444, &netTCPData{kernel: k}),
+			"udp":    fs.newInode(root, 0444, &netUDPData{kernel: k}),
+			"unix":   fs.newInode(root, 0444, &netUnixData{kernel: k}),
 		}
 
 		if stack.SupportsIPv6() {
-			contents["if_inet6"] = fs.newDentry(root, fs.NextIno(), 0444, &ifinet6{stack: stack})
-			contents["ipv6_route"] = fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(""))
-			contents["tcp6"] = fs.newDentry(root, fs.NextIno(), 0444, &netTCP6Data{kernel: k})
-			contents["udp6"] = fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(upd6))
+			contents["if_inet6"] = fs.newInode(root, 0444, &ifinet6{stack: stack})
+			contents["ipv6_route"] = fs.newInode(root, 0444, newStaticFile(""))
+			contents["tcp6"] = fs.newInode(root, 0444, &netTCP6Data{kernel: k})
+			contents["udp6"] = fs.newInode(root, 0444, newStaticFile(upd6))
 		}
 	}
 
@@ -262,7 +262,7 @@ func (n *netUnixData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 		// For now, we always redact this pointer.
 		fmt.Fprintf(buf, "%#016p: %08X %08X %08X %04X %02X %8d",
 			(*unix.SocketOperations)(nil), // Num, pointer to kernel socket struct.
-			s.Refs()-1,                    // RefCount, don't count our own ref.
+			s.ReadRefs()-1,                // RefCount, don't count our own ref.
 			0,                             // Protocol, always 0 for UDS.
 			sockFlags,                     // Flags.
 			sops.Endpoint().Type(),        // Type.
@@ -430,7 +430,7 @@ func commonGenerateTCP(ctx context.Context, buf *bytes.Buffer, k *kernel.Kernel,
 
 		// Field: refcount. Don't count the ref we obtain while deferencing
 		// the weakref to this socket.
-		fmt.Fprintf(buf, "%d ", s.Refs()-1)
+		fmt.Fprintf(buf, "%d ", s.ReadRefs()-1)
 
 		// Field: Socket struct address. Redacted due to the same reason as
 		// the 'Num' field in /proc/net/unix, see netUnix.ReadSeqFileData.
@@ -589,7 +589,7 @@ func (d *netUDPData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 
 		// Field: ref; reference count on the socket inode. Don't count the ref
 		// we obtain while deferencing the weakref to this socket.
-		fmt.Fprintf(buf, "%d ", s.Refs()-1)
+		fmt.Fprintf(buf, "%d ", s.ReadRefs()-1)
 
 		// Field: Socket struct address. Redacted due to the same reason as
 		// the 'Num' field in /proc/net/unix, see netUnix.ReadSeqFileData.
@@ -616,6 +616,7 @@ type netSnmpData struct {
 
 var _ dynamicInode = (*netSnmpData)(nil)
 
+// +stateify savable
 type snmpLine struct {
 	prefix string
 	header string
@@ -660,7 +661,7 @@ func sprintSlice(s []uint64) string {
 	return r[1 : len(r)-1] // Remove "[]" introduced by fmt of slice.
 }
 
-// Generate implements vfs.DynamicBytesSource.
+// Generate implements vfs.DynamicBytesSource.Generate.
 func (d *netSnmpData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 	types := []interface{}{
 		&inet.StatSNMPIP{},
@@ -709,7 +710,7 @@ type netRouteData struct {
 
 var _ dynamicInode = (*netRouteData)(nil)
 
-// Generate implements vfs.DynamicBytesSource.
+// Generate implements vfs.DynamicBytesSource.Generate.
 // See Linux's net/ipv4/fib_trie.c:fib_route_seq_show.
 func (d *netRouteData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 	fmt.Fprintf(buf, "%-127s\n", "Iface\tDestination\tGateway\tFlags\tRefCnt\tUse\tMetric\tMask\tMTU\tWindow\tIRTT")
@@ -773,7 +774,7 @@ type netStatData struct {
 
 var _ dynamicInode = (*netStatData)(nil)
 
-// Generate implements vfs.DynamicBytesSource.
+// Generate implements vfs.DynamicBytesSource.Generate.
 // See Linux's net/ipv4/fib_trie.c:fib_route_seq_show.
 func (d *netStatData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 	buf.WriteString("TcpExt: SyncookiesSent SyncookiesRecv SyncookiesFailed " +
diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go
index 6d2b90a8b..3259c3732 100644
--- a/pkg/sentry/fsimpl/proc/tasks.go
+++ b/pkg/sentry/fsimpl/proc/tasks.go
@@ -37,11 +37,14 @@ const (
 //
 // +stateify savable
 type tasksInode struct {
-	kernfs.InodeNotSymlink
-	kernfs.InodeDirectoryNoNewChildren
+	implStatFS
+	kernfs.InodeAlwaysValid
 	kernfs.InodeAttrs
+	kernfs.InodeDirectoryNoNewChildren
+	kernfs.InodeNotSymlink
+	kernfs.InodeTemporary // This holds no meaning as this inode can't be Looked up and is always valid.
 	kernfs.OrderedChildren
-	kernfs.AlwaysValid
+	tasksInodeRefs
 
 	locks vfs.FileLocks
 
@@ -50,8 +53,6 @@ type tasksInode struct {
 
 	// '/proc/self' and '/proc/thread-self' have custom directory offsets in
 	// Linux. So handle them outside of OrderedChildren.
-	selfSymlink       *vfs.Dentry
-	threadSelfSymlink *vfs.Dentry
 
 	// cgroupControllers is a map of controller name to directory in the
 	// cgroup hierarchy. These controllers are immutable and will be listed
@@ -61,51 +62,53 @@ type tasksInode struct {
 
 var _ kernfs.Inode = (*tasksInode)(nil)
 
-func (fs *filesystem) newTasksInode(k *kernel.Kernel, pidns *kernel.PIDNamespace, cgroupControllers map[string]string) (*tasksInode, *kernfs.Dentry) {
+func (fs *filesystem) newTasksInode(k *kernel.Kernel, pidns *kernel.PIDNamespace, cgroupControllers map[string]string) *tasksInode {
 	root := auth.NewRootCredentials(pidns.UserNamespace())
-	contents := map[string]*kernfs.Dentry{
-		"cpuinfo":     fs.newDentry(root, fs.NextIno(), 0444, newStaticFileSetStat(cpuInfoData(k))),
-		"filesystems": fs.newDentry(root, fs.NextIno(), 0444, &filesystemsData{}),
-		"loadavg":     fs.newDentry(root, fs.NextIno(), 0444, &loadavgData{}),
+	contents := map[string]kernfs.Inode{
+		"cpuinfo":     fs.newInode(root, 0444, newStaticFileSetStat(cpuInfoData(k))),
+		"filesystems": fs.newInode(root, 0444, &filesystemsData{}),
+		"loadavg":     fs.newInode(root, 0444, &loadavgData{}),
 		"sys":         fs.newSysDir(root, k),
-		"meminfo":     fs.newDentry(root, fs.NextIno(), 0444, &meminfoData{}),
+		"meminfo":     fs.newInode(root, 0444, &meminfoData{}),
 		"mounts":      kernfs.NewStaticSymlink(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), "self/mounts"),
 		"net":         kernfs.NewStaticSymlink(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), "self/net"),
-		"stat":        fs.newDentry(root, fs.NextIno(), 0444, &statData{}),
-		"uptime":      fs.newDentry(root, fs.NextIno(), 0444, &uptimeData{}),
-		"version":     fs.newDentry(root, fs.NextIno(), 0444, &versionData{}),
+		"stat":        fs.newInode(root, 0444, &statData{}),
+		"uptime":      fs.newInode(root, 0444, &uptimeData{}),
+		"version":     fs.newInode(root, 0444, &versionData{}),
 	}
 
 	inode := &tasksInode{
 		pidns:             pidns,
 		fs:                fs,
-		selfSymlink:       fs.newSelfSymlink(root, fs.NextIno(), pidns).VFSDentry(),
-		threadSelfSymlink: fs.newThreadSelfSymlink(root, fs.NextIno(), pidns).VFSDentry(),
 		cgroupControllers: cgroupControllers,
 	}
 	inode.InodeAttrs.Init(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555)
-
-	dentry := &kernfs.Dentry{}
-	dentry.Init(inode)
+	inode.EnableLeakCheck()
 
 	inode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
-	links := inode.OrderedChildren.Populate(dentry, contents)
+	links := inode.OrderedChildren.Populate(contents)
 	inode.IncLinks(links)
 
-	return inode, dentry
+	return inode
 }
 
-// Lookup implements kernfs.inodeDynamicLookup.
-func (i *tasksInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) {
-	// Try to lookup a corresponding task.
+// Lookup implements kernfs.inodeDirectory.Lookup.
+func (i *tasksInode) Lookup(ctx context.Context, name string) (kernfs.Inode, error) {
+	// Check if a static entry was looked up.
+	if d, err := i.OrderedChildren.Lookup(ctx, name); err == nil {
+		return d, nil
+	}
+
+	// Not a static entry. Try to lookup a corresponding task.
 	tid, err := strconv.ParseUint(name, 10, 64)
 	if err != nil {
+		root := auth.NewRootCredentials(i.pidns.UserNamespace())
 		// If it failed to parse, check if it's one of the special handled files.
 		switch name {
 		case selfName:
-			return i.selfSymlink, nil
+			return i.newSelfSymlink(root), nil
 		case threadSelfName:
-			return i.threadSelfSymlink, nil
+			return i.newThreadSelfSymlink(root), nil
 		}
 		return nil, syserror.ENOENT
 	}
@@ -115,11 +118,10 @@ func (i *tasksInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, erro
 		return nil, syserror.ENOENT
 	}
 
-	taskDentry := i.fs.newTaskInode(task, i.pidns, true, i.cgroupControllers)
-	return taskDentry.VFSDentry(), nil
+	return i.fs.newTaskInode(task, i.pidns, true, i.cgroupControllers)
 }
 
-// IterDirents implements kernfs.inodeDynamicLookup.
+// IterDirents implements kernfs.inodeDirectory.IterDirents.
 func (i *tasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, offset, _ int64) (int64, error) {
 	// fs/proc/internal.h: #define FIRST_PROCESS_ENTRY 256
 	const FIRST_PROCESS_ENTRY = 256
@@ -197,9 +199,11 @@ func (i *tasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback
 	return maxTaskID, nil
 }
 
-// Open implements kernfs.Inode.
-func (i *tasksInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts)
+// Open implements kernfs.Inode.Open.
+func (i *tasksInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), d, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{
+		SeekEnd: kernfs.SeekEndZero,
+	})
 	if err != nil {
 		return nil, err
 	}
@@ -224,9 +228,16 @@ func (i *tasksInode) Stat(ctx context.Context, vsfs *vfs.Filesystem, opts vfs.St
 	return stat, nil
 }
 
+// DecRef implements kernfs.Inode.DecRef.
+func (i *tasksInode) DecRef(ctx context.Context) {
+	i.tasksInodeRefs.DecRef(func() { i.Destroy(ctx) })
+}
+
 // staticFileSetStat implements a special static file that allows inode
 // attributes to be set. This is to support /proc files that are readonly, but
 // allow attributes to be set.
+//
+// +stateify savable
 type staticFileSetStat struct {
 	dynamicBytesFileSetAttr
 	vfs.StaticData
diff --git a/pkg/sentry/fsimpl/proc/tasks_files.go b/pkg/sentry/fsimpl/proc/tasks_files.go
index 7d8983aa5..07c27cdd9 100644
--- a/pkg/sentry/fsimpl/proc/tasks_files.go
+++ b/pkg/sentry/fsimpl/proc/tasks_files.go
@@ -31,7 +31,9 @@ import (
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
+// +stateify savable
 type selfSymlink struct {
+	implStatFS
 	kernfs.InodeAttrs
 	kernfs.InodeNoopRefCount
 	kernfs.InodeSymlink
@@ -41,16 +43,13 @@ type selfSymlink struct {
 
 var _ kernfs.Inode = (*selfSymlink)(nil)
 
-func (fs *filesystem) newSelfSymlink(creds *auth.Credentials, ino uint64, pidns *kernel.PIDNamespace) *kernfs.Dentry {
-	inode := &selfSymlink{pidns: pidns}
-	inode.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777)
-
-	d := &kernfs.Dentry{}
-	d.Init(inode)
-	return d
+func (i *tasksInode) newSelfSymlink(creds *auth.Credentials) kernfs.Inode {
+	inode := &selfSymlink{pidns: i.pidns}
+	inode.Init(creds, linux.UNNAMED_MAJOR, i.fs.devMinor, i.fs.NextIno(), linux.ModeSymlink|0777)
+	return inode
 }
 
-func (s *selfSymlink) Readlink(ctx context.Context) (string, error) {
+func (s *selfSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) {
 	t := kernel.TaskFromContext(ctx)
 	if t == nil {
 		// Who is reading this link?
@@ -63,17 +62,19 @@ func (s *selfSymlink) Readlink(ctx context.Context) (string, error) {
 	return strconv.FormatUint(uint64(tgid), 10), nil
 }
 
-func (s *selfSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDentry, string, error) {
-	target, err := s.Readlink(ctx)
+func (s *selfSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDentry, string, error) {
+	target, err := s.Readlink(ctx, mnt)
 	return vfs.VirtualDentry{}, target, err
 }
 
-// SetStat implements Inode.SetStat not allowing inode attributes to be changed.
+// SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed.
 func (*selfSymlink) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
 	return syserror.EPERM
 }
 
+// +stateify savable
 type threadSelfSymlink struct {
+	implStatFS
 	kernfs.InodeAttrs
 	kernfs.InodeNoopRefCount
 	kernfs.InodeSymlink
@@ -83,16 +84,13 @@ type threadSelfSymlink struct {
 
 var _ kernfs.Inode = (*threadSelfSymlink)(nil)
 
-func (fs *filesystem) newThreadSelfSymlink(creds *auth.Credentials, ino uint64, pidns *kernel.PIDNamespace) *kernfs.Dentry {
-	inode := &threadSelfSymlink{pidns: pidns}
-	inode.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777)
-
-	d := &kernfs.Dentry{}
-	d.Init(inode)
-	return d
+func (i *tasksInode) newThreadSelfSymlink(creds *auth.Credentials) kernfs.Inode {
+	inode := &threadSelfSymlink{pidns: i.pidns}
+	inode.Init(creds, linux.UNNAMED_MAJOR, i.fs.devMinor, i.fs.NextIno(), linux.ModeSymlink|0777)
+	return inode
 }
 
-func (s *threadSelfSymlink) Readlink(ctx context.Context) (string, error) {
+func (s *threadSelfSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) {
 	t := kernel.TaskFromContext(ctx)
 	if t == nil {
 		// Who is reading this link?
@@ -106,12 +104,12 @@ func (s *threadSelfSymlink) Readlink(ctx context.Context) (string, error) {
 	return fmt.Sprintf("%d/task/%d", tgid, tid), nil
 }
 
-func (s *threadSelfSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDentry, string, error) {
-	target, err := s.Readlink(ctx)
+func (s *threadSelfSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDentry, string, error) {
+	target, err := s.Readlink(ctx, mnt)
 	return vfs.VirtualDentry{}, target, err
 }
 
-// SetStat implements Inode.SetStat not allowing inode attributes to be changed.
+// SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed.
 func (*threadSelfSymlink) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
 	return syserror.EPERM
 }
@@ -119,16 +117,20 @@ func (*threadSelfSymlink) SetStat(context.Context, *vfs.Filesystem, *auth.Creden
 // dynamicBytesFileSetAttr implements a special file that allows inode
 // attributes to be set. This is to support /proc files that are readonly, but
 // allow attributes to be set.
+//
+// +stateify savable
 type dynamicBytesFileSetAttr struct {
 	kernfs.DynamicBytesFile
 }
 
-// SetStat implements Inode.SetStat.
+// SetStat implements kernfs.Inode.SetStat.
 func (d *dynamicBytesFileSetAttr) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
 	return d.DynamicBytesFile.InodeAttrs.SetStat(ctx, fs, creds, opts)
 }
 
 // cpuStats contains the breakdown of CPU time for /proc/stat.
+//
+// +stateify savable
 type cpuStats struct {
 	// user is time spent in userspace tasks with non-positive niceness.
 	user uint64
diff --git a/pkg/sentry/fsimpl/proc/tasks_sys.go b/pkg/sentry/fsimpl/proc/tasks_sys.go
index 6435385ef..95420368d 100644
--- a/pkg/sentry/fsimpl/proc/tasks_sys.go
+++ b/pkg/sentry/fsimpl/proc/tasks_sys.go
@@ -27,9 +27,11 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
+// +stateify savable
 type tcpMemDir int
 
 const (
@@ -38,92 +40,93 @@ const (
 )
 
 // newSysDir returns the dentry corresponding to /proc/sys directory.
-func (fs *filesystem) newSysDir(root *auth.Credentials, k *kernel.Kernel) *kernfs.Dentry {
-	return kernfs.NewStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{
-		"kernel": kernfs.NewStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{
-			"hostname": fs.newDentry(root, fs.NextIno(), 0444, &hostnameData{}),
-			"shmall":   fs.newDentry(root, fs.NextIno(), 0444, shmData(linux.SHMALL)),
-			"shmmax":   fs.newDentry(root, fs.NextIno(), 0444, shmData(linux.SHMMAX)),
-			"shmmni":   fs.newDentry(root, fs.NextIno(), 0444, shmData(linux.SHMMNI)),
+func (fs *filesystem) newSysDir(root *auth.Credentials, k *kernel.Kernel) kernfs.Inode {
+	return fs.newStaticDir(root, map[string]kernfs.Inode{
+		"kernel": fs.newStaticDir(root, map[string]kernfs.Inode{
+			"hostname": fs.newInode(root, 0444, &hostnameData{}),
+			"shmall":   fs.newInode(root, 0444, shmData(linux.SHMALL)),
+			"shmmax":   fs.newInode(root, 0444, shmData(linux.SHMMAX)),
+			"shmmni":   fs.newInode(root, 0444, shmData(linux.SHMMNI)),
 		}),
-		"vm": kernfs.NewStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{
-			"mmap_min_addr":     fs.newDentry(root, fs.NextIno(), 0444, &mmapMinAddrData{k: k}),
-			"overcommit_memory": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0\n")),
+		"vm": fs.newStaticDir(root, map[string]kernfs.Inode{
+			"mmap_min_addr":     fs.newInode(root, 0444, &mmapMinAddrData{k: k}),
+			"overcommit_memory": fs.newInode(root, 0444, newStaticFile("0\n")),
 		}),
 		"net": fs.newSysNetDir(root, k),
 	})
 }
 
 // newSysNetDir returns the dentry corresponding to /proc/sys/net directory.
-func (fs *filesystem) newSysNetDir(root *auth.Credentials, k *kernel.Kernel) *kernfs.Dentry {
-	var contents map[string]*kernfs.Dentry
+func (fs *filesystem) newSysNetDir(root *auth.Credentials, k *kernel.Kernel) kernfs.Inode {
+	var contents map[string]kernfs.Inode
 
 	// TODO(gvisor.dev/issue/1833): Support for using the network stack in the
 	// network namespace of the calling process.
 	if stack := k.RootNetworkNamespace().Stack(); stack != nil {
-		contents = map[string]*kernfs.Dentry{
-			"ipv4": kernfs.NewStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{
-				"tcp_recovery": fs.newDentry(root, fs.NextIno(), 0644, &tcpRecoveryData{stack: stack}),
-				"tcp_rmem":     fs.newDentry(root, fs.NextIno(), 0644, &tcpMemData{stack: stack, dir: tcpRMem}),
-				"tcp_sack":     fs.newDentry(root, fs.NextIno(), 0644, &tcpSackData{stack: stack}),
-				"tcp_wmem":     fs.newDentry(root, fs.NextIno(), 0644, &tcpMemData{stack: stack, dir: tcpWMem}),
+		contents = map[string]kernfs.Inode{
+			"ipv4": fs.newStaticDir(root, map[string]kernfs.Inode{
+				"tcp_recovery": fs.newInode(root, 0644, &tcpRecoveryData{stack: stack}),
+				"tcp_rmem":     fs.newInode(root, 0644, &tcpMemData{stack: stack, dir: tcpRMem}),
+				"tcp_sack":     fs.newInode(root, 0644, &tcpSackData{stack: stack}),
+				"tcp_wmem":     fs.newInode(root, 0644, &tcpMemData{stack: stack, dir: tcpWMem}),
+				"ip_forward":   fs.newInode(root, 0444, &ipForwarding{stack: stack}),
 
 				// The following files are simple stubs until they are implemented in
 				// netstack, most of these files are configuration related. We use the
 				// value closest to the actual netstack behavior or any empty file, all
 				// of these files will have mode 0444 (read-only for all users).
-				"ip_local_port_range":     fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("16000   65535")),
-				"ip_local_reserved_ports": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("")),
-				"ipfrag_time":             fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("30")),
-				"ip_nonlocal_bind":        fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")),
-				"ip_no_pmtu_disc":         fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("1")),
+				"ip_local_port_range":     fs.newInode(root, 0444, newStaticFile("16000   65535")),
+				"ip_local_reserved_ports": fs.newInode(root, 0444, newStaticFile("")),
+				"ipfrag_time":             fs.newInode(root, 0444, newStaticFile("30")),
+				"ip_nonlocal_bind":        fs.newInode(root, 0444, newStaticFile("0")),
+				"ip_no_pmtu_disc":         fs.newInode(root, 0444, newStaticFile("1")),
 
 				// tcp_allowed_congestion_control tell the user what they are able to
 				// do as an unprivledged process so we leave it empty.
-				"tcp_allowed_congestion_control":   fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("")),
-				"tcp_available_congestion_control": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("reno")),
-				"tcp_congestion_control":           fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("reno")),
+				"tcp_allowed_congestion_control":   fs.newInode(root, 0444, newStaticFile("")),
+				"tcp_available_congestion_control": fs.newInode(root, 0444, newStaticFile("reno")),
+				"tcp_congestion_control":           fs.newInode(root, 0444, newStaticFile("reno")),
 
 				// Many of the following stub files are features netstack doesn't
 				// support. The unsupported features return "0" to indicate they are
 				// disabled.
-				"tcp_base_mss":              fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("1280")),
-				"tcp_dsack":                 fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")),
-				"tcp_early_retrans":         fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")),
-				"tcp_fack":                  fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")),
-				"tcp_fastopen":              fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")),
-				"tcp_fastopen_key":          fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("")),
-				"tcp_invalid_ratelimit":     fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")),
-				"tcp_keepalive_intvl":       fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")),
-				"tcp_keepalive_probes":      fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")),
-				"tcp_keepalive_time":        fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("7200")),
-				"tcp_mtu_probing":           fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")),
-				"tcp_no_metrics_save":       fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("1")),
-				"tcp_probe_interval":        fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")),
-				"tcp_probe_threshold":       fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")),
-				"tcp_retries1":              fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("3")),
-				"tcp_retries2":              fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("15")),
-				"tcp_rfc1337":               fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("1")),
-				"tcp_slow_start_after_idle": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("1")),
-				"tcp_synack_retries":        fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("5")),
-				"tcp_syn_retries":           fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("3")),
-				"tcp_timestamps":            fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("1")),
+				"tcp_base_mss":              fs.newInode(root, 0444, newStaticFile("1280")),
+				"tcp_dsack":                 fs.newInode(root, 0444, newStaticFile("0")),
+				"tcp_early_retrans":         fs.newInode(root, 0444, newStaticFile("0")),
+				"tcp_fack":                  fs.newInode(root, 0444, newStaticFile("0")),
+				"tcp_fastopen":              fs.newInode(root, 0444, newStaticFile("0")),
+				"tcp_fastopen_key":          fs.newInode(root, 0444, newStaticFile("")),
+				"tcp_invalid_ratelimit":     fs.newInode(root, 0444, newStaticFile("0")),
+				"tcp_keepalive_intvl":       fs.newInode(root, 0444, newStaticFile("0")),
+				"tcp_keepalive_probes":      fs.newInode(root, 0444, newStaticFile("0")),
+				"tcp_keepalive_time":        fs.newInode(root, 0444, newStaticFile("7200")),
+				"tcp_mtu_probing":           fs.newInode(root, 0444, newStaticFile("0")),
+				"tcp_no_metrics_save":       fs.newInode(root, 0444, newStaticFile("1")),
+				"tcp_probe_interval":        fs.newInode(root, 0444, newStaticFile("0")),
+				"tcp_probe_threshold":       fs.newInode(root, 0444, newStaticFile("0")),
+				"tcp_retries1":              fs.newInode(root, 0444, newStaticFile("3")),
+				"tcp_retries2":              fs.newInode(root, 0444, newStaticFile("15")),
+				"tcp_rfc1337":               fs.newInode(root, 0444, newStaticFile("1")),
+				"tcp_slow_start_after_idle": fs.newInode(root, 0444, newStaticFile("1")),
+				"tcp_synack_retries":        fs.newInode(root, 0444, newStaticFile("5")),
+				"tcp_syn_retries":           fs.newInode(root, 0444, newStaticFile("3")),
+				"tcp_timestamps":            fs.newInode(root, 0444, newStaticFile("1")),
 			}),
-			"core": kernfs.NewStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{
-				"default_qdisc": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("pfifo_fast")),
-				"message_burst": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("10")),
-				"message_cost":  fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("5")),
-				"optmem_max":    fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")),
-				"rmem_default":  fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("212992")),
-				"rmem_max":      fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("212992")),
-				"somaxconn":     fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("128")),
-				"wmem_default":  fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("212992")),
-				"wmem_max":      fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("212992")),
+			"core": fs.newStaticDir(root, map[string]kernfs.Inode{
+				"default_qdisc": fs.newInode(root, 0444, newStaticFile("pfifo_fast")),
+				"message_burst": fs.newInode(root, 0444, newStaticFile("10")),
+				"message_cost":  fs.newInode(root, 0444, newStaticFile("5")),
+				"optmem_max":    fs.newInode(root, 0444, newStaticFile("0")),
+				"rmem_default":  fs.newInode(root, 0444, newStaticFile("212992")),
+				"rmem_max":      fs.newInode(root, 0444, newStaticFile("212992")),
+				"somaxconn":     fs.newInode(root, 0444, newStaticFile("128")),
+				"wmem_default":  fs.newInode(root, 0444, newStaticFile("212992")),
+				"wmem_max":      fs.newInode(root, 0444, newStaticFile("212992")),
 			}),
 		}
 	}
 
-	return kernfs.NewStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, contents)
+	return fs.newStaticDir(root, contents)
 }
 
 // mmapMinAddrData implements vfs.DynamicBytesSource for
@@ -174,7 +177,7 @@ type tcpSackData struct {
 
 var _ vfs.WritableDynamicBytesSource = (*tcpSackData)(nil)
 
-// Generate implements vfs.DynamicBytesSource.
+// Generate implements vfs.DynamicBytesSource.Generate.
 func (d *tcpSackData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 	if d.enabled == nil {
 		sack, err := d.stack.TCPSACKEnabled()
@@ -232,7 +235,7 @@ type tcpRecoveryData struct {
 
 var _ vfs.WritableDynamicBytesSource = (*tcpRecoveryData)(nil)
 
-// Generate implements vfs.DynamicBytesSource.
+// Generate implements vfs.DynamicBytesSource.Generate.
 func (d *tcpRecoveryData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 	recovery, err := d.stack.TCPRecovery()
 	if err != nil {
@@ -284,7 +287,7 @@ type tcpMemData struct {
 
 var _ vfs.WritableDynamicBytesSource = (*tcpMemData)(nil)
 
-// Generate implements vfs.DynamicBytesSource.
+// Generate implements vfs.DynamicBytesSource.Generate.
 func (d *tcpMemData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 	d.mu.Lock()
 	defer d.mu.Unlock()
@@ -354,3 +357,63 @@ func (d *tcpMemData) writeSizeLocked(size inet.TCPBufferSize) error {
 		panic(fmt.Sprintf("unknown tcpMemFile type: %v", d.dir))
 	}
 }
+
+// ipForwarding implements vfs.WritableDynamicBytesSource for
+// /proc/sys/net/ipv4/ip_forwarding.
+//
+// +stateify savable
+type ipForwarding struct {
+	kernfs.DynamicBytesFile
+
+	stack   inet.Stack `state:"wait"`
+	enabled *bool
+}
+
+var _ vfs.WritableDynamicBytesSource = (*ipForwarding)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (ipf *ipForwarding) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	if ipf.enabled == nil {
+		enabled := ipf.stack.Forwarding(ipv4.ProtocolNumber)
+		ipf.enabled = &enabled
+	}
+
+	val := "0\n"
+	if *ipf.enabled {
+		// Technically, this is not quite compatible with Linux. Linux stores these
+		// as an integer, so if you write "2" into tcp_sack, you should get 2 back.
+		// Tough luck.
+		val = "1\n"
+	}
+	buf.WriteString(val)
+
+	return nil
+}
+
+// Write implements vfs.WritableDynamicBytesSource.Write.
+func (ipf *ipForwarding) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) {
+	if offset != 0 {
+		// No need to handle partial writes thus far.
+		return 0, syserror.EINVAL
+	}
+	if src.NumBytes() == 0 {
+		return 0, nil
+	}
+
+	// Limit input size so as not to impact performance if input size is large.
+	src = src.TakeFirst(usermem.PageSize - 1)
+
+	var v int32
+	n, err := usermem.CopyInt32StringInVec(ctx, src.IO, src.Addrs, &v, src.Opts)
+	if err != nil {
+		return 0, err
+	}
+	if ipf.enabled == nil {
+		ipf.enabled = new(bool)
+	}
+	*ipf.enabled = v != 0
+	if err := ipf.stack.SetForwarding(ipv4.ProtocolNumber, *ipf.enabled); err != nil {
+		return 0, err
+	}
+	return n, nil
+}
diff --git a/pkg/sentry/fsimpl/proc/tasks_sys_test.go b/pkg/sentry/fsimpl/proc/tasks_sys_test.go
index be54897bb..6cee22823 100644
--- a/pkg/sentry/fsimpl/proc/tasks_sys_test.go
+++ b/pkg/sentry/fsimpl/proc/tasks_sys_test.go
@@ -20,8 +20,10 @@ import (
 	"testing"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/contexttest"
 	"gvisor.dev/gvisor/pkg/sentry/inet"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 func newIPv6TestStack() *inet.TestStack {
@@ -76,3 +78,72 @@ func TestIfinet6(t *testing.T) {
 		t.Errorf("Got n.contents() = %v, want = %v", got, want)
 	}
 }
+
+// TestIPForwarding tests the implementation of
+// /proc/sys/net/ipv4/ip_forwarding
+func TestConfigureIPForwarding(t *testing.T) {
+	ctx := context.Background()
+	s := inet.NewTestStack()
+
+	var cases = []struct {
+		comment string
+		initial bool
+		str     string
+		final   bool
+	}{
+		{
+			comment: `Forwarding is disabled; write 1 and enable forwarding`,
+			initial: false,
+			str:     "1",
+			final:   true,
+		},
+		{
+			comment: `Forwarding is disabled; write 0 and disable forwarding`,
+			initial: false,
+			str:     "0",
+			final:   false,
+		},
+		{
+			comment: `Forwarding is enabled; write 1 and enable forwarding`,
+			initial: true,
+			str:     "1",
+			final:   true,
+		},
+		{
+			comment: `Forwarding is enabled; write 0 and disable forwarding`,
+			initial: true,
+			str:     "0",
+			final:   false,
+		},
+		{
+			comment: `Forwarding is disabled; write 2404 and enable forwarding`,
+			initial: false,
+			str:     "2404",
+			final:   true,
+		},
+		{
+			comment: `Forwarding is enabled; write 2404 and enable forwarding`,
+			initial: true,
+			str:     "2404",
+			final:   true,
+		},
+	}
+	for _, c := range cases {
+		t.Run(c.comment, func(t *testing.T) {
+			s.IPForwarding = c.initial
+
+			file := &ipForwarding{stack: s, enabled: &c.initial}
+
+			// Write the values.
+			src := usermem.BytesIOSequence([]byte(c.str))
+			if n, err := file.Write(ctx, src, 0); n != int64(len(c.str)) || err != nil {
+				t.Errorf("file.Write(ctx, nil, %q, 0) = (%d, %v); want (%d, nil)", c.str, n, err, len(c.str))
+			}
+
+			// Read the values from the stack and check them.
+			if got, want := s.IPForwarding, c.final; got != want {
+				t.Errorf("s.IPForwarding incorrect; got: %v, want: %v", got, want)
+			}
+		})
+	}
+}
diff --git a/pkg/sentry/fsimpl/proc/tasks_test.go b/pkg/sentry/fsimpl/proc/tasks_test.go
index 3c9297dee..2582ababd 100644
--- a/pkg/sentry/fsimpl/proc/tasks_test.go
+++ b/pkg/sentry/fsimpl/proc/tasks_test.go
@@ -67,6 +67,7 @@ var (
 	taskStaticFiles = map[string]testutil.DirentType{
 		"auxv":          linux.DT_REG,
 		"cgroup":        linux.DT_REG,
+		"cwd":           linux.DT_LNK,
 		"cmdline":       linux.DT_REG,
 		"comm":          linux.DT_REG,
 		"environ":       linux.DT_REG,
@@ -104,13 +105,16 @@ func setup(t *testing.T) *testutil.System {
 		AllowUserMount: true,
 	})
 
-	mntns, err := k.VFS().NewMountNamespace(ctx, creds, "", tmpfs.Name, &vfs.GetFilesystemOptions{})
+	mntns, err := k.VFS().NewMountNamespace(ctx, creds, "", tmpfs.Name, &vfs.MountOptions{})
 	if err != nil {
 		t.Fatalf("NewMountNamespace(): %v", err)
 	}
+	root := mntns.Root()
+	root.IncRef()
+	defer root.DecRef(ctx)
 	pop := &vfs.PathOperation{
-		Root:  mntns.Root(),
-		Start: mntns.Root(),
+		Root:  root,
+		Start: root,
 		Path:  fspath.Parse("/proc"),
 	}
 	if err := k.VFS().MkdirAt(ctx, creds, pop, &vfs.MkdirOptions{Mode: 0777}); err != nil {
@@ -118,8 +122,8 @@ func setup(t *testing.T) *testutil.System {
 	}
 
 	pop = &vfs.PathOperation{
-		Root:  mntns.Root(),
-		Start: mntns.Root(),
+		Root:  root,
+		Start: root,
 		Path:  fspath.Parse("/proc"),
 	}
 	mntOpts := &vfs.MountOptions{
@@ -132,7 +136,7 @@ func setup(t *testing.T) *testutil.System {
 			},
 		},
 	}
-	if err := k.VFS().MountAt(ctx, creds, "", pop, Name, mntOpts); err != nil {
+	if _, err := k.VFS().MountAt(ctx, creds, "", pop, Name, mntOpts); err != nil {
 		t.Fatalf("MountAt(/proc): %v", err)
 	}
 	return testutil.NewSystem(ctx, t, k.VFS(), mntns)
diff --git a/pkg/sentry/fsimpl/signalfd/BUILD b/pkg/sentry/fsimpl/signalfd/BUILD
index 067c1657f..adb610213 100644
--- a/pkg/sentry/fsimpl/signalfd/BUILD
+++ b/pkg/sentry/fsimpl/signalfd/BUILD
@@ -8,7 +8,6 @@ go_library(
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
-        "//pkg/binary",
         "//pkg/context",
         "//pkg/sentry/kernel",
         "//pkg/sentry/vfs",
diff --git a/pkg/sentry/fsimpl/signalfd/signalfd.go b/pkg/sentry/fsimpl/signalfd/signalfd.go
index 6297e1df4..10f1452ef 100644
--- a/pkg/sentry/fsimpl/signalfd/signalfd.go
+++ b/pkg/sentry/fsimpl/signalfd/signalfd.go
@@ -16,7 +16,6 @@ package signalfd
 
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/binary"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
@@ -26,7 +25,9 @@ import (
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
-// SignalFileDescription implements FileDescriptionImpl for signal fds.
+// SignalFileDescription implements vfs.FileDescriptionImpl for signal fds.
+//
+// +stateify savable
 type SignalFileDescription struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
@@ -43,7 +44,7 @@ type SignalFileDescription struct {
 	target *kernel.Task
 
 	// mu protects mask.
-	mu sync.Mutex
+	mu sync.Mutex `state:"nosave"`
 
 	// mask is the signal mask. Protected by mu.
 	mask linux.SignalSet
@@ -83,7 +84,7 @@ func (sfd *SignalFileDescription) SetMask(mask linux.SignalSet) {
 	sfd.mask = mask
 }
 
-// Read implements FileDescriptionImpl.Read.
+// Read implements vfs.FileDescriptionImpl.Read.
 func (sfd *SignalFileDescription) Read(ctx context.Context, dst usermem.IOSequence, _ vfs.ReadOptions) (int64, error) {
 	// Attempt to dequeue relevant signals.
 	info, err := sfd.target.Sigtimedwait(sfd.Mask(), 0)
@@ -93,8 +94,7 @@ func (sfd *SignalFileDescription) Read(ctx context.Context, dst usermem.IOSequen
 	}
 
 	// Copy out the signal info using the specified format.
-	var buf [128]byte
-	binary.Marshal(buf[:0], usermem.ByteOrder, &linux.SignalfdSiginfo{
+	infoNative := linux.SignalfdSiginfo{
 		Signo:   uint32(info.Signo),
 		Errno:   info.Errno,
 		Code:    info.Code,
@@ -103,9 +103,13 @@ func (sfd *SignalFileDescription) Read(ctx context.Context, dst usermem.IOSequen
 		Status:  info.Status(),
 		Overrun: uint32(info.Overrun()),
 		Addr:    info.Addr(),
-	})
-	n, err := dst.CopyOut(ctx, buf[:])
-	return int64(n), err
+	}
+	n, err := infoNative.WriteTo(dst.Writer(ctx))
+	if err == usermem.ErrEndOfIOSequence {
+		// Partial copy-out ok.
+		err = nil
+	}
+	return n, err
 }
 
 // Readiness implements waiter.Waitable.Readiness.
@@ -132,5 +136,5 @@ func (sfd *SignalFileDescription) EventUnregister(entry *waiter.Entry) {
 	sfd.target.SignalUnregister(entry)
 }
 
-// Release implements FileDescriptionImpl.Release()
+// Release implements vfs.FileDescriptionImpl.Release.
 func (sfd *SignalFileDescription) Release(context.Context) {}
diff --git a/pkg/sentry/fsimpl/sockfs/sockfs.go b/pkg/sentry/fsimpl/sockfs/sockfs.go
index c61818ff6..cf91ea36c 100644
--- a/pkg/sentry/fsimpl/sockfs/sockfs.go
+++ b/pkg/sentry/fsimpl/sockfs/sockfs.go
@@ -28,14 +28,16 @@ import (
 )
 
 // filesystemType implements vfs.FilesystemType.
+//
+// +stateify savable
 type filesystemType struct{}
 
-// GetFilesystem implements FilesystemType.GetFilesystem.
+// GetFilesystem implements vfs.FilesystemType.GetFilesystem.
 func (fsType filesystemType) GetFilesystem(_ context.Context, vfsObj *vfs.VirtualFilesystem, _ *auth.Credentials, _ string, _ vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
 	panic("sockfs.filesystemType.GetFilesystem should never be called")
 }
 
-// Name implements FilesystemType.Name.
+// Name implements vfs.FilesystemType.Name.
 //
 // Note that registering sockfs is unnecessary, except for the fact that it
 // will not show up under /proc/filesystems as a result. This is a very minor
@@ -44,6 +46,10 @@ func (filesystemType) Name() string {
 	return "sockfs"
 }
 
+// Release implements vfs.FilesystemType.Release.
+func (filesystemType) Release(ctx context.Context) {}
+
+// +stateify savable
 type filesystem struct {
 	kernfs.Filesystem
 
@@ -80,18 +86,25 @@ func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDe
 }
 
 // inode implements kernfs.Inode.
+//
+// +stateify savable
 type inode struct {
-	kernfs.InodeNotDirectory
-	kernfs.InodeNotSymlink
 	kernfs.InodeAttrs
 	kernfs.InodeNoopRefCount
+	kernfs.InodeNotDirectory
+	kernfs.InodeNotSymlink
 }
 
 // Open implements kernfs.Inode.Open.
-func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
 	return nil, syserror.ENXIO
 }
 
+// StatFS implements kernfs.Inode.StatFS.
+func (i *inode) StatFS(ctx context.Context, fs *vfs.Filesystem) (linux.Statfs, error) {
+	return vfs.GenericStatFS(linux.SOCKFS_MAGIC), nil
+}
+
 // NewDentry constructs and returns a sockfs dentry.
 //
 // Preconditions: mnt.Filesystem() must have been returned by NewFilesystem().
@@ -104,6 +117,6 @@ func NewDentry(creds *auth.Credentials, mnt *vfs.Mount) *vfs.Dentry {
 	i.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.Filesystem.NextIno(), filemode)
 
 	d := &kernfs.Dentry{}
-	d.Init(i)
+	d.Init(&fs.Filesystem, i)
 	return d.VFSDentry()
 }
diff --git a/pkg/sentry/fsimpl/sys/BUILD b/pkg/sentry/fsimpl/sys/BUILD
index 1b548ccd4..906cd52cb 100644
--- a/pkg/sentry/fsimpl/sys/BUILD
+++ b/pkg/sentry/fsimpl/sys/BUILD
@@ -1,21 +1,41 @@
 load("//tools:defs.bzl", "go_library", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
 
 licenses(["notice"])
 
+go_template_instance(
+    name = "dir_refs",
+    out = "dir_refs.go",
+    package = "sys",
+    prefix = "dir",
+    template = "//pkg/refs_vfs2:refs_template",
+    types = {
+        "T": "dir",
+    },
+)
+
 go_library(
     name = "sys",
     srcs = [
+        "dir_refs.go",
+        "kcov.go",
         "sys.go",
     ],
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
         "//pkg/context",
+        "//pkg/coverage",
+        "//pkg/log",
+        "//pkg/refs",
+        "//pkg/sentry/arch",
         "//pkg/sentry/fsimpl/kernfs",
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/memmap",
         "//pkg/sentry/vfs",
         "//pkg/syserror",
+        "//pkg/usermem",
     ],
 )
 
diff --git a/pkg/sentry/fsimpl/sys/kcov.go b/pkg/sentry/fsimpl/sys/kcov.go
new file mode 100644
index 000000000..31a361029
--- /dev/null
+++ b/pkg/sentry/fsimpl/sys/kcov.go
@@ -0,0 +1,118 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package sys
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/memmap"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+func (fs *filesystem) newKcovFile(ctx context.Context, creds *auth.Credentials) kernfs.Inode {
+	k := &kcovInode{}
+	k.InodeAttrs.Init(creds, 0, 0, fs.NextIno(), linux.S_IFREG|0600)
+	return k
+}
+
+// kcovInode implements kernfs.Inode.
+//
+// +stateify savable
+type kcovInode struct {
+	kernfs.InodeAttrs
+	kernfs.InodeNoopRefCount
+	kernfs.InodeNotDirectory
+	kernfs.InodeNotSymlink
+	implStatFS
+}
+
+func (i *kcovInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	k := kernel.KernelFromContext(ctx)
+	if k == nil {
+		panic("KernelFromContext returned nil")
+	}
+	fd := &kcovFD{
+		inode: i,
+		kcov:  k.NewKcov(),
+	}
+
+	if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), d.VFSDentry(), &vfs.FileDescriptionOptions{
+		DenyPRead:  true,
+		DenyPWrite: true,
+	}); err != nil {
+		return nil, err
+	}
+	return &fd.vfsfd, nil
+}
+
+// +stateify savable
+type kcovFD struct {
+	vfs.FileDescriptionDefaultImpl
+	vfs.NoLockFD
+
+	vfsfd vfs.FileDescription
+	inode *kcovInode
+	kcov  *kernel.Kcov
+}
+
+// Ioctl implements vfs.FileDescriptionImpl.Ioctl.
+func (fd *kcovFD) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	cmd := uint32(args[1].Int())
+	arg := args[2].Uint64()
+	switch uint32(cmd) {
+	case linux.KCOV_INIT_TRACE:
+		return 0, fd.kcov.InitTrace(arg)
+	case linux.KCOV_ENABLE:
+		return 0, fd.kcov.EnableTrace(ctx, uint8(arg))
+	case linux.KCOV_DISABLE:
+		if arg != 0 {
+			// This arg is unused; it should be 0.
+			return 0, syserror.EINVAL
+		}
+		return 0, fd.kcov.DisableTrace(ctx)
+	default:
+		return 0, syserror.ENOTTY
+	}
+}
+
+// ConfigureMmap implements vfs.FileDescriptionImpl.ConfigureMmap.
+func (fd *kcovFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
+	return fd.kcov.ConfigureMMap(ctx, opts)
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *kcovFD) Release(ctx context.Context) {
+	// kcov instances have reference counts in Linux, but this seems sufficient
+	// for our purposes.
+	fd.kcov.Clear(ctx)
+}
+
+// SetStat implements vfs.FileDescriptionImpl.SetStat.
+func (fd *kcovFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
+	creds := auth.CredentialsFromContext(ctx)
+	fs := fd.vfsfd.VirtualDentry().Mount().Filesystem()
+	return fd.inode.SetStat(ctx, fs, creds, opts)
+}
+
+// Stat implements vfs.FileDescriptionImpl.Stat.
+func (fd *kcovFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
+	return fd.inode.Stat(ctx, fd.vfsfd.Mount().Filesystem(), opts)
+}
diff --git a/pkg/sentry/fsimpl/sys/sys.go b/pkg/sentry/fsimpl/sys/sys.go
index 0401726b6..1ad679830 100644
--- a/pkg/sentry/fsimpl/sys/sys.go
+++ b/pkg/sentry/fsimpl/sys/sys.go
@@ -21,6 +21,7 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/coverage"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
@@ -33,9 +34,13 @@ const Name = "sysfs"
 const defaultSysDirMode = linux.FileMode(0755)
 
 // FilesystemType implements vfs.FilesystemType.
+//
+// +stateify savable
 type FilesystemType struct{}
 
 // filesystem implements vfs.FilesystemImpl.
+//
+// +stateify savable
 type filesystem struct {
 	kernfs.Filesystem
 
@@ -47,6 +52,9 @@ func (FilesystemType) Name() string {
 	return Name
 }
 
+// Release implements vfs.FilesystemType.Release.
+func (FilesystemType) Release(ctx context.Context) {}
+
 // GetFilesystem implements vfs.FilesystemType.GetFilesystem.
 func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
 	devMinor, err := vfsObj.GetAnonBlockDevMinor()
@@ -59,31 +67,33 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
 	}
 	fs.VFSFilesystem().Init(vfsObj, &fsType, fs)
 
-	root := fs.newDir(creds, defaultSysDirMode, map[string]*kernfs.Dentry{
+	root := fs.newDir(creds, defaultSysDirMode, map[string]kernfs.Inode{
 		"block": fs.newDir(creds, defaultSysDirMode, nil),
 		"bus":   fs.newDir(creds, defaultSysDirMode, nil),
-		"class": fs.newDir(creds, defaultSysDirMode, map[string]*kernfs.Dentry{
+		"class": fs.newDir(creds, defaultSysDirMode, map[string]kernfs.Inode{
 			"power_supply": fs.newDir(creds, defaultSysDirMode, nil),
 		}),
 		"dev": fs.newDir(creds, defaultSysDirMode, nil),
-		"devices": fs.newDir(creds, defaultSysDirMode, map[string]*kernfs.Dentry{
-			"system": fs.newDir(creds, defaultSysDirMode, map[string]*kernfs.Dentry{
+		"devices": fs.newDir(creds, defaultSysDirMode, map[string]kernfs.Inode{
+			"system": fs.newDir(creds, defaultSysDirMode, map[string]kernfs.Inode{
 				"cpu": cpuDir(ctx, fs, creds),
 			}),
 		}),
 		"firmware": fs.newDir(creds, defaultSysDirMode, nil),
 		"fs":       fs.newDir(creds, defaultSysDirMode, nil),
-		"kernel":   fs.newDir(creds, defaultSysDirMode, nil),
+		"kernel":   kernelDir(ctx, fs, creds),
 		"module":   fs.newDir(creds, defaultSysDirMode, nil),
 		"power":    fs.newDir(creds, defaultSysDirMode, nil),
 	})
-	return fs.VFSFilesystem(), root.VFSDentry(), nil
+	var rootD kernfs.Dentry
+	rootD.Init(&fs.Filesystem, root)
+	return fs.VFSFilesystem(), rootD.VFSDentry(), nil
 }
 
-func cpuDir(ctx context.Context, fs *filesystem, creds *auth.Credentials) *kernfs.Dentry {
+func cpuDir(ctx context.Context, fs *filesystem, creds *auth.Credentials) kernfs.Inode {
 	k := kernel.KernelFromContext(ctx)
 	maxCPUCores := k.ApplicationCores()
-	children := map[string]*kernfs.Dentry{
+	children := map[string]kernfs.Inode{
 		"online":   fs.newCPUFile(creds, maxCPUCores, linux.FileMode(0444)),
 		"possible": fs.newCPUFile(creds, maxCPUCores, linux.FileMode(0444)),
 		"present":  fs.newCPUFile(creds, maxCPUCores, linux.FileMode(0444)),
@@ -94,6 +104,21 @@ func cpuDir(ctx context.Context, fs *filesystem, creds *auth.Credentials) *kernf
 	return fs.newDir(creds, defaultSysDirMode, children)
 }
 
+func kernelDir(ctx context.Context, fs *filesystem, creds *auth.Credentials) kernfs.Inode {
+	// If kcov is available, set up /sys/kernel/debug/kcov. Technically, debugfs
+	// should be mounted at debug/, but for our purposes, it is sufficient to
+	// keep it in sys.
+	var children map[string]kernfs.Inode
+	if coverage.KcovAvailable() {
+		children = map[string]kernfs.Inode{
+			"debug": fs.newDir(creds, linux.FileMode(0700), map[string]kernfs.Inode{
+				"kcov": fs.newKcovFile(ctx, creds),
+			}),
+		}
+	}
+	return fs.newDir(creds, defaultSysDirMode, children)
+}
+
 // Release implements vfs.FilesystemImpl.Release.
 func (fs *filesystem) Release(ctx context.Context) {
 	fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor)
@@ -101,46 +126,62 @@ func (fs *filesystem) Release(ctx context.Context) {
 }
 
 // dir implements kernfs.Inode.
+//
+// +stateify savable
 type dir struct {
+	dirRefs
+	kernfs.InodeAlwaysValid
 	kernfs.InodeAttrs
-	kernfs.InodeNoDynamicLookup
 	kernfs.InodeNotSymlink
 	kernfs.InodeDirectoryNoNewChildren
+	kernfs.InodeTemporary
 	kernfs.OrderedChildren
 
 	locks vfs.FileLocks
-
-	dentry kernfs.Dentry
 }
 
-func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, contents map[string]*kernfs.Dentry) *kernfs.Dentry {
+func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, contents map[string]kernfs.Inode) kernfs.Inode {
 	d := &dir{}
 	d.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0755)
 	d.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
-	d.dentry.Init(d)
-
-	d.IncLinks(d.OrderedChildren.Populate(&d.dentry, contents))
-
-	return &d.dentry
+	d.EnableLeakCheck()
+	d.IncLinks(d.OrderedChildren.Populate(contents))
+	return d
 }
 
-// SetStat implements Inode.SetStat not allowing inode attributes to be changed.
+// SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed.
 func (*dir) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
 	return syserror.EPERM
 }
 
 // Open implements kernfs.Inode.Open.
-func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &d.locks, &opts)
+func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, kd *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), kd, &d.OrderedChildren, &d.locks, &opts, kernfs.GenericDirectoryFDOptions{
+		SeekEnd: kernfs.SeekEndStaticEntries,
+	})
 	if err != nil {
 		return nil, err
 	}
 	return fd.VFSFileDescription(), nil
 }
 
+// DecRef implements kernfs.Inode.DecRef.
+func (d *dir) DecRef(ctx context.Context) {
+	d.dirRefs.DecRef(func() { d.Destroy(ctx) })
+}
+
+// StatFS implements kernfs.Inode.StatFS.
+func (d *dir) StatFS(ctx context.Context, fs *vfs.Filesystem) (linux.Statfs, error) {
+	return vfs.GenericStatFS(linux.SYSFS_MAGIC), nil
+}
+
 // cpuFile implements kernfs.Inode.
+//
+// +stateify savable
 type cpuFile struct {
+	implStatFS
 	kernfs.DynamicBytesFile
+
 	maxCores uint
 }
 
@@ -150,10 +191,16 @@ func (c *cpuFile) Generate(ctx context.Context, buf *bytes.Buffer) error {
 	return nil
 }
 
-func (fs *filesystem) newCPUFile(creds *auth.Credentials, maxCores uint, mode linux.FileMode) *kernfs.Dentry {
+func (fs *filesystem) newCPUFile(creds *auth.Credentials, maxCores uint, mode linux.FileMode) kernfs.Inode {
 	c := &cpuFile{maxCores: maxCores}
 	c.DynamicBytesFile.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), c, mode)
-	d := &kernfs.Dentry{}
-	d.Init(c)
-	return d
+	return c
+}
+
+// +stateify savable
+type implStatFS struct{}
+
+// StatFS implements kernfs.Inode.StatFS.
+func (*implStatFS) StatFS(context.Context, *vfs.Filesystem) (linux.Statfs, error) {
+	return vfs.GenericStatFS(linux.SYSFS_MAGIC), nil
 }
diff --git a/pkg/sentry/fsimpl/sys/sys_test.go b/pkg/sentry/fsimpl/sys/sys_test.go
index 9fd38b295..0a0d914cc 100644
--- a/pkg/sentry/fsimpl/sys/sys_test.go
+++ b/pkg/sentry/fsimpl/sys/sys_test.go
@@ -38,7 +38,7 @@ func newTestSystem(t *testing.T) *testutil.System {
 		AllowUserMount: true,
 	})
 
-	mns, err := k.VFS().NewMountNamespace(ctx, creds, "", sys.Name, &vfs.GetFilesystemOptions{})
+	mns, err := k.VFS().NewMountNamespace(ctx, creds, "", sys.Name, &vfs.MountOptions{})
 	if err != nil {
 		t.Fatalf("Failed to create new mount namespace: %v", err)
 	}
diff --git a/pkg/sentry/fsimpl/testutil/kernel.go b/pkg/sentry/fsimpl/testutil/kernel.go
index 1813269e0..738c0c9cc 100644
--- a/pkg/sentry/fsimpl/testutil/kernel.go
+++ b/pkg/sentry/fsimpl/testutil/kernel.go
@@ -147,7 +147,12 @@ func CreateTask(ctx context.Context, name string, tc *kernel.ThreadGroup, mntns
 		FSContext:               kernel.NewFSContextVFS2(root, cwd, 0022),
 		FDTable:                 k.NewFDTable(),
 	}
-	return k.TaskSet().NewTask(config)
+	t, err := k.TaskSet().NewTask(ctx, config)
+	if err != nil {
+		config.ThreadGroup.Release(ctx)
+		return nil, err
+	}
+	return t, nil
 }
 
 func newFakeExecutable(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, root vfs.VirtualDentry) (*vfs.FileDescription, error) {
diff --git a/pkg/sentry/fsimpl/testutil/testutil.go b/pkg/sentry/fsimpl/testutil/testutil.go
index 568132121..1a8525b06 100644
--- a/pkg/sentry/fsimpl/testutil/testutil.go
+++ b/pkg/sentry/fsimpl/testutil/testutil.go
@@ -46,16 +46,18 @@ type System struct {
 
 // NewSystem constructs a System.
 //
-// Precondition: Caller must hold a reference on MntNs, whose ownership
+// Precondition: Caller must hold a reference on mns, whose ownership
 // is transferred to the new System.
 func NewSystem(ctx context.Context, t *testing.T, v *vfs.VirtualFilesystem, mns *vfs.MountNamespace) *System {
+	root := mns.Root()
+	root.IncRef()
 	s := &System{
 		t:     t,
 		Ctx:   ctx,
 		Creds: auth.CredentialsFromContext(ctx),
 		VFS:   v,
 		MntNs: mns,
-		Root:  mns.Root(),
+		Root:  root,
 	}
 	return s
 }
@@ -254,10 +256,10 @@ func (d *DirentCollector) Contains(name string, typ uint8) error {
 	defer d.mu.Unlock()
 	dirent, ok := d.dirents[name]
 	if !ok {
-		return fmt.Errorf("No dirent named %q found", name)
+		return fmt.Errorf("no dirent named %q found", name)
 	}
 	if dirent.Type != typ {
-		return fmt.Errorf("Dirent named %q found, but was expecting type %s, got: %+v", name, linux.DirentType.Parse(uint64(typ)), dirent)
+		return fmt.Errorf("dirent named %q found, but was expecting type %s, got: %+v", name, linux.DirentType.Parse(uint64(typ)), dirent)
 	}
 	return nil
 }
diff --git a/pkg/sentry/fsimpl/timerfd/timerfd.go b/pkg/sentry/fsimpl/timerfd/timerfd.go
index 86beaa0a8..8853c8ad2 100644
--- a/pkg/sentry/fsimpl/timerfd/timerfd.go
+++ b/pkg/sentry/fsimpl/timerfd/timerfd.go
@@ -26,8 +26,10 @@ import (
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
-// TimerFileDescription implements FileDescriptionImpl for timer fds. It also
+// TimerFileDescription implements vfs.FileDescriptionImpl for timer fds. It also
 // implements ktime.TimerListener.
+//
+// +stateify savable
 type TimerFileDescription struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
@@ -62,7 +64,7 @@ func New(ctx context.Context, vfsObj *vfs.VirtualFilesystem, clock ktime.Clock,
 	return &tfd.vfsfd, nil
 }
 
-// Read implements FileDescriptionImpl.Read.
+// Read implements vfs.FileDescriptionImpl.Read.
 func (tfd *TimerFileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
 	const sizeofUint64 = 8
 	if dst.NumBytes() < sizeofUint64 {
@@ -128,7 +130,7 @@ func (tfd *TimerFileDescription) ResumeTimer() {
 	tfd.timer.Resume()
 }
 
-// Release implements FileDescriptionImpl.Release()
+// Release implements vfs.FileDescriptionImpl.Release.
 func (tfd *TimerFileDescription) Release(context.Context) {
 	tfd.timer.Destroy()
 }
diff --git a/pkg/sentry/fsimpl/tmpfs/benchmark_test.go b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go
index d263147c2..3cc63e732 100644
--- a/pkg/sentry/fsimpl/tmpfs/benchmark_test.go
+++ b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go
@@ -182,7 +182,7 @@ func BenchmarkVFS2TmpfsStat(b *testing.B) {
 			vfsObj.MustRegisterFilesystemType("tmpfs", tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
 				AllowUserMount: true,
 			})
-			mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{})
+			mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.MountOptions{})
 			if err != nil {
 				b.Fatalf("failed to create tmpfs root mount: %v", err)
 			}
@@ -193,6 +193,7 @@ func BenchmarkVFS2TmpfsStat(b *testing.B) {
 
 			// Create nested directories with given depth.
 			root := mntns.Root()
+			root.IncRef()
 			defer root.DecRef(ctx)
 			vd := root
 			vd.IncRef()
@@ -376,7 +377,7 @@ func BenchmarkVFS2TmpfsMountStat(b *testing.B) {
 			vfsObj.MustRegisterFilesystemType("tmpfs", tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
 				AllowUserMount: true,
 			})
-			mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{})
+			mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.MountOptions{})
 			if err != nil {
 				b.Fatalf("failed to create tmpfs root mount: %v", err)
 			}
@@ -387,6 +388,7 @@ func BenchmarkVFS2TmpfsMountStat(b *testing.B) {
 
 			// Create the mount point.
 			root := mntns.Root()
+			root.IncRef()
 			defer root.DecRef(ctx)
 			pop := vfs.PathOperation{
 				Root:  root,
@@ -405,7 +407,7 @@ func BenchmarkVFS2TmpfsMountStat(b *testing.B) {
 			}
 			defer mountPoint.DecRef(ctx)
 			// Create and mount the submount.
-			if err := vfsObj.MountAt(ctx, creds, "", &pop, "tmpfs", &vfs.MountOptions{}); err != nil {
+			if _, err := vfsObj.MountAt(ctx, creds, "", &pop, "tmpfs", &vfs.MountOptions{}); err != nil {
 				b.Fatalf("failed to mount tmpfs submount: %v", err)
 			}
 			filePathBuilder.WriteString(mountPointName)
diff --git a/pkg/sentry/fsimpl/tmpfs/device_file.go b/pkg/sentry/fsimpl/tmpfs/device_file.go
index ac54d420d..9129d35b7 100644
--- a/pkg/sentry/fsimpl/tmpfs/device_file.go
+++ b/pkg/sentry/fsimpl/tmpfs/device_file.go
@@ -22,6 +22,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 )
 
+// +stateify savable
 type deviceFile struct {
 	inode inode
 	kind  vfs.DeviceKind
diff --git a/pkg/sentry/fsimpl/tmpfs/directory.go b/pkg/sentry/fsimpl/tmpfs/directory.go
index 78b4fc5be..e90669cf0 100644
--- a/pkg/sentry/fsimpl/tmpfs/directory.go
+++ b/pkg/sentry/fsimpl/tmpfs/directory.go
@@ -25,6 +25,7 @@ import (
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
+// +stateify savable
 type directory struct {
 	// Since directories can't be hard-linked, each directory can only be
 	// associated with a single dentry, which we can store in the directory
@@ -44,7 +45,7 @@ type directory struct {
 	// (with inode == nil) that represent the iteration position of
 	// directoryFDs. childList is used to support directoryFD.IterDirents()
 	// efficiently. childList is protected by iterMu.
-	iterMu    sync.Mutex
+	iterMu    sync.Mutex `state:"nosave"`
 	childList dentryList
 }
 
@@ -57,8 +58,9 @@ func (fs *filesystem) newDirectory(kuid auth.KUID, kgid auth.KGID, mode linux.Fi
 	return dir
 }
 
-// Preconditions: filesystem.mu must be locked for writing. dir must not
-// already contain a child with the given name.
+// Preconditions:
+// * filesystem.mu must be locked for writing.
+// * dir must not already contain a child with the given name.
 func (dir *directory) insertChildLocked(child *dentry, name string) {
 	child.parent = &dir.dentry
 	child.name = name
@@ -85,6 +87,7 @@ func (dir *directory) mayDelete(creds *auth.Credentials, child *dentry) error {
 	return vfs.CheckDeleteSticky(creds, linux.FileMode(atomic.LoadUint32(&dir.inode.mode)), auth.KUID(atomic.LoadUint32(&child.inode.uid)))
 }
 
+// +stateify savable
 type directoryFD struct {
 	fileDescription
 	vfs.DirectoryFileDescriptionDefaultImpl
diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go
index b0ec177e6..e39cd305b 100644
--- a/pkg/sentry/fsimpl/tmpfs/filesystem.go
+++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go
@@ -25,7 +25,6 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
-	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // Sync implements vfs.FilesystemImpl.Sync.
@@ -39,7 +38,9 @@ func (fs *filesystem) Sync(ctx context.Context) error {
 //
 // stepLocked is loosely analogous to fs/namei.c:walk_component().
 //
-// Preconditions: filesystem.mu must be locked. !rp.Done().
+// Preconditions:
+// * filesystem.mu must be locked.
+// * !rp.Done().
 func stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry) (*dentry, error) {
 	dir, ok := d.inode.impl.(*directory)
 	if !ok {
@@ -97,7 +98,9 @@ afterSymlink:
 // walkParentDirLocked is loosely analogous to Linux's
 // fs/namei.c:path_parentat().
 //
-// Preconditions: filesystem.mu must be locked. !rp.Done().
+// Preconditions:
+// * filesystem.mu must be locked.
+// * !rp.Done().
 func walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry) (*directory, error) {
 	for !rp.Final() {
 		next, err := stepLocked(ctx, rp, d)
@@ -139,8 +142,9 @@ func resolveLocked(ctx context.Context, rp *vfs.ResolvingPath) (*dentry, error)
 // doCreateAt is loosely analogous to a conjunction of Linux's
 // fs/namei.c:filename_create() and done_path_create().
 //
-// Preconditions: !rp.Done(). For the final path component in rp,
-// !rp.ShouldFollowSymlink().
+// Preconditions:
+// * !rp.Done().
+// * For the final path component in rp, !rp.ShouldFollowSymlink().
 func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool, create func(parentDir *directory, name string) error) error {
 	fs.mu.Lock()
 	defer fs.mu.Unlock()
@@ -669,11 +673,11 @@ func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts
 		fs.mu.RUnlock()
 		return err
 	}
-	if err := d.inode.setStat(ctx, rp.Credentials(), &opts); err != nil {
-		fs.mu.RUnlock()
+	err = d.inode.setStat(ctx, rp.Credentials(), &opts)
+	fs.mu.RUnlock()
+	if err != nil {
 		return err
 	}
-	fs.mu.RUnlock()
 
 	if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 {
 		d.InotifyWithParent(ctx, ev, 0, vfs.InodeEvent)
@@ -701,16 +705,7 @@ func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linu
 	if _, err := resolveLocked(ctx, rp); err != nil {
 		return linux.Statfs{}, err
 	}
-	statfs := linux.Statfs{
-		Type:         linux.TMPFS_MAGIC,
-		BlockSize:    usermem.PageSize,
-		FragmentSize: usermem.PageSize,
-		NameLength:   linux.NAME_MAX,
-		// TODO(b/29637826): Allow configuring a tmpfs size and enforce it.
-		Blocks:     0,
-		BlocksFree: 0,
-	}
-	return statfs, nil
+	return globalStatfs, nil
 }
 
 // SymlinkAt implements vfs.FilesystemImpl.SymlinkAt.
@@ -775,7 +770,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
 	return nil
 }
 
-// BoundEndpointAt implements FilesystemImpl.BoundEndpointAt.
+// BoundEndpointAt implements vfs.FilesystemImpl.BoundEndpointAt.
 func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) {
 	fs.mu.RLock()
 	defer fs.mu.RUnlock()
@@ -788,65 +783,68 @@ func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath
 	}
 	switch impl := d.inode.impl.(type) {
 	case *socketFile:
+		if impl.ep == nil {
+			return nil, syserror.ECONNREFUSED
+		}
 		return impl.ep, nil
 	default:
 		return nil, syserror.ECONNREFUSED
 	}
 }
 
-// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
-func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
+// ListXattrAt implements vfs.FilesystemImpl.ListXattrAt.
+func (fs *filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
 	fs.mu.RLock()
 	defer fs.mu.RUnlock()
 	d, err := resolveLocked(ctx, rp)
 	if err != nil {
 		return nil, err
 	}
-	return d.inode.listxattr(size)
+	return d.inode.listXattr(size)
 }
 
-// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt.
-func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) {
+// GetXattrAt implements vfs.FilesystemImpl.GetXattrAt.
+func (fs *filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetXattrOptions) (string, error) {
 	fs.mu.RLock()
 	defer fs.mu.RUnlock()
 	d, err := resolveLocked(ctx, rp)
 	if err != nil {
 		return "", err
 	}
-	return d.inode.getxattr(rp.Credentials(), &opts)
+	return d.inode.getXattr(rp.Credentials(), &opts)
 }
 
-// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt.
-func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error {
+// SetXattrAt implements vfs.FilesystemImpl.SetXattrAt.
+func (fs *filesystem) SetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetXattrOptions) error {
 	fs.mu.RLock()
 	d, err := resolveLocked(ctx, rp)
 	if err != nil {
 		fs.mu.RUnlock()
 		return err
 	}
-	if err := d.inode.setxattr(rp.Credentials(), &opts); err != nil {
-		fs.mu.RUnlock()
+	err = d.inode.setXattr(rp.Credentials(), &opts)
+	fs.mu.RUnlock()
+	if err != nil {
 		return err
 	}
-	fs.mu.RUnlock()
 
 	d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent)
 	return nil
 }
 
-// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt.
-func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
+// RemoveXattrAt implements vfs.FilesystemImpl.RemoveXattrAt.
+func (fs *filesystem) RemoveXattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
 	fs.mu.RLock()
 	d, err := resolveLocked(ctx, rp)
 	if err != nil {
 		fs.mu.RUnlock()
 		return err
 	}
-	if err := d.inode.removexattr(rp.Credentials(), name); err != nil {
-		fs.mu.RUnlock()
+	err = d.inode.removeXattr(rp.Credentials(), name)
+	fs.mu.RUnlock()
+	if err != nil {
 		return err
 	}
-	fs.mu.RUnlock()
 
 	d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent)
 	return nil
@@ -867,8 +865,16 @@ func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDe
 		}
 		if d.parent == nil {
 			if d.name != "" {
-				// This must be an anonymous memfd file.
+				// This file must have been created by
+				// newUnlinkedRegularFileDescription(). In Linux,
+				// mm/shmem.c:__shmem_file_setup() =>
+				// fs/file_table.c:alloc_file_pseudo() sets the created
+				// dentry's dentry_operations to anon_ops, for which d_dname ==
+				// simple_dname. fs/d_path.c:simple_dname() defines the
+				// dentry's pathname to be its name, prefixed with "/" and
+				// suffixed with " (deleted)".
 				b.PrependComponent("/" + d.name)
+				b.AppendString(" (deleted)")
 				return vfs.PrependPathSyntheticError{}
 			}
 			return vfs.PrependPathAtNonMountRootError{}
diff --git a/pkg/sentry/fsimpl/tmpfs/named_pipe.go b/pkg/sentry/fsimpl/tmpfs/named_pipe.go
index 739350cf0..d772db9e9 100644
--- a/pkg/sentry/fsimpl/tmpfs/named_pipe.go
+++ b/pkg/sentry/fsimpl/tmpfs/named_pipe.go
@@ -21,6 +21,7 @@ import (
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
+// +stateify savable
 type namedPipe struct {
 	inode inode
 
@@ -28,8 +29,8 @@ type namedPipe struct {
 }
 
 // Preconditions:
-//   * fs.mu must be locked.
-//   * rp.Mount().CheckBeginWrite() has been called successfully.
+// * fs.mu must be locked.
+// * rp.Mount().CheckBeginWrite() has been called successfully.
 func (fs *filesystem) newNamedPipe(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode) *inode {
 	file := &namedPipe{pipe: pipe.NewVFSPipe(true /* isNamed */, pipe.DefaultPipeSize, usermem.PageSize)}
 	file.inode.init(file, fs, kuid, kgid, linux.S_IFIFO|mode)
diff --git a/pkg/sentry/fsimpl/tmpfs/pipe_test.go b/pkg/sentry/fsimpl/tmpfs/pipe_test.go
index ec2701d8b..2f856ce36 100644
--- a/pkg/sentry/fsimpl/tmpfs/pipe_test.go
+++ b/pkg/sentry/fsimpl/tmpfs/pipe_test.go
@@ -158,13 +158,14 @@ func setup(t *testing.T) (context.Context, *auth.Credentials, *vfs.VirtualFilesy
 	vfsObj.MustRegisterFilesystemType("tmpfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
 		AllowUserMount: true,
 	})
-	mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{})
+	mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.MountOptions{})
 	if err != nil {
 		t.Fatalf("failed to create tmpfs root mount: %v", err)
 	}
 
 	// Create the pipe.
 	root := mntns.Root()
+	root.IncRef()
 	pop := vfs.PathOperation{
 		Root:  root,
 		Start: root,
diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file.go b/pkg/sentry/fsimpl/tmpfs/regular_file.go
index 0710b65db..ce4e3eda7 100644
--- a/pkg/sentry/fsimpl/tmpfs/regular_file.go
+++ b/pkg/sentry/fsimpl/tmpfs/regular_file.go
@@ -36,12 +36,18 @@ import (
 )
 
 // regularFile is a regular (=S_IFREG) tmpfs file.
+//
+// +stateify savable
 type regularFile struct {
 	inode inode
 
 	// memFile is a platform.File used to allocate pages to this regularFile.
 	memFile *pgalloc.MemoryFile
 
+	// memoryUsageKind is the memory accounting category under which pages backing
+	// this regularFile's contents are accounted.
+	memoryUsageKind usage.MemoryKind
+
 	// mapsMu protects mappings.
 	mapsMu sync.Mutex `state:"nosave"`
 
@@ -62,7 +68,7 @@ type regularFile struct {
 	writableMappingPages uint64
 
 	// dataMu protects the fields below.
-	dataMu sync.RWMutex
+	dataMu sync.RWMutex `state:"nosave"`
 
 	// data maps offsets into the file to offsets into memFile that store
 	// the file's data.
@@ -86,14 +92,75 @@ type regularFile struct {
 
 func (fs *filesystem) newRegularFile(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode) *inode {
 	file := &regularFile{
-		memFile: fs.memFile,
-		seals:   linux.F_SEAL_SEAL,
+		memFile:         fs.memFile,
+		memoryUsageKind: usage.Tmpfs,
+		seals:           linux.F_SEAL_SEAL,
 	}
 	file.inode.init(file, fs, kuid, kgid, linux.S_IFREG|mode)
 	file.inode.nlink = 1 // from parent directory
 	return &file.inode
 }
 
+// newUnlinkedRegularFileDescription creates a regular file on the tmpfs
+// filesystem represented by mount and returns an FD representing that file.
+// The new file is not reachable by path traversal from any other file.
+//
+// newUnlinkedRegularFileDescription is analogous to Linux's
+// mm/shmem.c:__shmem_file_setup().
+//
+// Preconditions: mount must be a tmpfs mount.
+func newUnlinkedRegularFileDescription(ctx context.Context, creds *auth.Credentials, mount *vfs.Mount, name string) (*regularFileFD, error) {
+	fs, ok := mount.Filesystem().Impl().(*filesystem)
+	if !ok {
+		panic("tmpfs.newUnlinkedRegularFileDescription() called with non-tmpfs mount")
+	}
+
+	inode := fs.newRegularFile(creds.EffectiveKUID, creds.EffectiveKGID, 0777)
+	d := fs.newDentry(inode)
+	defer d.DecRef(ctx)
+	d.name = name
+
+	fd := &regularFileFD{}
+	fd.Init(&inode.locks)
+	flags := uint32(linux.O_RDWR)
+	if err := fd.vfsfd.Init(fd, flags, mount, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil {
+		return nil, err
+	}
+	return fd, nil
+}
+
+// NewZeroFile creates a new regular file and file description as for
+// mmap(MAP_SHARED | MAP_ANONYMOUS). The file has the given size and is
+// initially (implicitly) filled with zeroes.
+//
+// Preconditions: mount must be a tmpfs mount.
+func NewZeroFile(ctx context.Context, creds *auth.Credentials, mount *vfs.Mount, size uint64) (*vfs.FileDescription, error) {
+	// Compare mm/shmem.c:shmem_zero_setup().
+	fd, err := newUnlinkedRegularFileDescription(ctx, creds, mount, "dev/zero")
+	if err != nil {
+		return nil, err
+	}
+	rf := fd.inode().impl.(*regularFile)
+	rf.memoryUsageKind = usage.Anonymous
+	rf.size = size
+	return &fd.vfsfd, err
+}
+
+// NewMemfd creates a new regular file and file description as for
+// memfd_create.
+//
+// Preconditions: mount must be a tmpfs mount.
+func NewMemfd(ctx context.Context, creds *auth.Credentials, mount *vfs.Mount, allowSeals bool, name string) (*vfs.FileDescription, error) {
+	fd, err := newUnlinkedRegularFileDescription(ctx, creds, mount, name)
+	if err != nil {
+		return nil, err
+	}
+	if allowSeals {
+		fd.inode().impl.(*regularFile).seals = 0
+	}
+	return &fd.vfsfd, nil
+}
+
 // truncate grows or shrinks the file to the given size. It returns true if the
 // file size was updated.
 func (rf *regularFile) truncate(newSize uint64) (bool, error) {
@@ -226,7 +293,7 @@ func (rf *regularFile) Translate(ctx context.Context, required, optional memmap.
 		optional.End = pgend
 	}
 
-	cerr := rf.data.Fill(ctx, required, optional, rf.memFile, usage.Tmpfs, func(_ context.Context, dsts safemem.BlockSeq, _ uint64) (uint64, error) {
+	cerr := rf.data.Fill(ctx, required, optional, rf.size, rf.memFile, rf.memoryUsageKind, func(_ context.Context, dsts safemem.BlockSeq, _ uint64) (uint64, error) {
 		// Newly-allocated pages are zeroed, so we don't need to do anything.
 		return dsts.NumBytes(), nil
 	})
@@ -260,13 +327,14 @@ func (*regularFile) InvalidateUnsavable(context.Context) error {
 	return nil
 }
 
+// +stateify savable
 type regularFileFD struct {
 	fileDescription
 
 	// off is the file offset. off is accessed using atomic memory operations.
 	// offMu serializes operations that may mutate off.
 	off   int64
-	offMu sync.Mutex
+	offMu sync.Mutex `state:"nosave"`
 }
 
 // Release implements vfs.FileDescriptionImpl.Release.
@@ -575,7 +643,7 @@ func (rw *regularFileReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64,
 		case gap.Ok():
 			// Allocate memory for the write.
 			gapMR := gap.Range().Intersect(pgMR)
-			fr, err := rw.file.memFile.Allocate(gapMR.Length(), usage.Tmpfs)
+			fr, err := rw.file.memFile.Allocate(gapMR.Length(), rw.file.memoryUsageKind)
 			if err != nil {
 				retErr = err
 				goto exitLoop
diff --git a/pkg/sentry/fsimpl/tmpfs/socket_file.go b/pkg/sentry/fsimpl/tmpfs/socket_file.go
index 3ed650474..5699d5975 100644
--- a/pkg/sentry/fsimpl/tmpfs/socket_file.go
+++ b/pkg/sentry/fsimpl/tmpfs/socket_file.go
@@ -21,6 +21,8 @@ import (
 )
 
 // socketFile is a socket (=S_IFSOCK) tmpfs file.
+//
+// +stateify savable
 type socketFile struct {
 	inode inode
 	ep    transport.BoundEndpoint
diff --git a/pkg/sentry/fsimpl/tmpfs/symlink.go b/pkg/sentry/fsimpl/tmpfs/symlink.go
index b0de5fabe..a102a2ee2 100644
--- a/pkg/sentry/fsimpl/tmpfs/symlink.go
+++ b/pkg/sentry/fsimpl/tmpfs/symlink.go
@@ -19,6 +19,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 )
 
+// +stateify savable
 type symlink struct {
 	inode  inode
 	target string // immutable
diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
index de2af6d01..e2a0aac69 100644
--- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go
+++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
@@ -51,9 +51,13 @@ import (
 const Name = "tmpfs"
 
 // FilesystemType implements vfs.FilesystemType.
+//
+// +stateify savable
 type FilesystemType struct{}
 
 // filesystem implements vfs.FilesystemImpl.
+//
+// +stateify savable
 type filesystem struct {
 	vfsfs vfs.Filesystem
 
@@ -67,9 +71,11 @@ type filesystem struct {
 	devMinor uint32
 
 	// mu serializes changes to the Dentry tree.
-	mu sync.RWMutex
+	mu sync.RWMutex `state:"nosave"`
 
 	nextInoMinusOne uint64 // accessed using atomic memory operations
+
+	root *dentry
 }
 
 // Name implements vfs.FilesystemType.Name.
@@ -77,7 +83,12 @@ func (FilesystemType) Name() string {
 	return Name
 }
 
+// Release implements vfs.FilesystemType.Release.
+func (FilesystemType) Release(ctx context.Context) {}
+
 // FilesystemOpts is used to pass configuration data to tmpfs.
+//
+// +stateify savable
 type FilesystemOpts struct {
 	// RootFileType is the FileType of the filesystem root. Valid values
 	// are: S_IFDIR, S_IFREG, and S_IFLNK. Defaults to S_IFDIR.
@@ -188,6 +199,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
 		fs.vfsfs.DecRef(ctx)
 		return nil, nil, fmt.Errorf("invalid tmpfs root file type: %#o", rootFileType)
 	}
+	fs.root = root
 	return &fs.vfsfs, &root.vfsd, nil
 }
 
@@ -199,9 +211,61 @@ func NewFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *au
 // Release implements vfs.FilesystemImpl.Release.
 func (fs *filesystem) Release(ctx context.Context) {
 	fs.vfsfs.VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor)
+	fs.mu.Lock()
+	if fs.root.inode.isDir() {
+		fs.root.releaseChildrenLocked(ctx)
+	}
+	fs.mu.Unlock()
+}
+
+// releaseChildrenLocked is called on the mount point by filesystem.Release() to
+// destroy all objects in the mount. It performs a depth-first walk of the
+// filesystem and "unlinks" everything by decrementing link counts
+// appropriately. There should be no open file descriptors when this is called,
+// so each inode should only have one outstanding reference that is removed once
+// its link count hits zero.
+//
+// Note that we do not update filesystem state precisely while tearing down (for
+// instance, the child maps are ignored)--we only care to remove all remaining
+// references so that every filesystem object gets destroyed. Also note that we
+// do not need to trigger DecRef on the mount point itself or any child mount;
+// these are taken care of by the destructor of the enclosing MountNamespace.
+//
+// Precondition: filesystem.mu is held.
+func (d *dentry) releaseChildrenLocked(ctx context.Context) {
+	dir := d.inode.impl.(*directory)
+	for _, child := range dir.childMap {
+		if child.inode.isDir() {
+			child.releaseChildrenLocked(ctx)
+			child.inode.decLinksLocked(ctx) // link for child/.
+			dir.inode.decLinksLocked(ctx)   // link for child/..
+		}
+		child.inode.decLinksLocked(ctx) // link for child
+	}
+}
+
+// immutable
+var globalStatfs = linux.Statfs{
+	Type:         linux.TMPFS_MAGIC,
+	BlockSize:    usermem.PageSize,
+	FragmentSize: usermem.PageSize,
+	NameLength:   linux.NAME_MAX,
+
+	// tmpfs currently does not support configurable size limits. In Linux,
+	// such a tmpfs mount will return f_blocks == f_bfree == f_bavail == 0 from
+	// statfs(2). However, many applications treat this as having a size limit
+	// of 0. To work around this, claim to have a very large but non-zero size,
+	// chosen to ensure that BlockSize * Blocks does not overflow int64 (which
+	// applications may also handle incorrectly).
+	// TODO(b/29637826): allow configuring a tmpfs size and enforce it.
+	Blocks:          math.MaxInt64 / usermem.PageSize,
+	BlocksFree:      math.MaxInt64 / usermem.PageSize,
+	BlocksAvailable: math.MaxInt64 / usermem.PageSize,
 }
 
 // dentry implements vfs.DentryImpl.
+//
+// +stateify savable
 type dentry struct {
 	vfsd vfs.Dentry
 
@@ -281,6 +345,8 @@ func (d *dentry) Watches() *vfs.Watches {
 func (d *dentry) OnZeroWatches(context.Context) {}
 
 // inode represents a filesystem object.
+//
+// +stateify savable
 type inode struct {
 	// fs is the owning filesystem. fs is immutable.
 	fs *filesystem
@@ -297,12 +363,12 @@ type inode struct {
 
 	// Inode metadata. Writing multiple fields atomically requires holding
 	// mu, othewise atomic operations can be used.
-	mu    sync.Mutex
-	mode  uint32 // file type and mode
-	nlink uint32 // protected by filesystem.mu instead of inode.mu
-	uid   uint32 // auth.KUID, but stored as raw uint32 for sync/atomic
-	gid   uint32 // auth.KGID, but ...
-	ino   uint64 // immutable
+	mu    sync.Mutex `state:"nosave"`
+	mode  uint32     // file type and mode
+	nlink uint32     // protected by filesystem.mu instead of inode.mu
+	uid   uint32     // auth.KUID, but stored as raw uint32 for sync/atomic
+	gid   uint32     // auth.KGID, but ...
+	ino   uint64     // immutable
 
 	// Linux's tmpfs has no concept of btime.
 	atime int64 // nanoseconds
@@ -340,8 +406,10 @@ func (i *inode) init(impl interface{}, fs *filesystem, kuid auth.KUID, kgid auth
 
 // incLinksLocked increments i's link count.
 //
-// Preconditions: filesystem.mu must be locked for writing. i.nlink != 0.
-// i.nlink < maxLinks.
+// Preconditions:
+// * filesystem.mu must be locked for writing.
+// * i.nlink != 0.
+// * i.nlink < maxLinks.
 func (i *inode) incLinksLocked() {
 	if i.nlink == 0 {
 		panic("tmpfs.inode.incLinksLocked() called with no existing links")
@@ -355,7 +423,9 @@ func (i *inode) incLinksLocked() {
 // decLinksLocked decrements i's link count. If the link count reaches 0, we
 // remove a reference on i as well.
 //
-// Preconditions: filesystem.mu must be locked for writing. i.nlink != 0.
+// Preconditions:
+// * filesystem.mu must be locked for writing.
+// * i.nlink != 0.
 func (i *inode) decLinksLocked(ctx context.Context) {
 	if i.nlink == 0 {
 		panic("tmpfs.inode.decLinksLocked() called with no existing links")
@@ -594,66 +664,59 @@ func (i *inode) touchCMtime() {
 	i.mu.Unlock()
 }
 
-// Preconditions: The caller has called vfs.Mount.CheckBeginWrite() and holds
-// inode.mu.
+// Preconditions:
+// * The caller has called vfs.Mount.CheckBeginWrite().
+// * inode.mu must be locked.
 func (i *inode) touchCMtimeLocked() {
 	now := i.fs.clock.Now().Nanoseconds()
 	atomic.StoreInt64(&i.mtime, now)
 	atomic.StoreInt64(&i.ctime, now)
 }
 
-func (i *inode) listxattr(size uint64) ([]string, error) {
-	return i.xattrs.Listxattr(size)
+func (i *inode) listXattr(size uint64) ([]string, error) {
+	return i.xattrs.ListXattr(size)
 }
 
-func (i *inode) getxattr(creds *auth.Credentials, opts *vfs.GetxattrOptions) (string, error) {
-	if err := i.checkPermissions(creds, vfs.MayRead); err != nil {
+func (i *inode) getXattr(creds *auth.Credentials, opts *vfs.GetXattrOptions) (string, error) {
+	if err := i.checkXattrPermissions(creds, opts.Name, vfs.MayRead); err != nil {
 		return "", err
 	}
-	if !strings.HasPrefix(opts.Name, linux.XATTR_USER_PREFIX) {
-		return "", syserror.EOPNOTSUPP
-	}
-	if !i.userXattrSupported() {
-		return "", syserror.ENODATA
-	}
-	return i.xattrs.Getxattr(opts)
+	return i.xattrs.GetXattr(opts)
 }
 
-func (i *inode) setxattr(creds *auth.Credentials, opts *vfs.SetxattrOptions) error {
-	if err := i.checkPermissions(creds, vfs.MayWrite); err != nil {
+func (i *inode) setXattr(creds *auth.Credentials, opts *vfs.SetXattrOptions) error {
+	if err := i.checkXattrPermissions(creds, opts.Name, vfs.MayWrite); err != nil {
 		return err
 	}
-	if !strings.HasPrefix(opts.Name, linux.XATTR_USER_PREFIX) {
-		return syserror.EOPNOTSUPP
-	}
-	if !i.userXattrSupported() {
-		return syserror.EPERM
-	}
-	return i.xattrs.Setxattr(opts)
+	return i.xattrs.SetXattr(opts)
 }
 
-func (i *inode) removexattr(creds *auth.Credentials, name string) error {
-	if err := i.checkPermissions(creds, vfs.MayWrite); err != nil {
+func (i *inode) removeXattr(creds *auth.Credentials, name string) error {
+	if err := i.checkXattrPermissions(creds, name, vfs.MayWrite); err != nil {
 		return err
 	}
-	if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) {
+	return i.xattrs.RemoveXattr(name)
+}
+
+func (i *inode) checkXattrPermissions(creds *auth.Credentials, name string, ats vfs.AccessTypes) error {
+	// We currently only support extended attributes in the user.* and
+	// trusted.* namespaces. See b/148380782.
+	if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) && !strings.HasPrefix(name, linux.XATTR_TRUSTED_PREFIX) {
 		return syserror.EOPNOTSUPP
 	}
-	if !i.userXattrSupported() {
-		return syserror.EPERM
+	mode := linux.FileMode(atomic.LoadUint32(&i.mode))
+	kuid := auth.KUID(atomic.LoadUint32(&i.uid))
+	kgid := auth.KGID(atomic.LoadUint32(&i.gid))
+	if err := vfs.GenericCheckPermissions(creds, ats, mode, kuid, kgid); err != nil {
+		return err
 	}
-	return i.xattrs.Removexattr(name)
-}
-
-// Extended attributes in the user.* namespace are only supported for regular
-// files and directories.
-func (i *inode) userXattrSupported() bool {
-	filetype := linux.S_IFMT & atomic.LoadUint32(&i.mode)
-	return filetype == linux.S_IFREG || filetype == linux.S_IFDIR
+	return vfs.CheckXattrPermissions(creds, ats, mode, kuid, name)
 }
 
 // fileDescription is embedded by tmpfs implementations of
 // vfs.FileDescriptionImpl.
+//
+// +stateify savable
 type fileDescription struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
@@ -693,20 +756,25 @@ func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions)
 	return nil
 }
 
-// Listxattr implements vfs.FileDescriptionImpl.Listxattr.
-func (fd *fileDescription) Listxattr(ctx context.Context, size uint64) ([]string, error) {
-	return fd.inode().listxattr(size)
+// StatFS implements vfs.FileDescriptionImpl.StatFS.
+func (fd *fileDescription) StatFS(ctx context.Context) (linux.Statfs, error) {
+	return globalStatfs, nil
 }
 
-// Getxattr implements vfs.FileDescriptionImpl.Getxattr.
-func (fd *fileDescription) Getxattr(ctx context.Context, opts vfs.GetxattrOptions) (string, error) {
-	return fd.inode().getxattr(auth.CredentialsFromContext(ctx), &opts)
+// ListXattr implements vfs.FileDescriptionImpl.ListXattr.
+func (fd *fileDescription) ListXattr(ctx context.Context, size uint64) ([]string, error) {
+	return fd.inode().listXattr(size)
 }
 
-// Setxattr implements vfs.FileDescriptionImpl.Setxattr.
-func (fd *fileDescription) Setxattr(ctx context.Context, opts vfs.SetxattrOptions) error {
+// GetXattr implements vfs.FileDescriptionImpl.GetXattr.
+func (fd *fileDescription) GetXattr(ctx context.Context, opts vfs.GetXattrOptions) (string, error) {
+	return fd.inode().getXattr(auth.CredentialsFromContext(ctx), &opts)
+}
+
+// SetXattr implements vfs.FileDescriptionImpl.SetXattr.
+func (fd *fileDescription) SetXattr(ctx context.Context, opts vfs.SetXattrOptions) error {
 	d := fd.dentry()
-	if err := d.inode.setxattr(auth.CredentialsFromContext(ctx), &opts); err != nil {
+	if err := d.inode.setXattr(auth.CredentialsFromContext(ctx), &opts); err != nil {
 		return err
 	}
 
@@ -715,10 +783,10 @@ func (fd *fileDescription) Setxattr(ctx context.Context, opts vfs.SetxattrOption
 	return nil
 }
 
-// Removexattr implements vfs.FileDescriptionImpl.Removexattr.
-func (fd *fileDescription) Removexattr(ctx context.Context, name string) error {
+// RemoveXattr implements vfs.FileDescriptionImpl.RemoveXattr.
+func (fd *fileDescription) RemoveXattr(ctx context.Context, name string) error {
 	d := fd.dentry()
-	if err := d.inode.removexattr(auth.CredentialsFromContext(ctx), name); err != nil {
+	if err := d.inode.removeXattr(auth.CredentialsFromContext(ctx), name); err != nil {
 		return err
 	}
 
@@ -727,37 +795,6 @@ func (fd *fileDescription) Removexattr(ctx context.Context, name string) error {
 	return nil
 }
 
-// NewMemfd creates a new tmpfs regular file and file description that can back
-// an anonymous fd created by memfd_create.
-func NewMemfd(ctx context.Context, creds *auth.Credentials, mount *vfs.Mount, allowSeals bool, name string) (*vfs.FileDescription, error) {
-	fs, ok := mount.Filesystem().Impl().(*filesystem)
-	if !ok {
-		panic("NewMemfd() called with non-tmpfs mount")
-	}
-
-	// Per Linux, mm/shmem.c:__shmem_file_setup(), memfd inodes are set up with
-	// S_IRWXUGO.
-	inode := fs.newRegularFile(creds.EffectiveKUID, creds.EffectiveKGID, 0777)
-	rf := inode.impl.(*regularFile)
-	if allowSeals {
-		rf.seals = 0
-	}
-
-	d := fs.newDentry(inode)
-	defer d.DecRef(ctx)
-	d.name = name
-
-	// Per Linux, mm/shmem.c:__shmem_file_setup(), memfd files are set up with
-	// FMODE_READ | FMODE_WRITE.
-	var fd regularFileFD
-	fd.Init(&inode.locks)
-	flags := uint32(linux.O_RDWR)
-	if err := fd.vfsfd.Init(&fd, flags, mount, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil {
-		return nil, err
-	}
-	return &fd.vfsfd, nil
-}
-
 // LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
 func (fd *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
 	return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block)
diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs_test.go b/pkg/sentry/fsimpl/tmpfs/tmpfs_test.go
index 6f3e3ae6f..fc5323abc 100644
--- a/pkg/sentry/fsimpl/tmpfs/tmpfs_test.go
+++ b/pkg/sentry/fsimpl/tmpfs/tmpfs_test.go
@@ -41,11 +41,12 @@ func newTmpfsRoot(ctx context.Context) (*vfs.VirtualFilesystem, vfs.VirtualDentr
 	vfsObj.MustRegisterFilesystemType("tmpfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
 		AllowUserMount: true,
 	})
-	mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{})
+	mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.MountOptions{})
 	if err != nil {
 		return nil, vfs.VirtualDentry{}, nil, fmt.Errorf("failed to create tmpfs root mount: %v", err)
 	}
 	root := mntns.Root()
+	root.IncRef()
 	return vfsObj, root, func() {
 		root.DecRef(ctx)
 		mntns.DecRef(ctx)
diff --git a/pkg/sentry/fsimpl/verity/BUILD b/pkg/sentry/fsimpl/verity/BUILD
index 28d2a4bcb..0ca750281 100644
--- a/pkg/sentry/fsimpl/verity/BUILD
+++ b/pkg/sentry/fsimpl/verity/BUILD
@@ -1,4 +1,4 @@
-load("//tools:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 licenses(["notice"])
 
@@ -13,11 +13,35 @@ go_library(
         "//pkg/abi/linux",
         "//pkg/context",
         "//pkg/fspath",
+        "//pkg/marshal/primitive",
+        "//pkg/merkletree",
+        "//pkg/sentry/arch",
         "//pkg/sentry/fs/lock",
+        "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/socket/unix/transport",
         "//pkg/sentry/vfs",
         "//pkg/sync",
         "//pkg/syserror",
+        "//pkg/usermem",
+    ],
+)
+
+go_test(
+    name = "verity_test",
+    srcs = [
+        "verity_test.go",
+    ],
+    library = ":verity",
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/context",
+        "//pkg/fspath",
+        "//pkg/sentry/arch",
+        "//pkg/sentry/fsimpl/tmpfs",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/kernel/contexttest",
+        "//pkg/sentry/vfs",
+        "//pkg/usermem",
     ],
 )
diff --git a/pkg/sentry/fsimpl/verity/filesystem.go b/pkg/sentry/fsimpl/verity/filesystem.go
index 78c6074bd..03da505e1 100644
--- a/pkg/sentry/fsimpl/verity/filesystem.go
+++ b/pkg/sentry/fsimpl/verity/filesystem.go
@@ -15,9 +15,17 @@
 package verity
 
 import (
+	"bytes"
+	"fmt"
+	"io"
+	"strconv"
+	"strings"
+	"sync/atomic"
+
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/merkletree"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
@@ -91,10 +99,474 @@ func (fs *filesystem) renameMuUnlockAndCheckDrop(ctx context.Context, ds **[]*de
 	putDentrySlice(*ds)
 }
 
-// resolveLocked resolves rp to an existing file.
-func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath, ds **[]*dentry) (*dentry, error) {
-	// TODO(b/159261227): Implement resolveLocked.
-	return nil, nil
+// stepLocked resolves rp.Component() to an existing file, starting from the
+// given directory.
+//
+// Dentries which may have a reference count of zero, and which therefore
+// should be dropped once traversal is complete, are appended to ds.
+//
+// Preconditions: fs.renameMu must be locked. d.dirMu must be locked.
+// !rp.Done().
+func (fs *filesystem) stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, mayFollowSymlinks bool, ds **[]*dentry) (*dentry, error) {
+	if !d.isDir() {
+		return nil, syserror.ENOTDIR
+	}
+
+	if err := d.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
+		return nil, err
+	}
+
+afterSymlink:
+	name := rp.Component()
+	if name == "." {
+		rp.Advance()
+		return d, nil
+	}
+	if name == ".." {
+		if isRoot, err := rp.CheckRoot(ctx, &d.vfsd); err != nil {
+			return nil, err
+		} else if isRoot || d.parent == nil {
+			rp.Advance()
+			return d, nil
+		}
+		if err := rp.CheckMount(ctx, &d.parent.vfsd); err != nil {
+			return nil, err
+		}
+		rp.Advance()
+		return d.parent, nil
+	}
+	child, err := fs.getChildLocked(ctx, d, name, ds)
+	if err != nil {
+		return nil, err
+	}
+	if err := rp.CheckMount(ctx, &child.vfsd); err != nil {
+		return nil, err
+	}
+	if child.isSymlink() && mayFollowSymlinks && rp.ShouldFollowSymlink() {
+		target, err := child.readlink(ctx)
+		if err != nil {
+			return nil, err
+		}
+		if err := rp.HandleSymlink(target); err != nil {
+			return nil, err
+		}
+		goto afterSymlink // don't check the current directory again
+	}
+	rp.Advance()
+	return child, nil
+}
+
+// verifyChild verifies the hash of child against the already verified hash of
+// the parent to ensure the child is expected.  verifyChild triggers a sentry
+// panic if unexpected modifications to the file system are detected. In
+// noCrashOnVerificationFailure mode it returns a syserror instead.
+// Preconditions: fs.renameMu must be locked. d.dirMu must be locked.
+// TODO(b/166474175): Investigate all possible errors returned in this
+// function, and make sure we differentiate all errors that indicate unexpected
+// modifications to the file system from the ones that are not harmful.
+func (fs *filesystem) verifyChild(ctx context.Context, parent *dentry, child *dentry) (*dentry, error) {
+	vfsObj := fs.vfsfs.VirtualFilesystem()
+
+	// Get the path to the child dentry. This is only used to provide path
+	// information in failure case.
+	childPath, err := vfsObj.PathnameWithDeleted(ctx, child.fs.rootDentry.lowerVD, child.lowerVD)
+	if err != nil {
+		return nil, err
+	}
+
+	fs.verityMu.RLock()
+	defer fs.verityMu.RUnlock()
+	// Read the offset of the child from the extended attributes of the
+	// corresponding Merkle tree file.
+	// This is the offset of the hash for child in its parent's Merkle tree
+	// file.
+	off, err := vfsObj.GetXattrAt(ctx, fs.creds, &vfs.PathOperation{
+		Root:  child.lowerMerkleVD,
+		Start: child.lowerMerkleVD,
+	}, &vfs.GetXattrOptions{
+		Name: merkleOffsetInParentXattr,
+		Size: sizeOfStringInt32,
+	})
+
+	// The Merkle tree file for the child should have been created and
+	// contains the expected xattrs. If the file or the xattr does not
+	// exist, it indicates unexpected modifications to the file system.
+	if err == syserror.ENOENT || err == syserror.ENODATA {
+		return nil, alertIntegrityViolation(err, fmt.Sprintf("Failed to get xattr %s for %s: %v", merkleOffsetInParentXattr, childPath, err))
+	}
+	if err != nil {
+		return nil, err
+	}
+	// The offset xattr should be an integer. If it's not, it indicates
+	// unexpected modifications to the file system.
+	offset, err := strconv.Atoi(off)
+	if err != nil {
+		return nil, alertIntegrityViolation(err, fmt.Sprintf("Failed to convert xattr %s for %s to int: %v", merkleOffsetInParentXattr, childPath, err))
+	}
+
+	// Open parent Merkle tree file to read and verify child's hash.
+	parentMerkleFD, err := vfsObj.OpenAt(ctx, fs.creds, &vfs.PathOperation{
+		Root:  parent.lowerMerkleVD,
+		Start: parent.lowerMerkleVD,
+	}, &vfs.OpenOptions{
+		Flags: linux.O_RDONLY,
+	})
+
+	// The parent Merkle tree file should have been created. If it's
+	// missing, it indicates an unexpected modification to the file system.
+	if err == syserror.ENOENT {
+		return nil, alertIntegrityViolation(err, fmt.Sprintf("Failed to open parent Merkle file for %s: %v", childPath, err))
+	}
+	if err != nil {
+		return nil, err
+	}
+
+	// dataSize is the size of raw data for the Merkle tree. For a file,
+	// dataSize is the size of the whole file. For a directory, dataSize is
+	// the size of all its children's hashes.
+	dataSize, err := parentMerkleFD.GetXattr(ctx, &vfs.GetXattrOptions{
+		Name: merkleSizeXattr,
+		Size: sizeOfStringInt32,
+	})
+
+	// The Merkle tree file for the child should have been created and
+	// contains the expected xattrs. If the file or the xattr does not
+	// exist, it indicates unexpected modifications to the file system.
+	if err == syserror.ENOENT || err == syserror.ENODATA {
+		return nil, alertIntegrityViolation(err, fmt.Sprintf("Failed to get xattr %s for %s: %v", merkleSizeXattr, childPath, err))
+	}
+	if err != nil {
+		return nil, err
+	}
+
+	// The dataSize xattr should be an integer. If it's not, it indicates
+	// unexpected modifications to the file system.
+	parentSize, err := strconv.Atoi(dataSize)
+	if err != nil {
+		return nil, alertIntegrityViolation(syserror.EINVAL, fmt.Sprintf("Failed to convert xattr %s for %s to int: %v", merkleSizeXattr, childPath, err))
+	}
+
+	fdReader := vfs.FileReadWriteSeeker{
+		FD:  parentMerkleFD,
+		Ctx: ctx,
+	}
+
+	parentStat, err := vfsObj.StatAt(ctx, fs.creds, &vfs.PathOperation{
+		Root:  parent.lowerVD,
+		Start: parent.lowerVD,
+	}, &vfs.StatOptions{})
+	if err == syserror.ENOENT {
+		return nil, alertIntegrityViolation(err, fmt.Sprintf("Failed to get parent stat for %s: %v", childPath, err))
+	}
+	if err != nil {
+		return nil, err
+	}
+
+	// Since we are verifying against a directory Merkle tree, buf should
+	// contain the hash of the children in the parent Merkle tree when
+	// Verify returns with success.
+	var buf bytes.Buffer
+	if _, err := merkletree.Verify(&merkletree.VerifyParams{
+		Out:                   &buf,
+		File:                  &fdReader,
+		Tree:                  &fdReader,
+		Size:                  int64(parentSize),
+		Name:                  parent.name,
+		Mode:                  uint32(parentStat.Mode),
+		UID:                   parentStat.UID,
+		GID:                   parentStat.GID,
+		ReadOffset:            int64(offset),
+		ReadSize:              int64(merkletree.DigestSize()),
+		Expected:              parent.hash,
+		DataAndTreeInSameFile: true,
+	}); err != nil && err != io.EOF {
+		return nil, alertIntegrityViolation(syserror.EIO, fmt.Sprintf("Verification for %s failed: %v", childPath, err))
+	}
+
+	// Cache child hash when it's verified the first time.
+	if len(child.hash) == 0 {
+		child.hash = buf.Bytes()
+	}
+	return child, nil
+}
+
+// verifyStat verifies the stat against the verified hash. The mode/uid/gid of
+// the file is cached after verified.
+func (fs *filesystem) verifyStat(ctx context.Context, d *dentry, stat linux.Statx) error {
+	vfsObj := fs.vfsfs.VirtualFilesystem()
+
+	// Get the path to the child dentry. This is only used to provide path
+	// information in failure case.
+	childPath, err := vfsObj.PathnameWithDeleted(ctx, d.fs.rootDentry.lowerVD, d.lowerVD)
+	if err != nil {
+		return err
+	}
+
+	fs.verityMu.RLock()
+	defer fs.verityMu.RUnlock()
+
+	fd, err := vfsObj.OpenAt(ctx, fs.creds, &vfs.PathOperation{
+		Root:  d.lowerMerkleVD,
+		Start: d.lowerMerkleVD,
+	}, &vfs.OpenOptions{
+		Flags: linux.O_RDONLY,
+	})
+	if err == syserror.ENOENT {
+		return alertIntegrityViolation(err, fmt.Sprintf("Failed to open merkle file for %s: %v", childPath, err))
+	}
+	if err != nil {
+		return err
+	}
+
+	merkleSize, err := fd.GetXattr(ctx, &vfs.GetXattrOptions{
+		Name: merkleSizeXattr,
+		Size: sizeOfStringInt32,
+	})
+
+	if err == syserror.ENODATA {
+		return alertIntegrityViolation(err, fmt.Sprintf("Failed to get xattr %s for merkle file of %s: %v", merkleSizeXattr, childPath, err))
+	}
+	if err != nil {
+		return err
+	}
+
+	size, err := strconv.Atoi(merkleSize)
+	if err != nil {
+		return alertIntegrityViolation(syserror.EINVAL, fmt.Sprintf("Failed to convert xattr %s for %s to int: %v", merkleSizeXattr, childPath, err))
+	}
+
+	fdReader := vfs.FileReadWriteSeeker{
+		FD:  fd,
+		Ctx: ctx,
+	}
+
+	var buf bytes.Buffer
+	params := &merkletree.VerifyParams{
+		Out:        &buf,
+		Tree:       &fdReader,
+		Size:       int64(size),
+		Name:       d.name,
+		Mode:       uint32(stat.Mode),
+		UID:        stat.UID,
+		GID:        stat.GID,
+		ReadOffset: 0,
+		// Set read size to 0 so only the metadata is verified.
+		ReadSize:              0,
+		Expected:              d.hash,
+		DataAndTreeInSameFile: false,
+	}
+	if atomic.LoadUint32(&d.mode)&linux.S_IFMT == linux.S_IFDIR {
+		params.DataAndTreeInSameFile = true
+	}
+
+	if _, err := merkletree.Verify(params); err != nil && err != io.EOF {
+		return alertIntegrityViolation(err, fmt.Sprintf("Verification stat for %s failed: %v", childPath, err))
+	}
+	d.mode = uint32(stat.Mode)
+	d.uid = stat.UID
+	d.gid = stat.GID
+	return nil
+}
+
+// Preconditions: fs.renameMu must be locked. d.dirMu must be locked.
+func (fs *filesystem) getChildLocked(ctx context.Context, parent *dentry, name string, ds **[]*dentry) (*dentry, error) {
+	if child, ok := parent.children[name]; ok {
+		// If enabling verification on files/directories is not allowed
+		// during runtime, all cached children are already verified. If
+		// runtime enable is allowed and the parent directory is
+		// enabled, we should verify the child hash here because it may
+		// be cached before enabled.
+		if fs.allowRuntimeEnable {
+			if parent.verityEnabled() {
+				if _, err := fs.verifyChild(ctx, parent, child); err != nil {
+					return nil, err
+				}
+			}
+			if child.verityEnabled() {
+				vfsObj := fs.vfsfs.VirtualFilesystem()
+				mask := uint32(linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID)
+				stat, err := vfsObj.StatAt(ctx, fs.creds, &vfs.PathOperation{
+					Root:  child.lowerVD,
+					Start: child.lowerVD,
+				}, &vfs.StatOptions{
+					Mask: mask,
+				})
+				if err != nil {
+					return nil, err
+				}
+				if err := fs.verifyStat(ctx, child, stat); err != nil {
+					return nil, err
+				}
+			}
+		}
+		return child, nil
+	}
+	child, err := fs.lookupAndVerifyLocked(ctx, parent, name)
+	if err != nil {
+		return nil, err
+	}
+	if parent.children == nil {
+		parent.children = make(map[string]*dentry)
+	}
+	parent.children[name] = child
+	// child's refcount is initially 0, so it may be dropped after traversal.
+	*ds = appendDentry(*ds, child)
+	return child, nil
+}
+
+// Preconditions: fs.renameMu must be locked. parent.dirMu must be locked.
+func (fs *filesystem) lookupAndVerifyLocked(ctx context.Context, parent *dentry, name string) (*dentry, error) {
+	vfsObj := fs.vfsfs.VirtualFilesystem()
+
+	childFilename := fspath.Parse(name)
+	childVD, childErr := vfsObj.GetDentryAt(ctx, fs.creds, &vfs.PathOperation{
+		Root:  parent.lowerVD,
+		Start: parent.lowerVD,
+		Path:  childFilename,
+	}, &vfs.GetDentryOptions{})
+
+	// We will handle ENOENT separately, as it may indicate unexpected
+	// modifications to the file system, and may cause a sentry panic.
+	if childErr != nil && childErr != syserror.ENOENT {
+		return nil, childErr
+	}
+
+	// The dentry needs to be cleaned up if any error occurs. IncRef will be
+	// called if a verity child dentry is successfully created.
+	if childErr == nil {
+		defer childVD.DecRef(ctx)
+	}
+
+	childMerkleFilename := merklePrefix + name
+	childMerkleVD, childMerkleErr := vfsObj.GetDentryAt(ctx, fs.creds, &vfs.PathOperation{
+		Root:  parent.lowerVD,
+		Start: parent.lowerVD,
+		Path:  fspath.Parse(childMerkleFilename),
+	}, &vfs.GetDentryOptions{})
+
+	// We will handle ENOENT separately, as it may indicate unexpected
+	// modifications to the file system, and may cause a sentry panic.
+	if childMerkleErr != nil && childMerkleErr != syserror.ENOENT {
+		return nil, childMerkleErr
+	}
+
+	// The dentry needs to be cleaned up if any error occurs. IncRef will be
+	// called if a verity child dentry is successfully created.
+	if childMerkleErr == nil {
+		defer childMerkleVD.DecRef(ctx)
+	}
+
+	// Get the path to the parent dentry. This is only used to provide path
+	// information in failure case.
+	parentPath, err := vfsObj.PathnameWithDeleted(ctx, parent.fs.rootDentry.lowerVD, parent.lowerVD)
+	if err != nil {
+		return nil, err
+	}
+
+	// TODO(b/166474175): Investigate all possible errors of childErr and
+	// childMerkleErr, and make sure we differentiate all errors that
+	// indicate unexpected modifications to the file system from the ones
+	// that are not harmful.
+	if childErr == syserror.ENOENT && childMerkleErr == nil {
+		// Failed to get child file/directory dentry. However the
+		// corresponding Merkle tree is found. This indicates an
+		// unexpected modification to the file system that
+		// removed/renamed the child.
+		return nil, alertIntegrityViolation(childErr, fmt.Sprintf("Target file %s is expected but missing", parentPath+"/"+name))
+	} else if childErr == nil && childMerkleErr == syserror.ENOENT {
+		// If in allowRuntimeEnable mode, and the Merkle tree file is
+		// not created yet, we create an empty Merkle tree file, so that
+		// if the file is enabled through ioctl, we have the Merkle tree
+		// file open and ready to use.
+		// This may cause empty and unused Merkle tree files in
+		// allowRuntimeEnable mode, if they are never enabled. This
+		// does not affect verification, as we rely on cached hash to
+		// decide whether to perform verification, not the existence of
+		// the Merkle tree file. Also, those Merkle tree files are
+		// always hidden and cannot be accessed by verity fs users.
+		if fs.allowRuntimeEnable {
+			childMerkleFD, err := vfsObj.OpenAt(ctx, fs.creds, &vfs.PathOperation{
+				Root:  parent.lowerVD,
+				Start: parent.lowerVD,
+				Path:  fspath.Parse(childMerkleFilename),
+			}, &vfs.OpenOptions{
+				Flags: linux.O_RDWR | linux.O_CREAT,
+				Mode:  0644,
+			})
+			if err != nil {
+				return nil, err
+			}
+			childMerkleFD.DecRef(ctx)
+			childMerkleVD, err = vfsObj.GetDentryAt(ctx, fs.creds, &vfs.PathOperation{
+				Root:  parent.lowerVD,
+				Start: parent.lowerVD,
+				Path:  fspath.Parse(childMerkleFilename),
+			}, &vfs.GetDentryOptions{})
+			if err != nil {
+				return nil, err
+			}
+		} else {
+			// If runtime enable is not allowed. This indicates an
+			// unexpected modification to the file system that
+			// removed/renamed the Merkle tree file.
+			return nil, alertIntegrityViolation(childMerkleErr, fmt.Sprintf("Expected Merkle file for target %s but none found", parentPath+"/"+name))
+		}
+	} else if childErr == syserror.ENOENT && childMerkleErr == syserror.ENOENT {
+		// Both the child and the corresponding Merkle tree are missing.
+		// This could be an unexpected modification or due to incorrect
+		// parameter.
+		// TODO(b/167752508): Investigate possible ways to differentiate
+		// cases that both files are deleted from cases that they never
+		// exist in the file system.
+		return nil, alertIntegrityViolation(childErr, fmt.Sprintf("Failed to find file %s", parentPath+"/"+name))
+	}
+
+	mask := uint32(linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID)
+	stat, err := vfsObj.StatAt(ctx, fs.creds, &vfs.PathOperation{
+		Root:  childVD,
+		Start: childVD,
+	}, &vfs.StatOptions{
+		Mask: mask,
+	})
+	if err != nil {
+		return nil, err
+	}
+
+	child := fs.newDentry()
+	child.lowerVD = childVD
+	child.lowerMerkleVD = childMerkleVD
+
+	// Increase the reference for both childVD and childMerkleVD as they are
+	// held by child. If this function fails and the child is destroyed, the
+	// references will be decreased in destroyLocked.
+	childVD.IncRef()
+	childMerkleVD.IncRef()
+
+	parent.IncRef()
+	child.parent = parent
+	child.name = name
+
+	child.mode = uint32(stat.Mode)
+	child.uid = stat.UID
+	child.gid = stat.GID
+
+	// Verify child hash. This should always be performed unless in
+	// allowRuntimeEnable mode and the parent directory hasn't been enabled
+	// yet.
+	if parent.verityEnabled() {
+		if _, err := fs.verifyChild(ctx, parent, child); err != nil {
+			child.destroyLocked(ctx)
+			return nil, err
+		}
+	}
+	if child.verityEnabled() {
+		if err := fs.verifyStat(ctx, child, stat); err != nil {
+			child.destroyLocked(ctx)
+			return nil, err
+		}
+	}
+
+	return child, nil
 }
 
 // walkParentDirLocked resolves all but the last path component of rp to an
@@ -104,8 +576,39 @@ func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath,
 //
 // Preconditions: fs.renameMu must be locked. !rp.Done().
 func (fs *filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, error) {
-	// TODO(b/159261227): Implement walkParentDirLocked.
-	return nil, nil
+	for !rp.Final() {
+		d.dirMu.Lock()
+		next, err := fs.stepLocked(ctx, rp, d, true /* mayFollowSymlinks */, ds)
+		d.dirMu.Unlock()
+		if err != nil {
+			return nil, err
+		}
+		d = next
+	}
+	if !d.isDir() {
+		return nil, syserror.ENOTDIR
+	}
+	return d, nil
+}
+
+// resolveLocked resolves rp to an existing file.
+//
+// Preconditions: fs.renameMu must be locked.
+func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath, ds **[]*dentry) (*dentry, error) {
+	d := rp.Start().Impl().(*dentry)
+	for !rp.Done() {
+		d.dirMu.Lock()
+		next, err := fs.stepLocked(ctx, rp, d, true /* mayFollowSymlinks */, ds)
+		d.dirMu.Unlock()
+		if err != nil {
+			return nil, err
+		}
+		d = next
+	}
+	if rp.MustBeDir() && !d.isDir() {
+		return nil, syserror.ENOTDIR
+	}
+	return d, nil
 }
 
 // AccessAt implements vfs.Filesystem.Impl.AccessAt.
@@ -179,8 +682,183 @@ func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
 
 // OpenAt implements vfs.FilesystemImpl.OpenAt.
 func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	//TODO(b/159261227): Implement OpenAt.
-	return nil, nil
+	// Verity fs is read-only.
+	if opts.Flags&(linux.O_WRONLY|linux.O_CREAT) != 0 {
+		return nil, syserror.EROFS
+	}
+
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+
+	start := rp.Start().Impl().(*dentry)
+	if rp.Done() {
+		return start.openLocked(ctx, rp, &opts)
+	}
+
+afterTrailingSymlink:
+	parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds)
+	if err != nil {
+		return nil, err
+	}
+
+	// Check for search permission in the parent directory.
+	if err := parent.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
+		return nil, err
+	}
+
+	// Open existing child or follow symlink.
+	parent.dirMu.Lock()
+	child, err := fs.stepLocked(ctx, rp, parent, false /*mayFollowSymlinks*/, &ds)
+	parent.dirMu.Unlock()
+	if err != nil {
+		return nil, err
+	}
+	if child.isSymlink() && rp.ShouldFollowSymlink() {
+		target, err := child.readlink(ctx)
+		if err != nil {
+			return nil, err
+		}
+		if err := rp.HandleSymlink(target); err != nil {
+			return nil, err
+		}
+		start = parent
+		goto afterTrailingSymlink
+	}
+	return child.openLocked(ctx, rp, &opts)
+}
+
+// Preconditions: fs.renameMu must be locked.
+func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions) (*vfs.FileDescription, error) {
+	// Users should not open the Merkle tree files. Those are for verity fs
+	// use only.
+	if strings.Contains(d.name, merklePrefix) {
+		return nil, syserror.EPERM
+	}
+	ats := vfs.AccessTypesForOpenFlags(opts)
+	if err := d.checkPermissions(rp.Credentials(), ats); err != nil {
+		return nil, err
+	}
+
+	// Verity fs is read-only.
+	if ats&vfs.MayWrite != 0 {
+		return nil, syserror.EROFS
+	}
+
+	// Get the path to the target file. This is only used to provide path
+	// information in failure case.
+	path, err := d.fs.vfsfs.VirtualFilesystem().PathnameWithDeleted(ctx, d.fs.rootDentry.lowerVD, d.lowerVD)
+	if err != nil {
+		return nil, err
+	}
+
+	// Open the file in the underlying file system.
+	lowerFD, err := rp.VirtualFilesystem().OpenAt(ctx, d.fs.creds, &vfs.PathOperation{
+		Root:  d.lowerVD,
+		Start: d.lowerVD,
+	}, opts)
+
+	// The file should exist, as we succeeded in finding its dentry. If it's
+	// missing, it indicates an unexpected modification to the file system.
+	if err != nil {
+		if err == syserror.ENOENT {
+			return nil, alertIntegrityViolation(err, fmt.Sprintf("File %s expected but not found", path))
+		}
+		return nil, err
+	}
+
+	// lowerFD needs to be cleaned up if any error occurs. IncRef will be
+	// called if a verity FD is successfully created.
+	defer lowerFD.DecRef(ctx)
+
+	// Open the Merkle tree file corresponding to the current file/directory
+	// to be used later for verifying Read/Walk.
+	merkleReader, err := rp.VirtualFilesystem().OpenAt(ctx, d.fs.creds, &vfs.PathOperation{
+		Root:  d.lowerMerkleVD,
+		Start: d.lowerMerkleVD,
+	}, &vfs.OpenOptions{
+		Flags: linux.O_RDONLY,
+	})
+
+	// The Merkle tree file should exist, as we succeeded in finding its
+	// dentry. If it's missing, it indicates an unexpected modification to
+	// the file system.
+	if err != nil {
+		if err == syserror.ENOENT {
+			return nil, alertIntegrityViolation(err, fmt.Sprintf("Merkle file for %s expected but not found", path))
+		}
+		return nil, err
+	}
+
+	// merkleReader needs to be cleaned up if any error occurs. IncRef will
+	// be called if a verity FD is successfully created.
+	defer merkleReader.DecRef(ctx)
+
+	lowerFlags := lowerFD.StatusFlags()
+	lowerFDOpts := lowerFD.Options()
+	var merkleWriter *vfs.FileDescription
+	var parentMerkleWriter *vfs.FileDescription
+
+	// Only open the Merkle tree files for write if in allowRuntimeEnable
+	// mode.
+	if d.fs.allowRuntimeEnable {
+		merkleWriter, err = rp.VirtualFilesystem().OpenAt(ctx, d.fs.creds, &vfs.PathOperation{
+			Root:  d.lowerMerkleVD,
+			Start: d.lowerMerkleVD,
+		}, &vfs.OpenOptions{
+			Flags: linux.O_WRONLY | linux.O_APPEND,
+		})
+		if err != nil {
+			if err == syserror.ENOENT {
+				return nil, alertIntegrityViolation(err, fmt.Sprintf("Merkle file for %s expected but not found", path))
+			}
+			return nil, err
+		}
+		// merkleWriter is cleaned up if any error occurs. IncRef will
+		// be called if a verity FD is created successfully.
+		defer merkleWriter.DecRef(ctx)
+
+		if d.parent != nil {
+			parentMerkleWriter, err = rp.VirtualFilesystem().OpenAt(ctx, d.fs.creds, &vfs.PathOperation{
+				Root:  d.parent.lowerMerkleVD,
+				Start: d.parent.lowerMerkleVD,
+			}, &vfs.OpenOptions{
+				Flags: linux.O_WRONLY | linux.O_APPEND,
+			})
+			if err != nil {
+				if err == syserror.ENOENT {
+					parentPath, _ := d.fs.vfsfs.VirtualFilesystem().PathnameWithDeleted(ctx, d.fs.rootDentry.lowerVD, d.parent.lowerVD)
+					return nil, alertIntegrityViolation(err, fmt.Sprintf("Merkle file for %s expected but not found", parentPath))
+				}
+				return nil, err
+			}
+			// parentMerkleWriter is cleaned up if any error occurs. IncRef
+			// will be called if a verity FD is created successfully.
+			defer parentMerkleWriter.DecRef(ctx)
+		}
+	}
+
+	fd := &fileDescription{
+		d:                  d,
+		lowerFD:            lowerFD,
+		merkleReader:       merkleReader,
+		merkleWriter:       merkleWriter,
+		parentMerkleWriter: parentMerkleWriter,
+		isDir:              d.isDir(),
+	}
+
+	if err := fd.vfsfd.Init(fd, lowerFlags, rp.Mount(), &d.vfsd, &lowerFDOpts); err != nil {
+		return nil, err
+	}
+	lowerFD.IncRef()
+	merkleReader.IncRef()
+	if merkleWriter != nil {
+		merkleWriter.IncRef()
+	}
+	if parentMerkleWriter != nil {
+		parentMerkleWriter.IncRef()
+	}
+	return &fd.vfsfd, err
 }
 
 // ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt.
@@ -218,6 +896,8 @@ func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts
 }
 
 // StatAt implements vfs.FilesystemImpl.StatAt.
+// TODO(b/170157489): Investigate whether stats other than Mode/UID/GID should
+// be verified.
 func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) {
 	var ds *[]*dentry
 	fs.renameMu.RLock()
@@ -235,6 +915,11 @@ func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
 	if err != nil {
 		return linux.Statx{}, err
 	}
+	if d.verityEnabled() {
+		if err := fs.verifyStat(ctx, d, stat); err != nil {
+			return linux.Statx{}, err
+		}
+	}
 	return stat, nil
 }
 
@@ -256,7 +941,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
 	return syserror.EROFS
 }
 
-// BoundEndpointAt implements FilesystemImpl.BoundEndpointAt.
+// BoundEndpointAt implements vfs.FilesystemImpl.BoundEndpointAt.
 func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) {
 	var ds *[]*dentry
 	fs.renameMu.RLock()
@@ -267,8 +952,8 @@ func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath
 	return nil, syserror.ECONNREFUSED
 }
 
-// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
-func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
+// ListXattrAt implements vfs.FilesystemImpl.ListXattrAt.
+func (fs *filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
 	var ds *[]*dentry
 	fs.renameMu.RLock()
 	defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
@@ -277,14 +962,14 @@ func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, si
 		return nil, err
 	}
 	lowerVD := d.lowerVD
-	return fs.vfsfs.VirtualFilesystem().ListxattrAt(ctx, d.fs.creds, &vfs.PathOperation{
+	return fs.vfsfs.VirtualFilesystem().ListXattrAt(ctx, d.fs.creds, &vfs.PathOperation{
 		Root:  lowerVD,
 		Start: lowerVD,
 	}, size)
 }
 
-// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt.
-func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) {
+// GetXattrAt implements vfs.FilesystemImpl.GetXattrAt.
+func (fs *filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetXattrOptions) (string, error) {
 	var ds *[]*dentry
 	fs.renameMu.RLock()
 	defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
@@ -293,20 +978,20 @@ func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt
 		return "", err
 	}
 	lowerVD := d.lowerVD
-	return fs.vfsfs.VirtualFilesystem().GetxattrAt(ctx, d.fs.creds, &vfs.PathOperation{
+	return fs.vfsfs.VirtualFilesystem().GetXattrAt(ctx, d.fs.creds, &vfs.PathOperation{
 		Root:  lowerVD,
 		Start: lowerVD,
 	}, &opts)
 }
 
-// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt.
-func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error {
+// SetXattrAt implements vfs.FilesystemImpl.SetXattrAt.
+func (fs *filesystem) SetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetXattrOptions) error {
 	// Verity file system is read-only.
 	return syserror.EROFS
 }
 
-// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt.
-func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
+// RemoveXattrAt implements vfs.FilesystemImpl.RemoveXattrAt.
+func (fs *filesystem) RemoveXattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
 	// Verity file system is read-only.
 	return syserror.EROFS
 }
diff --git a/pkg/sentry/fsimpl/verity/verity.go b/pkg/sentry/fsimpl/verity/verity.go
index cb29d33a5..8dc9e26bc 100644
--- a/pkg/sentry/fsimpl/verity/verity.go
+++ b/pkg/sentry/fsimpl/verity/verity.go
@@ -22,29 +22,60 @@
 package verity
 
 import (
+	"fmt"
+	"strconv"
 	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
+	"gvisor.dev/gvisor/pkg/merkletree"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
 	fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // Name is the default filesystem name.
 const Name = "verity"
 
-// testOnlyDebugging allows verity file system to return error instead of
-// crashing the application when a malicious action is detected. This should
-// only be set for tests.
-var testOnlyDebugging bool
+// merklePrefix is the prefix of the Merkle tree files. For example, the Merkle
+// tree file for "/foo" is "/.merkle.verity.foo".
+const merklePrefix = ".merkle.verity."
+
+// merkleoffsetInParentXattr is the extended attribute name specifying the
+// offset of child hash in its parent's Merkle tree.
+const merkleOffsetInParentXattr = "user.merkle.offset"
+
+// merkleSizeXattr is the extended attribute name specifying the size of data
+// hashed by the corresponding Merkle tree. For a file, it's the size of the
+// whole file. For a directory, it's the size of all its children's hashes.
+const merkleSizeXattr = "user.merkle.size"
+
+// sizeOfStringInt32 is the size for a 32 bit integer stored as string in
+// extended attributes. The maximum value of a 32 bit integer is 10 digits.
+const sizeOfStringInt32 = 10
+
+// noCrashOnVerificationFailure indicates whether the sandbox should panic
+// whenever verification fails. If true, an error is returned instead of
+// panicking. This should only be set for tests.
+// TOOD(b/165661693): Decide whether to panic or return error based on this
+// flag.
+var noCrashOnVerificationFailure bool
 
 // FilesystemType implements vfs.FilesystemType.
+//
+// +stateify savable
 type FilesystemType struct{}
 
 // filesystem implements vfs.FilesystemImpl.
+//
+// +stateify savable
 type filesystem struct {
 	vfsfs vfs.Filesystem
 
@@ -69,11 +100,24 @@ type filesystem struct {
 	// renameMu synchronizes renaming with non-renaming operations in order
 	// to ensure consistent lock ordering between dentry.dirMu in different
 	// dentries.
-	renameMu sync.RWMutex
+	renameMu sync.RWMutex `state:"nosave"`
+
+	// verityMu synchronizes enabling verity files, protects files or
+	// directories from being enabled by different threads simultaneously.
+	// It also ensures that verity does not access files that are being
+	// enabled.
+	//
+	// Also, the directory Merkle trees depends on the generated trees of
+	// its children. So they shouldn't be enabled the same time. This lock
+	// is for the whole file system to ensure that no more than one file is
+	// enabled the same time.
+	verityMu sync.RWMutex
 }
 
 // InternalFilesystemOptions may be passed as
 // vfs.GetFilesystemOptions.InternalData to FilesystemType.GetFilesystem.
+//
+// +stateify savable
 type InternalFilesystemOptions struct {
 	// RootMerkleFileName is the name of the verity root Merkle tree file.
 	RootMerkleFileName string
@@ -93,10 +137,10 @@ type InternalFilesystemOptions struct {
 	// system wrapped by verity file system.
 	LowerGetFSOptions vfs.GetFilesystemOptions
 
-	// TestOnlyDebugging allows verity file system to return error instead
-	// of crashing the application when a malicious action is detected. This
-	// should only be set for tests.
-	TestOnlyDebugging bool
+	// NoCrashOnVerificationFailure indicates whether the sandbox should
+	// panic whenever verification fails. If true, an error is returned
+	// instead of panicking. This should only be set for tests.
+	NoCrashOnVerificationFailure bool
 }
 
 // Name implements vfs.FilesystemType.Name.
@@ -104,10 +148,129 @@ func (FilesystemType) Name() string {
 	return Name
 }
 
+// Release implements vfs.FilesystemType.Release.
+func (FilesystemType) Release(ctx context.Context) {}
+
+// alertIntegrityViolation alerts a violation of integrity, which usually means
+// unexpected modification to the file system is detected. In
+// noCrashOnVerificationFailure mode, it returns an error, otherwise it panic.
+func alertIntegrityViolation(err error, msg string) error {
+	if noCrashOnVerificationFailure {
+		return err
+	}
+	panic(msg)
+}
+
 // GetFilesystem implements vfs.FilesystemType.GetFilesystem.
 func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
-	//TODO(b/159261227): Implement GetFilesystem.
-	return nil, nil, nil
+	iopts, ok := opts.InternalData.(InternalFilesystemOptions)
+	if !ok {
+		ctx.Warningf("verity.FilesystemType.GetFilesystem: missing verity configs")
+		return nil, nil, syserror.EINVAL
+	}
+	noCrashOnVerificationFailure = iopts.NoCrashOnVerificationFailure
+
+	// Mount the lower file system. The lower file system is wrapped inside
+	// verity, and should not be exposed or connected.
+	mopts := &vfs.MountOptions{
+		GetFilesystemOptions: iopts.LowerGetFSOptions,
+		InternalMount:        true,
+	}
+	mnt, err := vfsObj.MountDisconnected(ctx, creds, "", iopts.LowerName, mopts)
+	if err != nil {
+		return nil, nil, err
+	}
+
+	fs := &filesystem{
+		creds:              creds.Fork(),
+		lowerMount:         mnt,
+		allowRuntimeEnable: iopts.AllowRuntimeEnable,
+	}
+	fs.vfsfs.Init(vfsObj, &fstype, fs)
+
+	// Construct the root dentry.
+	d := fs.newDentry()
+	d.refs = 1
+	lowerVD := vfs.MakeVirtualDentry(mnt, mnt.Root())
+	lowerVD.IncRef()
+	d.lowerVD = lowerVD
+
+	rootMerkleName := merklePrefix + iopts.RootMerkleFileName
+
+	lowerMerkleVD, err := vfsObj.GetDentryAt(ctx, fs.creds, &vfs.PathOperation{
+		Root:  lowerVD,
+		Start: lowerVD,
+		Path:  fspath.Parse(rootMerkleName),
+	}, &vfs.GetDentryOptions{})
+
+	// If runtime enable is allowed, the root merkle tree may be absent. We
+	// should create the tree file.
+	if err == syserror.ENOENT && fs.allowRuntimeEnable {
+		lowerMerkleFD, err := vfsObj.OpenAt(ctx, fs.creds, &vfs.PathOperation{
+			Root:  lowerVD,
+			Start: lowerVD,
+			Path:  fspath.Parse(rootMerkleName),
+		}, &vfs.OpenOptions{
+			Flags: linux.O_RDWR | linux.O_CREAT,
+			Mode:  0644,
+		})
+		if err != nil {
+			fs.vfsfs.DecRef(ctx)
+			d.DecRef(ctx)
+			return nil, nil, err
+		}
+		lowerMerkleFD.DecRef(ctx)
+		lowerMerkleVD, err = vfsObj.GetDentryAt(ctx, fs.creds, &vfs.PathOperation{
+			Root:  lowerVD,
+			Start: lowerVD,
+			Path:  fspath.Parse(rootMerkleName),
+		}, &vfs.GetDentryOptions{})
+		if err != nil {
+			fs.vfsfs.DecRef(ctx)
+			d.DecRef(ctx)
+			return nil, nil, err
+		}
+	} else if err != nil {
+		// Failed to get dentry for the root Merkle file. This
+		// indicates an unexpected modification that removed/renamed
+		// the root Merkle file, or it's never generated.
+		fs.vfsfs.DecRef(ctx)
+		d.DecRef(ctx)
+		return nil, nil, alertIntegrityViolation(err, "Failed to find root Merkle file")
+	}
+	d.lowerMerkleVD = lowerMerkleVD
+
+	// Get metadata from the underlying file system.
+	const statMask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID
+	stat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{
+		Root:  lowerVD,
+		Start: lowerVD,
+	}, &vfs.StatOptions{
+		Mask: statMask,
+	})
+	if err != nil {
+		fs.vfsfs.DecRef(ctx)
+		d.DecRef(ctx)
+		return nil, nil, err
+	}
+
+	d.mode = uint32(stat.Mode)
+	d.uid = stat.UID
+	d.gid = stat.GID
+	d.hash = make([]byte, len(iopts.RootHash))
+
+	if !fs.allowRuntimeEnable {
+		if err := fs.verifyStat(ctx, d, stat); err != nil {
+			return nil, nil, err
+		}
+	}
+
+	copy(d.hash, iopts.RootHash)
+	d.vfsd.Init(d)
+
+	fs.rootDentry = d
+
+	return &fs.vfsfs, &d.vfsd, nil
 }
 
 // Release implements vfs.FilesystemImpl.Release.
@@ -116,6 +279,8 @@ func (fs *filesystem) Release(ctx context.Context) {
 }
 
 // dentry implements vfs.DentryImpl.
+//
+// +stateify savable
 type dentry struct {
 	vfsd vfs.Dentry
 
@@ -142,7 +307,7 @@ type dentry struct {
 	// and dirents (if not nil) is a cache of dirents as returned by
 	// directoryFDs representing this directory. children is protected by
 	// dirMu.
-	dirMu    sync.Mutex
+	dirMu    sync.Mutex `state:"nosave"`
 	children map[string]*dentry
 
 	// lowerVD is the VirtualDentry in the underlying file system.
@@ -152,8 +317,8 @@ type dentry struct {
 	// in the underlying file system.
 	lowerMerkleVD vfs.VirtualDentry
 
-	// rootHash is the rootHash for the current file or directory.
-	rootHash []byte
+	// hash is the calculated hash for the current file or directory.
+	hash []byte
 }
 
 // newDentry creates a new dentry representing the given verity file. The
@@ -275,6 +440,14 @@ func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes)
 	return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(atomic.LoadUint32(&d.mode)), auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid)))
 }
 
+// verityEnabled checks whether the file is enabled with verity features. It
+// should always be true if runtime enable is not allowed. In runtime enable
+// mode, it returns true if the target has been enabled with
+// ioctl(FS_IOC_ENABLE_VERITY).
+func (d *dentry) verityEnabled() bool {
+	return !d.fs.allowRuntimeEnable || len(d.hash) != 0
+}
+
 func (d *dentry) readlink(ctx context.Context) (string, error) {
 	return d.fs.vfsfs.VirtualFilesystem().ReadlinkAt(ctx, d.fs.creds, &vfs.PathOperation{
 		Root:  d.lowerVD,
@@ -286,6 +459,8 @@ func (d *dentry) readlink(ctx context.Context) (string, error) {
 // FileDescription is a wrapper of the underlying lowerFD, with support to build
 // Merkle trees through the Linux fs-verity API to verify contents read from
 // lowerFD.
+//
+// +stateify savable
 type fileDescription struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
@@ -335,6 +510,11 @@ func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linu
 	if err != nil {
 		return linux.Statx{}, err
 	}
+	if fd.d.verityEnabled() {
+		if err := fd.d.fs.verifyStat(ctx, fd.d, stat); err != nil {
+			return linux.Statx{}, err
+		}
+	}
 	return stat, nil
 }
 
@@ -344,12 +524,273 @@ func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions)
 	return syserror.EPERM
 }
 
+// generateMerkle generates a Merkle tree file for fd. If fd points to a file
+// /foo/bar, a Merkle tree file /foo/.merkle.verity.bar is generated. The hash
+// of the generated Merkle tree and the data size is returned.  If fd points to
+// a regular file, the data is the content of the file. If fd points to a
+// directory, the data is all hahes of its children, written to the Merkle tree
+// file.
+func (fd *fileDescription) generateMerkle(ctx context.Context) ([]byte, uint64, error) {
+	fdReader := vfs.FileReadWriteSeeker{
+		FD:  fd.lowerFD,
+		Ctx: ctx,
+	}
+	merkleReader := vfs.FileReadWriteSeeker{
+		FD:  fd.merkleReader,
+		Ctx: ctx,
+	}
+	merkleWriter := vfs.FileReadWriteSeeker{
+		FD:  fd.merkleWriter,
+		Ctx: ctx,
+	}
+	params := &merkletree.GenerateParams{
+		TreeReader: &merkleReader,
+		TreeWriter: &merkleWriter,
+	}
+
+	switch atomic.LoadUint32(&fd.d.mode) & linux.S_IFMT {
+	case linux.S_IFREG:
+		// For a regular file, generate a Merkle tree based on its
+		// content.
+		var err error
+		stat, err := fd.lowerFD.Stat(ctx, vfs.StatOptions{})
+		if err != nil {
+			return nil, 0, err
+		}
+
+		params.File = &fdReader
+		params.Size = int64(stat.Size)
+		params.Name = fd.d.name
+		params.Mode = uint32(stat.Mode)
+		params.UID = stat.UID
+		params.GID = stat.GID
+		params.DataAndTreeInSameFile = false
+	case linux.S_IFDIR:
+		// For a directory, generate a Merkle tree based on the hashes
+		// of its children that has already been written to the Merkle
+		// tree file.
+		merkleStat, err := fd.merkleReader.Stat(ctx, vfs.StatOptions{})
+		if err != nil {
+			return nil, 0, err
+		}
+
+		params.Size = int64(merkleStat.Size)
+
+		stat, err := fd.lowerFD.Stat(ctx, vfs.StatOptions{})
+		if err != nil {
+			return nil, 0, err
+		}
+
+		params.File = &merkleReader
+		params.Name = fd.d.name
+		params.Mode = uint32(stat.Mode)
+		params.UID = stat.UID
+		params.GID = stat.GID
+		params.DataAndTreeInSameFile = true
+	default:
+		// TODO(b/167728857): Investigate whether and how we should
+		// enable other types of file.
+		return nil, 0, syserror.EINVAL
+	}
+	hash, err := merkletree.Generate(params)
+	return hash, uint64(params.Size), err
+}
+
+// enableVerity enables verity features on fd by generating a Merkle tree file
+// and stores its hash in its parent directory's Merkle tree.
+func (fd *fileDescription) enableVerity(ctx context.Context, uio usermem.IO) (uintptr, error) {
+	if !fd.d.fs.allowRuntimeEnable {
+		return 0, syserror.EPERM
+	}
+
+	fd.d.fs.verityMu.Lock()
+	defer fd.d.fs.verityMu.Unlock()
+
+	// In allowRuntimeEnable mode, the underlying fd and read/write fd for
+	// the Merkle tree file should have all been initialized. For any file
+	// or directory other than the root, the parent Merkle tree file should
+	// have also been initialized.
+	if fd.lowerFD == nil || fd.merkleReader == nil || fd.merkleWriter == nil || (fd.parentMerkleWriter == nil && fd.d != fd.d.fs.rootDentry) {
+		return 0, alertIntegrityViolation(syserror.EIO, "Unexpected verity fd: missing expected underlying fds")
+	}
+
+	hash, dataSize, err := fd.generateMerkle(ctx)
+	if err != nil {
+		return 0, err
+	}
+
+	if fd.parentMerkleWriter != nil {
+		stat, err := fd.parentMerkleWriter.Stat(ctx, vfs.StatOptions{})
+		if err != nil {
+			return 0, err
+		}
+
+		// Write the hash of fd to the parent directory's Merkle tree
+		// file, as it should be part of the parent Merkle tree data.
+		// parentMerkleWriter is open with O_APPEND, so it should write
+		// directly to the end of the file.
+		if _, err = fd.parentMerkleWriter.Write(ctx, usermem.BytesIOSequence(hash), vfs.WriteOptions{}); err != nil {
+			return 0, err
+		}
+
+		// Record the offset of the hash of fd in parent directory's
+		// Merkle tree file.
+		if err := fd.merkleWriter.SetXattr(ctx, &vfs.SetXattrOptions{
+			Name:  merkleOffsetInParentXattr,
+			Value: strconv.Itoa(int(stat.Size)),
+		}); err != nil {
+			return 0, err
+		}
+	}
+
+	// Record the size of the data being hashed for fd.
+	if err := fd.merkleWriter.SetXattr(ctx, &vfs.SetXattrOptions{
+		Name:  merkleSizeXattr,
+		Value: strconv.Itoa(int(dataSize)),
+	}); err != nil {
+		return 0, err
+	}
+	fd.d.hash = append(fd.d.hash, hash...)
+	return 0, nil
+}
+
+// measureVerity returns the hash of fd, saved in verityDigest.
+func (fd *fileDescription) measureVerity(ctx context.Context, uio usermem.IO, verityDigest usermem.Addr) (uintptr, error) {
+	t := kernel.TaskFromContext(ctx)
+	var metadata linux.DigestMetadata
+
+	// If allowRuntimeEnable is true, an empty fd.d.hash indicates that
+	// verity is not enabled for the file. If allowRuntimeEnable is false,
+	// this is an integrity violation because all files should have verity
+	// enabled, in which case fd.d.hash should be set.
+	if len(fd.d.hash) == 0 {
+		if fd.d.fs.allowRuntimeEnable {
+			return 0, syserror.ENODATA
+		}
+		return 0, alertIntegrityViolation(syserror.ENODATA, "Ioctl measureVerity: no hash found")
+	}
+
+	// The first part of VerityDigest is the metadata.
+	if _, err := metadata.CopyIn(t, verityDigest); err != nil {
+		return 0, err
+	}
+	if metadata.DigestSize < uint16(len(fd.d.hash)) {
+		return 0, syserror.EOVERFLOW
+	}
+
+	// Populate the output digest size, since DigestSize is both input and
+	// output.
+	metadata.DigestSize = uint16(len(fd.d.hash))
+
+	// First copy the metadata.
+	if _, err := metadata.CopyOut(t, verityDigest); err != nil {
+		return 0, err
+	}
+
+	// Now copy the root hash bytes to the memory after metadata.
+	_, err := t.CopyOutBytes(usermem.Addr(uintptr(verityDigest)+linux.SizeOfDigestMetadata), fd.d.hash)
+	return 0, err
+}
+
+func (fd *fileDescription) verityFlags(ctx context.Context, uio usermem.IO, flags usermem.Addr) (uintptr, error) {
+	f := int32(0)
+
+	// All enabled files should store a hash. This flag is not settable via
+	// FS_IOC_SETFLAGS.
+	if len(fd.d.hash) != 0 {
+		f |= linux.FS_VERITY_FL
+	}
+
+	t := kernel.TaskFromContext(ctx)
+	_, err := primitive.CopyInt32Out(t, flags, f)
+	return 0, err
+}
+
+// Ioctl implements vfs.FileDescriptionImpl.Ioctl.
+func (fd *fileDescription) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	switch cmd := args[1].Uint(); cmd {
+	case linux.FS_IOC_ENABLE_VERITY:
+		return fd.enableVerity(ctx, uio)
+	case linux.FS_IOC_MEASURE_VERITY:
+		return fd.measureVerity(ctx, uio, args[2].Pointer())
+	case linux.FS_IOC_GETFLAGS:
+		return fd.verityFlags(ctx, uio, args[2].Pointer())
+	default:
+		// TODO(b/169682228): Investigate which ioctl commands should
+		// be allowed.
+		return 0, syserror.ENOSYS
+	}
+}
+
+// PRead implements vfs.FileDescriptionImpl.PRead.
+func (fd *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+	// No need to verify if the file is not enabled yet in
+	// allowRuntimeEnable mode.
+	if !fd.d.verityEnabled() {
+		return fd.lowerFD.PRead(ctx, dst, offset, opts)
+	}
+
+	fd.d.fs.verityMu.RLock()
+	defer fd.d.fs.verityMu.RUnlock()
+	// dataSize is the size of the whole file.
+	dataSize, err := fd.merkleReader.GetXattr(ctx, &vfs.GetXattrOptions{
+		Name: merkleSizeXattr,
+		Size: sizeOfStringInt32,
+	})
+
+	// The Merkle tree file for the child should have been created and
+	// contains the expected xattrs. If the xattr does not exist, it
+	// indicates unexpected modifications to the file system.
+	if err == syserror.ENODATA {
+		return 0, alertIntegrityViolation(err, fmt.Sprintf("Failed to get xattr %s: %v", merkleSizeXattr, err))
+	}
+	if err != nil {
+		return 0, err
+	}
+
+	// The dataSize xattr should be an integer. If it's not, it indicates
+	// unexpected modifications to the file system.
+	size, err := strconv.Atoi(dataSize)
+	if err != nil {
+		return 0, alertIntegrityViolation(err, fmt.Sprintf("Failed to convert xattr %s to int: %v", merkleSizeXattr, err))
+	}
+
+	dataReader := vfs.FileReadWriteSeeker{
+		FD:  fd.lowerFD,
+		Ctx: ctx,
+	}
+
+	merkleReader := vfs.FileReadWriteSeeker{
+		FD:  fd.merkleReader,
+		Ctx: ctx,
+	}
+
+	n, err := merkletree.Verify(&merkletree.VerifyParams{
+		Out:                   dst.Writer(ctx),
+		File:                  &dataReader,
+		Tree:                  &merkleReader,
+		Size:                  int64(size),
+		Name:                  fd.d.name,
+		Mode:                  fd.d.mode,
+		UID:                   fd.d.uid,
+		GID:                   fd.d.gid,
+		ReadOffset:            offset,
+		ReadSize:              dst.NumBytes(),
+		Expected:              fd.d.hash,
+		DataAndTreeInSameFile: false,
+	})
+	if err != nil {
+		return 0, alertIntegrityViolation(syserror.EIO, fmt.Sprintf("Verification failed: %v", err))
+	}
+	return n, err
+}
+
 // LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
 func (fd *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
-	return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block)
+	return fd.lowerFD.LockPOSIX(ctx, uid, t, start, length, whence, block)
 }
 
 // UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
 func (fd *fileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
-	return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence)
+	return fd.lowerFD.UnlockPOSIX(ctx, uid, start, length, whence)
 }
diff --git a/pkg/sentry/fsimpl/verity/verity_test.go b/pkg/sentry/fsimpl/verity/verity_test.go
new file mode 100644
index 000000000..e301d35f5
--- /dev/null
+++ b/pkg/sentry/fsimpl/verity/verity_test.go
@@ -0,0 +1,491 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package verity
+
+import (
+	"fmt"
+	"io"
+	"math/rand"
+	"testing"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/contexttest"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// rootMerkleFilename is the name of the root Merkle tree file.
+const rootMerkleFilename = "root.verity"
+
+// maxDataSize is the maximum data size written to the file for test.
+const maxDataSize = 100000
+
+// newVerityRoot creates a new verity mount, and returns the root. The
+// underlying file system is tmpfs. If the error is not nil, then cleanup
+// should be called when the root is no longer needed.
+func newVerityRoot(ctx context.Context, t *testing.T) (*vfs.VirtualFilesystem, vfs.VirtualDentry, error) {
+	rand.Seed(time.Now().UnixNano())
+	vfsObj := &vfs.VirtualFilesystem{}
+	if err := vfsObj.Init(ctx); err != nil {
+		return nil, vfs.VirtualDentry{}, fmt.Errorf("VFS init: %v", err)
+	}
+
+	vfsObj.MustRegisterFilesystemType("verity", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserMount: true,
+	})
+
+	vfsObj.MustRegisterFilesystemType("tmpfs", tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserMount: true,
+	})
+
+	mntns, err := vfsObj.NewMountNamespace(ctx, auth.CredentialsFromContext(ctx), "", "verity", &vfs.MountOptions{
+		GetFilesystemOptions: vfs.GetFilesystemOptions{
+			InternalData: InternalFilesystemOptions{
+				RootMerkleFileName:           rootMerkleFilename,
+				LowerName:                    "tmpfs",
+				AllowRuntimeEnable:           true,
+				NoCrashOnVerificationFailure: true,
+			},
+		},
+	})
+	if err != nil {
+		return nil, vfs.VirtualDentry{}, fmt.Errorf("NewMountNamespace: %v", err)
+	}
+	root := mntns.Root()
+	root.IncRef()
+	t.Helper()
+	t.Cleanup(func() {
+		root.DecRef(ctx)
+		mntns.DecRef(ctx)
+	})
+	return vfsObj, root, nil
+}
+
+// newFileFD creates a new file in the verity mount, and returns the FD. The FD
+// points to a file that has random data generated.
+func newFileFD(ctx context.Context, vfsObj *vfs.VirtualFilesystem, root vfs.VirtualDentry, filePath string, mode linux.FileMode) (*vfs.FileDescription, int, error) {
+	creds := auth.CredentialsFromContext(ctx)
+	lowerRoot := root.Dentry().Impl().(*dentry).lowerVD
+
+	// Create the file in the underlying file system.
+	lowerFD, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{
+		Root:  lowerRoot,
+		Start: lowerRoot,
+		Path:  fspath.Parse(filePath),
+	}, &vfs.OpenOptions{
+		Flags: linux.O_RDWR | linux.O_CREAT | linux.O_EXCL,
+		Mode:  linux.ModeRegular | mode,
+	})
+	if err != nil {
+		return nil, 0, err
+	}
+
+	// Generate random data to be written to the file.
+	dataSize := rand.Intn(maxDataSize) + 1
+	data := make([]byte, dataSize)
+	rand.Read(data)
+
+	// Write directly to the underlying FD, since verity FD is read-only.
+	n, err := lowerFD.Write(ctx, usermem.BytesIOSequence(data), vfs.WriteOptions{})
+	if err != nil {
+		return nil, 0, err
+	}
+
+	if n != int64(len(data)) {
+		return nil, 0, fmt.Errorf("lowerFD.Write got write length %d, want %d", n, len(data))
+	}
+
+	lowerFD.DecRef(ctx)
+
+	// Now open the verity file descriptor.
+	fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{
+		Root:  root,
+		Start: root,
+		Path:  fspath.Parse(filePath),
+	}, &vfs.OpenOptions{
+		Flags: linux.O_RDONLY,
+		Mode:  linux.ModeRegular | mode,
+	})
+	return fd, dataSize, err
+}
+
+// corruptRandomBit randomly flips a bit in the file represented by fd.
+func corruptRandomBit(ctx context.Context, fd *vfs.FileDescription, size int) error {
+	// Flip a random bit in the underlying file.
+	randomPos := int64(rand.Intn(size))
+	byteToModify := make([]byte, 1)
+	if _, err := fd.PRead(ctx, usermem.BytesIOSequence(byteToModify), randomPos, vfs.ReadOptions{}); err != nil {
+		return fmt.Errorf("lowerFD.PRead: %v", err)
+	}
+	byteToModify[0] ^= 1
+	if _, err := fd.PWrite(ctx, usermem.BytesIOSequence(byteToModify), randomPos, vfs.WriteOptions{}); err != nil {
+		return fmt.Errorf("lowerFD.PWrite: %v", err)
+	}
+	return nil
+}
+
+// TestOpen ensures that when a file is created, the corresponding Merkle tree
+// file and the root Merkle tree file exist.
+func TestOpen(t *testing.T) {
+	ctx := contexttest.Context(t)
+	vfsObj, root, err := newVerityRoot(ctx, t)
+	if err != nil {
+		t.Fatalf("newVerityRoot: %v", err)
+	}
+
+	filename := "verity-test-file"
+	if _, _, err := newFileFD(ctx, vfsObj, root, filename, 0644); err != nil {
+		t.Fatalf("newFileFD: %v", err)
+	}
+
+	// Ensure that the corresponding Merkle tree file is created.
+	lowerRoot := root.Dentry().Impl().(*dentry).lowerVD
+	if _, err = vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
+		Root:  lowerRoot,
+		Start: lowerRoot,
+		Path:  fspath.Parse(merklePrefix + filename),
+	}, &vfs.OpenOptions{
+		Flags: linux.O_RDONLY,
+	}); err != nil {
+		t.Errorf("OpenAt Merkle tree file %s: %v", merklePrefix+filename, err)
+	}
+
+	// Ensure the root merkle tree file is created.
+	if _, err = vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
+		Root:  lowerRoot,
+		Start: lowerRoot,
+		Path:  fspath.Parse(merklePrefix + rootMerkleFilename),
+	}, &vfs.OpenOptions{
+		Flags: linux.O_RDONLY,
+	}); err != nil {
+		t.Errorf("OpenAt root Merkle tree file %s: %v", merklePrefix+rootMerkleFilename, err)
+	}
+}
+
+// TestUnmodifiedFileSucceeds ensures that read from an untouched verity file
+// succeeds after enabling verity for it.
+func TestReadUnmodifiedFileSucceeds(t *testing.T) {
+	ctx := contexttest.Context(t)
+	vfsObj, root, err := newVerityRoot(ctx, t)
+	if err != nil {
+		t.Fatalf("newVerityRoot: %v", err)
+	}
+
+	filename := "verity-test-file"
+	fd, size, err := newFileFD(ctx, vfsObj, root, filename, 0644)
+	if err != nil {
+		t.Fatalf("newFileFD: %v", err)
+	}
+
+	// Enable verity on the file and confirm a normal read succeeds.
+	var args arch.SyscallArguments
+	args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY}
+	if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil {
+		t.Fatalf("Ioctl: %v", err)
+	}
+
+	buf := make([]byte, size)
+	n, err := fd.PRead(ctx, usermem.BytesIOSequence(buf), 0 /* offset */, vfs.ReadOptions{})
+	if err != nil && err != io.EOF {
+		t.Fatalf("fd.PRead: %v", err)
+	}
+
+	if n != int64(size) {
+		t.Errorf("fd.PRead got read length %d, want %d", n, size)
+	}
+}
+
+// TestReopenUnmodifiedFileSucceeds ensures that reopen an untouched verity file
+// succeeds after enabling verity for it.
+func TestReopenUnmodifiedFileSucceeds(t *testing.T) {
+	ctx := contexttest.Context(t)
+	vfsObj, root, err := newVerityRoot(ctx, t)
+	if err != nil {
+		t.Fatalf("newVerityRoot: %v", err)
+	}
+
+	filename := "verity-test-file"
+	fd, _, err := newFileFD(ctx, vfsObj, root, filename, 0644)
+	if err != nil {
+		t.Fatalf("newFileFD: %v", err)
+	}
+
+	// Enable verity on the file and confirms a normal read succeeds.
+	var args arch.SyscallArguments
+	args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY}
+	if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil {
+		t.Fatalf("Ioctl: %v", err)
+	}
+
+	// Ensure reopening the verity enabled file succeeds.
+	if _, err = vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
+		Root:  root,
+		Start: root,
+		Path:  fspath.Parse(filename),
+	}, &vfs.OpenOptions{
+		Flags: linux.O_RDONLY,
+		Mode:  linux.ModeRegular,
+	}); err != nil {
+		t.Errorf("reopen enabled file failed: %v", err)
+	}
+}
+
+// TestModifiedFileFails ensures that read from a modified verity file fails.
+func TestModifiedFileFails(t *testing.T) {
+	ctx := contexttest.Context(t)
+	vfsObj, root, err := newVerityRoot(ctx, t)
+	if err != nil {
+		t.Fatalf("newVerityRoot: %v", err)
+	}
+
+	filename := "verity-test-file"
+	fd, size, err := newFileFD(ctx, vfsObj, root, filename, 0644)
+	if err != nil {
+		t.Fatalf("newFileFD: %v", err)
+	}
+
+	// Enable verity on the file.
+	var args arch.SyscallArguments
+	args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY}
+	if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil {
+		t.Fatalf("Ioctl: %v", err)
+	}
+
+	// Open a new lowerFD that's read/writable.
+	lowerVD := fd.Impl().(*fileDescription).d.lowerVD
+
+	lowerFD, err := vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
+		Root:  lowerVD,
+		Start: lowerVD,
+	}, &vfs.OpenOptions{
+		Flags: linux.O_RDWR,
+	})
+	if err != nil {
+		t.Fatalf("OpenAt: %v", err)
+	}
+
+	if err := corruptRandomBit(ctx, lowerFD, size); err != nil {
+		t.Fatalf("corruptRandomBit: %v", err)
+	}
+
+	// Confirm that read from the modified file fails.
+	buf := make([]byte, size)
+	if _, err := fd.PRead(ctx, usermem.BytesIOSequence(buf), 0 /* offset */, vfs.ReadOptions{}); err == nil {
+		t.Fatalf("fd.PRead succeeded with modified file")
+	}
+}
+
+// TestModifiedMerkleFails ensures that read from a verity file fails if the
+// corresponding Merkle tree file is modified.
+func TestModifiedMerkleFails(t *testing.T) {
+	ctx := contexttest.Context(t)
+	vfsObj, root, err := newVerityRoot(ctx, t)
+	if err != nil {
+		t.Fatalf("newVerityRoot: %v", err)
+	}
+
+	filename := "verity-test-file"
+	fd, size, err := newFileFD(ctx, vfsObj, root, filename, 0644)
+	if err != nil {
+		t.Fatalf("newFileFD: %v", err)
+	}
+
+	// Enable verity on the file.
+	var args arch.SyscallArguments
+	args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY}
+	if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil {
+		t.Fatalf("Ioctl: %v", err)
+	}
+
+	// Open a new lowerMerkleFD that's read/writable.
+	lowerMerkleVD := fd.Impl().(*fileDescription).d.lowerMerkleVD
+
+	lowerMerkleFD, err := vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
+		Root:  lowerMerkleVD,
+		Start: lowerMerkleVD,
+	}, &vfs.OpenOptions{
+		Flags: linux.O_RDWR,
+	})
+	if err != nil {
+		t.Fatalf("OpenAt: %v", err)
+	}
+
+	// Flip a random bit in the Merkle tree file.
+	stat, err := lowerMerkleFD.Stat(ctx, vfs.StatOptions{})
+	if err != nil {
+		t.Fatalf("stat: %v", err)
+	}
+	merkleSize := int(stat.Size)
+	if err := corruptRandomBit(ctx, lowerMerkleFD, merkleSize); err != nil {
+		t.Fatalf("corruptRandomBit: %v", err)
+	}
+
+	// Confirm that read from a file with modified Merkle tree fails.
+	buf := make([]byte, size)
+	if _, err := fd.PRead(ctx, usermem.BytesIOSequence(buf), 0 /* offset */, vfs.ReadOptions{}); err == nil {
+		fmt.Println(buf)
+		t.Fatalf("fd.PRead succeeded with modified Merkle file")
+	}
+}
+
+// TestModifiedParentMerkleFails ensures that open a verity enabled file in a
+// verity enabled directory fails if the hashes related to the target file in
+// the parent Merkle tree file is modified.
+func TestModifiedParentMerkleFails(t *testing.T) {
+	ctx := contexttest.Context(t)
+	vfsObj, root, err := newVerityRoot(ctx, t)
+	if err != nil {
+		t.Fatalf("newVerityRoot: %v", err)
+	}
+
+	filename := "verity-test-file"
+	fd, _, err := newFileFD(ctx, vfsObj, root, filename, 0644)
+	if err != nil {
+		t.Fatalf("newFileFD: %v", err)
+	}
+
+	// Enable verity on the file.
+	var args arch.SyscallArguments
+	args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY}
+	if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil {
+		t.Fatalf("Ioctl: %v", err)
+	}
+
+	// Enable verity on the parent directory.
+	parentFD, err := vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
+		Root:  root,
+		Start: root,
+	}, &vfs.OpenOptions{
+		Flags: linux.O_RDONLY,
+	})
+	if err != nil {
+		t.Fatalf("OpenAt: %v", err)
+	}
+
+	if _, err := parentFD.Ioctl(ctx, nil /* uio */, args); err != nil {
+		t.Fatalf("Ioctl: %v", err)
+	}
+
+	// Open a new lowerMerkleFD that's read/writable.
+	parentLowerMerkleVD := fd.Impl().(*fileDescription).d.parent.lowerMerkleVD
+
+	parentLowerMerkleFD, err := vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
+		Root:  parentLowerMerkleVD,
+		Start: parentLowerMerkleVD,
+	}, &vfs.OpenOptions{
+		Flags: linux.O_RDWR,
+	})
+	if err != nil {
+		t.Fatalf("OpenAt: %v", err)
+	}
+
+	// Flip a random bit in the parent Merkle tree file.
+	// This parent directory contains only one child, so any random
+	// modification in the parent Merkle tree should cause verification
+	// failure when opening the child file.
+	stat, err := parentLowerMerkleFD.Stat(ctx, vfs.StatOptions{})
+	if err != nil {
+		t.Fatalf("stat: %v", err)
+	}
+	parentMerkleSize := int(stat.Size)
+	if err := corruptRandomBit(ctx, parentLowerMerkleFD, parentMerkleSize); err != nil {
+		t.Fatalf("corruptRandomBit: %v", err)
+	}
+
+	parentLowerMerkleFD.DecRef(ctx)
+
+	// Ensure reopening the verity enabled file fails.
+	if _, err = vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
+		Root:  root,
+		Start: root,
+		Path:  fspath.Parse(filename),
+	}, &vfs.OpenOptions{
+		Flags: linux.O_RDONLY,
+		Mode:  linux.ModeRegular,
+	}); err == nil {
+		t.Errorf("OpenAt file with modified parent Merkle succeeded")
+	}
+}
+
+// TestUnmodifiedStatSucceeds ensures that stat of an untouched verity file
+// succeeds after enabling verity for it.
+func TestUnmodifiedStatSucceeds(t *testing.T) {
+	ctx := contexttest.Context(t)
+	vfsObj, root, err := newVerityRoot(ctx, t)
+	if err != nil {
+		t.Fatalf("newVerityRoot: %v", err)
+	}
+
+	filename := "verity-test-file"
+	fd, _, err := newFileFD(ctx, vfsObj, root, filename, 0644)
+	if err != nil {
+		t.Fatalf("newFileFD: %v", err)
+	}
+
+	// Enable verity on the file and confirms stat succeeds.
+	var args arch.SyscallArguments
+	args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY}
+	if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil {
+		t.Fatalf("fd.Ioctl: %v", err)
+	}
+
+	if _, err := fd.Stat(ctx, vfs.StatOptions{}); err != nil {
+		t.Errorf("fd.Stat: %v", err)
+	}
+}
+
+// TestModifiedStatFails checks that getting stat for a file with modified stat
+// should fail.
+func TestModifiedStatFails(t *testing.T) {
+	ctx := contexttest.Context(t)
+	vfsObj, root, err := newVerityRoot(ctx, t)
+	if err != nil {
+		t.Fatalf("newVerityRoot: %v", err)
+	}
+
+	filename := "verity-test-file"
+	fd, _, err := newFileFD(ctx, vfsObj, root, filename, 0644)
+	if err != nil {
+		t.Fatalf("newFileFD: %v", err)
+	}
+
+	// Enable verity on the file.
+	var args arch.SyscallArguments
+	args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY}
+	if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil {
+		t.Fatalf("fd.Ioctl: %v", err)
+	}
+
+	lowerFD := fd.Impl().(*fileDescription).lowerFD
+	// Change the stat of the underlying file, and check that stat fails.
+	if err := lowerFD.SetStat(ctx, vfs.SetStatOptions{
+		Stat: linux.Statx{
+			Mask: uint32(linux.STATX_MODE),
+			Mode: 0777,
+		},
+	}); err != nil {
+		t.Fatalf("lowerFD.SetStat: %v", err)
+	}
+
+	if _, err := fd.Stat(ctx, vfs.StatOptions{}); err == nil {
+		t.Errorf("fd.Stat succeeded when it should fail")
+	}
+}
diff --git a/pkg/sentry/hostmm/BUILD b/pkg/sentry/hostmm/BUILD
index 61c78569d..300b7ccce 100644
--- a/pkg/sentry/hostmm/BUILD
+++ b/pkg/sentry/hostmm/BUILD
@@ -7,11 +7,14 @@ go_library(
     srcs = [
         "cgroup.go",
         "hostmm.go",
+        "membarrier.go",
     ],
     visibility = ["//pkg/sentry:internal"],
     deps = [
+        "//pkg/abi/linux",
         "//pkg/fd",
         "//pkg/log",
         "//pkg/usermem",
+        "@org_golang_x_sys//unix:go_default_library",
     ],
 )
diff --git a/pkg/sentry/hostmm/membarrier.go b/pkg/sentry/hostmm/membarrier.go
new file mode 100644
index 000000000..4468d75f1
--- /dev/null
+++ b/pkg/sentry/hostmm/membarrier.go
@@ -0,0 +1,90 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package hostmm
+
+import (
+	"syscall"
+
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/log"
+)
+
+var (
+	haveMembarrierGlobal           = false
+	haveMembarrierPrivateExpedited = false
+)
+
+func init() {
+	supported, _, e := syscall.RawSyscall(unix.SYS_MEMBARRIER, linux.MEMBARRIER_CMD_QUERY, 0 /* flags */, 0 /* unused */)
+	if e != 0 {
+		if e != syscall.ENOSYS {
+			log.Warningf("membarrier(MEMBARRIER_CMD_QUERY) failed: %s", e.Error())
+		}
+		return
+	}
+	// We don't use MEMBARRIER_CMD_GLOBAL_EXPEDITED because this sends IPIs to
+	// all CPUs running tasks that have previously invoked
+	// MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED, which presents a DOS risk.
+	// (MEMBARRIER_CMD_GLOBAL is synchronize_rcu(), i.e. it waits for an RCU
+	// grace period to elapse without bothering other CPUs.
+	// MEMBARRIER_CMD_PRIVATE_EXPEDITED sends IPIs only to CPUs running tasks
+	// sharing the caller's MM.)
+	if supported&linux.MEMBARRIER_CMD_GLOBAL != 0 {
+		haveMembarrierGlobal = true
+	}
+	if req := uintptr(linux.MEMBARRIER_CMD_PRIVATE_EXPEDITED | linux.MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED); supported&req == req {
+		if _, _, e := syscall.RawSyscall(unix.SYS_MEMBARRIER, linux.MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED, 0 /* flags */, 0 /* unused */); e != 0 {
+			log.Warningf("membarrier(MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED) failed: %s", e.Error())
+		} else {
+			haveMembarrierPrivateExpedited = true
+		}
+	}
+}
+
+// HaveGlobalMemoryBarrier returns true if GlobalMemoryBarrier is supported.
+func HaveGlobalMemoryBarrier() bool {
+	return haveMembarrierGlobal
+}
+
+// GlobalMemoryBarrier blocks until "all running threads [in the host OS] have
+// passed through a state where all memory accesses to user-space addresses
+// match program order between entry to and return from [GlobalMemoryBarrier]",
+// as for membarrier(2).
+//
+// Preconditions: HaveGlobalMemoryBarrier() == true.
+func GlobalMemoryBarrier() error {
+	if _, _, e := syscall.Syscall(unix.SYS_MEMBARRIER, linux.MEMBARRIER_CMD_GLOBAL, 0 /* flags */, 0 /* unused */); e != 0 {
+		return e
+	}
+	return nil
+}
+
+// HaveProcessMemoryBarrier returns true if ProcessMemoryBarrier is supported.
+func HaveProcessMemoryBarrier() bool {
+	return haveMembarrierPrivateExpedited
+}
+
+// ProcessMemoryBarrier is equivalent to GlobalMemoryBarrier, but only
+// synchronizes with threads sharing a virtual address space (from the host OS'
+// perspective) with the calling thread.
+//
+// Preconditions: HaveProcessMemoryBarrier() == true.
+func ProcessMemoryBarrier() error {
+	if _, _, e := syscall.RawSyscall(unix.SYS_MEMBARRIER, linux.MEMBARRIER_CMD_PRIVATE_EXPEDITED, 0 /* flags */, 0 /* unused */); e != 0 {
+		return e
+	}
+	return nil
+}
diff --git a/pkg/sentry/inet/BUILD b/pkg/sentry/inet/BUILD
index 07bf39fed..5bba9de0b 100644
--- a/pkg/sentry/inet/BUILD
+++ b/pkg/sentry/inet/BUILD
@@ -15,6 +15,7 @@ go_library(
     ],
     deps = [
         "//pkg/context",
+        "//pkg/tcpip",
         "//pkg/tcpip/stack",
     ],
 )
diff --git a/pkg/sentry/inet/inet.go b/pkg/sentry/inet/inet.go
index c0b4831d1..fbe6d6aa6 100644
--- a/pkg/sentry/inet/inet.go
+++ b/pkg/sentry/inet/inet.go
@@ -15,7 +15,10 @@
 // Package inet defines semantics for IP stacks.
 package inet
 
-import "gvisor.dev/gvisor/pkg/tcpip/stack"
+import (
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+)
 
 // Stack represents a TCP/IP stack.
 type Stack interface {
@@ -80,6 +83,12 @@ type Stack interface {
 	// RestoreCleanupEndpoints adds endpoints to cleanup tracking. This is useful
 	// for restoring a stack after a save.
 	RestoreCleanupEndpoints([]stack.TransportEndpoint)
+
+	// Forwarding returns if packet forwarding between NICs is enabled.
+	Forwarding(protocol tcpip.NetworkProtocolNumber) bool
+
+	// SetForwarding enables or disables packet forwarding between NICs.
+	SetForwarding(protocol tcpip.NetworkProtocolNumber, enable bool) error
 }
 
 // Interface contains information about a network interface.
diff --git a/pkg/sentry/inet/test_stack.go b/pkg/sentry/inet/test_stack.go
index 9771f01fc..1779cc6f3 100644
--- a/pkg/sentry/inet/test_stack.go
+++ b/pkg/sentry/inet/test_stack.go
@@ -14,7 +14,10 @@
 
 package inet
 
-import "gvisor.dev/gvisor/pkg/tcpip/stack"
+import (
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+)
 
 // TestStack is a dummy implementation of Stack for tests.
 type TestStack struct {
@@ -26,6 +29,7 @@ type TestStack struct {
 	TCPSendBufSize    TCPBufferSize
 	TCPSACKFlag       bool
 	Recovery          TCPLossRecovery
+	IPForwarding      bool
 }
 
 // NewTestStack returns a TestStack with no network interfaces. The value of
@@ -128,3 +132,14 @@ func (s *TestStack) CleanupEndpoints() []stack.TransportEndpoint {
 
 // RestoreCleanupEndpoints implements inet.Stack.RestoreCleanupEndpoints.
 func (s *TestStack) RestoreCleanupEndpoints([]stack.TransportEndpoint) {}
+
+// Forwarding implements inet.Stack.Forwarding.
+func (s *TestStack) Forwarding(protocol tcpip.NetworkProtocolNumber) bool {
+	return s.IPForwarding
+}
+
+// SetForwarding implements inet.Stack.SetForwarding.
+func (s *TestStack) SetForwarding(protocol tcpip.NetworkProtocolNumber, enable bool) error {
+	s.IPForwarding = enable
+	return nil
+}
diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
index 5416a310d..c0de72eef 100644
--- a/pkg/sentry/kernel/BUILD
+++ b/pkg/sentry/kernel/BUILD
@@ -69,8 +69,63 @@ go_template_instance(
     prefix = "socket",
     template = "//pkg/ilist:generic_list",
     types = {
-        "Element": "*SocketEntry",
-        "Linker": "*SocketEntry",
+        "Element": "*SocketRecordVFS1",
+        "Linker": "*SocketRecordVFS1",
+    },
+)
+
+go_template_instance(
+    name = "fd_table_refs",
+    out = "fd_table_refs.go",
+    package = "kernel",
+    prefix = "FDTable",
+    template = "//pkg/refs_vfs2:refs_template",
+    types = {
+        "T": "FDTable",
+    },
+)
+
+go_template_instance(
+    name = "fs_context_refs",
+    out = "fs_context_refs.go",
+    package = "kernel",
+    prefix = "FSContext",
+    template = "//pkg/refs_vfs2:refs_template",
+    types = {
+        "T": "FSContext",
+    },
+)
+
+go_template_instance(
+    name = "ipc_namespace_refs",
+    out = "ipc_namespace_refs.go",
+    package = "kernel",
+    prefix = "IPCNamespace",
+    template = "//pkg/refs_vfs2:refs_template",
+    types = {
+        "T": "IPCNamespace",
+    },
+)
+
+go_template_instance(
+    name = "process_group_refs",
+    out = "process_group_refs.go",
+    package = "kernel",
+    prefix = "ProcessGroup",
+    template = "//pkg/refs_vfs2:refs_template",
+    types = {
+        "T": "ProcessGroup",
+    },
+)
+
+go_template_instance(
+    name = "session_refs",
+    out = "session_refs.go",
+    package = "kernel",
+    prefix = "Session",
+    template = "//pkg/refs_vfs2:refs_template",
+    types = {
+        "T": "Session",
     },
 )
 
@@ -88,9 +143,14 @@ go_library(
         "aio.go",
         "context.go",
         "fd_table.go",
+        "fd_table_refs.go",
         "fd_table_unsafe.go",
         "fs_context.go",
+        "fs_context_refs.go",
         "ipc_namespace.go",
+        "ipc_namespace_refs.go",
+        "kcov.go",
+        "kcov_unsafe.go",
         "kernel.go",
         "kernel_opts.go",
         "kernel_state.go",
@@ -99,6 +159,7 @@ go_library(
         "pending_signals_state.go",
         "posixtimer.go",
         "process_group_list.go",
+        "process_group_refs.go",
         "ptrace.go",
         "ptrace_amd64.go",
         "ptrace_arm64.go",
@@ -106,6 +167,7 @@ go_library(
         "seccomp.go",
         "seqatomic_taskgoroutineschedinfo_unsafe.go",
         "session_list.go",
+        "session_refs.go",
         "sessions.go",
         "signal.go",
         "signal_handlers.go",
@@ -147,20 +209,24 @@ go_library(
         "gvisor.dev/gvisor/pkg/sentry/device",
         "gvisor.dev/gvisor/pkg/tcpip",
     ],
+    marshal = True,
     visibility = ["//:sandbox"],
     deps = [
         ":uncaught_signal_go_proto",
         "//pkg/abi",
         "//pkg/abi/linux",
         "//pkg/amutex",
-        "//pkg/binary",
         "//pkg/bits",
         "//pkg/bpf",
+        "//pkg/cleanup",
         "//pkg/context",
+        "//pkg/coverage",
         "//pkg/cpuid",
         "//pkg/eventchannel",
         "//pkg/fspath",
         "//pkg/log",
+        "//pkg/marshal",
+        "//pkg/marshal/primitive",
         "//pkg/metric",
         "//pkg/refs",
         "//pkg/refs_vfs2",
@@ -210,7 +276,6 @@ go_library(
         "//pkg/tcpip/stack",
         "//pkg/usermem",
         "//pkg/waiter",
-        "//tools/go_marshal/marshal",
     ],
 )
 
diff --git a/pkg/sentry/kernel/auth/BUILD b/pkg/sentry/kernel/auth/BUILD
index 2bc49483a..869e49ebc 100644
--- a/pkg/sentry/kernel/auth/BUILD
+++ b/pkg/sentry/kernel/auth/BUILD
@@ -57,6 +57,7 @@ go_library(
         "id_map_set.go",
         "user_namespace.go",
     ],
+    marshal = True,
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
diff --git a/pkg/sentry/kernel/auth/context.go b/pkg/sentry/kernel/auth/context.go
index ef5723127..c08d47787 100644
--- a/pkg/sentry/kernel/auth/context.go
+++ b/pkg/sentry/kernel/auth/context.go
@@ -34,3 +34,23 @@ func CredentialsFromContext(ctx context.Context) *Credentials {
 	}
 	return NewAnonymousCredentials()
 }
+
+// ContextWithCredentials returns a copy of ctx carrying creds.
+func ContextWithCredentials(ctx context.Context, creds *Credentials) context.Context {
+	return &authContext{ctx, creds}
+}
+
+type authContext struct {
+	context.Context
+	creds *Credentials
+}
+
+// Value implements context.Context.
+func (ac *authContext) Value(key interface{}) interface{} {
+	switch key {
+	case CtxCredentials:
+		return ac.creds
+	default:
+		return ac.Context.Value(key)
+	}
+}
diff --git a/pkg/sentry/kernel/auth/id.go b/pkg/sentry/kernel/auth/id.go
index 0a58ba17c..4c32ee703 100644
--- a/pkg/sentry/kernel/auth/id.go
+++ b/pkg/sentry/kernel/auth/id.go
@@ -19,9 +19,13 @@ import (
 )
 
 // UID is a user ID in an unspecified user namespace.
+//
+// +marshal
 type UID uint32
 
 // GID is a group ID in an unspecified user namespace.
+//
+// +marshal slice:GIDSlice
 type GID uint32
 
 // In the root user namespace, user/group IDs have a 1-to-1 relationship with
diff --git a/pkg/sentry/kernel/context.go b/pkg/sentry/kernel/context.go
index dd5f0f5fa..bb94769c4 100644
--- a/pkg/sentry/kernel/context.go
+++ b/pkg/sentry/kernel/context.go
@@ -81,7 +81,8 @@ func UTSNamespaceFromContext(ctx context.Context) *UTSNamespace {
 }
 
 // IPCNamespaceFromContext returns the IPC namespace in which ctx is executing,
-// or nil if there is no such IPC namespace.
+// or nil if there is no such IPC namespace. It takes a reference on the
+// namespace.
 func IPCNamespaceFromContext(ctx context.Context) *IPCNamespace {
 	if v := ctx.Value(CtxIPCNamespace); v != nil {
 		return v.(*IPCNamespace)
diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go
index ce53af69b..0ec7344cd 100644
--- a/pkg/sentry/kernel/fd_table.go
+++ b/pkg/sentry/kernel/fd_table.go
@@ -23,7 +23,6 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
-	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/lock"
 	"gvisor.dev/gvisor/pkg/sentry/limits"
@@ -78,7 +77,8 @@ type descriptor struct {
 //
 // +stateify savable
 type FDTable struct {
-	refs.AtomicRefCount
+	FDTableRefs
+
 	k *Kernel
 
 	// mu protects below.
@@ -111,8 +111,11 @@ func (f *FDTable) saveDescriptorTable() map[int32]descriptor {
 func (f *FDTable) loadDescriptorTable(m map[int32]descriptor) {
 	ctx := context.Background()
 	f.init() // Initialize table.
+	f.used = 0
 	for fd, d := range m {
-		f.setAll(fd, d.file, d.fileVFS2, d.flags)
+		if file, fileVFS2 := f.setAll(ctx, fd, d.file, d.fileVFS2, d.flags); file != nil || fileVFS2 != nil {
+			panic("VFS1 or VFS2 files set")
+		}
 
 		// Note that we do _not_ need to acquire a extra table reference here. The
 		// table reference will already be accounted for in the file, so we drop the
@@ -127,7 +130,7 @@ func (f *FDTable) loadDescriptorTable(m map[int32]descriptor) {
 }
 
 // drop drops the table reference.
-func (f *FDTable) drop(file *fs.File) {
+func (f *FDTable) drop(ctx context.Context, file *fs.File) {
 	// Release locks.
 	file.Dirent.Inode.LockCtx.Posix.UnlockRegion(f, lock.LockRange{0, lock.LockEOF})
 
@@ -145,14 +148,13 @@ func (f *FDTable) drop(file *fs.File) {
 	d.InotifyEvent(ev, 0)
 
 	// Drop the table reference.
-	file.DecRef(context.Background())
+	file.DecRef(ctx)
 }
 
 // dropVFS2 drops the table reference.
-func (f *FDTable) dropVFS2(file *vfs.FileDescription) {
+func (f *FDTable) dropVFS2(ctx context.Context, file *vfs.FileDescription) {
 	// Release any POSIX lock possibly held by the FDTable. Range {0, 0} means the
 	// entire file.
-	ctx := context.Background()
 	err := file.UnlockPOSIX(ctx, f, 0, 0, linux.SEEK_SET)
 	if err != nil && err != syserror.ENOLCK {
 		panic(fmt.Sprintf("UnlockPOSIX failed: %v", err))
@@ -176,22 +178,15 @@ func (k *Kernel) NewFDTable() *FDTable {
 	return f
 }
 
-// destroy removes all of the file descriptors from the map.
-func (f *FDTable) destroy(ctx context.Context) {
-	f.RemoveIf(ctx, func(*fs.File, *vfs.FileDescription, FDFlags) bool {
-		return true
-	})
-}
-
-// DecRef implements RefCounter.DecRef with destructor f.destroy.
+// DecRef implements RefCounter.DecRef.
+//
+// If f reaches zero references, all of its file descriptors are removed.
 func (f *FDTable) DecRef(ctx context.Context) {
-	f.DecRefWithDestructor(ctx, f.destroy)
-}
-
-// Size returns the number of file descriptor slots currently allocated.
-func (f *FDTable) Size() int {
-	size := atomic.LoadInt32(&f.used)
-	return int(size)
+	f.FDTableRefs.DecRef(func() {
+		f.RemoveIf(ctx, func(*fs.File, *vfs.FileDescription, FDFlags) bool {
+			return true
+		})
+	})
 }
 
 // forEach iterates over all non-nil files in sorted order.
@@ -280,7 +275,6 @@ func (f *FDTable) NewFDs(ctx context.Context, fd int32, files []*fs.File, flags
 	}
 
 	f.mu.Lock()
-	defer f.mu.Unlock()
 
 	// From f.next to find available fd.
 	if fd < f.next {
@@ -290,15 +284,25 @@ func (f *FDTable) NewFDs(ctx context.Context, fd int32, files []*fs.File, flags
 	// Install all entries.
 	for i := fd; i < end && len(fds) < len(files); i++ {
 		if d, _, _ := f.get(i); d == nil {
-			f.set(i, files[len(fds)], flags) // Set the descriptor.
-			fds = append(fds, i)             // Record the file descriptor.
+			// Set the descriptor.
+			f.set(ctx, i, files[len(fds)], flags)
+			fds = append(fds, i) // Record the file descriptor.
 		}
 	}
 
 	// Failure? Unwind existing FDs.
 	if len(fds) < len(files) {
 		for _, i := range fds {
-			f.set(i, nil, FDFlags{}) // Zap entry.
+			f.set(ctx, i, nil, FDFlags{})
+		}
+		f.mu.Unlock()
+
+		// Drop the reference taken by the call to f.set() that
+		// originally installed the file. Don't call f.drop()
+		// (generating inotify events, etc.) since the file should
+		// appear to have never been inserted into f.
+		for _, file := range files[:len(fds)] {
+			file.DecRef(ctx)
 		}
 		return nil, syscall.EMFILE
 	}
@@ -308,6 +312,7 @@ func (f *FDTable) NewFDs(ctx context.Context, fd int32, files []*fs.File, flags
 		f.next = fds[len(fds)-1] + 1
 	}
 
+	f.mu.Unlock()
 	return fds, nil
 }
 
@@ -335,7 +340,6 @@ func (f *FDTable) NewFDsVFS2(ctx context.Context, fd int32, files []*vfs.FileDes
 	}
 
 	f.mu.Lock()
-	defer f.mu.Unlock()
 
 	// From f.next to find available fd.
 	if fd < f.next {
@@ -345,15 +349,25 @@ func (f *FDTable) NewFDsVFS2(ctx context.Context, fd int32, files []*vfs.FileDes
 	// Install all entries.
 	for i := fd; i < end && len(fds) < len(files); i++ {
 		if d, _, _ := f.getVFS2(i); d == nil {
-			f.setVFS2(i, files[len(fds)], flags) // Set the descriptor.
-			fds = append(fds, i)                 // Record the file descriptor.
+			// Set the descriptor.
+			f.setVFS2(ctx, i, files[len(fds)], flags)
+			fds = append(fds, i) // Record the file descriptor.
 		}
 	}
 
 	// Failure? Unwind existing FDs.
 	if len(fds) < len(files) {
 		for _, i := range fds {
-			f.setVFS2(i, nil, FDFlags{}) // Zap entry.
+			f.setVFS2(ctx, i, nil, FDFlags{})
+		}
+		f.mu.Unlock()
+
+		// Drop the reference taken by the call to f.setVFS2() that
+		// originally installed the file. Don't call f.dropVFS2()
+		// (generating inotify events, etc.) since the file should
+		// appear to have never been inserted into f.
+		for _, file := range files[:len(fds)] {
+			file.DecRef(ctx)
 		}
 		return nil, syscall.EMFILE
 	}
@@ -363,6 +377,7 @@ func (f *FDTable) NewFDsVFS2(ctx context.Context, fd int32, files []*vfs.FileDes
 		f.next = fds[len(fds)-1] + 1
 	}
 
+	f.mu.Unlock()
 	return fds, nil
 }
 
@@ -398,7 +413,7 @@ func (f *FDTable) NewFDVFS2(ctx context.Context, minfd int32, file *vfs.FileDesc
 	}
 	for fd < end {
 		if d, _, _ := f.getVFS2(fd); d == nil {
-			f.setVFS2(fd, file, flags)
+			f.setVFS2(ctx, fd, file, flags)
 			if fd == f.next {
 				// Update next search start position.
 				f.next = fd + 1
@@ -414,40 +429,55 @@ func (f *FDTable) NewFDVFS2(ctx context.Context, minfd int32, file *vfs.FileDesc
 // reference for that FD, the ref count for that existing reference is
 // decremented.
 func (f *FDTable) NewFDAt(ctx context.Context, fd int32, file *fs.File, flags FDFlags) error {
-	return f.newFDAt(ctx, fd, file, nil, flags)
+	df, _, err := f.newFDAt(ctx, fd, file, nil, flags)
+	if err != nil {
+		return err
+	}
+	if df != nil {
+		f.drop(ctx, df)
+	}
+	return nil
 }
 
 // NewFDAtVFS2 sets the file reference for the given FD. If there is an active
 // reference for that FD, the ref count for that existing reference is
 // decremented.
 func (f *FDTable) NewFDAtVFS2(ctx context.Context, fd int32, file *vfs.FileDescription, flags FDFlags) error {
-	return f.newFDAt(ctx, fd, nil, file, flags)
+	_, dfVFS2, err := f.newFDAt(ctx, fd, nil, file, flags)
+	if err != nil {
+		return err
+	}
+	if dfVFS2 != nil {
+		f.dropVFS2(ctx, dfVFS2)
+	}
+	return nil
 }
 
-func (f *FDTable) newFDAt(ctx context.Context, fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) error {
+func (f *FDTable) newFDAt(ctx context.Context, fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) (*fs.File, *vfs.FileDescription, error) {
 	if fd < 0 {
 		// Don't accept negative FDs.
-		return syscall.EBADF
+		return nil, nil, syscall.EBADF
 	}
 
 	// Check the limit for the provided file.
 	if limitSet := limits.FromContext(ctx); limitSet != nil {
 		if lim := limitSet.Get(limits.NumberOfFiles); lim.Cur != limits.Infinity && uint64(fd) >= lim.Cur {
-			return syscall.EMFILE
+			return nil, nil, syscall.EMFILE
 		}
 	}
 
 	// Install the entry.
 	f.mu.Lock()
 	defer f.mu.Unlock()
-	f.setAll(fd, file, fileVFS2, flags)
-	return nil
+
+	df, dfVFS2 := f.setAll(ctx, fd, file, fileVFS2, flags)
+	return df, dfVFS2, nil
 }
 
 // SetFlags sets the flags for the given file descriptor.
 //
 // True is returned iff flags were changed.
-func (f *FDTable) SetFlags(fd int32, flags FDFlags) error {
+func (f *FDTable) SetFlags(ctx context.Context, fd int32, flags FDFlags) error {
 	if fd < 0 {
 		// Don't accept negative FDs.
 		return syscall.EBADF
@@ -463,14 +493,14 @@ func (f *FDTable) SetFlags(fd int32, flags FDFlags) error {
 	}
 
 	// Update the flags.
-	f.set(fd, file, flags)
+	f.set(ctx, fd, file, flags)
 	return nil
 }
 
 // SetFlagsVFS2 sets the flags for the given file descriptor.
 //
 // True is returned iff flags were changed.
-func (f *FDTable) SetFlagsVFS2(fd int32, flags FDFlags) error {
+func (f *FDTable) SetFlagsVFS2(ctx context.Context, fd int32, flags FDFlags) error {
 	if fd < 0 {
 		// Don't accept negative FDs.
 		return syscall.EBADF
@@ -486,7 +516,7 @@ func (f *FDTable) SetFlagsVFS2(fd int32, flags FDFlags) error {
 	}
 
 	// Update the flags.
-	f.setVFS2(fd, file, flags)
+	f.setVFS2(ctx, fd, file, flags)
 	return nil
 }
 
@@ -552,30 +582,6 @@ func (f *FDTable) GetFDs(ctx context.Context) []int32 {
 	return fds
 }
 
-// GetRefs returns a stable slice of references to all files and bumps the
-// reference count on each. The caller must use DecRef on each reference when
-// they're done using the slice.
-func (f *FDTable) GetRefs(ctx context.Context) []*fs.File {
-	files := make([]*fs.File, 0, f.Size())
-	f.forEach(ctx, func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) {
-		file.IncRef() // Acquire a reference for caller.
-		files = append(files, file)
-	})
-	return files
-}
-
-// GetRefsVFS2 returns a stable slice of references to all files and bumps the
-// reference count on each. The caller must use DecRef on each reference when
-// they're done using the slice.
-func (f *FDTable) GetRefsVFS2(ctx context.Context) []*vfs.FileDescription {
-	files := make([]*vfs.FileDescription, 0, f.Size())
-	f.forEach(ctx, func(_ int32, _ *fs.File, file *vfs.FileDescription, _ FDFlags) {
-		file.IncRef() // Acquire a reference for caller.
-		files = append(files, file)
-	})
-	return files
-}
-
 // Fork returns an independent FDTable.
 func (f *FDTable) Fork(ctx context.Context) *FDTable {
 	clone := f.k.NewFDTable()
@@ -583,11 +589,8 @@ func (f *FDTable) Fork(ctx context.Context) *FDTable {
 	f.forEach(ctx, func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) {
 		// The set function here will acquire an appropriate table
 		// reference for the clone. We don't need anything else.
-		switch {
-		case file != nil:
-			clone.set(fd, file, flags)
-		case fileVFS2 != nil:
-			clone.setVFS2(fd, fileVFS2, flags)
+		if df, dfVFS2 := clone.setAll(ctx, fd, file, fileVFS2, flags); df != nil || dfVFS2 != nil {
+			panic("VFS1 or VFS2 files set")
 		}
 	})
 	return clone
@@ -596,13 +599,12 @@ func (f *FDTable) Fork(ctx context.Context) *FDTable {
 // Remove removes an FD from and returns a non-file iff successful.
 //
 // N.B. Callers are required to use DecRef when they are done.
-func (f *FDTable) Remove(fd int32) (*fs.File, *vfs.FileDescription) {
+func (f *FDTable) Remove(ctx context.Context, fd int32) (*fs.File, *vfs.FileDescription) {
 	if fd < 0 {
 		return nil, nil
 	}
 
 	f.mu.Lock()
-	defer f.mu.Unlock()
 
 	// Update current available position.
 	if fd < f.next {
@@ -618,24 +620,51 @@ func (f *FDTable) Remove(fd int32) (*fs.File, *vfs.FileDescription) {
 	case orig2 != nil:
 		orig2.IncRef()
 	}
+
 	if orig != nil || orig2 != nil {
-		f.setAll(fd, nil, nil, FDFlags{}) // Zap entry.
+		orig, orig2 = f.setAll(ctx, fd, nil, nil, FDFlags{}) // Zap entry.
 	}
+	f.mu.Unlock()
+
+	if orig != nil {
+		f.drop(ctx, orig)
+	}
+	if orig2 != nil {
+		f.dropVFS2(ctx, orig2)
+	}
+
 	return orig, orig2
 }
 
 // RemoveIf removes all FDs where cond is true.
 func (f *FDTable) RemoveIf(ctx context.Context, cond func(*fs.File, *vfs.FileDescription, FDFlags) bool) {
-	f.mu.Lock()
-	defer f.mu.Unlock()
+	// TODO(gvisor.dev/issue/1624): Remove fs.File slice.
+	var files []*fs.File
+	var filesVFS2 []*vfs.FileDescription
 
+	f.mu.Lock()
 	f.forEach(ctx, func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) {
 		if cond(file, fileVFS2, flags) {
-			f.set(fd, nil, FDFlags{}) // Clear from table.
+			df, dfVFS2 := f.setAll(ctx, fd, nil, nil, FDFlags{}) // Clear from table.
+			if df != nil {
+				files = append(files, df)
+			}
+			if dfVFS2 != nil {
+				filesVFS2 = append(filesVFS2, dfVFS2)
+			}
 			// Update current available position.
 			if fd < f.next {
 				f.next = fd
 			}
 		}
 	})
+	f.mu.Unlock()
+
+	for _, file := range files {
+		f.drop(ctx, file)
+	}
+
+	for _, file := range filesVFS2 {
+		f.dropVFS2(ctx, file)
+	}
 }
diff --git a/pkg/sentry/kernel/fd_table_test.go b/pkg/sentry/kernel/fd_table_test.go
index e3f30ba2a..bf5460083 100644
--- a/pkg/sentry/kernel/fd_table_test.go
+++ b/pkg/sentry/kernel/fd_table_test.go
@@ -72,7 +72,7 @@ func TestFDTableMany(t *testing.T) {
 		}
 
 		i := int32(2)
-		fdTable.Remove(i)
+		fdTable.Remove(ctx, i)
 		if fds, err := fdTable.NewFDs(ctx, 0, []*fs.File{file}, FDFlags{}); err != nil || fds[0] != i {
 			t.Fatalf("Allocated %v FDs but wanted to allocate %v: %v", i, maxFD, err)
 		}
@@ -93,7 +93,7 @@ func TestFDTableOverLimit(t *testing.T) {
 			t.Fatalf("fdTable.NewFDs(maxFD-3, {f,f,f}): got %v, wanted nil", err)
 		} else {
 			for _, fd := range fds {
-				fdTable.Remove(fd)
+				fdTable.Remove(ctx, fd)
 			}
 		}
 
@@ -150,13 +150,13 @@ func TestFDTable(t *testing.T) {
 			t.Fatalf("fdTable.Get(2): got a %v, wanted nil", ref)
 		}
 
-		ref, _ := fdTable.Remove(1)
+		ref, _ := fdTable.Remove(ctx, 1)
 		if ref == nil {
 			t.Fatalf("fdTable.Remove(1) for an existing FD: failed, want success")
 		}
 		ref.DecRef(ctx)
 
-		if ref, _ := fdTable.Remove(1); ref != nil {
+		if ref, _ := fdTable.Remove(ctx, 1); ref != nil {
 			t.Fatalf("r.Remove(1) for a removed FD: got success, want failure")
 		}
 	})
diff --git a/pkg/sentry/kernel/fd_table_unsafe.go b/pkg/sentry/kernel/fd_table_unsafe.go
index 7fd97dc53..da79e6627 100644
--- a/pkg/sentry/kernel/fd_table_unsafe.go
+++ b/pkg/sentry/kernel/fd_table_unsafe.go
@@ -18,6 +18,7 @@ import (
 	"sync/atomic"
 	"unsafe"
 
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 )
@@ -31,6 +32,8 @@ type descriptorTable struct {
 }
 
 // init initializes the table.
+//
+// TODO(gvisor.dev/1486): Enable leak check for FDTable.
 func (f *FDTable) init() {
 	var slice []unsafe.Pointer // Empty slice.
 	atomic.StorePointer(&f.slice, unsafe.Pointer(&slice))
@@ -76,33 +79,37 @@ func (f *FDTable) getAll(fd int32) (*fs.File, *vfs.FileDescription, FDFlags, boo
 	return d.file, d.fileVFS2, d.flags, true
 }
 
-// set sets an entry.
-//
-// This handles accounting changes, as well as acquiring and releasing the
-// reference needed by the table iff the file is different.
+// CurrentMaxFDs returns the number of file descriptors that may be stored in f
+// without reallocation.
+func (f *FDTable) CurrentMaxFDs() int {
+	slice := *(*[]unsafe.Pointer)(atomic.LoadPointer(&f.slice))
+	return len(slice)
+}
+
+// set sets an entry for VFS1, refer to setAll().
 //
 // Precondition: mu must be held.
-func (f *FDTable) set(fd int32, file *fs.File, flags FDFlags) {
-	f.setAll(fd, file, nil, flags)
+func (f *FDTable) set(ctx context.Context, fd int32, file *fs.File, flags FDFlags) *fs.File {
+	dropFile, _ := f.setAll(ctx, fd, file, nil, flags)
+	return dropFile
 }
 
-// setVFS2 sets an entry.
-//
-// This handles accounting changes, as well as acquiring and releasing the
-// reference needed by the table iff the file is different.
+// setVFS2 sets an entry for VFS2, refer to setAll().
 //
 // Precondition: mu must be held.
-func (f *FDTable) setVFS2(fd int32, file *vfs.FileDescription, flags FDFlags) {
-	f.setAll(fd, nil, file, flags)
+func (f *FDTable) setVFS2(ctx context.Context, fd int32, file *vfs.FileDescription, flags FDFlags) *vfs.FileDescription {
+	_, dropFile := f.setAll(ctx, fd, nil, file, flags)
+	return dropFile
 }
 
-// setAll sets an entry.
-//
-// This handles accounting changes, as well as acquiring and releasing the
-// reference needed by the table iff the file is different.
+// setAll sets the file description referred to by fd to file/fileVFS2. If
+// file/fileVFS2 are non-nil, it takes a reference on them. If setAll replaces
+// an existing file description, it returns it with the FDTable's reference
+// transferred to the caller, which must call f.drop/dropVFS2() on the returned
+// file after unlocking f.mu.
 //
 // Precondition: mu must be held.
-func (f *FDTable) setAll(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) {
+func (f *FDTable) setAll(ctx context.Context, fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) (*fs.File, *vfs.FileDescription) {
 	if file != nil && fileVFS2 != nil {
 		panic("VFS1 and VFS2 files set")
 	}
@@ -145,25 +152,25 @@ func (f *FDTable) setAll(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription,
 		}
 	}
 
-	// Drop the table reference.
+	// Adjust used.
+	switch {
+	case orig == nil && desc != nil:
+		atomic.AddInt32(&f.used, 1)
+	case orig != nil && desc == nil:
+		atomic.AddInt32(&f.used, -1)
+	}
+
 	if orig != nil {
 		switch {
 		case orig.file != nil:
 			if desc == nil || desc.file != orig.file {
-				f.drop(orig.file)
+				return orig.file, nil
 			}
 		case orig.fileVFS2 != nil:
 			if desc == nil || desc.fileVFS2 != orig.fileVFS2 {
-				f.dropVFS2(orig.fileVFS2)
+				return nil, orig.fileVFS2
 			}
 		}
 	}
-
-	// Adjust used.
-	switch {
-	case orig == nil && desc != nil:
-		atomic.AddInt32(&f.used, 1)
-	case orig != nil && desc == nil:
-		atomic.AddInt32(&f.used, -1)
-	}
+	return nil, nil
 }
diff --git a/pkg/sentry/kernel/fs_context.go b/pkg/sentry/kernel/fs_context.go
index 8f2d36d5a..d46d1e1c1 100644
--- a/pkg/sentry/kernel/fs_context.go
+++ b/pkg/sentry/kernel/fs_context.go
@@ -18,7 +18,6 @@ import (
 	"fmt"
 
 	"gvisor.dev/gvisor/pkg/context"
-	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/sync"
@@ -30,7 +29,7 @@ import (
 //
 // +stateify savable
 type FSContext struct {
-	refs.AtomicRefCount
+	FSContextRefs
 
 	// mu protects below.
 	mu sync.Mutex `state:"nosave"`
@@ -64,7 +63,7 @@ func newFSContext(root, cwd *fs.Dirent, umask uint) *FSContext {
 		cwd:   cwd,
 		umask: umask,
 	}
-	f.EnableLeakCheck("kernel.FSContext")
+	f.EnableLeakCheck()
 	return &f
 }
 
@@ -77,54 +76,56 @@ func NewFSContextVFS2(root, cwd vfs.VirtualDentry, umask uint) *FSContext {
 		cwdVFS2:  cwd,
 		umask:    umask,
 	}
-	f.EnableLeakCheck("kernel.FSContext")
+	f.EnableLeakCheck()
 	return &f
 }
 
-// destroy is the destructor for an FSContext.
+// DecRef implements RefCounter.DecRef.
 //
-// This will call DecRef on both root and cwd Dirents.  If either call to
-// DecRef returns an error, then it will be propagated.  If both calls to
-// DecRef return an error, then the one from root.DecRef will be propagated.
+// When f reaches zero references, DecRef will be called on both root and cwd
+// Dirents.
 //
 // Note that there may still be calls to WorkingDirectory() or RootDirectory()
 // (that return nil).  This is because valid references may still be held via
 // proc files or other mechanisms.
-func (f *FSContext) destroy(ctx context.Context) {
-	// Hold f.mu so that we don't race with RootDirectory() and
-	// WorkingDirectory().
-	f.mu.Lock()
-	defer f.mu.Unlock()
-
-	if VFS2Enabled {
-		f.rootVFS2.DecRef(ctx)
-		f.rootVFS2 = vfs.VirtualDentry{}
-		f.cwdVFS2.DecRef(ctx)
-		f.cwdVFS2 = vfs.VirtualDentry{}
-	} else {
-		f.root.DecRef(ctx)
-		f.root = nil
-		f.cwd.DecRef(ctx)
-		f.cwd = nil
-	}
-}
-
-// DecRef implements RefCounter.DecRef with destructor f.destroy.
 func (f *FSContext) DecRef(ctx context.Context) {
-	f.DecRefWithDestructor(ctx, f.destroy)
+	f.FSContextRefs.DecRef(func() {
+		// Hold f.mu so that we don't race with RootDirectory() and
+		// WorkingDirectory().
+		f.mu.Lock()
+		defer f.mu.Unlock()
+
+		if VFS2Enabled {
+			f.rootVFS2.DecRef(ctx)
+			f.rootVFS2 = vfs.VirtualDentry{}
+			f.cwdVFS2.DecRef(ctx)
+			f.cwdVFS2 = vfs.VirtualDentry{}
+		} else {
+			f.root.DecRef(ctx)
+			f.root = nil
+			f.cwd.DecRef(ctx)
+			f.cwd = nil
+		}
+	})
 }
 
 // Fork forks this FSContext.
 //
-// This is not a valid call after destroy.
+// This is not a valid call after f is destroyed.
 func (f *FSContext) Fork() *FSContext {
 	f.mu.Lock()
 	defer f.mu.Unlock()
 
 	if VFS2Enabled {
+		if !f.cwdVFS2.Ok() {
+			panic("FSContext.Fork() called after destroy")
+		}
 		f.cwdVFS2.IncRef()
 		f.rootVFS2.IncRef()
 	} else {
+		if f.cwd == nil {
+			panic("FSContext.Fork() called after destroy")
+		}
 		f.cwd.IncRef()
 		f.root.IncRef()
 	}
@@ -140,8 +141,8 @@ func (f *FSContext) Fork() *FSContext {
 
 // WorkingDirectory returns the current working directory.
 //
-// This will return nil if called after destroy(), otherwise it will return a
-// Dirent with a reference taken.
+// This will return nil if called after f is destroyed, otherwise it will return
+// a Dirent with a reference taken.
 func (f *FSContext) WorkingDirectory() *fs.Dirent {
 	f.mu.Lock()
 	defer f.mu.Unlock()
@@ -152,8 +153,8 @@ func (f *FSContext) WorkingDirectory() *fs.Dirent {
 
 // WorkingDirectoryVFS2 returns the current working directory.
 //
-// This will return nil if called after destroy(), otherwise it will return a
-// Dirent with a reference taken.
+// This will return nil if called after f is destroyed, otherwise it will return
+// a Dirent with a reference taken.
 func (f *FSContext) WorkingDirectoryVFS2() vfs.VirtualDentry {
 	f.mu.Lock()
 	defer f.mu.Unlock()
@@ -165,7 +166,7 @@ func (f *FSContext) WorkingDirectoryVFS2() vfs.VirtualDentry {
 // SetWorkingDirectory sets the current working directory.
 // This will take an extra reference on the Dirent.
 //
-// This is not a valid call after destroy.
+// This is not a valid call after f is destroyed.
 func (f *FSContext) SetWorkingDirectory(ctx context.Context, d *fs.Dirent) {
 	if d == nil {
 		panic("FSContext.SetWorkingDirectory called with nil dirent")
@@ -187,11 +188,15 @@ func (f *FSContext) SetWorkingDirectory(ctx context.Context, d *fs.Dirent) {
 // SetWorkingDirectoryVFS2 sets the current working directory.
 // This will take an extra reference on the VirtualDentry.
 //
-// This is not a valid call after destroy.
+// This is not a valid call after f is destroyed.
 func (f *FSContext) SetWorkingDirectoryVFS2(ctx context.Context, d vfs.VirtualDentry) {
 	f.mu.Lock()
 	defer f.mu.Unlock()
 
+	if !f.cwdVFS2.Ok() {
+		panic(fmt.Sprintf("FSContext.SetWorkingDirectoryVFS2(%v)) called after destroy", d))
+	}
+
 	old := f.cwdVFS2
 	f.cwdVFS2 = d
 	d.IncRef()
@@ -200,8 +205,8 @@ func (f *FSContext) SetWorkingDirectoryVFS2(ctx context.Context, d vfs.VirtualDe
 
 // RootDirectory returns the current filesystem root.
 //
-// This will return nil if called after destroy(), otherwise it will return a
-// Dirent with a reference taken.
+// This will return nil if called after f is destroyed, otherwise it will return
+// a Dirent with a reference taken.
 func (f *FSContext) RootDirectory() *fs.Dirent {
 	f.mu.Lock()
 	defer f.mu.Unlock()
@@ -213,8 +218,8 @@ func (f *FSContext) RootDirectory() *fs.Dirent {
 
 // RootDirectoryVFS2 returns the current filesystem root.
 //
-// This will return nil if called after destroy(), otherwise it will return a
-// Dirent with a reference taken.
+// This will return nil if called after f is destroyed, otherwise it will return
+// a Dirent with a reference taken.
 func (f *FSContext) RootDirectoryVFS2() vfs.VirtualDentry {
 	f.mu.Lock()
 	defer f.mu.Unlock()
@@ -226,7 +231,7 @@ func (f *FSContext) RootDirectoryVFS2() vfs.VirtualDentry {
 // SetRootDirectory sets the root directory.
 // This will take an extra reference on the Dirent.
 //
-// This is not a valid call after free.
+// This is not a valid call after f is destroyed.
 func (f *FSContext) SetRootDirectory(ctx context.Context, d *fs.Dirent) {
 	if d == nil {
 		panic("FSContext.SetRootDirectory called with nil dirent")
@@ -247,7 +252,7 @@ func (f *FSContext) SetRootDirectory(ctx context.Context, d *fs.Dirent) {
 
 // SetRootDirectoryVFS2 sets the root directory. It takes a reference on vd.
 //
-// This is not a valid call after free.
+// This is not a valid call after f is destroyed.
 func (f *FSContext) SetRootDirectoryVFS2(ctx context.Context, vd vfs.VirtualDentry) {
 	if !vd.Ok() {
 		panic("FSContext.SetRootDirectoryVFS2 called with zero-value VirtualDentry")
diff --git a/pkg/sentry/kernel/ipc_namespace.go b/pkg/sentry/kernel/ipc_namespace.go
index 80a070d7e..3f34ee0db 100644
--- a/pkg/sentry/kernel/ipc_namespace.go
+++ b/pkg/sentry/kernel/ipc_namespace.go
@@ -15,6 +15,7 @@
 package kernel
 
 import (
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/semaphore"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/shm"
@@ -24,6 +25,8 @@ import (
 //
 // +stateify savable
 type IPCNamespace struct {
+	IPCNamespaceRefs
+
 	// User namespace which owns this IPC namespace. Immutable.
 	userNS *auth.UserNamespace
 
@@ -33,11 +36,13 @@ type IPCNamespace struct {
 
 // NewIPCNamespace creates a new IPC namespace.
 func NewIPCNamespace(userNS *auth.UserNamespace) *IPCNamespace {
-	return &IPCNamespace{
+	ns := &IPCNamespace{
 		userNS:     userNS,
 		semaphores: semaphore.NewRegistry(userNS),
 		shms:       shm.NewRegistry(userNS),
 	}
+	ns.EnableLeakCheck()
+	return ns
 }
 
 // SemaphoreRegistry returns the semaphore set registry for this namespace.
@@ -50,6 +55,13 @@ func (i *IPCNamespace) ShmRegistry() *shm.Registry {
 	return i.shms
 }
 
+// DecRef implements refs_vfs2.RefCounter.DecRef.
+func (i *IPCNamespace) DecRef(ctx context.Context) {
+	i.IPCNamespaceRefs.DecRef(func() {
+		i.shms.Release(ctx)
+	})
+}
+
 // IPCNamespace returns the task's IPC namespace.
 func (t *Task) IPCNamespace() *IPCNamespace {
 	t.mu.Lock()
diff --git a/pkg/sentry/kernel/kcov.go b/pkg/sentry/kernel/kcov.go
new file mode 100644
index 000000000..4fcdfc541
--- /dev/null
+++ b/pkg/sentry/kernel/kcov.go
@@ -0,0 +1,338 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"fmt"
+	"io"
+	"sync"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/coverage"
+	"gvisor.dev/gvisor/pkg/safemem"
+	"gvisor.dev/gvisor/pkg/sentry/memmap"
+	"gvisor.dev/gvisor/pkg/sentry/mm"
+	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
+	"gvisor.dev/gvisor/pkg/sentry/usage"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// kcovAreaSizeMax is the maximum number of uint64 entries allowed in the kcov
+// area. On Linux, the maximum is INT_MAX / 8.
+const kcovAreaSizeMax = 10 * 1024 * 1024
+
+// Kcov provides kernel coverage data to userspace through a memory-mapped
+// region, as kcov does in Linux.
+//
+// To give the illusion that the data is always up to date, we update the shared
+// memory every time before we return to userspace.
+type Kcov struct {
+	// mfp provides application memory. It is immutable after creation.
+	mfp pgalloc.MemoryFileProvider
+
+	// mu protects all of the fields below.
+	mu sync.RWMutex
+
+	// mode is the current kcov mode.
+	mode uint8
+
+	// size is the size of the mapping through which the kernel conveys coverage
+	// information to userspace.
+	size uint64
+
+	// owningTask is the task that currently owns coverage data on the system. The
+	// interface for kcov essentially requires that coverage is only going to a
+	// single task. Note that kcov should only generate coverage data for the
+	// owning task, but we currently generate global coverage.
+	owningTask *Task
+
+	// count is a locally cached version of the first uint64 in the kcov data,
+	// which is the number of subsequent entries representing PCs.
+	//
+	// It is used with kcovInode.countBlock(), to copy in/out the first element of
+	// the actual data in an efficient manner, avoid boilerplate, and prevent
+	// accidental garbage escapes by the temporary counts.
+	count uint64
+
+	mappable *mm.SpecialMappable
+}
+
+// NewKcov creates and returns a Kcov instance.
+func (k *Kernel) NewKcov() *Kcov {
+	return &Kcov{
+		mfp: k,
+	}
+}
+
+var coveragePool = sync.Pool{
+	New: func() interface{} {
+		return make([]byte, 0)
+	},
+}
+
+// TaskWork implements TaskWorker.TaskWork.
+func (kcov *Kcov) TaskWork(t *Task) {
+	kcov.mu.Lock()
+	defer kcov.mu.Unlock()
+
+	if kcov.mode != linux.KCOV_MODE_TRACE_PC {
+		return
+	}
+
+	rw := &kcovReadWriter{
+		mf: kcov.mfp.MemoryFile(),
+		fr: kcov.mappable.FileRange(),
+	}
+
+	// Read in the PC count.
+	if _, err := safemem.ReadFullToBlocks(rw, kcov.countBlock()); err != nil {
+		panic(fmt.Sprintf("Internal error reading count from kcov area: %v", err))
+	}
+
+	rw.off = 8 * (1 + kcov.count)
+	n := coverage.ConsumeCoverageData(&kcovIOWriter{rw})
+
+	// Update the pc count, based on the number of entries written. Note that if
+	// we reached the end of the kcov area, we may not have written everything in
+	// output.
+	kcov.count += uint64(n / 8)
+	rw.off = 0
+	if _, err := safemem.WriteFullFromBlocks(rw, kcov.countBlock()); err != nil {
+		panic(fmt.Sprintf("Internal error writing count to kcov area: %v", err))
+	}
+
+	// Re-register for future work.
+	t.RegisterWork(kcov)
+}
+
+// InitTrace performs the KCOV_INIT_TRACE ioctl.
+func (kcov *Kcov) InitTrace(size uint64) error {
+	kcov.mu.Lock()
+	defer kcov.mu.Unlock()
+
+	if kcov.mode != linux.KCOV_MODE_DISABLED {
+		return syserror.EBUSY
+	}
+
+	// To simplify all the logic around mapping, we require that the length of the
+	// shared region is a multiple of the system page size.
+	if (8*size)&(usermem.PageSize-1) != 0 {
+		return syserror.EINVAL
+	}
+
+	// We need space for at least two uint64s to hold current position and a
+	// single PC.
+	if size < 2 || size > kcovAreaSizeMax {
+		return syserror.EINVAL
+	}
+
+	kcov.size = size
+	kcov.mode = linux.KCOV_MODE_INIT
+	return nil
+}
+
+// EnableTrace performs the KCOV_ENABLE_TRACE ioctl.
+func (kcov *Kcov) EnableTrace(ctx context.Context, traceKind uint8) error {
+	t := TaskFromContext(ctx)
+	if t == nil {
+		panic("kcovInode.EnableTrace() cannot be used outside of a task goroutine")
+	}
+
+	kcov.mu.Lock()
+	defer kcov.mu.Unlock()
+
+	// KCOV_ENABLE must be preceded by KCOV_INIT_TRACE and an mmap call.
+	if kcov.mode != linux.KCOV_MODE_INIT || kcov.mappable == nil {
+		return syserror.EINVAL
+	}
+
+	switch traceKind {
+	case linux.KCOV_TRACE_PC:
+		kcov.mode = linux.KCOV_MODE_TRACE_PC
+	case linux.KCOV_TRACE_CMP:
+		// We do not support KCOV_MODE_TRACE_CMP.
+		return syserror.ENOTSUP
+	default:
+		return syserror.EINVAL
+	}
+
+	if kcov.owningTask != nil && kcov.owningTask != t {
+		return syserror.EBUSY
+	}
+
+	kcov.owningTask = t
+	t.SetKcov(kcov)
+	t.RegisterWork(kcov)
+
+	// Clear existing coverage data; the task expects to read only coverage data
+	// from the time it is activated.
+	coverage.ClearCoverageData()
+	return nil
+}
+
+// DisableTrace performs the KCOV_DISABLE_TRACE ioctl.
+func (kcov *Kcov) DisableTrace(ctx context.Context) error {
+	kcov.mu.Lock()
+	defer kcov.mu.Unlock()
+
+	t := TaskFromContext(ctx)
+	if t == nil {
+		panic("kcovInode.EnableTrace() cannot be used outside of a task goroutine")
+	}
+
+	if t != kcov.owningTask {
+		return syserror.EINVAL
+	}
+	kcov.mode = linux.KCOV_MODE_INIT
+	kcov.owningTask = nil
+	if kcov.mappable != nil {
+		kcov.mappable.DecRef(ctx)
+		kcov.mappable = nil
+	}
+	return nil
+}
+
+// Clear resets the mode and clears the owning task and memory mapping for kcov.
+// It is called when the fd corresponding to kcov is closed. Note that the mode
+// needs to be set so that the next call to kcov.TaskWork() will exit early.
+func (kcov *Kcov) Clear(ctx context.Context) {
+	kcov.mu.Lock()
+	kcov.mode = linux.KCOV_MODE_INIT
+	kcov.owningTask = nil
+	if kcov.mappable != nil {
+		kcov.mappable.DecRef(ctx)
+		kcov.mappable = nil
+	}
+	kcov.mu.Unlock()
+}
+
+// OnTaskExit is called when the owning task exits. It is similar to
+// kcov.Clear(), except the memory mapping is not cleared, so that the same
+// mapping can be used in the future if kcov is enabled again by another task.
+func (kcov *Kcov) OnTaskExit() {
+	kcov.mu.Lock()
+	kcov.mode = linux.KCOV_MODE_INIT
+	kcov.owningTask = nil
+	kcov.mu.Unlock()
+}
+
+// ConfigureMMap is called by the vfs.FileDescription for this kcov instance to
+// implement vfs.FileDescription.ConfigureMMap.
+func (kcov *Kcov) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
+	kcov.mu.Lock()
+	defer kcov.mu.Unlock()
+
+	if kcov.mode != linux.KCOV_MODE_INIT {
+		return syserror.EINVAL
+	}
+
+	if kcov.mappable == nil {
+		// Set up the kcov area.
+		fr, err := kcov.mfp.MemoryFile().Allocate(kcov.size*8, usage.Anonymous)
+		if err != nil {
+			return err
+		}
+
+		// Get the thread id for the mmap name.
+		t := TaskFromContext(ctx)
+		if t == nil {
+			panic("ThreadFromContext returned nil")
+		}
+		// For convenience, a special mappable is used here. Note that these mappings
+		// will look different under /proc/[pid]/maps than they do on Linux.
+		kcov.mappable = mm.NewSpecialMappable(fmt.Sprintf("[kcov:%d]", t.ThreadID()), kcov.mfp, fr)
+	}
+	kcov.mappable.IncRef()
+	opts.Mappable = kcov.mappable
+	opts.MappingIdentity = kcov.mappable
+	return nil
+}
+
+// kcovReadWriter implements safemem.Reader and safemem.Writer.
+type kcovReadWriter struct {
+	off uint64
+	mf  *pgalloc.MemoryFile
+	fr  memmap.FileRange
+}
+
+// ReadToBlocks implements safemem.Reader.ReadToBlocks.
+func (rw *kcovReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
+	if dsts.IsEmpty() {
+		return 0, nil
+	}
+
+	// Limit the read to the kcov range and check for overflow.
+	if rw.fr.Length() <= rw.off {
+		return 0, io.EOF
+	}
+	start := rw.fr.Start + rw.off
+	end := rw.fr.Start + rw.fr.Length()
+	if rend := start + dsts.NumBytes(); rend < end {
+		end = rend
+	}
+
+	// Get internal mappings.
+	bs, err := rw.mf.MapInternal(memmap.FileRange{start, end}, usermem.Read)
+	if err != nil {
+		return 0, err
+	}
+
+	// Copy from internal mappings.
+	n, err := safemem.CopySeq(dsts, bs)
+	rw.off += n
+	return n, err
+}
+
+// WriteFromBlocks implements safemem.Writer.WriteFromBlocks.
+func (rw *kcovReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) {
+	if srcs.IsEmpty() {
+		return 0, nil
+	}
+
+	// Limit the write to the kcov area and check for overflow.
+	if rw.fr.Length() <= rw.off {
+		return 0, io.EOF
+	}
+	start := rw.fr.Start + rw.off
+	end := rw.fr.Start + rw.fr.Length()
+	if wend := start + srcs.NumBytes(); wend < end {
+		end = wend
+	}
+
+	// Get internal mapping.
+	bs, err := rw.mf.MapInternal(memmap.FileRange{start, end}, usermem.Write)
+	if err != nil {
+		return 0, err
+	}
+
+	// Copy to internal mapping.
+	n, err := safemem.CopySeq(bs, srcs)
+	rw.off += n
+	return n, err
+}
+
+// kcovIOWriter implements io.Writer as a basic wrapper over kcovReadWriter.
+type kcovIOWriter struct {
+	rw *kcovReadWriter
+}
+
+// Write implements io.Writer.Write.
+func (w *kcovIOWriter) Write(p []byte) (int, error) {
+	bs := safemem.BlockSeqOf(safemem.BlockFromSafeSlice(p))
+	n, err := safemem.WriteFullFromBlocks(w.rw, bs)
+	return int(n), err
+}
diff --git a/pkg/sentry/kernel/kcov_unsafe.go b/pkg/sentry/kernel/kcov_unsafe.go
new file mode 100644
index 000000000..6f8a0266b
--- /dev/null
+++ b/pkg/sentry/kernel/kcov_unsafe.go
@@ -0,0 +1,28 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"unsafe"
+
+	"gvisor.dev/gvisor/pkg/safemem"
+)
+
+// countBlock provides a safemem.BlockSeq for kcov.count.
+//
+// Like k.count, the block returned is protected by k.mu.
+func (kcov *Kcov) countBlock() safemem.BlockSeq {
+	return safemem.BlockSeqOf(safemem.BlockFromSafePointer(unsafe.Pointer(&kcov.count), int(unsafe.Sizeof(kcov.count))))
+}
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 1028d13c6..0eb2bf7bd 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -39,6 +39,7 @@ import (
 	"time"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/cleanup"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/cpuid"
 	"gvisor.dev/gvisor/pkg/eventchannel"
@@ -220,13 +221,18 @@ type Kernel struct {
 	// danglingEndpoints is used to save / restore tcpip.DanglingEndpoints.
 	danglingEndpoints struct{} `state:".([]tcpip.Endpoint)"`
 
-	// sockets is the list of all network sockets the system. Protected by
-	// extMu.
+	// sockets is the list of all network sockets in the system.
+	// Protected by extMu.
+	// TODO(gvisor.dev/issue/1624): Only used by VFS1.
 	sockets socketList
 
-	// nextSocketEntry is the next entry number to use in sockets. Protected
+	// socketsVFS2 records all network sockets in the system. Protected by
+	// extMu.
+	socketsVFS2 map[*vfs.FileDescription]*SocketRecord
+
+	// nextSocketRecord is the next entry number to use in sockets. Protected
 	// by extMu.
-	nextSocketEntry uint64
+	nextSocketRecord uint64
 
 	// deviceRegistry is used to save/restore device.SimpleDevices.
 	deviceRegistry struct{} `state:".(*device.Registry)"`
@@ -248,7 +254,7 @@ type Kernel struct {
 	// SpecialOpts contains special kernel options.
 	SpecialOpts
 
-	// VFS keeps the filesystem state used across the kernel.
+	// vfs keeps the filesystem state used across the kernel.
 	vfs vfs.VirtualFilesystem
 
 	// hostMount is the Mount used for file descriptors that were imported
@@ -335,7 +341,7 @@ func (k *Kernel) Init(args InitKernelArgs) error {
 		return fmt.Errorf("Timekeeper is nil")
 	}
 	if args.Timekeeper.clocks == nil {
-		return fmt.Errorf("Must call Timekeeper.SetClocks() before Kernel.Init()")
+		return fmt.Errorf("must call Timekeeper.SetClocks() before Kernel.Init()")
 	}
 	if args.RootUserNamespace == nil {
 		return fmt.Errorf("RootUserNamespace is nil")
@@ -360,7 +366,7 @@ func (k *Kernel) Init(args InitKernelArgs) error {
 		k.useHostCores = true
 		maxCPU, err := hostcpu.MaxPossibleCPU()
 		if err != nil {
-			return fmt.Errorf("Failed to get maximum CPU number: %v", err)
+			return fmt.Errorf("failed to get maximum CPU number: %v", err)
 		}
 		minAppCores := uint(maxCPU) + 1
 		if k.applicationCores < minAppCores {
@@ -414,6 +420,8 @@ func (k *Kernel) Init(args InitKernelArgs) error {
 			return fmt.Errorf("failed to create sockfs mount: %v", err)
 		}
 		k.socketMount = socketMount
+
+		k.socketsVFS2 = make(map[*vfs.FileDescription]*SocketRecord)
 	}
 
 	return nil
@@ -507,6 +515,10 @@ func (k *Kernel) SaveTo(w wire.Writer) error {
 // flushMountSourceRefs flushes the MountSources for all mounted filesystems
 // and open FDs.
 func (k *Kernel) flushMountSourceRefs(ctx context.Context) error {
+	if VFS2Enabled {
+		return nil // Not relevant.
+	}
+
 	// Flush all mount sources for currently mounted filesystems in each task.
 	flushed := make(map[*fs.MountNamespace]struct{})
 	k.tasks.mu.RLock()
@@ -533,11 +545,6 @@ func (k *Kernel) flushMountSourceRefs(ctx context.Context) error {
 //
 // Precondition: Must be called with the kernel paused.
 func (ts *TaskSet) forEachFDPaused(ctx context.Context, f func(*fs.File, *vfs.FileDescription) error) (err error) {
-	// TODO(gvisor.dev/issue/1663): Add save support for VFS2.
-	if VFS2Enabled {
-		return nil
-	}
-
 	ts.mu.RLock()
 	defer ts.mu.RUnlock()
 	for t := range ts.Root.tids {
@@ -556,6 +563,10 @@ func (ts *TaskSet) forEachFDPaused(ctx context.Context, f func(*fs.File, *vfs.Fi
 
 func (ts *TaskSet) flushWritesToFiles(ctx context.Context) error {
 	// TODO(gvisor.dev/issue/1663): Add save support for VFS2.
+	if VFS2Enabled {
+		return nil
+	}
+
 	return ts.forEachFDPaused(ctx, func(file *fs.File, _ *vfs.FileDescription) error {
 		if flags := file.Flags(); !flags.Write {
 			return nil
@@ -818,7 +829,9 @@ func (ctx *createProcessContext) Value(key interface{}) interface{} {
 	case CtxUTSNamespace:
 		return ctx.args.UTSNamespace
 	case CtxIPCNamespace:
-		return ctx.args.IPCNamespace
+		ipcns := ctx.args.IPCNamespace
+		ipcns.IncRef()
+		return ipcns
 	case auth.CtxCredentials:
 		return ctx.args.Credentials
 	case fs.CtxRoot:
@@ -831,14 +844,16 @@ func (ctx *createProcessContext) Value(key interface{}) interface{} {
 		if ctx.args.MountNamespaceVFS2 == nil {
 			return nil
 		}
-		// MountNamespaceVFS2.Root() takes a reference on the root dirent for us.
-		return ctx.args.MountNamespaceVFS2.Root()
+		root := ctx.args.MountNamespaceVFS2.Root()
+		root.IncRef()
+		return root
 	case vfs.CtxMountNamespace:
 		if ctx.k.globalInit == nil {
 			return nil
 		}
-		// MountNamespaceVFS2 takes a reference for us.
-		return ctx.k.GlobalInit().Leader().MountNamespaceVFS2()
+		mntns := ctx.k.GlobalInit().Leader().MountNamespaceVFS2()
+		mntns.IncRef()
+		return mntns
 	case fs.CtxDirentCacheLimiter:
 		return ctx.k.DirentCacheLimiter
 	case inet.CtxStack:
@@ -888,19 +903,19 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
 		opener    fsbridge.Lookup
 		fsContext *FSContext
 		mntns     *fs.MountNamespace
+		mntnsVFS2 *vfs.MountNamespace
 	)
 
 	if VFS2Enabled {
-		mntnsVFS2 := args.MountNamespaceVFS2
+		mntnsVFS2 = args.MountNamespaceVFS2
 		if mntnsVFS2 == nil {
-			// MountNamespaceVFS2 adds a reference to the namespace, which is
-			// transferred to the new process.
+			// Add a reference to the namespace, which is transferred to the new process.
 			mntnsVFS2 = k.globalInit.Leader().MountNamespaceVFS2()
+			mntnsVFS2.IncRef()
 		}
 		// Get the root directory from the MountNamespace.
-		root := args.MountNamespaceVFS2.Root()
-		// The call to newFSContext below will take a reference on root, so we
-		// don't need to hold this one.
+		root := mntnsVFS2.Root()
+		root.IncRef()
 		defer root.DecRef(ctx)
 
 		// Grab the working directory.
@@ -952,6 +967,10 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
 	}
 
 	tg := k.NewThreadGroup(mntns, args.PIDNamespace, NewSignalHandlers(), linux.SIGCHLD, args.Limits)
+	cu := cleanup.Make(func() {
+		tg.Release(ctx)
+	})
+	defer cu.Clean()
 
 	// Check which file to start from.
 	switch {
@@ -1008,16 +1027,17 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
 		UTSNamespace:            args.UTSNamespace,
 		IPCNamespace:            args.IPCNamespace,
 		AbstractSocketNamespace: args.AbstractSocketNamespace,
-		MountNamespaceVFS2:      args.MountNamespaceVFS2,
+		MountNamespaceVFS2:      mntnsVFS2,
 		ContainerID:             args.ContainerID,
 	}
-	t, err := k.tasks.NewTask(config)
+	t, err := k.tasks.NewTask(ctx, config)
 	if err != nil {
 		return nil, 0, err
 	}
 	t.traceExecEvent(tc) // Simulate exec for tracing.
 
 	// Success.
+	cu.Release()
 	tgid := k.tasks.Root.IDOfThreadGroup(tg)
 	if k.globalInit == nil {
 		k.globalInit = tg
@@ -1067,8 +1087,9 @@ func (k *Kernel) Start() error {
 
 // pauseTimeLocked pauses all Timers and Timekeeper updates.
 //
-// Preconditions: Any task goroutines running in k must be stopped. k.extMu
-// must be locked.
+// Preconditions:
+// * Any task goroutines running in k must be stopped.
+// * k.extMu must be locked.
 func (k *Kernel) pauseTimeLocked(ctx context.Context) {
 	// k.cpuClockTicker may be nil since Kernel.SaveTo() may be called before
 	// Kernel.Start().
@@ -1111,8 +1132,9 @@ func (k *Kernel) pauseTimeLocked(ctx context.Context) {
 // pauseTimeLocked has not been previously called, resumeTimeLocked has no
 // effect.
 //
-// Preconditions: Any task goroutines running in k must be stopped. k.extMu
-// must be locked.
+// Preconditions:
+// * Any task goroutines running in k must be stopped.
+// * k.extMu must be locked.
 func (k *Kernel) resumeTimeLocked(ctx context.Context) {
 	if k.cpuClockTicker != nil {
 		k.cpuClockTicker.Resume()
@@ -1360,8 +1382,9 @@ func (k *Kernel) RootUTSNamespace() *UTSNamespace {
 	return k.rootUTSNamespace
 }
 
-// RootIPCNamespace returns the root IPCNamespace.
+// RootIPCNamespace takes a reference and returns the root IPCNamespace.
 func (k *Kernel) RootIPCNamespace() *IPCNamespace {
+	k.rootIPCNamespace.IncRef()
 	return k.rootIPCNamespace
 }
 
@@ -1506,20 +1529,27 @@ func (k *Kernel) SupervisorContext() context.Context {
 	}
 }
 
-// SocketEntry represents a socket recorded in Kernel.sockets. It implements
+// SocketRecord represents a socket recorded in Kernel.socketsVFS2.
+//
+// +stateify savable
+type SocketRecord struct {
+	k        *Kernel
+	Sock     *refs.WeakRef        // TODO(gvisor.dev/issue/1624): Only used by VFS1.
+	SockVFS2 *vfs.FileDescription // Only used by VFS2.
+	ID       uint64               // Socket table entry number.
+}
+
+// SocketRecordVFS1 represents a socket recorded in Kernel.sockets. It implements
 // refs.WeakRefUser for sockets stored in the socket table.
 //
 // +stateify savable
-type SocketEntry struct {
+type SocketRecordVFS1 struct {
 	socketEntry
-	k        *Kernel
-	Sock     *refs.WeakRef
-	SockVFS2 *vfs.FileDescription
-	ID       uint64 // Socket table entry number.
+	SocketRecord
 }
 
 // WeakRefGone implements refs.WeakRefUser.WeakRefGone.
-func (s *SocketEntry) WeakRefGone(context.Context) {
+func (s *SocketRecordVFS1) WeakRefGone(context.Context) {
 	s.k.extMu.Lock()
 	s.k.sockets.Remove(s)
 	s.k.extMu.Unlock()
@@ -1530,9 +1560,14 @@ func (s *SocketEntry) WeakRefGone(context.Context) {
 // Precondition: Caller must hold a reference to sock.
 func (k *Kernel) RecordSocket(sock *fs.File) {
 	k.extMu.Lock()
-	id := k.nextSocketEntry
-	k.nextSocketEntry++
-	s := &SocketEntry{k: k, ID: id}
+	id := k.nextSocketRecord
+	k.nextSocketRecord++
+	s := &SocketRecordVFS1{
+		SocketRecord: SocketRecord{
+			k:  k,
+			ID: id,
+		},
+	}
 	s.Sock = refs.NewWeakRef(sock, s)
 	k.sockets.PushBack(s)
 	k.extMu.Unlock()
@@ -1544,29 +1579,45 @@ func (k *Kernel) RecordSocket(sock *fs.File) {
 // Precondition: Caller must hold a reference to sock.
 //
 // Note that the socket table will not hold a reference on the
-// vfs.FileDescription, because we do not support weak refs on VFS2 files.
+// vfs.FileDescription.
 func (k *Kernel) RecordSocketVFS2(sock *vfs.FileDescription) {
 	k.extMu.Lock()
-	id := k.nextSocketEntry
-	k.nextSocketEntry++
-	s := &SocketEntry{
+	if _, ok := k.socketsVFS2[sock]; ok {
+		panic(fmt.Sprintf("Socket %p added twice", sock))
+	}
+	id := k.nextSocketRecord
+	k.nextSocketRecord++
+	s := &SocketRecord{
 		k:        k,
 		ID:       id,
 		SockVFS2: sock,
 	}
-	k.sockets.PushBack(s)
+	k.socketsVFS2[sock] = s
+	k.extMu.Unlock()
+}
+
+// DeleteSocketVFS2 removes a VFS2 socket from the system-wide socket table.
+func (k *Kernel) DeleteSocketVFS2(sock *vfs.FileDescription) {
+	k.extMu.Lock()
+	delete(k.socketsVFS2, sock)
 	k.extMu.Unlock()
 }
 
 // ListSockets returns a snapshot of all sockets.
 //
-// Callers of ListSockets() in VFS2 should use SocketEntry.SockVFS2.TryIncRef()
+// Callers of ListSockets() in VFS2 should use SocketRecord.SockVFS2.TryIncRef()
 // to get a reference on a socket in the table.
-func (k *Kernel) ListSockets() []*SocketEntry {
+func (k *Kernel) ListSockets() []*SocketRecord {
 	k.extMu.Lock()
-	var socks []*SocketEntry
-	for s := k.sockets.Front(); s != nil; s = s.Next() {
-		socks = append(socks, s)
+	var socks []*SocketRecord
+	if VFS2Enabled {
+		for _, s := range k.socketsVFS2 {
+			socks = append(socks, s)
+		}
+	} else {
+		for s := k.sockets.Front(); s != nil; s = s.Next() {
+			socks = append(socks, &s.SocketRecord)
+		}
 	}
 	k.extMu.Unlock()
 	return socks
@@ -1594,7 +1645,9 @@ func (ctx supervisorContext) Value(key interface{}) interface{} {
 	case CtxUTSNamespace:
 		return ctx.k.rootUTSNamespace
 	case CtxIPCNamespace:
-		return ctx.k.rootIPCNamespace
+		ipcns := ctx.k.rootIPCNamespace
+		ipcns.IncRef()
+		return ipcns
 	case auth.CtxCredentials:
 		// The supervisor context is global root.
 		return auth.NewRootCredentials(ctx.k.rootUserNamespace)
@@ -1607,16 +1660,16 @@ func (ctx supervisorContext) Value(key interface{}) interface{} {
 		if ctx.k.globalInit == nil {
 			return vfs.VirtualDentry{}
 		}
-		mntns := ctx.k.GlobalInit().Leader().MountNamespaceVFS2()
-		defer mntns.DecRef(ctx)
-		// Root() takes a reference on the root dirent for us.
-		return mntns.Root()
+		root := ctx.k.GlobalInit().Leader().MountNamespaceVFS2().Root()
+		root.IncRef()
+		return root
 	case vfs.CtxMountNamespace:
 		if ctx.k.globalInit == nil {
 			return nil
 		}
-		// MountNamespaceVFS2() takes a reference for us.
-		return ctx.k.GlobalInit().Leader().MountNamespaceVFS2()
+		mntns := ctx.k.GlobalInit().Leader().MountNamespaceVFS2()
+		mntns.IncRef()
+		return mntns
 	case fs.CtxDirentCacheLimiter:
 		return ctx.k.DirentCacheLimiter
 	case inet.CtxStack:
@@ -1697,3 +1750,20 @@ func (k *Kernel) ShmMount() *vfs.Mount {
 func (k *Kernel) SocketMount() *vfs.Mount {
 	return k.socketMount
 }
+
+// Release releases resources owned by k.
+//
+// Precondition: This should only be called after the kernel is fully
+// initialized, e.g. after k.Start() has been called.
+func (k *Kernel) Release() {
+	ctx := k.SupervisorContext()
+	if VFS2Enabled {
+		k.hostMount.DecRef(ctx)
+		k.pipeMount.DecRef(ctx)
+		k.shmMount.DecRef(ctx)
+		k.socketMount.DecRef(ctx)
+		k.vfs.Release(ctx)
+	}
+	k.timekeeper.Destroy()
+	k.vdso.Release(ctx)
+}
diff --git a/pkg/sentry/kernel/pipe/BUILD b/pkg/sentry/kernel/pipe/BUILD
index 449643118..99134e634 100644
--- a/pkg/sentry/kernel/pipe/BUILD
+++ b/pkg/sentry/kernel/pipe/BUILD
@@ -21,6 +21,7 @@ go_library(
         "//pkg/amutex",
         "//pkg/buffer",
         "//pkg/context",
+        "//pkg/marshal/primitive",
         "//pkg/safemem",
         "//pkg/sentry/arch",
         "//pkg/sentry/device",
diff --git a/pkg/sentry/kernel/pipe/pipe.go b/pkg/sentry/kernel/pipe/pipe.go
index 297e8f28f..67beb0ad6 100644
--- a/pkg/sentry/kernel/pipe/pipe.go
+++ b/pkg/sentry/kernel/pipe/pipe.go
@@ -17,6 +17,7 @@ package pipe
 
 import (
 	"fmt"
+	"io"
 	"sync/atomic"
 	"syscall"
 
@@ -200,22 +201,22 @@ type readOps struct {
 //
 // Precondition: this pipe must have readers.
 func (p *Pipe) read(ctx context.Context, ops readOps) (int64, error) {
-	// Don't block for a zero-length read even if the pipe is empty.
-	if ops.left() == 0 {
-		return 0, nil
-	}
-
 	p.mu.Lock()
 	defer p.mu.Unlock()
 	return p.readLocked(ctx, ops)
 }
 
 func (p *Pipe) readLocked(ctx context.Context, ops readOps) (int64, error) {
+	// Don't block for a zero-length read even if the pipe is empty.
+	if ops.left() == 0 {
+		return 0, nil
+	}
+
 	// Is the pipe empty?
 	if p.view.Size() == 0 {
 		if !p.HasWriters() {
 			// There are no writers, return EOF.
-			return 0, nil
+			return 0, io.EOF
 		}
 		return 0, syserror.ErrWouldBlock
 	}
@@ -388,6 +389,10 @@ func (p *Pipe) rwReadiness() waiter.EventMask {
 func (p *Pipe) queued() int64 {
 	p.mu.Lock()
 	defer p.mu.Unlock()
+	return p.queuedLocked()
+}
+
+func (p *Pipe) queuedLocked() int64 {
 	return p.view.Size()
 }
 
diff --git a/pkg/sentry/kernel/pipe/pipe_util.go b/pkg/sentry/kernel/pipe/pipe_util.go
index 6d58b682f..f665920cb 100644
--- a/pkg/sentry/kernel/pipe/pipe_util.go
+++ b/pkg/sentry/kernel/pipe/pipe_util.go
@@ -23,6 +23,7 @@ import (
 	"gvisor.dev/gvisor/pkg/amutex"
 	"gvisor.dev/gvisor/pkg/buffer"
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/usermem"
@@ -145,9 +146,14 @@ func (p *Pipe) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArgume
 			v = math.MaxInt32 // Silently truncate.
 		}
 		// Copy result to userspace.
-		_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(v), usermem.IOOpts{
-			AddressSpaceActive: true,
-		})
+		iocc := primitive.IOCopyContext{
+			IO:  io,
+			Ctx: ctx,
+			Opts: usermem.IOOpts{
+				AddressSpaceActive: true,
+			},
+		}
+		_, err := primitive.CopyInt32Out(&iocc, args[2].Pointer(), int32(v))
 		return 0, err
 	default:
 		return 0, syscall.ENOTTY
diff --git a/pkg/sentry/kernel/pipe/vfs.go b/pkg/sentry/kernel/pipe/vfs.go
index 28f998e45..1a152142b 100644
--- a/pkg/sentry/kernel/pipe/vfs.go
+++ b/pkg/sentry/kernel/pipe/vfs.go
@@ -67,6 +67,11 @@ func (vp *VFSPipe) ReaderWriterPair(mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlag
 	return vp.newFD(mnt, vfsd, linux.O_RDONLY|statusFlags, locks), vp.newFD(mnt, vfsd, linux.O_WRONLY|statusFlags, locks)
 }
 
+// Allocate implements vfs.FileDescriptionImpl.Allocate.
+func (*VFSPipe) Allocate(context.Context, uint64, uint64, uint64) error {
+	return syserror.ESPIPE
+}
+
 // Open opens the pipe represented by vp.
 func (vp *VFSPipe) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32, locks *vfs.FileLocks) (*vfs.FileDescription, error) {
 	vp.mu.Lock()
@@ -232,8 +237,7 @@ func (fd *VFSPipeFD) Ioctl(ctx context.Context, uio usermem.IO, args arch.Syscal
 
 // PipeSize implements fcntl(F_GETPIPE_SZ).
 func (fd *VFSPipeFD) PipeSize() int64 {
-	// Inline Pipe.FifoSize() rather than calling it with nil Context and
-	// fs.File and ignoring the returned error (which is always nil).
+	// Inline Pipe.FifoSize() since we don't have a fs.File.
 	fd.pipe.mu.Lock()
 	defer fd.pipe.mu.Unlock()
 	return fd.pipe.max
@@ -244,19 +248,57 @@ func (fd *VFSPipeFD) SetPipeSize(size int64) (int64, error) {
 	return fd.pipe.SetFifoSize(size)
 }
 
-// IOSequence returns a useremm.IOSequence that reads up to count bytes from,
-// or writes up to count bytes to, fd.
-func (fd *VFSPipeFD) IOSequence(count int64) usermem.IOSequence {
-	return usermem.IOSequence{
+// SpliceToNonPipe performs a splice operation from fd to a non-pipe file.
+func (fd *VFSPipeFD) SpliceToNonPipe(ctx context.Context, out *vfs.FileDescription, off, count int64) (int64, error) {
+	fd.pipe.mu.Lock()
+	defer fd.pipe.mu.Unlock()
+
+	// Cap the sequence at number of bytes actually available.
+	v := fd.pipe.queuedLocked()
+	if v < count {
+		count = v
+	}
+	src := usermem.IOSequence{
 		IO:    fd,
 		Addrs: usermem.AddrRangeSeqOf(usermem.AddrRange{0, usermem.Addr(count)}),
 	}
+
+	var (
+		n   int64
+		err error
+	)
+	if off == -1 {
+		n, err = out.Write(ctx, src, vfs.WriteOptions{})
+	} else {
+		n, err = out.PWrite(ctx, src, off, vfs.WriteOptions{})
+	}
+	if n > 0 {
+		fd.pipe.view.TrimFront(n)
+	}
+	return n, err
+}
+
+// SpliceFromNonPipe performs a splice operation from a non-pipe file to fd.
+func (fd *VFSPipeFD) SpliceFromNonPipe(ctx context.Context, in *vfs.FileDescription, off, count int64) (int64, error) {
+	fd.pipe.mu.Lock()
+	defer fd.pipe.mu.Unlock()
+
+	dst := usermem.IOSequence{
+		IO:    fd,
+		Addrs: usermem.AddrRangeSeqOf(usermem.AddrRange{0, usermem.Addr(count)}),
+	}
+
+	if off == -1 {
+		return in.Read(ctx, dst, vfs.ReadOptions{})
+	}
+	return in.PRead(ctx, dst, off, vfs.ReadOptions{})
 }
 
-// CopyIn implements usermem.IO.CopyIn.
+// CopyIn implements usermem.IO.CopyIn. Note that it is the caller's
+// responsibility to trim fd.pipe.view after the read is completed.
 func (fd *VFSPipeFD) CopyIn(ctx context.Context, addr usermem.Addr, dst []byte, opts usermem.IOOpts) (int, error) {
 	origCount := int64(len(dst))
-	n, err := fd.pipe.read(ctx, readOps{
+	n, err := fd.pipe.readLocked(ctx, readOps{
 		left: func() int64 {
 			return int64(len(dst))
 		},
@@ -265,7 +307,6 @@ func (fd *VFSPipeFD) CopyIn(ctx context.Context, addr usermem.Addr, dst []byte,
 		},
 		read: func(view *buffer.View) (int64, error) {
 			n, err := view.ReadAt(dst, 0)
-			view.TrimFront(int64(n))
 			return int64(n), err
 		},
 	})
@@ -281,7 +322,7 @@ func (fd *VFSPipeFD) CopyIn(ctx context.Context, addr usermem.Addr, dst []byte,
 // CopyOut implements usermem.IO.CopyOut.
 func (fd *VFSPipeFD) CopyOut(ctx context.Context, addr usermem.Addr, src []byte, opts usermem.IOOpts) (int, error) {
 	origCount := int64(len(src))
-	n, err := fd.pipe.write(ctx, writeOps{
+	n, err := fd.pipe.writeLocked(ctx, writeOps{
 		left: func() int64 {
 			return int64(len(src))
 		},
@@ -305,7 +346,7 @@ func (fd *VFSPipeFD) CopyOut(ctx context.Context, addr usermem.Addr, src []byte,
 // ZeroOut implements usermem.IO.ZeroOut.
 func (fd *VFSPipeFD) ZeroOut(ctx context.Context, addr usermem.Addr, toZero int64, opts usermem.IOOpts) (int64, error) {
 	origCount := toZero
-	n, err := fd.pipe.write(ctx, writeOps{
+	n, err := fd.pipe.writeLocked(ctx, writeOps{
 		left: func() int64 {
 			return toZero
 		},
@@ -326,14 +367,15 @@ func (fd *VFSPipeFD) ZeroOut(ctx context.Context, addr usermem.Addr, toZero int6
 	return n, err
 }
 
-// CopyInTo implements usermem.IO.CopyInTo.
+// CopyInTo implements usermem.IO.CopyInTo. Note that it is the caller's
+// responsibility to trim fd.pipe.view after the read is completed.
 func (fd *VFSPipeFD) CopyInTo(ctx context.Context, ars usermem.AddrRangeSeq, dst safemem.Writer, opts usermem.IOOpts) (int64, error) {
 	count := ars.NumBytes()
 	if count == 0 {
 		return 0, nil
 	}
 	origCount := count
-	n, err := fd.pipe.read(ctx, readOps{
+	n, err := fd.pipe.readLocked(ctx, readOps{
 		left: func() int64 {
 			return count
 		},
@@ -342,7 +384,6 @@ func (fd *VFSPipeFD) CopyInTo(ctx context.Context, ars usermem.AddrRangeSeq, dst
 		},
 		read: func(view *buffer.View) (int64, error) {
 			n, err := view.ReadToSafememWriter(dst, uint64(count))
-			view.TrimFront(int64(n))
 			return int64(n), err
 		},
 	})
@@ -362,7 +403,7 @@ func (fd *VFSPipeFD) CopyOutFrom(ctx context.Context, ars usermem.AddrRangeSeq,
 		return 0, nil
 	}
 	origCount := count
-	n, err := fd.pipe.write(ctx, writeOps{
+	n, err := fd.pipe.writeLocked(ctx, writeOps{
 		left: func() int64 {
 			return count
 		},
diff --git a/pkg/sentry/kernel/ptrace.go b/pkg/sentry/kernel/ptrace.go
index 619b0cb7c..1145faf13 100644
--- a/pkg/sentry/kernel/ptrace.go
+++ b/pkg/sentry/kernel/ptrace.go
@@ -18,6 +18,7 @@ import (
 	"fmt"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/mm"
 	"gvisor.dev/gvisor/pkg/syserror"
@@ -224,8 +225,9 @@ func (s *ptraceStop) Killable() bool {
 // beginPtraceStopLocked does not signal t's tracer or wake it if it is
 // waiting.
 //
-// Preconditions: The TaskSet mutex must be locked. The caller must be running
-// on the task goroutine.
+// Preconditions:
+// * The TaskSet mutex must be locked.
+// * The caller must be running on the task goroutine.
 func (t *Task) beginPtraceStopLocked() bool {
 	t.tg.signalHandlers.mu.Lock()
 	defer t.tg.signalHandlers.mu.Unlock()
@@ -270,8 +272,9 @@ func (t *Task) ptraceTrapLocked(code int32) {
 // ptraceStop, temporarily preventing it from being removed by a concurrent
 // Task.Kill, and returns true. Otherwise it returns false.
 //
-// Preconditions: The TaskSet mutex must be locked. The caller must be running
-// on the task goroutine of t's tracer.
+// Preconditions:
+// * The TaskSet mutex must be locked.
+// * The caller must be running on the task goroutine of t's tracer.
 func (t *Task) ptraceFreeze() bool {
 	t.tg.signalHandlers.mu.Lock()
 	defer t.tg.signalHandlers.mu.Unlock()
@@ -301,8 +304,9 @@ func (t *Task) ptraceUnfreeze() {
 	t.ptraceUnfreezeLocked()
 }
 
-// Preconditions: t must be in a frozen ptraceStop. t's signal mutex must be
-// locked.
+// Preconditions:
+// * t must be in a frozen ptraceStop.
+// * t's signal mutex must be locked.
 func (t *Task) ptraceUnfreezeLocked() {
 	// Do this even if the task has been killed to ensure a panic if t.stop is
 	// nil or not a ptraceStop.
@@ -497,8 +501,9 @@ func (t *Task) forgetTracerLocked() {
 // ptraceSignalLocked is called after signal dequeueing to check if t should
 // enter ptrace signal-delivery-stop.
 //
-// Preconditions: The signal mutex must be locked. The caller must be running
-// on the task goroutine.
+// Preconditions:
+// * The signal mutex must be locked.
+// * The caller must be running on the task goroutine.
 func (t *Task) ptraceSignalLocked(info *arch.SignalInfo) bool {
 	if linux.Signal(info.Signo) == linux.SIGKILL {
 		return false
@@ -828,8 +833,9 @@ func (t *Task) ptraceInterrupt(target *Task) error {
 	return nil
 }
 
-// Preconditions: The TaskSet mutex must be locked for writing. t must have a
-// tracer.
+// Preconditions:
+// * The TaskSet mutex must be locked for writing.
+// * t must have a tracer.
 func (t *Task) ptraceSetOptionsLocked(opts uintptr) error {
 	const valid = uintptr(linux.PTRACE_O_EXITKILL |
 		linux.PTRACE_O_TRACESYSGOOD |
@@ -994,18 +1000,15 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error {
 		// at the address specified by the data parameter, and the return value
 		// is the error flag." - ptrace(2)
 		word := t.Arch().Native(0)
-		if _, err := usermem.CopyObjectIn(t, target.MemoryManager(), addr, word, usermem.IOOpts{
-			IgnorePermissions: true,
-		}); err != nil {
+		if _, err := word.CopyIn(target.AsCopyContext(usermem.IOOpts{IgnorePermissions: true}), addr); err != nil {
 			return err
 		}
-		_, err := t.CopyOut(data, word)
+		_, err := word.CopyOut(t, data)
 		return err
 
 	case linux.PTRACE_POKETEXT, linux.PTRACE_POKEDATA:
-		_, err := usermem.CopyObjectOut(t, target.MemoryManager(), addr, t.Arch().Native(uintptr(data)), usermem.IOOpts{
-			IgnorePermissions: true,
-		})
+		word := t.Arch().Native(uintptr(data))
+		_, err := word.CopyOut(target.AsCopyContext(usermem.IOOpts{IgnorePermissions: true}), addr)
 		return err
 
 	case linux.PTRACE_GETREGSET:
@@ -1073,12 +1076,12 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error {
 		if target.ptraceSiginfo == nil {
 			return syserror.EINVAL
 		}
-		_, err := t.CopyOut(data, target.ptraceSiginfo)
+		_, err := target.ptraceSiginfo.CopyOut(t, data)
 		return err
 
 	case linux.PTRACE_SETSIGINFO:
 		var info arch.SignalInfo
-		if _, err := t.CopyIn(data, &info); err != nil {
+		if _, err := info.CopyIn(t, data); err != nil {
 			return err
 		}
 		t.tg.pidns.owner.mu.RLock()
@@ -1093,7 +1096,8 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error {
 		if addr != linux.SignalSetSize {
 			return syserror.EINVAL
 		}
-		_, err := t.CopyOut(data, target.SignalMask())
+		mask := target.SignalMask()
+		_, err := mask.CopyOut(t, data)
 		return err
 
 	case linux.PTRACE_SETSIGMASK:
@@ -1101,7 +1105,7 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error {
 			return syserror.EINVAL
 		}
 		var mask linux.SignalSet
-		if _, err := t.CopyIn(data, &mask); err != nil {
+		if _, err := mask.CopyIn(t, data); err != nil {
 			return err
 		}
 		// The target's task goroutine is stopped, so this is safe:
@@ -1116,7 +1120,7 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error {
 	case linux.PTRACE_GETEVENTMSG:
 		t.tg.pidns.owner.mu.RLock()
 		defer t.tg.pidns.owner.mu.RUnlock()
-		_, err := t.CopyOut(usermem.Addr(data), target.ptraceEventMsg)
+		_, err := primitive.CopyUint64Out(t, usermem.Addr(data), target.ptraceEventMsg)
 		return err
 
 	// PEEKSIGINFO is unimplemented but seems to have no users anywhere.
diff --git a/pkg/sentry/kernel/ptrace_amd64.go b/pkg/sentry/kernel/ptrace_amd64.go
index cef1276ec..609ad3941 100644
--- a/pkg/sentry/kernel/ptrace_amd64.go
+++ b/pkg/sentry/kernel/ptrace_amd64.go
@@ -30,7 +30,7 @@ func (t *Task) ptraceArch(target *Task, req int64, addr, data usermem.Addr) erro
 		if err != nil {
 			return err
 		}
-		_, err = t.CopyOut(data, n)
+		_, err = n.CopyOut(t, data)
 		return err
 
 	case linux.PTRACE_POKEUSR: // aka PTRACE_POKEUSER
diff --git a/pkg/sentry/kernel/rseq.go b/pkg/sentry/kernel/rseq.go
index 18416643b..2a9023fdf 100644
--- a/pkg/sentry/kernel/rseq.go
+++ b/pkg/sentry/kernel/rseq.go
@@ -173,8 +173,10 @@ func (t *Task) OldRSeqCPUAddr() usermem.Addr {
 // SetOldRSeqCPUAddr replaces the address that old rseq will keep updated with
 // t's CPU number.
 //
-// Preconditions: t.RSeqAvailable() == true. The caller must be running on the
-// task goroutine. t's AddressSpace must be active.
+// Preconditions:
+// * t.RSeqAvailable() == true.
+// * The caller must be running on the task goroutine.
+// * t's AddressSpace must be active.
 func (t *Task) SetOldRSeqCPUAddr(addr usermem.Addr) error {
 	t.oldRSeqCPUAddr = addr
 
@@ -189,8 +191,9 @@ func (t *Task) SetOldRSeqCPUAddr(addr usermem.Addr) error {
 	return nil
 }
 
-// Preconditions: The caller must be running on the task goroutine. t's
-// AddressSpace must be active.
+// Preconditions:
+// * The caller must be running on the task goroutine.
+// * t's AddressSpace must be active.
 func (t *Task) rseqUpdateCPU() error {
 	if t.rseqAddr == 0 && t.oldRSeqCPUAddr == 0 {
 		t.rseqCPU = -1
@@ -209,8 +212,9 @@ func (t *Task) rseqUpdateCPU() error {
 	return oerr
 }
 
-// Preconditions: The caller must be running on the task goroutine. t's
-// AddressSpace must be active.
+// Preconditions:
+// * The caller must be running on the task goroutine.
+// * t's AddressSpace must be active.
 func (t *Task) oldRSeqCopyOutCPU() error {
 	if t.oldRSeqCPUAddr == 0 {
 		return nil
@@ -222,8 +226,9 @@ func (t *Task) oldRSeqCopyOutCPU() error {
 	return err
 }
 
-// Preconditions: The caller must be running on the task goroutine. t's
-// AddressSpace must be active.
+// Preconditions:
+// * The caller must be running on the task goroutine.
+// * t's AddressSpace must be active.
 func (t *Task) rseqCopyOutCPU() error {
 	if t.rseqAddr == 0 {
 		return nil
@@ -240,8 +245,9 @@ func (t *Task) rseqCopyOutCPU() error {
 	return err
 }
 
-// Preconditions: The caller must be running on the task goroutine. t's
-// AddressSpace must be active.
+// Preconditions:
+// * The caller must be running on the task goroutine.
+// * t's AddressSpace must be active.
 func (t *Task) rseqClearCPU() error {
 	buf := t.CopyScratchBuffer(8)
 	// CPUIDStart and CPUID are the first two fields in linux.RSeq.
@@ -269,8 +275,9 @@ func (t *Task) rseqClearCPU() error {
 //
 // See kernel/rseq.c:rseq_ip_fixup for reference.
 //
-// Preconditions: The caller must be running on the task goroutine. t's
-// AddressSpace must be active.
+// Preconditions:
+// * The caller must be running on the task goroutine.
+// * t's AddressSpace must be active.
 func (t *Task) rseqAddrInterrupt() {
 	if t.rseqAddr == 0 {
 		return
diff --git a/pkg/sentry/kernel/seccomp.go b/pkg/sentry/kernel/seccomp.go
index c38c5a40c..387edfa91 100644
--- a/pkg/sentry/kernel/seccomp.go
+++ b/pkg/sentry/kernel/seccomp.go
@@ -18,7 +18,6 @@ import (
 	"syscall"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/binary"
 	"gvisor.dev/gvisor/pkg/bpf"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/syserror"
@@ -27,25 +26,18 @@ import (
 
 const maxSyscallFilterInstructions = 1 << 15
 
-// seccompData is equivalent to struct seccomp_data, which contains the data
-// passed to seccomp-bpf filters.
-type seccompData struct {
-	// nr is the system call number.
-	nr int32
-
-	// arch is an AUDIT_ARCH_* value indicating the system call convention.
-	arch uint32
-
-	// instructionPointer is the value of the instruction pointer at the time
-	// of the system call.
-	instructionPointer uint64
-
-	// args contains the first 6 system call arguments.
-	args [6]uint64
-}
-
-func (d *seccompData) asBPFInput() bpf.Input {
-	return bpf.InputBytes{binary.Marshal(nil, usermem.ByteOrder, d), usermem.ByteOrder}
+// dataAsBPFInput returns a serialized BPF program, only valid on the current task
+// goroutine.
+//
+// Note: this is called for every syscall, which is a very hot path.
+func dataAsBPFInput(t *Task, d *linux.SeccompData) bpf.Input {
+	buf := t.CopyScratchBuffer(d.SizeBytes())
+	d.MarshalUnsafe(buf)
+	return bpf.InputBytes{
+		Data: buf,
+		// Go-marshal always uses the native byte order.
+		Order: usermem.ByteOrder,
+	}
 }
 
 func seccompSiginfo(t *Task, errno, sysno int32, ip usermem.Addr) *arch.SignalInfo {
@@ -112,20 +104,20 @@ func (t *Task) checkSeccompSyscall(sysno int32, args arch.SyscallArguments, ip u
 }
 
 func (t *Task) evaluateSyscallFilters(sysno int32, args arch.SyscallArguments, ip usermem.Addr) uint32 {
-	data := seccompData{
-		nr:                 sysno,
-		arch:               t.tc.st.AuditNumber,
-		instructionPointer: uint64(ip),
+	data := linux.SeccompData{
+		Nr:                 sysno,
+		Arch:               t.tc.st.AuditNumber,
+		InstructionPointer: uint64(ip),
 	}
 	// data.args is []uint64 and args is []arch.SyscallArgument (uintptr), so
 	// we can't do any slicing tricks or even use copy/append here.
 	for i, arg := range args {
-		if i >= len(data.args) {
+		if i >= len(data.Args) {
 			break
 		}
-		data.args[i] = arg.Uint64()
+		data.Args[i] = arg.Uint64()
 	}
-	input := data.asBPFInput()
+	input := dataAsBPFInput(t, &data)
 
 	ret := uint32(linux.SECCOMP_RET_ALLOW)
 	f := t.syscallFilters.Load()
diff --git a/pkg/sentry/kernel/sessions.go b/pkg/sentry/kernel/sessions.go
index 5c4c622c2..df5c8421b 100644
--- a/pkg/sentry/kernel/sessions.go
+++ b/pkg/sentry/kernel/sessions.go
@@ -16,8 +16,6 @@ package kernel
 
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/context"
-	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
@@ -32,7 +30,7 @@ type ProcessGroupID ThreadID
 //
 // +stateify savable
 type Session struct {
-	refs refs.AtomicRefCount
+	SessionRefs
 
 	// leader is the originator of the Session.
 	//
@@ -62,16 +60,11 @@ type Session struct {
 	sessionEntry
 }
 
-// incRef grabs a reference.
-func (s *Session) incRef() {
-	s.refs.IncRef()
-}
-
-// decRef drops a reference.
+// DecRef drops a reference.
 //
 // Precondition: callers must hold TaskSet.mu for writing.
-func (s *Session) decRef() {
-	s.refs.DecRefWithDestructor(nil, func(context.Context) {
+func (s *Session) DecRef() {
+	s.SessionRefs.DecRef(func() {
 		// Remove translations from the leader.
 		for ns := s.leader.pidns; ns != nil; ns = ns.parent {
 			id := ns.sids[s]
@@ -88,7 +81,7 @@ func (s *Session) decRef() {
 //
 // +stateify savable
 type ProcessGroup struct {
-	refs refs.AtomicRefCount // not exported.
+	refs ProcessGroupRefs
 
 	// originator is the originator of the group.
 	//
@@ -163,7 +156,7 @@ func (pg *ProcessGroup) decRefWithParent(parentPG *ProcessGroup) {
 	}
 
 	alive := true
-	pg.refs.DecRefWithDestructor(nil, func(context.Context) {
+	pg.refs.DecRef(func() {
 		alive = false // don't bother with handleOrphan.
 
 		// Remove translations from the originator.
@@ -175,7 +168,7 @@ func (pg *ProcessGroup) decRefWithParent(parentPG *ProcessGroup) {
 
 		// Remove the list of process groups.
 		pg.session.processGroups.Remove(pg)
-		pg.session.decRef()
+		pg.session.DecRef()
 	})
 	if alive {
 		pg.handleOrphan()
@@ -302,7 +295,7 @@ func (tg *ThreadGroup) createSession() error {
 		id:     SessionID(id),
 		leader: tg,
 	}
-	s.refs.EnableLeakCheck("kernel.Session")
+	s.EnableLeakCheck()
 
 	// Create a new ProcessGroup, belonging to that Session.
 	// This also has a single reference (assigned below).
@@ -316,7 +309,7 @@ func (tg *ThreadGroup) createSession() error {
 		session:    s,
 		ancestors:  0,
 	}
-	pg.refs.EnableLeakCheck("kernel.ProcessGroup")
+	pg.refs.EnableLeakCheck()
 
 	// Tie them and return the result.
 	s.processGroups.PushBack(pg)
@@ -396,13 +389,13 @@ func (tg *ThreadGroup) CreateProcessGroup() error {
 	//
 	// We manually adjust the ancestors if the parent is in the same
 	// session.
-	tg.processGroup.session.incRef()
+	tg.processGroup.session.IncRef()
 	pg := ProcessGroup{
 		id:         ProcessGroupID(id),
 		originator: tg,
 		session:    tg.processGroup.session,
 	}
-	pg.refs.EnableLeakCheck("kernel.ProcessGroup")
+	pg.refs.EnableLeakCheck()
 
 	if tg.leader.parent != nil && tg.leader.parent.tg.processGroup.session == pg.session {
 		pg.ancestors++
diff --git a/pkg/sentry/kernel/shm/BUILD b/pkg/sentry/kernel/shm/BUILD
index c211fc8d0..f8a382fd8 100644
--- a/pkg/sentry/kernel/shm/BUILD
+++ b/pkg/sentry/kernel/shm/BUILD
@@ -1,12 +1,25 @@
 load("//tools:defs.bzl", "go_library")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
 
 package(licenses = ["notice"])
 
+go_template_instance(
+    name = "shm_refs",
+    out = "shm_refs.go",
+    package = "shm",
+    prefix = "Shm",
+    template = "//pkg/refs_vfs2:refs_template",
+    types = {
+        "T": "Shm",
+    },
+)
+
 go_library(
     name = "shm",
     srcs = [
         "device.go",
         "shm.go",
+        "shm_refs.go",
     ],
     visibility = ["//pkg/sentry:internal"],
     deps = [
@@ -14,6 +27,7 @@ go_library(
         "//pkg/context",
         "//pkg/log",
         "//pkg/refs",
+        "//pkg/refs_vfs2",
         "//pkg/sentry/device",
         "//pkg/sentry/fs",
         "//pkg/sentry/kernel/auth",
diff --git a/pkg/sentry/kernel/shm/shm.go b/pkg/sentry/kernel/shm/shm.go
index 13ec7afe0..ebbebf46b 100644
--- a/pkg/sentry/kernel/shm/shm.go
+++ b/pkg/sentry/kernel/shm/shm.go
@@ -39,7 +39,6 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/log"
-	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
@@ -252,7 +251,7 @@ func (r *Registry) newShm(ctx context.Context, pid int32, key Key, creator fs.Fi
 		creatorPID:    pid,
 		changeTime:    ktime.NowFromContext(ctx),
 	}
-	shm.EnableLeakCheck("kernel.Shm")
+	shm.EnableLeakCheck()
 
 	// Find the next available ID.
 	for id := r.lastIDUsed + 1; id != r.lastIDUsed; id++ {
@@ -322,9 +321,32 @@ func (r *Registry) remove(s *Shm) {
 	r.totalPages -= s.effectiveSize / usermem.PageSize
 }
 
+// Release drops the self-reference of each active shm segment in the registry.
+// It is called when the kernel.IPCNamespace containing r is being destroyed.
+func (r *Registry) Release(ctx context.Context) {
+	// Because Shm.DecRef() may acquire the same locks, collect the segments to
+	// release first. Note that this should not race with any updates to r, since
+	// the IPC namespace containing it has no more references.
+	toRelease := make([]*Shm, 0)
+	r.mu.Lock()
+	for _, s := range r.keysToShms {
+		s.mu.Lock()
+		if !s.pendingDestruction {
+			toRelease = append(toRelease, s)
+		}
+		s.mu.Unlock()
+	}
+	r.mu.Unlock()
+
+	for _, s := range toRelease {
+		r.dissociateKey(s)
+		s.DecRef(ctx)
+	}
+}
+
 // Shm represents a single shared memory segment.
 //
-// Shm segment are backed directly by an allocation from platform memory.
+// Shm segments are backed directly by an allocation from platform memory.
 // Segments are always mapped as a whole, greatly simplifying how mappings are
 // tracked. However note that mremap and munmap calls may cause the vma for a
 // segment to become fragmented; which requires special care when unmapping a
@@ -337,14 +359,14 @@ func (r *Registry) remove(s *Shm) {
 //
 // +stateify savable
 type Shm struct {
-	// AtomicRefCount tracks the number of references to this segment.
+	// ShmRefs tracks the number of references to this segment.
 	//
 	// A segment holds a reference to itself until it is marked for
 	// destruction.
 	//
 	// In addition to direct users, the MemoryManager will hold references
 	// via MappingIdentity.
-	refs.AtomicRefCount
+	ShmRefs
 
 	mfp pgalloc.MemoryFileProvider
 
@@ -428,11 +450,14 @@ func (s *Shm) InodeID() uint64 {
 	return uint64(s.ID)
 }
 
-// DecRef overrides refs.RefCount.DecRef with a destructor.
+// DecRef drops a reference on s.
 //
 // Precondition: Caller must not hold s.mu.
 func (s *Shm) DecRef(ctx context.Context) {
-	s.DecRefWithDestructor(ctx, s.destroy)
+	s.ShmRefs.DecRef(func() {
+		s.mfp.MemoryFile().DecRef(s.fr)
+		s.registry.remove(s)
+	})
 }
 
 // Msync implements memmap.MappingIdentity.Msync. Msync is a no-op for shm
@@ -642,11 +667,6 @@ func (s *Shm) Set(ctx context.Context, ds *linux.ShmidDS) error {
 	return nil
 }
 
-func (s *Shm) destroy(context.Context) {
-	s.mfp.MemoryFile().DecRef(s.fr)
-	s.registry.remove(s)
-}
-
 // MarkDestroyed marks a segment for destruction. The segment is actually
 // destroyed once it has no references. MarkDestroyed may be called multiple
 // times, and is safe to call after a segment has already been destroyed. See
@@ -655,17 +675,20 @@ func (s *Shm) MarkDestroyed(ctx context.Context) {
 	s.registry.dissociateKey(s)
 
 	s.mu.Lock()
-	defer s.mu.Unlock()
-	if !s.pendingDestruction {
-		s.pendingDestruction = true
-		// Drop the self-reference so destruction occurs when all
-		// external references are gone.
-		//
-		// N.B. This cannot be the final DecRef, as the caller also
-		// holds a reference.
-		s.DecRef(ctx)
+	if s.pendingDestruction {
+		s.mu.Unlock()
 		return
 	}
+	s.pendingDestruction = true
+	s.mu.Unlock()
+
+	// Drop the self-reference so destruction occurs when all
+	// external references are gone.
+	//
+	// N.B. This cannot be the final DecRef, as the caller also
+	// holds a reference.
+	s.DecRef(ctx)
+	return
 }
 
 // checkOwnership verifies whether a segment may be accessed by ctx as an
diff --git a/pkg/sentry/kernel/signalfd/BUILD b/pkg/sentry/kernel/signalfd/BUILD
index 3eb78e91b..76d472292 100644
--- a/pkg/sentry/kernel/signalfd/BUILD
+++ b/pkg/sentry/kernel/signalfd/BUILD
@@ -8,7 +8,6 @@ go_library(
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
-        "//pkg/binary",
         "//pkg/context",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/anon",
diff --git a/pkg/sentry/kernel/signalfd/signalfd.go b/pkg/sentry/kernel/signalfd/signalfd.go
index b07e1c1bd..78f718cfe 100644
--- a/pkg/sentry/kernel/signalfd/signalfd.go
+++ b/pkg/sentry/kernel/signalfd/signalfd.go
@@ -17,7 +17,6 @@ package signalfd
 
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/binary"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/anon"
@@ -103,8 +102,7 @@ func (s *SignalOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOS
 	}
 
 	// Copy out the signal info using the specified format.
-	var buf [128]byte
-	binary.Marshal(buf[:0], usermem.ByteOrder, &linux.SignalfdSiginfo{
+	infoNative := linux.SignalfdSiginfo{
 		Signo:   uint32(info.Signo),
 		Errno:   info.Errno,
 		Code:    info.Code,
@@ -113,9 +111,13 @@ func (s *SignalOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOS
 		Status:  info.Status(),
 		Overrun: uint32(info.Overrun()),
 		Addr:    info.Addr(),
-	})
-	n, err := dst.CopyOut(ctx, buf[:])
-	return int64(n), err
+	}
+	n, err := infoNative.WriteTo(dst.Writer(ctx))
+	if err == usermem.ErrEndOfIOSequence {
+		// Partial copy-out ok.
+		err = nil
+	}
+	return n, err
 }
 
 // Readiness implements waiter.Waitable.Readiness.
diff --git a/pkg/sentry/kernel/syscalls.go b/pkg/sentry/kernel/syscalls.go
index 413111faf..332bdb8e8 100644
--- a/pkg/sentry/kernel/syscalls.go
+++ b/pkg/sentry/kernel/syscalls.go
@@ -348,6 +348,16 @@ func (s *SyscallTable) LookupName(sysno uintptr) string {
 	return fmt.Sprintf("sys_%d", sysno) // Unlikely.
 }
 
+// LookupNo looks up a syscall number by name.
+func (s *SyscallTable) LookupNo(name string) (uintptr, error) {
+	for i, syscall := range s.Table {
+		if syscall.Name == name {
+			return uintptr(i), nil
+		}
+	}
+	return 0, fmt.Errorf("syscall %q not found", name)
+}
+
 // LookupEmulate looks up an emulation syscall number.
 func (s *SyscallTable) LookupEmulate(addr usermem.Addr) (uintptr, bool) {
 	sysno, ok := s.Emulate[addr]
diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index 5aee699e7..037971393 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -574,6 +574,11 @@ type Task struct {
 	//
 	// startTime is protected by mu.
 	startTime ktime.Time
+
+	// kcov is the kcov instance providing code coverage owned by this task.
+	//
+	// kcov is exclusive to the task goroutine.
+	kcov *Kcov
 }
 
 func (t *Task) savePtraceTracer() *Task {
@@ -651,7 +656,9 @@ func (t *Task) Value(key interface{}) interface{} {
 	case CtxUTSNamespace:
 		return t.utsns
 	case CtxIPCNamespace:
-		return t.ipcns
+		ipcns := t.IPCNamespace()
+		ipcns.IncRef()
+		return ipcns
 	case CtxTask:
 		return t
 	case auth.CtxCredentials:
@@ -730,7 +737,6 @@ func (t *Task) SyscallRestartBlock() SyscallRestartBlock {
 func (t *Task) IsChrooted() bool {
 	if VFS2Enabled {
 		realRoot := t.mountNamespaceVFS2.Root()
-		defer realRoot.DecRef(t)
 		root := t.fsContext.RootDirectoryVFS2()
 		defer root.DecRef(t)
 		return root != realRoot
@@ -863,7 +869,6 @@ func (t *Task) MountNamespace() *fs.MountNamespace {
 func (t *Task) MountNamespaceVFS2() *vfs.MountNamespace {
 	t.mu.Lock()
 	defer t.mu.Unlock()
-	t.mountNamespaceVFS2.IncRef()
 	return t.mountNamespaceVFS2
 }
 
@@ -903,3 +908,16 @@ func (t *Task) UID() uint32 {
 func (t *Task) GID() uint32 {
 	return uint32(t.Credentials().EffectiveKGID)
 }
+
+// SetKcov sets the kcov instance associated with t.
+func (t *Task) SetKcov(k *Kcov) {
+	t.kcov = k
+}
+
+// ResetKcov clears the kcov instance associated with t.
+func (t *Task) ResetKcov() {
+	if t.kcov != nil {
+		t.kcov.OnTaskExit()
+		t.kcov = nil
+	}
+}
diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go
index 9d7a9128f..682080c14 100644
--- a/pkg/sentry/kernel/task_clone.go
+++ b/pkg/sentry/kernel/task_clone.go
@@ -19,6 +19,7 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/bpf"
+	"gvisor.dev/gvisor/pkg/cleanup"
 	"gvisor.dev/gvisor/pkg/sentry/inet"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
@@ -203,7 +204,13 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
 		// Note that "If CLONE_NEWIPC is set, then create the process in a new IPC
 		// namespace"
 		ipcns = NewIPCNamespace(userns)
+	} else {
+		ipcns.IncRef()
 	}
+	cu := cleanup.Make(func() {
+		ipcns.DecRef(t)
+	})
+	defer cu.Clean()
 
 	netns := t.NetworkNamespace()
 	if opts.NewNetworkNamespace {
@@ -214,12 +221,18 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
 	mntnsVFS2 := t.mountNamespaceVFS2
 	if mntnsVFS2 != nil {
 		mntnsVFS2.IncRef()
+		cu.Add(func() {
+			mntnsVFS2.DecRef(t)
+		})
 	}
 
 	tc, err := t.tc.Fork(t, t.k, !opts.NewAddressSpace)
 	if err != nil {
 		return 0, nil, err
 	}
+	cu.Add(func() {
+		tc.release()
+	})
 	// clone() returns 0 in the child.
 	tc.Arch.SetReturn(0)
 	if opts.Stack != 0 {
@@ -295,11 +308,11 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
 	} else {
 		cfg.InheritParent = t
 	}
-	nt, err := t.tg.pidns.owner.NewTask(cfg)
+	nt, err := t.tg.pidns.owner.NewTask(t, cfg)
+	// If NewTask succeeds, we transfer references to nt. If NewTask fails, it does
+	// the cleanup for us.
+	cu.Release()
 	if err != nil {
-		if opts.NewThreadGroup {
-			tg.release(t)
-		}
 		return 0, nil, err
 	}
 
@@ -341,12 +354,12 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
 		nt.SetClearTID(opts.ChildTID)
 	}
 	if opts.ChildSetTID {
-		// Can't use Task.CopyOut, which assumes AddressSpaceActive.
-		usermem.CopyObjectOut(t, nt.MemoryManager(), opts.ChildTID, nt.ThreadID(), usermem.IOOpts{})
+		ctid := nt.ThreadID()
+		ctid.CopyOut(nt.AsCopyContext(usermem.IOOpts{AddressSpaceActive: false}), opts.ChildTID)
 	}
 	ntid := t.tg.pidns.IDOfTask(nt)
 	if opts.ParentSetTID {
-		t.CopyOut(opts.ParentTID, ntid)
+		ntid.CopyOut(t, opts.ParentTID)
 	}
 
 	kind := ptraceCloneKindClone
@@ -509,6 +522,7 @@ func (t *Task) Unshare(opts *SharingOptions) error {
 		}
 		// Note that "If CLONE_NEWIPC is set, then create the process in a new IPC
 		// namespace"
+		t.ipcns.DecRef(t)
 		t.ipcns = NewIPCNamespace(creds.UserNamespace)
 	}
 	var oldFDTable *FDTable
diff --git a/pkg/sentry/kernel/task_context.go b/pkg/sentry/kernel/task_context.go
index 9fa528384..d1136461a 100644
--- a/pkg/sentry/kernel/task_context.go
+++ b/pkg/sentry/kernel/task_context.go
@@ -126,7 +126,11 @@ func (t *Task) SyscallTable() *SyscallTable {
 // Preconditions: The caller must be running on the task goroutine, or t.mu
 // must be locked.
 func (t *Task) Stack() *arch.Stack {
-	return &arch.Stack{t.Arch(), t.MemoryManager(), usermem.Addr(t.Arch().Stack())}
+	return &arch.Stack{
+		Arch:   t.Arch(),
+		IO:     t.MemoryManager(),
+		Bottom: usermem.Addr(t.Arch().Stack()),
+	}
 }
 
 // LoadTaskImage loads a specified file into a new TaskContext.
diff --git a/pkg/sentry/kernel/task_exec.go b/pkg/sentry/kernel/task_exec.go
index 5e4fb3e3a..412d471d3 100644
--- a/pkg/sentry/kernel/task_exec.go
+++ b/pkg/sentry/kernel/task_exec.go
@@ -237,9 +237,10 @@ func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState {
 // promoteLocked makes t the leader of its thread group. If t is already the
 // thread group leader, promoteLocked is a no-op.
 //
-// Preconditions: All other tasks in t's thread group, including the existing
-// leader (if it is not t), have reached TaskExitZombie. The TaskSet mutex must
-// be locked for writing.
+// Preconditions:
+// * All other tasks in t's thread group, including the existing leader (if it
+//   is not t), have reached TaskExitZombie.
+// * The TaskSet mutex must be locked for writing.
 func (t *Task) promoteLocked() {
 	oldLeader := t.tg.leader
 	if t == oldLeader {
diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go
index c165d6cb1..ce7b9641d 100644
--- a/pkg/sentry/kernel/task_exit.go
+++ b/pkg/sentry/kernel/task_exit.go
@@ -239,6 +239,8 @@ func (*runExitMain) execute(t *Task) taskRunState {
 	t.traceExitEvent()
 	lastExiter := t.exitThreadGroup()
 
+	t.ResetKcov()
+
 	// If the task has a cleartid, and the thread group wasn't killed by a
 	// signal, handle that before releasing the MM.
 	if t.cleartid != 0 {
@@ -246,7 +248,8 @@ func (*runExitMain) execute(t *Task) taskRunState {
 		signaled := t.tg.exiting && t.tg.exitStatus.Signaled()
 		t.tg.signalHandlers.mu.Unlock()
 		if !signaled {
-			if _, err := t.CopyOut(t.cleartid, ThreadID(0)); err == nil {
+			zero := ThreadID(0)
+			if _, err := zero.CopyOut(t, t.cleartid); err == nil {
 				t.Futex().Wake(t, t.cleartid, false, ^uint32(0), 1)
 			}
 			// If the CopyOut fails, there's nothing we can do.
@@ -277,12 +280,13 @@ func (*runExitMain) execute(t *Task) taskRunState {
 		t.mountNamespaceVFS2.DecRef(t)
 		t.mountNamespaceVFS2 = nil
 	}
+	t.ipcns.DecRef(t)
 	t.mu.Unlock()
 
 	// If this is the last task to exit from the thread group, release the
 	// thread group's resources.
 	if lastExiter {
-		t.tg.release(t)
+		t.tg.Release(t)
 	}
 
 	// Detach tracees.
diff --git a/pkg/sentry/kernel/task_futex.go b/pkg/sentry/kernel/task_futex.go
index 4b535c949..c80391475 100644
--- a/pkg/sentry/kernel/task_futex.go
+++ b/pkg/sentry/kernel/task_futex.go
@@ -16,6 +16,7 @@ package kernel
 
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/futex"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
@@ -87,7 +88,7 @@ func (t *Task) exitRobustList() {
 		return
 	}
 
-	next := rl.List
+	next := primitive.Uint64(rl.List)
 	done := 0
 	var pendingLockAddr usermem.Addr
 	if rl.ListOpPending != 0 {
@@ -99,12 +100,12 @@ func (t *Task) exitRobustList() {
 		// We traverse to the next element of the list before we
 		// actually wake anything. This prevents the race where waking
 		// this futex causes a modification of the list.
-		thisLockAddr := usermem.Addr(next + rl.FutexOffset)
+		thisLockAddr := usermem.Addr(uint64(next) + rl.FutexOffset)
 
 		// Try to decode the next element in the list before waking the
 		// current futex. But don't check the error until after we've
 		// woken the current futex. Linux does it in this order too
-		_, nextErr := t.CopyIn(usermem.Addr(next), &next)
+		_, nextErr := next.CopyIn(t, usermem.Addr(next))
 
 		// Wakeup the current futex if it's not pending.
 		if thisLockAddr != pendingLockAddr {
diff --git a/pkg/sentry/kernel/task_run.go b/pkg/sentry/kernel/task_run.go
index aa3a573c0..8dc3fec90 100644
--- a/pkg/sentry/kernel/task_run.go
+++ b/pkg/sentry/kernel/task_run.go
@@ -141,7 +141,7 @@ func (*runApp) handleCPUIDInstruction(t *Task) error {
 	region := trace.StartRegion(t.traceContext, cpuidRegion)
 	expected := arch.CPUIDInstruction[:]
 	found := make([]byte, len(expected))
-	_, err := t.CopyIn(usermem.Addr(t.Arch().IP()), &found)
+	_, err := t.CopyInBytes(usermem.Addr(t.Arch().IP()), found)
 	if err == nil && bytes.Equal(expected, found) {
 		// Skip the cpuid instruction.
 		t.Arch().CPUIDEmulate(t)
diff --git a/pkg/sentry/kernel/task_sched.go b/pkg/sentry/kernel/task_sched.go
index 09366b60c..52c55d13d 100644
--- a/pkg/sentry/kernel/task_sched.go
+++ b/pkg/sentry/kernel/task_sched.go
@@ -133,9 +133,10 @@ func (t *Task) accountTaskGoroutineEnter(state TaskGoroutineState) {
 	}
 }
 
-// Preconditions: The caller must be running on the task goroutine, and leaving
-// a state indicated by a previous call to
-// t.accountTaskGoroutineEnter(state).
+// Preconditions:
+// * The caller must be running on the task goroutine
+// * The caller must be leaving a state indicated by a previous call to
+//   t.accountTaskGoroutineEnter(state).
 func (t *Task) accountTaskGoroutineLeave(state TaskGoroutineState) {
 	if state != TaskGoroutineRunningApp {
 		// Task is unblocking/continuing.
@@ -191,8 +192,8 @@ func (tg *ThreadGroup) CPUStats() usage.CPUStats {
 	return tg.cpuStatsAtLocked(tg.leader.k.CPUClockNow())
 }
 
-// Preconditions: As for TaskGoroutineSchedInfo.userTicksAt. The TaskSet mutex
-// must be locked.
+// Preconditions: Same as TaskGoroutineSchedInfo.userTicksAt, plus:
+// * The TaskSet mutex must be locked.
 func (tg *ThreadGroup) cpuStatsAtLocked(now uint64) usage.CPUStats {
 	stats := tg.exitedCPUStats
 	// Account for live tasks.
diff --git a/pkg/sentry/kernel/task_signals.go b/pkg/sentry/kernel/task_signals.go
index d6a2040bc..ebdb83061 100644
--- a/pkg/sentry/kernel/task_signals.go
+++ b/pkg/sentry/kernel/task_signals.go
@@ -259,7 +259,11 @@ func (t *Task) deliverSignalToHandler(info *arch.SignalInfo, act arch.SignalAct)
 	// Set up the signal handler. If we have a saved signal mask, the signal
 	// handler should run with the current mask, but sigreturn should restore
 	// the saved one.
-	st := &arch.Stack{t.Arch(), mm, sp}
+	st := &arch.Stack{
+		Arch:   t.Arch(),
+		IO:     mm,
+		Bottom: sp,
+	}
 	mask := t.signalMask
 	if t.haveSavedSignalMask {
 		mask = t.savedSignalMask
@@ -319,8 +323,9 @@ func (t *Task) SignalReturn(rt bool) (*SyscallControl, error) {
 
 // Sigtimedwait implements the semantics of sigtimedwait(2).
 //
-// Preconditions: The caller must be running on the task goroutine. t.exitState
-// < TaskExitZombie.
+// Preconditions:
+// * The caller must be running on the task goroutine.
+// * t.exitState < TaskExitZombie.
 func (t *Task) Sigtimedwait(set linux.SignalSet, timeout time.Duration) (*arch.SignalInfo, error) {
 	// set is the set of signals we're interested in; invert it to get the set
 	// of signals to block.
@@ -584,8 +589,9 @@ func (t *Task) SignalMask() linux.SignalSet {
 
 // SetSignalMask sets t's signal mask.
 //
-// Preconditions: SetSignalMask can only be called by the task goroutine.
-// t.exitState < TaskExitZombie.
+// Preconditions:
+// * The caller must be running on the task goroutine.
+// * t.exitState < TaskExitZombie.
 func (t *Task) SetSignalMask(mask linux.SignalSet) {
 	// By precondition, t prevents t.tg from completing an execve and mutating
 	// t.tg.signalHandlers, so we can skip the TaskSet mutex.
@@ -631,7 +637,7 @@ func (t *Task) setSignalMaskLocked(mask linux.SignalSet) {
 // SetSavedSignalMask sets the saved signal mask (see Task.savedSignalMask's
 // comment).
 //
-// Preconditions: SetSavedSignalMask can only be called by the task goroutine.
+// Preconditions: The caller must be running on the task goroutine.
 func (t *Task) SetSavedSignalMask(mask linux.SignalSet) {
 	t.savedSignalMask = mask
 	t.haveSavedSignalMask = true
diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go
index 64c1e120a..8e28230cc 100644
--- a/pkg/sentry/kernel/task_start.go
+++ b/pkg/sentry/kernel/task_start.go
@@ -16,6 +16,7 @@ package kernel
 
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/inet"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
@@ -98,14 +99,18 @@ type TaskConfig struct {
 // NewTask creates a new task defined by cfg.
 //
 // NewTask does not start the returned task; the caller must call Task.Start.
-func (ts *TaskSet) NewTask(cfg *TaskConfig) (*Task, error) {
+//
+// If successful, NewTask transfers references held by cfg to the new task.
+// Otherwise, NewTask releases them.
+func (ts *TaskSet) NewTask(ctx context.Context, cfg *TaskConfig) (*Task, error) {
 	t, err := ts.newTask(cfg)
 	if err != nil {
 		cfg.TaskContext.release()
-		cfg.FSContext.DecRef(t)
-		cfg.FDTable.DecRef(t)
+		cfg.FSContext.DecRef(ctx)
+		cfg.FDTable.DecRef(ctx)
+		cfg.IPCNamespace.DecRef(ctx)
 		if cfg.MountNamespaceVFS2 != nil {
-			cfg.MountNamespaceVFS2.DecRef(t)
+			cfg.MountNamespaceVFS2.DecRef(ctx)
 		}
 		return nil, err
 	}
diff --git a/pkg/sentry/kernel/task_stop.go b/pkg/sentry/kernel/task_stop.go
index 296735d32..a35948a5f 100644
--- a/pkg/sentry/kernel/task_stop.go
+++ b/pkg/sentry/kernel/task_stop.go
@@ -99,8 +99,9 @@ type TaskStop interface {
 
 // beginInternalStop indicates the start of an internal stop that applies to t.
 //
-// Preconditions: The task must not already be in an internal stop (i.e. t.stop
-// == nil). The caller must be running on the task goroutine.
+// Preconditions:
+// * The caller must be running on the task goroutine.
+// * The task must not already be in an internal stop (i.e. t.stop == nil).
 func (t *Task) beginInternalStop(s TaskStop) {
 	t.tg.pidns.owner.mu.RLock()
 	defer t.tg.pidns.owner.mu.RUnlock()
@@ -109,8 +110,8 @@ func (t *Task) beginInternalStop(s TaskStop) {
 	t.beginInternalStopLocked(s)
 }
 
-// Preconditions: The signal mutex must be locked. All preconditions for
-// Task.beginInternalStop also apply.
+// Preconditions: Same as beginInternalStop, plus:
+// * The signal mutex must be locked.
 func (t *Task) beginInternalStopLocked(s TaskStop) {
 	if t.stop != nil {
 		panic(fmt.Sprintf("Attempting to enter internal stop %#v when already in internal stop %#v", s, t.stop))
@@ -128,8 +129,9 @@ func (t *Task) beginInternalStopLocked(s TaskStop) {
 // t.stop, which is why there is no endInternalStop that locks the signal mutex
 // for you.
 //
-// Preconditions: The signal mutex must be locked. The task must be in an
-// internal stop (i.e. t.stop != nil).
+// Preconditions:
+// * The signal mutex must be locked.
+// * The task must be in an internal stop (i.e. t.stop != nil).
 func (t *Task) endInternalStopLocked() {
 	if t.stop == nil {
 		panic("Attempting to leave non-existent internal stop")
diff --git a/pkg/sentry/kernel/task_syscall.go b/pkg/sentry/kernel/task_syscall.go
index 2dbf86547..0141459e7 100644
--- a/pkg/sentry/kernel/task_syscall.go
+++ b/pkg/sentry/kernel/task_syscall.go
@@ -22,6 +22,7 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/bits"
+	"gvisor.dev/gvisor/pkg/marshal"
 	"gvisor.dev/gvisor/pkg/metric"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
@@ -287,7 +288,7 @@ func (t *Task) doVsyscall(addr usermem.Addr, sysno uintptr) taskRunState {
 
 	// Grab the caller up front, to make sure there's a sensible stack.
 	caller := t.Arch().Native(uintptr(0))
-	if _, err := t.CopyIn(usermem.Addr(t.Arch().Stack()), caller); err != nil {
+	if _, err := caller.CopyIn(t, usermem.Addr(t.Arch().Stack())); err != nil {
 		t.Debugf("vsyscall %d: error reading return address from stack: %v", sysno, err)
 		t.forceSignal(linux.SIGSEGV, false /* unconditional */)
 		t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
@@ -323,7 +324,7 @@ func (t *Task) doVsyscall(addr usermem.Addr, sysno uintptr) taskRunState {
 type runVsyscallAfterPtraceEventSeccomp struct {
 	addr   usermem.Addr
 	sysno  uintptr
-	caller interface{}
+	caller marshal.Marshallable
 }
 
 func (r *runVsyscallAfterPtraceEventSeccomp) execute(t *Task) taskRunState {
@@ -346,7 +347,7 @@ func (r *runVsyscallAfterPtraceEventSeccomp) execute(t *Task) taskRunState {
 	return t.doVsyscallInvoke(sysno, t.Arch().SyscallArgs(), r.caller)
 }
 
-func (t *Task) doVsyscallInvoke(sysno uintptr, args arch.SyscallArguments, caller interface{}) taskRunState {
+func (t *Task) doVsyscallInvoke(sysno uintptr, args arch.SyscallArguments, caller marshal.Marshallable) taskRunState {
 	rval, ctrl, err := t.executeSyscall(sysno, args)
 	if ctrl != nil {
 		t.Debugf("vsyscall %d, caller %x: syscall control: %v", sysno, t.Arch().Value(caller), ctrl)
diff --git a/pkg/sentry/kernel/task_usermem.go b/pkg/sentry/kernel/task_usermem.go
index b02044ad2..ce134bf54 100644
--- a/pkg/sentry/kernel/task_usermem.go
+++ b/pkg/sentry/kernel/task_usermem.go
@@ -18,6 +18,7 @@ import (
 	"math"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/marshal"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
@@ -43,17 +44,6 @@ func (t *Task) Deactivate() {
 	}
 }
 
-// CopyIn copies a fixed-size value or slice of fixed-size values in from the
-// task's memory. The copy will fail with syscall.EFAULT if it traverses user
-// memory that is unmapped or not readable by the user.
-//
-// This Task's AddressSpace must be active.
-func (t *Task) CopyIn(addr usermem.Addr, dst interface{}) (int, error) {
-	return usermem.CopyObjectIn(t, t.MemoryManager(), addr, dst, usermem.IOOpts{
-		AddressSpaceActive: true,
-	})
-}
-
 // CopyInBytes is a fast version of CopyIn if the caller can serialize the
 // data without reflection and pass in a byte slice.
 //
@@ -64,17 +54,6 @@ func (t *Task) CopyInBytes(addr usermem.Addr, dst []byte) (int, error) {
 	})
 }
 
-// CopyOut copies a fixed-size value or slice of fixed-size values out to the
-// task's memory. The copy will fail with syscall.EFAULT if it traverses user
-// memory that is unmapped or not writeable by the user.
-//
-// This Task's AddressSpace must be active.
-func (t *Task) CopyOut(addr usermem.Addr, src interface{}) (int, error) {
-	return usermem.CopyObjectOut(t, t.MemoryManager(), addr, src, usermem.IOOpts{
-		AddressSpaceActive: true,
-	})
-}
-
 // CopyOutBytes is a fast version of CopyOut if the caller can serialize the
 // data without reflection and pass in a byte slice.
 //
@@ -114,7 +93,7 @@ func (t *Task) CopyInVector(addr usermem.Addr, maxElemSize, maxTotalSize int) ([
 	var v []string
 	for {
 		argAddr := t.Arch().Native(0)
-		if _, err := t.CopyIn(addr, argAddr); err != nil {
+		if _, err := argAddr.CopyIn(t, addr); err != nil {
 			return v, err
 		}
 		if t.Arch().Value(argAddr) == 0 {
@@ -143,8 +122,9 @@ func (t *Task) CopyInVector(addr usermem.Addr, maxElemSize, maxTotalSize int) ([
 // CopyOutIovecs converts src to an array of struct iovecs and copies it to the
 // memory mapped at addr.
 //
-// Preconditions: As for usermem.IO.CopyOut. The caller must be running on the
-// task goroutine. t's AddressSpace must be active.
+// Preconditions: Same as usermem.IO.CopyOut, plus:
+// * The caller must be running on the task goroutine.
+// * t's AddressSpace must be active.
 func (t *Task) CopyOutIovecs(addr usermem.Addr, src usermem.AddrRangeSeq) error {
 	switch t.Arch().Width() {
 	case 8:
@@ -191,8 +171,9 @@ func (t *Task) CopyOutIovecs(addr usermem.Addr, src usermem.AddrRangeSeq) error
 // combined length of all AddrRanges would otherwise exceed this amount, ranges
 // beyond MAX_RW_COUNT are silently truncated.
 //
-// Preconditions: As for usermem.IO.CopyIn. The caller must be running on the
-// task goroutine. t's AddressSpace must be active.
+// Preconditions: Same as usermem.IO.CopyIn, plus:
+// * The caller must be running on the task goroutine.
+// * t's AddressSpace must be active.
 func (t *Task) CopyInIovecs(addr usermem.Addr, numIovecs int) (usermem.AddrRangeSeq, error) {
 	if numIovecs == 0 {
 		return usermem.AddrRangeSeq{}, nil
@@ -284,7 +265,7 @@ func (t *Task) SingleIOSequence(addr usermem.Addr, length int, opts usermem.IOOp
 //
 // IovecsIOSequence is analogous to Linux's lib/iov_iter.c:import_iovec().
 //
-// Preconditions: As for Task.CopyInIovecs.
+// Preconditions: Same as Task.CopyInIovecs.
 func (t *Task) IovecsIOSequence(addr usermem.Addr, iovcnt int, opts usermem.IOOpts) (usermem.IOSequence, error) {
 	if iovcnt < 0 || iovcnt > linux.UIO_MAXIOV {
 		return usermem.IOSequence{}, syserror.EINVAL
@@ -299,3 +280,30 @@ func (t *Task) IovecsIOSequence(addr usermem.Addr, iovcnt int, opts usermem.IOOp
 		Opts:  opts,
 	}, nil
 }
+
+// copyContext implements marshal.CopyContext. It wraps a task to allow copying
+// memory to and from the task memory with custom usermem.IOOpts.
+type copyContext struct {
+	*Task
+	opts usermem.IOOpts
+}
+
+// AsCopyContext wraps the task and returns it as CopyContext.
+func (t *Task) AsCopyContext(opts usermem.IOOpts) marshal.CopyContext {
+	return &copyContext{t, opts}
+}
+
+// CopyInString copies a string in from the task's memory.
+func (t *copyContext) CopyInString(addr usermem.Addr, maxLen int) (string, error) {
+	return usermem.CopyStringIn(t, t.MemoryManager(), addr, maxLen, t.opts)
+}
+
+// CopyInBytes copies task memory into dst from an IO context.
+func (t *copyContext) CopyInBytes(addr usermem.Addr, dst []byte) (int, error) {
+	return t.MemoryManager().CopyIn(t, addr, dst, t.opts)
+}
+
+// CopyOutBytes copies src into task memoryfrom an IO context.
+func (t *copyContext) CopyOutBytes(addr usermem.Addr, src []byte) (int, error) {
+	return t.MemoryManager().CopyOut(t, addr, src, t.opts)
+}
diff --git a/pkg/sentry/kernel/thread_group.go b/pkg/sentry/kernel/thread_group.go
index 0b34c0099..a183b28c1 100644
--- a/pkg/sentry/kernel/thread_group.go
+++ b/pkg/sentry/kernel/thread_group.go
@@ -18,6 +18,7 @@ import (
 	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
@@ -307,8 +308,8 @@ func (tg *ThreadGroup) Limits() *limits.LimitSet {
 	return tg.limits
 }
 
-// release releases the thread group's resources.
-func (tg *ThreadGroup) release(t *Task) {
+// Release releases the thread group's resources.
+func (tg *ThreadGroup) Release(ctx context.Context) {
 	// Timers must be destroyed without holding the TaskSet or signal mutexes
 	// since timers send signals with Timer.mu locked.
 	tg.itimerRealTimer.Destroy()
@@ -325,7 +326,7 @@ func (tg *ThreadGroup) release(t *Task) {
 		it.DestroyTimer()
 	}
 	if tg.mounts != nil {
-		tg.mounts.DecRef(t)
+		tg.mounts.DecRef(ctx)
 	}
 }
 
diff --git a/pkg/sentry/kernel/threads.go b/pkg/sentry/kernel/threads.go
index 872e1a82d..fdadb52c0 100644
--- a/pkg/sentry/kernel/threads.go
+++ b/pkg/sentry/kernel/threads.go
@@ -36,6 +36,8 @@ import (
 const TasksLimit = (1 << 16)
 
 // ThreadID is a generic thread identifier.
+//
+// +marshal
 type ThreadID int32
 
 // String returns a decimal representation of the ThreadID.
@@ -263,6 +265,13 @@ func (ns *PIDNamespace) Tasks() []*Task {
 	return tasks
 }
 
+// NumTasks returns the number of tasks in ns.
+func (ns *PIDNamespace) NumTasks() int {
+	ns.owner.mu.RLock()
+	defer ns.owner.mu.RUnlock()
+	return len(ns.tids)
+}
+
 // ThreadGroups returns a snapshot of the thread groups in ns.
 func (ns *PIDNamespace) ThreadGroups() []*ThreadGroup {
 	return ns.ThreadGroupsAppend(nil)
diff --git a/pkg/sentry/kernel/time/time.go b/pkg/sentry/kernel/time/time.go
index e959700f2..f61a8e164 100644
--- a/pkg/sentry/kernel/time/time.go
+++ b/pkg/sentry/kernel/time/time.go
@@ -616,8 +616,10 @@ func (t *Timer) Swap(s Setting) (Time, Setting) {
 // Timer's Clock) at which the Setting was changed. Setting s.Enabled to true
 // starts the timer, while setting s.Enabled to false stops it.
 //
-// Preconditions: The Timer must not be paused. f cannot call any Timer methods
-// since it is called with the Timer mutex locked.
+// Preconditions:
+// * The Timer must not be paused.
+// * f cannot call any Timer methods since it is called with the Timer mutex
+//   locked.
 func (t *Timer) SwapAnd(s Setting, f func()) (Time, Setting) {
 	now := t.clock.Now()
 	t.mu.Lock()
diff --git a/pkg/sentry/kernel/vdso.go b/pkg/sentry/kernel/vdso.go
index 290c32466..9bc452e67 100644
--- a/pkg/sentry/kernel/vdso.go
+++ b/pkg/sentry/kernel/vdso.go
@@ -17,7 +17,6 @@ package kernel
 import (
 	"fmt"
 
-	"gvisor.dev/gvisor/pkg/binary"
 	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
@@ -28,6 +27,8 @@ import (
 //
 // They are exposed to the VDSO via a parameter page managed by VDSOParamPage,
 // which also includes a sequence counter.
+//
+// +marshal
 type vdsoParams struct {
 	monotonicReady      uint64
 	monotonicBaseCycles int64
@@ -68,21 +69,29 @@ type VDSOParamPage struct {
 	// checked in state_test_util tests, causing this field to change across
 	// save / restore.
 	seq uint64
+
+	// copyScratchBuffer is a temporary buffer used to marshal the params before
+	// copying it to the real parameter page. The parameter page is typically
+	// updated at a moderate frequency of ~O(seconds) throughout the lifetime of
+	// the sentry, so reusing this buffer is a good tradeoff between memory
+	// usage and the cost of allocation.
+	copyScratchBuffer []byte
 }
 
 // NewVDSOParamPage returns a VDSOParamPage.
 //
 // Preconditions:
-//
 // * fr is a single page allocated from mfp.MemoryFile(). VDSOParamPage does
 //   not take ownership of fr; it must remain allocated for the lifetime of the
 //   VDSOParamPage.
-//
 // * VDSOParamPage must be the only writer to fr.
-//
 // * mfp.MemoryFile().MapInternal(fr) must return a single safemem.Block.
 func NewVDSOParamPage(mfp pgalloc.MemoryFileProvider, fr memmap.FileRange) *VDSOParamPage {
-	return &VDSOParamPage{mfp: mfp, fr: fr}
+	return &VDSOParamPage{
+		mfp:               mfp,
+		fr:                fr,
+		copyScratchBuffer: make([]byte, (*vdsoParams)(nil).SizeBytes()),
+	}
 }
 
 // access returns a mapping of the param page.
@@ -136,7 +145,8 @@ func (v *VDSOParamPage) Write(f func() vdsoParams) error {
 
 	// Get the new params.
 	p := f()
-	buf := binary.Marshal(nil, usermem.ByteOrder, p)
+	buf := v.copyScratchBuffer[:p.SizeBytes()]
+	p.MarshalUnsafe(buf)
 
 	// Skip the sequence counter.
 	if _, err := safemem.Copy(paramPage.DropFirst(8), safemem.BlockFromSafeSlice(buf)); err != nil {
diff --git a/pkg/sentry/limits/context.go b/pkg/sentry/limits/context.go
index 77e1fe217..0bade6e57 100644
--- a/pkg/sentry/limits/context.go
+++ b/pkg/sentry/limits/context.go
@@ -33,3 +33,12 @@ func FromContext(ctx context.Context) *LimitSet {
 	}
 	return nil
 }
+
+// FromContextOrDie returns FromContext(ctx) if the latter is not nil.
+// Otherwise, panic is triggered.
+func FromContextOrDie(ctx context.Context) *LimitSet {
+	if v := ctx.Value(CtxLimits); v != nil {
+		return v.(*LimitSet)
+	}
+	panic("failed to create limit set from context")
+}
diff --git a/pkg/sentry/loader/elf.go b/pkg/sentry/loader/elf.go
index 20dd1cc21..98af2cc38 100644
--- a/pkg/sentry/loader/elf.go
+++ b/pkg/sentry/loader/elf.go
@@ -194,6 +194,10 @@ func parseHeader(ctx context.Context, f fullReader) (elfInfo, error) {
 		log.Infof("Too many phdrs (%d): total size %d > %d", hdr.Phnum, totalPhdrSize, maxTotalPhdrSize)
 		return elfInfo{}, syserror.ENOEXEC
 	}
+	if int64(hdr.Phoff) < 0 || int64(hdr.Phoff+uint64(totalPhdrSize)) < 0 {
+		ctx.Infof("Unsupported phdr offset %d", hdr.Phoff)
+		return elfInfo{}, syserror.ENOEXEC
+	}
 
 	phdrBuf := make([]byte, totalPhdrSize)
 	_, err = f.ReadFull(ctx, usermem.BytesIOSequence(phdrBuf), int64(hdr.Phoff))
@@ -402,8 +406,7 @@ type loadedELF struct {
 //
 // It does not load the ELF interpreter, or return any auxv entries.
 //
-// Preconditions:
-//  * f is an ELF file
+// Preconditions: f is an ELF file.
 func loadParsedELF(ctx context.Context, m *mm.MemoryManager, f fsbridge.File, info elfInfo, sharedLoadOffset usermem.Addr) (loadedELF, error) {
 	first := true
 	var start, end usermem.Addr
@@ -438,6 +441,10 @@ func loadParsedELF(ctx context.Context, m *mm.MemoryManager, f fsbridge.File, in
 				ctx.Infof("PT_INTERP path too big: %v", phdr.Filesz)
 				return loadedELF{}, syserror.ENOEXEC
 			}
+			if int64(phdr.Off) < 0 || int64(phdr.Off+phdr.Filesz) < 0 {
+				ctx.Infof("Unsupported PT_INTERP offset %d", phdr.Off)
+				return loadedELF{}, syserror.ENOEXEC
+			}
 
 			path := make([]byte, phdr.Filesz)
 			_, err := f.ReadFull(ctx, usermem.BytesIOSequence(path), int64(phdr.Off))
@@ -571,8 +578,8 @@ func loadParsedELF(ctx context.Context, m *mm.MemoryManager, f fsbridge.File, in
 // It does not load the ELF interpreter, or return any auxv entries.
 //
 // Preconditions:
-//  * f is an ELF file
-//  * f is the first ELF loaded into m
+// * f is an ELF file.
+// * f is the first ELF loaded into m.
 func loadInitialELF(ctx context.Context, m *mm.MemoryManager, fs *cpuid.FeatureSet, f fsbridge.File) (loadedELF, arch.Context, error) {
 	info, err := parseHeader(ctx, f)
 	if err != nil {
@@ -609,8 +616,7 @@ func loadInitialELF(ctx context.Context, m *mm.MemoryManager, fs *cpuid.FeatureS
 //
 // It does not return any auxv entries.
 //
-// Preconditions:
-//  * f is an ELF file
+// Preconditions: f is an ELF file.
 func loadInterpreterELF(ctx context.Context, m *mm.MemoryManager, f fsbridge.File, initial loadedELF) (loadedELF, error) {
 	info, err := parseHeader(ctx, f)
 	if err != nil {
@@ -640,8 +646,7 @@ func loadInterpreterELF(ctx context.Context, m *mm.MemoryManager, f fsbridge.Fil
 // If loadELF returns ErrSwitchFile it should be called again with the returned
 // path and argv.
 //
-// Preconditions:
-//  * args.File is an ELF file
+// Preconditions: args.File is an ELF file.
 func loadELF(ctx context.Context, args LoadArgs) (loadedELF, arch.Context, error) {
 	bin, ac, err := loadInitialELF(ctx, args.MemoryManager, args.Features, args.File)
 	if err != nil {
diff --git a/pkg/sentry/loader/loader.go b/pkg/sentry/loader/loader.go
index 8d6802ea3..c69b62db9 100644
--- a/pkg/sentry/loader/loader.go
+++ b/pkg/sentry/loader/loader.go
@@ -122,7 +122,7 @@ func allocStack(ctx context.Context, m *mm.MemoryManager, a arch.Context) (*arch
 	if err != nil {
 		return nil, err
 	}
-	return &arch.Stack{a, m, ar.End}, nil
+	return &arch.Stack{Arch: a, IO: m, Bottom: ar.End}, nil
 }
 
 const (
@@ -215,8 +215,8 @@ func loadExecutable(ctx context.Context, args LoadArgs) (loadedELF, arch.Context
 // path and argv.
 //
 // Preconditions:
-//  * The Task MemoryManager is empty.
-//  * Load is called on the Task goroutine.
+// * The Task MemoryManager is empty.
+// * Load is called on the Task goroutine.
 func Load(ctx context.Context, args LoadArgs, extraAuxv []arch.AuxEntry, vdso *VDSO) (abi.OS, arch.Context, string, *syserr.Error) {
 	// Load the executable itself.
 	loaded, ac, file, newArgv, err := loadExecutable(ctx, args)
@@ -247,20 +247,20 @@ func Load(ctx context.Context, args LoadArgs, extraAuxv []arch.AuxEntry, vdso *V
 	}
 
 	// Push the original filename to the stack, for AT_EXECFN.
-	execfn, err := stack.Push(args.Filename)
-	if err != nil {
+	if _, err := stack.PushNullTerminatedByteSlice([]byte(args.Filename)); err != nil {
 		return 0, nil, "", syserr.NewDynamic(fmt.Sprintf("Failed to push exec filename: %v", err), syserr.FromError(err).ToLinux())
 	}
+	execfn := stack.Bottom
 
 	// Push 16 random bytes on the stack which AT_RANDOM will point to.
 	var b [16]byte
 	if _, err := rand.Read(b[:]); err != nil {
 		return 0, nil, "", syserr.NewDynamic(fmt.Sprintf("Failed to read random bytes: %v", err), syserr.FromError(err).ToLinux())
 	}
-	random, err := stack.Push(b)
-	if err != nil {
+	if _, err = stack.PushNullTerminatedByteSlice(b[:]); err != nil {
 		return 0, nil, "", syserr.NewDynamic(fmt.Sprintf("Failed to push random bytes: %v", err), syserr.FromError(err).ToLinux())
 	}
+	random := stack.Bottom
 
 	c := auth.CredentialsFromContext(ctx)
 
diff --git a/pkg/sentry/loader/vdso.go b/pkg/sentry/loader/vdso.go
index 05a294fe6..241d87835 100644
--- a/pkg/sentry/loader/vdso.go
+++ b/pkg/sentry/loader/vdso.go
@@ -380,3 +380,9 @@ func loadVDSO(ctx context.Context, m *mm.MemoryManager, v *VDSO, bin loadedELF)
 
 	return vdsoAddr, nil
 }
+
+// Release drops references on mappings held by v.
+func (v *VDSO) Release(ctx context.Context) {
+	v.ParamPage.DecRef(ctx)
+	v.vdso.DecRef(ctx)
+}
diff --git a/pkg/sentry/memmap/mapping_set.go b/pkg/sentry/memmap/mapping_set.go
index d609c1ae0..457ed87f8 100644
--- a/pkg/sentry/memmap/mapping_set.go
+++ b/pkg/sentry/memmap/mapping_set.go
@@ -177,7 +177,7 @@ func subsetMapping(wholeRange, subsetRange MappableRange, ms MappingSpace, addr
 // AddMapping adds the given mapping and returns the set of MappableRanges that
 // previously had no mappings.
 //
-// Preconditions: As for Mappable.AddMapping.
+// Preconditions: Same as Mappable.AddMapping.
 func (s *MappingSet) AddMapping(ms MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) []MappableRange {
 	mr := MappableRange{offset, offset + uint64(ar.Length())}
 	var mapped []MappableRange
@@ -204,7 +204,7 @@ func (s *MappingSet) AddMapping(ms MappingSpace, ar usermem.AddrRange, offset ui
 // RemoveMapping removes the given mapping and returns the set of
 // MappableRanges that now have no mappings.
 //
-// Preconditions: As for Mappable.RemoveMapping.
+// Preconditions: Same as Mappable.RemoveMapping.
 func (s *MappingSet) RemoveMapping(ms MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) []MappableRange {
 	mr := MappableRange{offset, offset + uint64(ar.Length())}
 	var unmapped []MappableRange
diff --git a/pkg/sentry/memmap/memmap.go b/pkg/sentry/memmap/memmap.go
index 65d83096f..7fd77925f 100644
--- a/pkg/sentry/memmap/memmap.go
+++ b/pkg/sentry/memmap/memmap.go
@@ -28,9 +28,9 @@ import (
 //
 // See mm/mm.go for Mappable's place in the lock order.
 //
-// Preconditions: For all Mappable methods, usermem.AddrRanges and
-// MappableRanges must be non-empty (Length() != 0), and usermem.Addrs and
-// Mappable offsets must be page-aligned.
+// All Mappable methods have the following preconditions:
+// * usermem.AddrRanges and MappableRanges must be non-empty (Length() != 0).
+// * usermem.Addrs and Mappable offsets must be page-aligned.
 type Mappable interface {
 	// AddMapping notifies the Mappable of a mapping from addresses ar in ms to
 	// offsets [offset, offset+ar.Length()) in this Mappable.
@@ -48,8 +48,10 @@ type Mappable interface {
 	// addresses ar in ms to offsets [offset, offset+ar.Length()) in this
 	// Mappable.
 	//
-	// Preconditions: offset+ar.Length() does not overflow. The removed mapping
-	// must exist. writable must match the corresponding call to AddMapping.
+	// Preconditions:
+	// * offset+ar.Length() does not overflow.
+	// * The removed mapping must exist. writable must match the
+	//   corresponding call to AddMapping.
 	RemoveMapping(ctx context.Context, ms MappingSpace, ar usermem.AddrRange, offset uint64, writable bool)
 
 	// CopyMapping notifies the Mappable of an attempt to copy a mapping in ms
@@ -60,9 +62,10 @@ type Mappable interface {
 	// CopyMapping is only called when a mapping is copied within a given
 	// MappingSpace; it is analogous to Linux's vm_operations_struct::mremap.
 	//
-	// Preconditions: offset+srcAR.Length() and offset+dstAR.Length() do not
-	// overflow. The mapping at srcAR must exist. writable must match the
-	// corresponding call to AddMapping.
+	// Preconditions:
+	// * offset+srcAR.Length() and offset+dstAR.Length() do not overflow.
+	// * The mapping at srcAR must exist. writable must match the
+	//   corresponding call to AddMapping.
 	CopyMapping(ctx context.Context, ms MappingSpace, srcAR, dstAR usermem.AddrRange, offset uint64, writable bool) error
 
 	// Translate returns the Mappable's current mappings for at least the range
@@ -77,11 +80,14 @@ type Mappable interface {
 	// reference is held on all pages in a File that may be the result
 	// of a valid Translation.
 	//
-	// Preconditions: required.Length() > 0. optional.IsSupersetOf(required).
-	// required and optional must be page-aligned. The caller must have
-	// established a mapping for all of the queried offsets via a previous call
-	// to AddMapping. The caller is responsible for ensuring that calls to
-	// Translate synchronize with invalidation.
+	// Preconditions:
+	// * required.Length() > 0.
+	// * optional.IsSupersetOf(required).
+	// * required and optional must be page-aligned.
+	// * The caller must have established a mapping for all of the queried
+	//   offsets via a previous call to AddMapping.
+	// * The caller is responsible for ensuring that calls to Translate
+	//   synchronize with invalidation.
 	//
 	// Postconditions: See CheckTranslateResult.
 	Translate(ctx context.Context, required, optional MappableRange, at usermem.AccessType) ([]Translation, error)
@@ -118,10 +124,10 @@ func (t Translation) FileRange() FileRange {
 // CheckTranslateResult returns an error if (ts, terr) does not satisfy all
 // postconditions for Mappable.Translate(required, optional, at).
 //
-// Preconditions: As for Mappable.Translate.
+// Preconditions: Same as Mappable.Translate.
 func CheckTranslateResult(required, optional MappableRange, at usermem.AccessType, ts []Translation, terr error) error {
 	// Verify that the inputs to Mappable.Translate were valid.
-	if !required.WellFormed() || required.Length() <= 0 {
+	if !required.WellFormed() || required.Length() == 0 {
 		panic(fmt.Sprintf("invalid required range: %v", required))
 	}
 	if !usermem.Addr(required.Start).IsPageAligned() || !usermem.Addr(required.End).IsPageAligned() {
@@ -139,7 +145,7 @@ func CheckTranslateResult(required, optional MappableRange, at usermem.AccessTyp
 		return fmt.Errorf("first Translation %+v does not cover start of required range %v", ts[0], required)
 	}
 	for i, t := range ts {
-		if !t.Source.WellFormed() || t.Source.Length() <= 0 {
+		if !t.Source.WellFormed() || t.Source.Length() == 0 {
 			return fmt.Errorf("Translation %+v has invalid Source", t)
 		}
 		if !usermem.Addr(t.Source.Start).IsPageAligned() || !usermem.Addr(t.Source.End).IsPageAligned() {
@@ -214,7 +220,9 @@ type MappingSpace interface {
 	// Invalidate must not take any locks preceding mm.MemoryManager.activeMu
 	// in the lock order.
 	//
-	// Preconditions: ar.Length() != 0. ar must be page-aligned.
+	// Preconditions:
+	// * ar.Length() != 0.
+	// * ar must be page-aligned.
 	Invalidate(ar usermem.AddrRange, opts InvalidateOpts)
 }
 
@@ -375,16 +383,20 @@ type File interface {
 
 	// IncRef increments the reference count on all pages in fr.
 	//
-	// Preconditions: fr.Start and fr.End must be page-aligned. fr.Length() >
-	// 0. At least one reference must be held on all pages in fr. (The File
-	// interface does not provide a way to acquire an initial reference;
-	// implementors may define mechanisms for doing so.)
+	// Preconditions:
+	// * fr.Start and fr.End must be page-aligned.
+	// * fr.Length() > 0.
+	// * At least one reference must be held on all pages in fr. (The File
+	//   interface does not provide a way to acquire an initial reference;
+	//   implementors may define mechanisms for doing so.)
 	IncRef(fr FileRange)
 
 	// DecRef decrements the reference count on all pages in fr.
 	//
-	// Preconditions: fr.Start and fr.End must be page-aligned. fr.Length() >
-	// 0. At least one reference must be held on all pages in fr.
+	// Preconditions:
+	// * fr.Start and fr.End must be page-aligned.
+	// * fr.Length() > 0.
+	// * At least one reference must be held on all pages in fr.
 	DecRef(fr FileRange)
 
 	// MapInternal returns a mapping of the given file offsets in the invoking
@@ -392,8 +404,9 @@ type File interface {
 	//
 	// Note that fr.Start and fr.End need not be page-aligned.
 	//
-	// Preconditions: fr.Length() > 0. At least one reference must be held on
-	// all pages in fr.
+	// Preconditions:
+	// * fr.Length() > 0.
+	// * At least one reference must be held on all pages in fr.
 	//
 	// Postconditions: The returned mapping is valid as long as at least one
 	// reference is held on the mapped pages.
diff --git a/pkg/sentry/mm/BUILD b/pkg/sentry/mm/BUILD
index f9d0837a1..b4a47ccca 100644
--- a/pkg/sentry/mm/BUILD
+++ b/pkg/sentry/mm/BUILD
@@ -73,12 +73,35 @@ go_template_instance(
     },
 )
 
+go_template_instance(
+    name = "aio_mappable_refs",
+    out = "aio_mappable_refs.go",
+    package = "mm",
+    prefix = "aioMappable",
+    template = "//pkg/refs_vfs2:refs_template",
+    types = {
+        "T": "aioMappable",
+    },
+)
+
+go_template_instance(
+    name = "special_mappable_refs",
+    out = "special_mappable_refs.go",
+    package = "mm",
+    prefix = "SpecialMappable",
+    template = "//pkg/refs_vfs2:refs_template",
+    types = {
+        "T": "SpecialMappable",
+    },
+)
+
 go_library(
     name = "mm",
     srcs = [
         "address_space.go",
         "aio_context.go",
         "aio_context_state.go",
+        "aio_mappable_refs.go",
         "debug.go",
         "file_refcount_set.go",
         "io.go",
@@ -92,6 +115,7 @@ go_library(
         "save_restore.go",
         "shm.go",
         "special_mappable.go",
+        "special_mappable_refs.go",
         "syscalls.go",
         "vma.go",
         "vma_set.go",
diff --git a/pkg/sentry/mm/address_space.go b/pkg/sentry/mm/address_space.go
index 5c667117c..a93e76c75 100644
--- a/pkg/sentry/mm/address_space.go
+++ b/pkg/sentry/mm/address_space.go
@@ -166,8 +166,12 @@ func (mm *MemoryManager) Deactivate() {
 // mapASLocked maps addresses in ar into mm.as. If precommit is true, mappings
 // for all addresses in ar should be precommitted.
 //
-// Preconditions: mm.activeMu must be locked. mm.as != nil. ar.Length() != 0.
-// ar must be page-aligned. pseg == mm.pmas.LowerBoundSegment(ar.Start).
+// Preconditions:
+// * mm.activeMu must be locked.
+// * mm.as != nil.
+// * ar.Length() != 0.
+// * ar must be page-aligned.
+// * pseg == mm.pmas.LowerBoundSegment(ar.Start).
 func (mm *MemoryManager) mapASLocked(pseg pmaIterator, ar usermem.AddrRange, precommit bool) error {
 	// By default, map entire pmas at a time, under the assumption that there
 	// is no cost to mapping more of a pma than necessary.
diff --git a/pkg/sentry/mm/aio_context.go b/pkg/sentry/mm/aio_context.go
index 16fea53c4..7bf48cb2c 100644
--- a/pkg/sentry/mm/aio_context.go
+++ b/pkg/sentry/mm/aio_context.go
@@ -17,7 +17,6 @@ package mm
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
-	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
 	"gvisor.dev/gvisor/pkg/sentry/usage"
@@ -239,7 +238,7 @@ func (ctx *AIOContext) Drain() {
 //
 // +stateify savable
 type aioMappable struct {
-	refs.AtomicRefCount
+	aioMappableRefs
 
 	mfp pgalloc.MemoryFileProvider
 	fr  memmap.FileRange
@@ -253,13 +252,13 @@ func newAIOMappable(mfp pgalloc.MemoryFileProvider) (*aioMappable, error) {
 		return nil, err
 	}
 	m := aioMappable{mfp: mfp, fr: fr}
-	m.EnableLeakCheck("mm.aioMappable")
+	m.EnableLeakCheck()
 	return &m, nil
 }
 
 // DecRef implements refs.RefCounter.DecRef.
 func (m *aioMappable) DecRef(ctx context.Context) {
-	m.AtomicRefCount.DecRefWithDestructor(ctx, func(context.Context) {
+	m.aioMappableRefs.DecRef(func() {
 		m.mfp.MemoryFile().DecRef(m.fr)
 	})
 }
diff --git a/pkg/sentry/mm/io.go b/pkg/sentry/mm/io.go
index fa776f9c6..a8ac48080 100644
--- a/pkg/sentry/mm/io.go
+++ b/pkg/sentry/mm/io.go
@@ -441,7 +441,10 @@ func (mm *MemoryManager) LoadUint32(ctx context.Context, addr usermem.Addr, opts
 // handleASIOFault handles a page fault at address addr for an AddressSpaceIO
 // operation spanning ioar.
 //
-// Preconditions: mm.as != nil. ioar.Length() != 0. ioar.Contains(addr).
+// Preconditions:
+// * mm.as != nil.
+// * ioar.Length() != 0.
+// * ioar.Contains(addr).
 func (mm *MemoryManager) handleASIOFault(ctx context.Context, addr usermem.Addr, ioar usermem.AddrRange, at usermem.AccessType) error {
 	// Try to map all remaining pages in the I/O operation. This RoundUp can't
 	// overflow because otherwise it would have been caught by CheckIORange.
@@ -629,7 +632,9 @@ func (mm *MemoryManager) withVecInternalMappings(ctx context.Context, ars userme
 // at most address end on AddrRange arsit.Head(). It is used in vector I/O paths to
 // truncate usermem.AddrRangeSeq when errors occur.
 //
-// Preconditions: !arsit.IsEmpty(). end <= arsit.Head().End.
+// Preconditions:
+// * !arsit.IsEmpty().
+// * end <= arsit.Head().End.
 func truncatedAddrRangeSeq(ars, arsit usermem.AddrRangeSeq, end usermem.Addr) usermem.AddrRangeSeq {
 	ar := arsit.Head()
 	if end <= ar.Start {
diff --git a/pkg/sentry/mm/mm.go b/pkg/sentry/mm/mm.go
index 3e85964e4..92cc87d84 100644
--- a/pkg/sentry/mm/mm.go
+++ b/pkg/sentry/mm/mm.go
@@ -235,6 +235,20 @@ type MemoryManager struct {
 
 	// vdsoSigReturnAddr is the address of 'vdso_sigreturn'.
 	vdsoSigReturnAddr uint64
+
+	// membarrierPrivateEnabled is non-zero if EnableMembarrierPrivate has
+	// previously been called. Since, as of this writing,
+	// MEMBARRIER_CMD_PRIVATE_EXPEDITED is implemented as a global memory
+	// barrier, membarrierPrivateEnabled has no other effect.
+	//
+	// membarrierPrivateEnabled is accessed using atomic memory operations.
+	membarrierPrivateEnabled uint32
+
+	// membarrierRSeqEnabled is non-zero if EnableMembarrierRSeq has previously
+	// been called.
+	//
+	// membarrierRSeqEnabled is accessed using atomic memory operations.
+	membarrierRSeqEnabled uint32
 }
 
 // vma represents a virtual memory area.
@@ -242,7 +256,7 @@ type MemoryManager struct {
 // +stateify savable
 type vma struct {
 	// mappable is the virtual memory object mapped by this vma. If mappable is
-	// nil, the vma represents a private anonymous mapping.
+	// nil, the vma represents an anonymous mapping.
 	mappable memmap.Mappable
 
 	// off is the offset into mappable at which this vma begins. If mappable is
diff --git a/pkg/sentry/mm/mm_test.go b/pkg/sentry/mm/mm_test.go
index fdc308542..acac3d357 100644
--- a/pkg/sentry/mm/mm_test.go
+++ b/pkg/sentry/mm/mm_test.go
@@ -51,7 +51,8 @@ func TestUsageASUpdates(t *testing.T) {
 	defer mm.DecUsers(ctx)
 
 	addr, err := mm.MMap(ctx, memmap.MMapOpts{
-		Length: 2 * usermem.PageSize,
+		Length:  2 * usermem.PageSize,
+		Private: true,
 	})
 	if err != nil {
 		t.Fatalf("MMap got err %v want nil", err)
diff --git a/pkg/sentry/mm/pma.go b/pkg/sentry/mm/pma.go
index 930ec895f..7e5f7de64 100644
--- a/pkg/sentry/mm/pma.go
+++ b/pkg/sentry/mm/pma.go
@@ -31,10 +31,12 @@ import (
 // iterator to the pma containing ar.Start. Otherwise it returns a terminal
 // iterator.
 //
-// Preconditions: mm.activeMu must be locked. ar.Length() != 0.
+// Preconditions:
+// * mm.activeMu must be locked.
+// * ar.Length() != 0.
 func (mm *MemoryManager) existingPMAsLocked(ar usermem.AddrRange, at usermem.AccessType, ignorePermissions bool, needInternalMappings bool) pmaIterator {
 	if checkInvariants {
-		if !ar.WellFormed() || ar.Length() <= 0 {
+		if !ar.WellFormed() || ar.Length() == 0 {
 			panic(fmt.Sprintf("invalid ar: %v", ar))
 		}
 	}
@@ -89,13 +91,16 @@ func (mm *MemoryManager) existingVecPMAsLocked(ars usermem.AddrRangeSeq, at user
 //
 // - An error that is non-nil if pmas exist for only a subset of ar.
 //
-// Preconditions: mm.mappingMu must be locked. mm.activeMu must be locked for
-// writing. ar.Length() != 0. vseg.Range().Contains(ar.Start). vmas must exist
-// for all addresses in ar, and support accesses of type at (i.e. permission
-// checks must have been performed against vmas).
+// Preconditions:
+// * mm.mappingMu must be locked.
+// * mm.activeMu must be locked for writing.
+// * ar.Length() != 0.
+// * vseg.Range().Contains(ar.Start).
+// * vmas must exist for all addresses in ar, and support accesses of type at
+//   (i.e. permission checks must have been performed against vmas).
 func (mm *MemoryManager) getPMAsLocked(ctx context.Context, vseg vmaIterator, ar usermem.AddrRange, at usermem.AccessType) (pmaIterator, pmaGapIterator, error) {
 	if checkInvariants {
-		if !ar.WellFormed() || ar.Length() <= 0 {
+		if !ar.WellFormed() || ar.Length() == 0 {
 			panic(fmt.Sprintf("invalid ar: %v", ar))
 		}
 		if !vseg.Ok() {
@@ -135,9 +140,11 @@ func (mm *MemoryManager) getPMAsLocked(ctx context.Context, vseg vmaIterator, ar
 // exist. If this is not equal to ars, it returns a non-nil error explaining
 // why.
 //
-// Preconditions: mm.mappingMu must be locked. mm.activeMu must be locked for
-// writing. vmas must exist for all addresses in ars, and support accesses of
-// type at (i.e. permission checks must have been performed against vmas).
+// Preconditions:
+// * mm.mappingMu must be locked.
+// * mm.activeMu must be locked for writing.
+// * vmas must exist for all addresses in ars, and support accesses of type at
+//   (i.e. permission checks must have been performed against vmas).
 func (mm *MemoryManager) getVecPMAsLocked(ctx context.Context, ars usermem.AddrRangeSeq, at usermem.AccessType) (usermem.AddrRangeSeq, error) {
 	for arsit := ars; !arsit.IsEmpty(); arsit = arsit.Tail() {
 		ar := arsit.Head()
@@ -186,7 +193,7 @@ func (mm *MemoryManager) getVecPMAsLocked(ctx context.Context, ars usermem.AddrR
 // getVecPMAsLocked; other clients should call one of those instead.
 func (mm *MemoryManager) getPMAsInternalLocked(ctx context.Context, vseg vmaIterator, ar usermem.AddrRange, at usermem.AccessType) (pmaIterator, pmaGapIterator, error) {
 	if checkInvariants {
-		if !ar.WellFormed() || ar.Length() <= 0 || !ar.IsPageAligned() {
+		if !ar.WellFormed() || ar.Length() == 0 || !ar.IsPageAligned() {
 			panic(fmt.Sprintf("invalid ar: %v", ar))
 		}
 		if !vseg.Ok() {
@@ -216,7 +223,7 @@ func (mm *MemoryManager) getPMAsInternalLocked(ctx context.Context, vseg vmaIter
 				// Need a pma here.
 				optAR := vseg.Range().Intersect(pgap.Range())
 				if checkInvariants {
-					if optAR.Length() <= 0 {
+					if optAR.Length() == 0 {
 						panic(fmt.Sprintf("vseg %v and pgap %v do not overlap", vseg, pgap))
 					}
 				}
@@ -518,8 +525,10 @@ func privateAligned(ar usermem.AddrRange) usermem.AddrRange {
 // the memory it maps, isPMACopyOnWriteLocked will take ownership of the memory
 // and update the pma to indicate that it does not require copy-on-write.
 //
-// Preconditions: vseg.Range().IsSupersetOf(pseg.Range()). mm.mappingMu must be
-// locked. mm.activeMu must be locked for writing.
+// Preconditions:
+// * vseg.Range().IsSupersetOf(pseg.Range()).
+// * mm.mappingMu must be locked.
+// * mm.activeMu must be locked for writing.
 func (mm *MemoryManager) isPMACopyOnWriteLocked(vseg vmaIterator, pseg pmaIterator) bool {
 	pma := pseg.ValuePtr()
 	if !pma.needCOW {
@@ -551,7 +560,7 @@ func (mm *MemoryManager) isPMACopyOnWriteLocked(vseg vmaIterator, pseg pmaIterat
 // Invalidate implements memmap.MappingSpace.Invalidate.
 func (mm *MemoryManager) Invalidate(ar usermem.AddrRange, opts memmap.InvalidateOpts) {
 	if checkInvariants {
-		if !ar.WellFormed() || ar.Length() <= 0 || !ar.IsPageAligned() {
+		if !ar.WellFormed() || ar.Length() == 0 || !ar.IsPageAligned() {
 			panic(fmt.Sprintf("invalid ar: %v", ar))
 		}
 	}
@@ -568,11 +577,13 @@ func (mm *MemoryManager) Invalidate(ar usermem.AddrRange, opts memmap.Invalidate
 // invalidateLocked removes pmas and AddressSpace mappings of those pmas for
 // addresses in ar.
 //
-// Preconditions: mm.activeMu must be locked for writing. ar.Length() != 0. ar
-// must be page-aligned.
+// Preconditions:
+// * mm.activeMu must be locked for writing.
+// * ar.Length() != 0.
+// * ar must be page-aligned.
 func (mm *MemoryManager) invalidateLocked(ar usermem.AddrRange, invalidatePrivate, invalidateShared bool) {
 	if checkInvariants {
-		if !ar.WellFormed() || ar.Length() <= 0 || !ar.IsPageAligned() {
+		if !ar.WellFormed() || ar.Length() == 0 || !ar.IsPageAligned() {
 			panic(fmt.Sprintf("invalid ar: %v", ar))
 		}
 	}
@@ -613,10 +624,12 @@ func (mm *MemoryManager) invalidateLocked(ar usermem.AddrRange, invalidatePrivat
 // most I/O. It should only be used in contexts that would use get_user_pages()
 // in the Linux kernel.
 //
-// Preconditions: ar.Length() != 0. ar must be page-aligned.
+// Preconditions:
+// * ar.Length() != 0.
+// * ar must be page-aligned.
 func (mm *MemoryManager) Pin(ctx context.Context, ar usermem.AddrRange, at usermem.AccessType, ignorePermissions bool) ([]PinnedRange, error) {
 	if checkInvariants {
-		if !ar.WellFormed() || ar.Length() <= 0 || !ar.IsPageAligned() {
+		if !ar.WellFormed() || ar.Length() == 0 || !ar.IsPageAligned() {
 			panic(fmt.Sprintf("invalid ar: %v", ar))
 		}
 	}
@@ -693,15 +706,19 @@ func Unpin(prs []PinnedRange) {
 
 // movePMAsLocked moves all pmas in oldAR to newAR.
 //
-// Preconditions: mm.activeMu must be locked for writing. oldAR.Length() != 0.
-// oldAR.Length() <= newAR.Length(). !oldAR.Overlaps(newAR).
-// mm.pmas.IsEmptyRange(newAR). oldAR and newAR must be page-aligned.
+// Preconditions:
+// * mm.activeMu must be locked for writing.
+// * oldAR.Length() != 0.
+// * oldAR.Length() <= newAR.Length().
+// * !oldAR.Overlaps(newAR).
+// * mm.pmas.IsEmptyRange(newAR).
+// * oldAR and newAR must be page-aligned.
 func (mm *MemoryManager) movePMAsLocked(oldAR, newAR usermem.AddrRange) {
 	if checkInvariants {
-		if !oldAR.WellFormed() || oldAR.Length() <= 0 || !oldAR.IsPageAligned() {
+		if !oldAR.WellFormed() || oldAR.Length() == 0 || !oldAR.IsPageAligned() {
 			panic(fmt.Sprintf("invalid oldAR: %v", oldAR))
 		}
-		if !newAR.WellFormed() || newAR.Length() <= 0 || !newAR.IsPageAligned() {
+		if !newAR.WellFormed() || newAR.Length() == 0 || !newAR.IsPageAligned() {
 			panic(fmt.Sprintf("invalid newAR: %v", newAR))
 		}
 		if oldAR.Length() > newAR.Length() {
@@ -751,15 +768,17 @@ func (mm *MemoryManager) movePMAsLocked(oldAR, newAR usermem.AddrRange) {
 // - An error that is non-nil if internal mappings exist for only a subset of
 // ar.
 //
-// Preconditions: mm.activeMu must be locked for writing.
-// pseg.Range().Contains(ar.Start). pmas must exist for all addresses in ar.
-// ar.Length() != 0.
+// Preconditions:
+// * mm.activeMu must be locked for writing.
+// * pseg.Range().Contains(ar.Start).
+// * pmas must exist for all addresses in ar.
+// * ar.Length() != 0.
 //
 // Postconditions: getPMAInternalMappingsLocked does not invalidate iterators
 // into mm.pmas.
 func (mm *MemoryManager) getPMAInternalMappingsLocked(pseg pmaIterator, ar usermem.AddrRange) (pmaGapIterator, error) {
 	if checkInvariants {
-		if !ar.WellFormed() || ar.Length() <= 0 {
+		if !ar.WellFormed() || ar.Length() == 0 {
 			panic(fmt.Sprintf("invalid ar: %v", ar))
 		}
 		if !pseg.Range().Contains(ar.Start) {
@@ -783,8 +802,9 @@ func (mm *MemoryManager) getPMAInternalMappingsLocked(pseg pmaIterator, ar userm
 // internal mappings exist. If this is not equal to ars, it returns a non-nil
 // error explaining why.
 //
-// Preconditions: mm.activeMu must be locked for writing. pmas must exist for
-// all addresses in ar.
+// Preconditions:
+// * mm.activeMu must be locked for writing.
+// * pmas must exist for all addresses in ar.
 //
 // Postconditions: getVecPMAInternalMappingsLocked does not invalidate iterators
 // into mm.pmas.
@@ -803,12 +823,15 @@ func (mm *MemoryManager) getVecPMAInternalMappingsLocked(ars usermem.AddrRangeSe
 
 // internalMappingsLocked returns internal mappings for addresses in ar.
 //
-// Preconditions: mm.activeMu must be locked. Internal mappings must have been
-// previously established for all addresses in ar. ar.Length() != 0.
-// pseg.Range().Contains(ar.Start).
+// Preconditions:
+// * mm.activeMu must be locked.
+// * Internal mappings must have been previously established for all addresses
+//   in ar.
+// * ar.Length() != 0.
+// * pseg.Range().Contains(ar.Start).
 func (mm *MemoryManager) internalMappingsLocked(pseg pmaIterator, ar usermem.AddrRange) safemem.BlockSeq {
 	if checkInvariants {
-		if !ar.WellFormed() || ar.Length() <= 0 {
+		if !ar.WellFormed() || ar.Length() == 0 {
 			panic(fmt.Sprintf("invalid ar: %v", ar))
 		}
 		if !pseg.Range().Contains(ar.Start) {
@@ -839,8 +862,10 @@ func (mm *MemoryManager) internalMappingsLocked(pseg pmaIterator, ar usermem.Add
 
 // vecInternalMappingsLocked returns internal mappings for addresses in ars.
 //
-// Preconditions: mm.activeMu must be locked. Internal mappings must have been
-// previously established for all addresses in ars.
+// Preconditions:
+// * mm.activeMu must be locked.
+// * Internal mappings must have been previously established for all addresses
+//   in ars.
 func (mm *MemoryManager) vecInternalMappingsLocked(ars usermem.AddrRangeSeq) safemem.BlockSeq {
 	var ims []safemem.Block
 	for ; !ars.IsEmpty(); ars = ars.Tail() {
@@ -969,7 +994,9 @@ func (pmaSetFunctions) Split(ar usermem.AddrRange, p pma, split usermem.Addr) (p
 // findOrSeekPrevUpperBoundPMA returns mm.pmas.UpperBoundSegment(addr), but may do
 // so by scanning linearly backward from pgap.
 //
-// Preconditions: mm.activeMu must be locked. addr <= pgap.Start().
+// Preconditions:
+// * mm.activeMu must be locked.
+// * addr <= pgap.Start().
 func (mm *MemoryManager) findOrSeekPrevUpperBoundPMA(addr usermem.Addr, pgap pmaGapIterator) pmaIterator {
 	if checkInvariants {
 		if !pgap.Ok() {
@@ -1015,13 +1042,15 @@ func (pseg pmaIterator) fileRange() memmap.FileRange {
 	return pseg.fileRangeOf(pseg.Range())
 }
 
-// Preconditions: pseg.Range().IsSupersetOf(ar). ar.Length != 0.
+// Preconditions:
+// * pseg.Range().IsSupersetOf(ar).
+// * ar.Length != 0.
 func (pseg pmaIterator) fileRangeOf(ar usermem.AddrRange) memmap.FileRange {
 	if checkInvariants {
 		if !pseg.Ok() {
 			panic("terminal pma iterator")
 		}
-		if !ar.WellFormed() || ar.Length() <= 0 {
+		if !ar.WellFormed() || ar.Length() == 0 {
 			panic(fmt.Sprintf("invalid ar: %v", ar))
 		}
 		if !pseg.Range().IsSupersetOf(ar) {
diff --git a/pkg/sentry/mm/special_mappable.go b/pkg/sentry/mm/special_mappable.go
index 4cdb52eb6..2dbe5b751 100644
--- a/pkg/sentry/mm/special_mappable.go
+++ b/pkg/sentry/mm/special_mappable.go
@@ -16,7 +16,6 @@ package mm
 
 import (
 	"gvisor.dev/gvisor/pkg/context"
-	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
 	"gvisor.dev/gvisor/pkg/sentry/usage"
@@ -31,7 +30,7 @@ import (
 //
 // +stateify savable
 type SpecialMappable struct {
-	refs.AtomicRefCount
+	SpecialMappableRefs
 
 	mfp  pgalloc.MemoryFileProvider
 	fr   memmap.FileRange
@@ -45,13 +44,13 @@ type SpecialMappable struct {
 // Preconditions: fr.Length() != 0.
 func NewSpecialMappable(name string, mfp pgalloc.MemoryFileProvider, fr memmap.FileRange) *SpecialMappable {
 	m := SpecialMappable{mfp: mfp, fr: fr, name: name}
-	m.EnableLeakCheck("mm.SpecialMappable")
+	m.EnableLeakCheck()
 	return &m
 }
 
 // DecRef implements refs.RefCounter.DecRef.
 func (m *SpecialMappable) DecRef(ctx context.Context) {
-	m.AtomicRefCount.DecRefWithDestructor(ctx, func(context.Context) {
+	m.SpecialMappableRefs.DecRef(func() {
 		m.mfp.MemoryFile().DecRef(m.fr)
 	})
 }
@@ -137,9 +136,12 @@ func (m *SpecialMappable) Length() uint64 {
 // NewSharedAnonMappable returns a SpecialMappable that implements the
 // semantics of mmap(MAP_SHARED|MAP_ANONYMOUS) and mappings of /dev/zero.
 //
-// TODO(jamieliu): The use of SpecialMappable is a lazy code reuse hack. Linux
-// uses an ephemeral file created by mm/shmem.c:shmem_zero_setup(); we should
-// do the same to get non-zero device and inode IDs.
+// TODO(gvisor.dev/issue/1624): Linux uses an ephemeral file created by
+// mm/shmem.c:shmem_zero_setup(), and VFS2 does something analogous. VFS1 uses
+// a SpecialMappable instead, incorrectly getting device and inode IDs of zero
+// and causing memory for shared anonymous mappings to be allocated up-front
+// instead of on first touch; this is to avoid exacerbating the fs.MountSource
+// leak (b/143656263). Delete this function along with VFS1.
 func NewSharedAnonMappable(length uint64, mfp pgalloc.MemoryFileProvider) (*SpecialMappable, error) {
 	if length == 0 {
 		return nil, syserror.EINVAL
diff --git a/pkg/sentry/mm/syscalls.go b/pkg/sentry/mm/syscalls.go
index e74d4e1c1..675efdc7c 100644
--- a/pkg/sentry/mm/syscalls.go
+++ b/pkg/sentry/mm/syscalls.go
@@ -17,6 +17,7 @@ package mm
 import (
 	"fmt"
 	mrand "math/rand"
+	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
@@ -24,7 +25,6 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/kernel/futex"
 	"gvisor.dev/gvisor/pkg/sentry/limits"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
-	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
@@ -93,18 +93,6 @@ func (mm *MemoryManager) MMap(ctx context.Context, opts memmap.MMapOpts) (userme
 		}
 	} else {
 		opts.Offset = 0
-		if !opts.Private {
-			if opts.MappingIdentity != nil {
-				return 0, syserror.EINVAL
-			}
-			m, err := NewSharedAnonMappable(opts.Length, pgalloc.MemoryFileProviderFromContext(ctx))
-			if err != nil {
-				return 0, err
-			}
-			defer m.DecRef(ctx)
-			opts.MappingIdentity = m
-			opts.Mappable = m
-		}
 	}
 
 	if opts.Addr.RoundDown() != opts.Addr {
@@ -166,7 +154,9 @@ func (mm *MemoryManager) MMap(ctx context.Context, opts memmap.MMapOpts) (userme
 // populateVMA obtains pmas for addresses in ar in the given vma, and maps them
 // into mm.as if it is active.
 //
-// Preconditions: mm.mappingMu must be locked. vseg.Range().IsSupersetOf(ar).
+// Preconditions:
+// * mm.mappingMu must be locked.
+// * vseg.Range().IsSupersetOf(ar).
 func (mm *MemoryManager) populateVMA(ctx context.Context, vseg vmaIterator, ar usermem.AddrRange, precommit bool) {
 	if !vseg.ValuePtr().effectivePerms.Any() {
 		// Linux doesn't populate inaccessible pages. See
@@ -208,8 +198,9 @@ func (mm *MemoryManager) populateVMA(ctx context.Context, vseg vmaIterator, ar u
 // preferable to populateVMA since it unlocks mm.mappingMu before performing
 // expensive operations that don't require it to be locked.
 //
-// Preconditions: mm.mappingMu must be locked for writing.
-// vseg.Range().IsSupersetOf(ar).
+// Preconditions:
+// * mm.mappingMu must be locked for writing.
+// * vseg.Range().IsSupersetOf(ar).
 //
 // Postconditions: mm.mappingMu will be unlocked.
 func (mm *MemoryManager) populateVMAAndUnlock(ctx context.Context, vseg vmaIterator, ar usermem.AddrRange, precommit bool) {
@@ -1284,3 +1275,27 @@ func (mm *MemoryManager) VirtualDataSize() uint64 {
 	defer mm.mappingMu.RUnlock()
 	return mm.dataAS
 }
+
+// EnableMembarrierPrivate causes future calls to IsMembarrierPrivateEnabled to
+// return true.
+func (mm *MemoryManager) EnableMembarrierPrivate() {
+	atomic.StoreUint32(&mm.membarrierPrivateEnabled, 1)
+}
+
+// IsMembarrierPrivateEnabled returns true if mm.EnableMembarrierPrivate() has
+// previously been called.
+func (mm *MemoryManager) IsMembarrierPrivateEnabled() bool {
+	return atomic.LoadUint32(&mm.membarrierPrivateEnabled) != 0
+}
+
+// EnableMembarrierRSeq causes future calls to IsMembarrierRSeqEnabled to
+// return true.
+func (mm *MemoryManager) EnableMembarrierRSeq() {
+	atomic.StoreUint32(&mm.membarrierRSeqEnabled, 1)
+}
+
+// IsMembarrierRSeqEnabled returns true if mm.EnableMembarrierRSeq() has
+// previously been called.
+func (mm *MemoryManager) IsMembarrierRSeqEnabled() bool {
+	return atomic.LoadUint32(&mm.membarrierRSeqEnabled) != 0
+}
diff --git a/pkg/sentry/mm/vma.go b/pkg/sentry/mm/vma.go
index c4e1989ed..b8df72813 100644
--- a/pkg/sentry/mm/vma.go
+++ b/pkg/sentry/mm/vma.go
@@ -27,8 +27,9 @@ import (
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
-// Preconditions: mm.mappingMu must be locked for writing. opts must be valid
-// as defined by the checks in MMap.
+// Preconditions:
+// * mm.mappingMu must be locked for writing.
+// * opts must be valid as defined by the checks in MMap.
 func (mm *MemoryManager) createVMALocked(ctx context.Context, opts memmap.MMapOpts) (vmaIterator, usermem.AddrRange, error) {
 	if opts.MaxPerms != opts.MaxPerms.Effective() {
 		panic(fmt.Sprintf("Non-effective MaxPerms %s cannot be enforced", opts.MaxPerms))
@@ -260,11 +261,12 @@ func (mm *MemoryManager) mlockedBytesRangeLocked(ar usermem.AddrRange) uint64 {
 //
 // - An error that is non-nil if vmas exist for only a subset of ar.
 //
-// Preconditions: mm.mappingMu must be locked for reading; it may be
-// temporarily unlocked. ar.Length() != 0.
+// Preconditions:
+// * mm.mappingMu must be locked for reading; it may be temporarily unlocked.
+// * ar.Length() != 0.
 func (mm *MemoryManager) getVMAsLocked(ctx context.Context, ar usermem.AddrRange, at usermem.AccessType, ignorePermissions bool) (vmaIterator, vmaGapIterator, error) {
 	if checkInvariants {
-		if !ar.WellFormed() || ar.Length() <= 0 {
+		if !ar.WellFormed() || ar.Length() == 0 {
 			panic(fmt.Sprintf("invalid ar: %v", ar))
 		}
 	}
@@ -342,11 +344,13 @@ const guardBytes = 256 * usermem.PageSize
 // unmapLocked unmaps all addresses in ar and returns the resulting gap in
 // mm.vmas.
 //
-// Preconditions: mm.mappingMu must be locked for writing. ar.Length() != 0.
-// ar must be page-aligned.
+// Preconditions:
+// * mm.mappingMu must be locked for writing.
+// * ar.Length() != 0.
+// * ar must be page-aligned.
 func (mm *MemoryManager) unmapLocked(ctx context.Context, ar usermem.AddrRange) vmaGapIterator {
 	if checkInvariants {
-		if !ar.WellFormed() || ar.Length() <= 0 || !ar.IsPageAligned() {
+		if !ar.WellFormed() || ar.Length() == 0 || !ar.IsPageAligned() {
 			panic(fmt.Sprintf("invalid ar: %v", ar))
 		}
 	}
@@ -361,11 +365,13 @@ func (mm *MemoryManager) unmapLocked(ctx context.Context, ar usermem.AddrRange)
 // gap in mm.vmas. It does not remove pmas or AddressSpace mappings; clients
 // must do so before calling removeVMAsLocked.
 //
-// Preconditions: mm.mappingMu must be locked for writing. ar.Length() != 0. ar
-// must be page-aligned.
+// Preconditions:
+// * mm.mappingMu must be locked for writing.
+// * ar.Length() != 0.
+// * ar must be page-aligned.
 func (mm *MemoryManager) removeVMAsLocked(ctx context.Context, ar usermem.AddrRange) vmaGapIterator {
 	if checkInvariants {
-		if !ar.WellFormed() || ar.Length() <= 0 || !ar.IsPageAligned() {
+		if !ar.WellFormed() || ar.Length() == 0 || !ar.IsPageAligned() {
 			panic(fmt.Sprintf("invalid ar: %v", ar))
 		}
 	}
@@ -467,7 +473,9 @@ func (vmaSetFunctions) Split(ar usermem.AddrRange, v vma, split usermem.Addr) (v
 	return v, v2
 }
 
-// Preconditions: vseg.ValuePtr().mappable != nil. vseg.Range().Contains(addr).
+// Preconditions:
+// * vseg.ValuePtr().mappable != nil.
+// * vseg.Range().Contains(addr).
 func (vseg vmaIterator) mappableOffsetAt(addr usermem.Addr) uint64 {
 	if checkInvariants {
 		if !vseg.Ok() {
@@ -491,8 +499,10 @@ func (vseg vmaIterator) mappableRange() memmap.MappableRange {
 	return vseg.mappableRangeOf(vseg.Range())
 }
 
-// Preconditions: vseg.ValuePtr().mappable != nil.
-// vseg.Range().IsSupersetOf(ar). ar.Length() != 0.
+// Preconditions:
+// * vseg.ValuePtr().mappable != nil.
+// * vseg.Range().IsSupersetOf(ar).
+// * ar.Length() != 0.
 func (vseg vmaIterator) mappableRangeOf(ar usermem.AddrRange) memmap.MappableRange {
 	if checkInvariants {
 		if !vseg.Ok() {
@@ -501,7 +511,7 @@ func (vseg vmaIterator) mappableRangeOf(ar usermem.AddrRange) memmap.MappableRan
 		if vseg.ValuePtr().mappable == nil {
 			panic("MappableRange is meaningless for anonymous vma")
 		}
-		if !ar.WellFormed() || ar.Length() <= 0 {
+		if !ar.WellFormed() || ar.Length() == 0 {
 			panic(fmt.Sprintf("invalid ar: %v", ar))
 		}
 		if !vseg.Range().IsSupersetOf(ar) {
@@ -514,8 +524,10 @@ func (vseg vmaIterator) mappableRangeOf(ar usermem.AddrRange) memmap.MappableRan
 	return memmap.MappableRange{vma.off + uint64(ar.Start-vstart), vma.off + uint64(ar.End-vstart)}
 }
 
-// Preconditions: vseg.ValuePtr().mappable != nil.
-// vseg.mappableRange().IsSupersetOf(mr). mr.Length() != 0.
+// Preconditions:
+// * vseg.ValuePtr().mappable != nil.
+// * vseg.mappableRange().IsSupersetOf(mr).
+// * mr.Length() != 0.
 func (vseg vmaIterator) addrRangeOf(mr memmap.MappableRange) usermem.AddrRange {
 	if checkInvariants {
 		if !vseg.Ok() {
@@ -524,7 +536,7 @@ func (vseg vmaIterator) addrRangeOf(mr memmap.MappableRange) usermem.AddrRange {
 		if vseg.ValuePtr().mappable == nil {
 			panic("MappableRange is meaningless for anonymous vma")
 		}
-		if !mr.WellFormed() || mr.Length() <= 0 {
+		if !mr.WellFormed() || mr.Length() == 0 {
 			panic(fmt.Sprintf("invalid mr: %v", mr))
 		}
 		if !vseg.mappableRange().IsSupersetOf(mr) {
@@ -540,7 +552,9 @@ func (vseg vmaIterator) addrRangeOf(mr memmap.MappableRange) usermem.AddrRange {
 // seekNextLowerBound returns mm.vmas.LowerBoundSegment(addr), but does so by
 // scanning linearly forward from vseg.
 //
-// Preconditions: mm.mappingMu must be locked. addr >= vseg.Start().
+// Preconditions:
+// * mm.mappingMu must be locked.
+// * addr >= vseg.Start().
 func (vseg vmaIterator) seekNextLowerBound(addr usermem.Addr) vmaIterator {
 	if checkInvariants {
 		if !vseg.Ok() {
diff --git a/pkg/sentry/pgalloc/BUILD b/pkg/sentry/pgalloc/BUILD
index 7a3311a70..5b09b9feb 100644
--- a/pkg/sentry/pgalloc/BUILD
+++ b/pkg/sentry/pgalloc/BUILD
@@ -83,6 +83,7 @@ go_library(
     ],
     visibility = ["//pkg/sentry:internal"],
     deps = [
+        "//pkg/abi/linux",
         "//pkg/context",
         "//pkg/log",
         "//pkg/memutil",
diff --git a/pkg/sentry/pgalloc/pgalloc.go b/pkg/sentry/pgalloc/pgalloc.go
index 46d3be58c..7c297fb9e 100644
--- a/pkg/sentry/pgalloc/pgalloc.go
+++ b/pkg/sentry/pgalloc/pgalloc.go
@@ -29,6 +29,7 @@ import (
 	"syscall"
 	"time"
 
+	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/safemem"
@@ -224,6 +225,18 @@ type usageInfo struct {
 	refs uint64
 }
 
+// canCommit returns true if the tracked region can be committed.
+func (u *usageInfo) canCommit() bool {
+	// refs must be greater than 0 because we assume that reclaimable pages
+	// (that aren't already known to be committed) are not committed. This
+	// isn't necessarily true, even after the reclaimer does Decommit(),
+	// because the kernel may subsequently back the hugepage-sized region
+	// containing the decommitted page with a hugepage. However, it's
+	// consistent with our treatment of unallocated pages, which have the same
+	// property.
+	return !u.knownCommitted && u.refs != 0
+}
+
 // An EvictableMemoryUser represents a user of MemoryFile-allocated memory that
 // may be asked to deallocate that memory in the presence of memory pressure.
 type EvictableMemoryUser interface {
@@ -507,7 +520,9 @@ func findAvailableRange(usage *usageSet, fileSize int64, length, alignment uint6
 // nearest page. If this is shorter than length bytes due to an error returned
 // by r.ReadToBlocks(), it returns that error.
 //
-// Preconditions: length > 0. length must be page-aligned.
+// Preconditions:
+// * length > 0.
+// * length must be page-aligned.
 func (f *MemoryFile) AllocateAndFill(length uint64, kind usage.MemoryKind, r safemem.Reader) (memmap.FileRange, error) {
 	fr, err := f.Allocate(length, kind)
 	if err != nil {
@@ -826,6 +841,11 @@ func (f *MemoryFile) UpdateUsage() error {
 		log.Debugf("UpdateUsage: skipped with usageSwapped!=0.")
 		return nil
 	}
+	// Linux updates usage values at CONFIG_HZ.
+	if scanningAfter := time.Now().Sub(f.usageLast).Milliseconds(); scanningAfter < time.Second.Milliseconds()/linux.CLOCKS_PER_SEC {
+		log.Debugf("UpdateUsage: skipped because previous scan happened %d ms back", scanningAfter)
+		return nil
+	}
 
 	f.usageLast = time.Now()
 	err = f.updateUsageLocked(currentUsage, mincore)
@@ -839,7 +859,7 @@ func (f *MemoryFile) UpdateUsage() error {
 // pages by invoking checkCommitted, which is a function that, for each page i
 // in bs, sets committed[i] to 1 if the page is committed and 0 otherwise.
 //
-// Precondition: f.mu must be held.
+// Precondition: f.mu must be held; it may be unlocked and reacquired.
 func (f *MemoryFile) updateUsageLocked(currentUsage uint64, checkCommitted func(bs []byte, committed []byte) error) error {
 	// Track if anything changed to elide the merge. In the common case, we
 	// expect all segments to be committed and no merge to occur.
@@ -866,7 +886,7 @@ func (f *MemoryFile) updateUsageLocked(currentUsage uint64, checkCommitted func(
 		} else if f.usageSwapped != 0 {
 			// We have more usage accounted for than the file itself.
 			// That's fine, we probably caught a race where pages were
-			// being committed while the above loop was running. Just
+			// being committed while the below loop was running. Just
 			// report the higher number that we found and ignore swap.
 			usage.MemoryAccounting.Dec(f.usageSwapped, usage.System)
 			f.usageSwapped = 0
@@ -878,21 +898,9 @@ func (f *MemoryFile) updateUsageLocked(currentUsage uint64, checkCommitted func(
 
 	// Iterate over all usage data. There will only be usage segments
 	// present when there is an associated reference.
-	for seg := f.usage.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
-		val := seg.Value()
-
-		// Already known to be committed; ignore.
-		if val.knownCommitted {
-			continue
-		}
-
-		// Assume that reclaimable pages (that aren't already known to be
-		// committed) are not committed. This isn't necessarily true, even
-		// after the reclaimer does Decommit(), because the kernel may
-		// subsequently back the hugepage-sized region containing the
-		// decommitted page with a hugepage. However, it's consistent with our
-		// treatment of unallocated pages, which have the same property.
-		if val.refs == 0 {
+	for seg := f.usage.FirstSegment(); seg.Ok(); {
+		if !seg.ValuePtr().canCommit() {
+			seg = seg.NextSegment()
 			continue
 		}
 
@@ -915,56 +923,53 @@ func (f *MemoryFile) updateUsageLocked(currentUsage uint64, checkCommitted func(
 			}
 
 			// Query for new pages in core.
-			if err := checkCommitted(s, buf); err != nil {
+			// NOTE(b/165896008): mincore (which is passed as checkCommitted)
+			// by f.UpdateUsage() might take a really long time. So unlock f.mu
+			// while checkCommitted runs.
+			f.mu.Unlock()
+			err := checkCommitted(s, buf)
+			f.mu.Lock()
+			if err != nil {
 				checkErr = err
 				return
 			}
 
 			// Scan each page and switch out segments.
-			populatedRun := false
-			populatedRunStart := 0
-			for i := 0; i <= bufLen; i++ {
-				// We run past the end of the slice here to
-				// simplify the logic and only set populated if
-				// we're still looking at elements.
-				populated := false
-				if i < bufLen {
-					populated = buf[i]&0x1 != 0
-				}
-
-				switch {
-				case populated == populatedRun:
-					// Keep the run going.
-					continue
-				case populated && !populatedRun:
-					// Begin the run.
-					populatedRun = true
-					populatedRunStart = i
-					// Keep going.
+			seg := f.usage.LowerBoundSegment(r.Start)
+			for i := 0; i < bufLen; {
+				if buf[i]&0x1 == 0 {
+					i++
 					continue
-				case !populated && populatedRun:
-					// Finish the run by changing this segment.
-					runRange := memmap.FileRange{
-						Start: r.Start + uint64(populatedRunStart*usermem.PageSize),
-						End:   r.Start + uint64(i*usermem.PageSize),
+				}
+				// Scan to the end of this committed range.
+				j := i + 1
+				for ; j < bufLen; j++ {
+					if buf[j]&0x1 == 0 {
+						break
 					}
-					seg = f.usage.Isolate(seg, runRange)
-					seg.ValuePtr().knownCommitted = true
-					// Advance the segment only if we still
-					// have work to do in the context of
-					// the original segment from the for
-					// loop. Otherwise, the for loop itself
-					// will advance the segment
-					// appropriately.
-					if runRange.End != r.End {
-						seg = seg.NextSegment()
+				}
+				committedFR := memmap.FileRange{
+					Start: r.Start + uint64(i*usermem.PageSize),
+					End:   r.Start + uint64(j*usermem.PageSize),
+				}
+				// Advance seg to committedFR.Start.
+				for seg.Ok() && seg.End() < committedFR.Start {
+					seg = seg.NextSegment()
+				}
+				// Mark pages overlapping committedFR as committed.
+				for seg.Ok() && seg.Start() < committedFR.End {
+					if seg.ValuePtr().canCommit() {
+						seg = f.usage.Isolate(seg, committedFR)
+						seg.ValuePtr().knownCommitted = true
+						amount := seg.Range().Length()
+						usage.MemoryAccounting.Inc(amount, seg.ValuePtr().kind)
+						f.usageExpected += amount
+						changedAny = true
 					}
-					amount := runRange.Length()
-					usage.MemoryAccounting.Inc(amount, val.kind)
-					f.usageExpected += amount
-					changedAny = true
-					populatedRun = false
+					seg = seg.NextSegment()
 				}
+				// Continue scanning for committed pages.
+				i = j + 1
 			}
 
 			// Advance r.Start.
@@ -976,6 +981,9 @@ func (f *MemoryFile) updateUsageLocked(currentUsage uint64, checkCommitted func(
 		if err != nil {
 			return err
 		}
+
+		// Continue with the first segment after r.End.
+		seg = f.usage.LowerBoundSegment(r.End)
 	}
 
 	return nil
@@ -1167,8 +1175,10 @@ func (f *MemoryFile) startEvictionsLocked() bool {
 	return startedAny
 }
 
-// Preconditions: info == f.evictable[user]. !info.evicting. f.mu must be
-// locked.
+// Preconditions:
+// * info == f.evictable[user].
+// * !info.evicting.
+// * f.mu must be locked.
 func (f *MemoryFile) startEvictionGoroutineLocked(user EvictableMemoryUser, info *evictableMemoryUserInfo) {
 	info.evicting = true
 	f.evictionWG.Add(1)
diff --git a/pkg/sentry/platform/BUILD b/pkg/sentry/platform/BUILD
index 209b28053..db7d55ef2 100644
--- a/pkg/sentry/platform/BUILD
+++ b/pkg/sentry/platform/BUILD
@@ -15,6 +15,7 @@ go_library(
         "//pkg/context",
         "//pkg/seccomp",
         "//pkg/sentry/arch",
+        "//pkg/sentry/hostmm",
         "//pkg/sentry/memmap",
         "//pkg/usermem",
     ],
diff --git a/pkg/sentry/platform/interrupt/interrupt.go b/pkg/sentry/platform/interrupt/interrupt.go
index 57be41647..9dfac3eae 100644
--- a/pkg/sentry/platform/interrupt/interrupt.go
+++ b/pkg/sentry/platform/interrupt/interrupt.go
@@ -54,8 +54,9 @@ type Forwarder struct {
 // }
 // defer f.Disable()
 //
-// Preconditions: r must not be nil. f must not already be forwarding
-// interrupts to a Receiver.
+// Preconditions:
+// * r must not be nil.
+// * f must not already be forwarding interrupts to a Receiver.
 func (f *Forwarder) Enable(r Receiver) bool {
 	if r == nil {
 		panic("nil Receiver")
diff --git a/pkg/sentry/platform/kvm/BUILD b/pkg/sentry/platform/kvm/BUILD
index 3970dd81d..8ce411102 100644
--- a/pkg/sentry/platform/kvm/BUILD
+++ b/pkg/sentry/platform/kvm/BUILD
@@ -9,12 +9,12 @@ go_library(
         "bluepill.go",
         "bluepill_allocator.go",
         "bluepill_amd64.go",
-        "bluepill_amd64.s",
         "bluepill_amd64_unsafe.go",
         "bluepill_arm64.go",
         "bluepill_arm64.s",
         "bluepill_arm64_unsafe.go",
         "bluepill_fault.go",
+        "bluepill_impl_amd64.s",
         "bluepill_unsafe.go",
         "context.go",
         "filters_amd64.go",
@@ -56,6 +56,7 @@ go_library(
         "//pkg/sentry/time",
         "//pkg/sync",
         "//pkg/usermem",
+        "@org_golang_x_sys//unix:go_default_library",
     ],
 )
 
@@ -63,6 +64,7 @@ go_test(
     name = "kvm_test",
     srcs = [
         "kvm_amd64_test.go",
+        "kvm_arm64_test.go",
         "kvm_test.go",
         "virtual_map_test.go",
     ],
@@ -78,6 +80,15 @@ go_test(
         "//pkg/sentry/platform/kvm/testutil",
         "//pkg/sentry/platform/ring0",
         "//pkg/sentry/platform/ring0/pagetables",
+        "//pkg/sentry/time",
         "//pkg/usermem",
     ],
 )
+
+genrule(
+    name = "bluepill_impl_amd64",
+    srcs = ["bluepill_amd64.s"],
+    outs = ["bluepill_impl_amd64.s"],
+    cmd = "(echo -e '// build +amd64\\n' && $(location //pkg/sentry/platform/ring0/gen_offsets) && cat $(SRCS)) > $@",
+    tools = ["//pkg/sentry/platform/ring0/gen_offsets"],
+)
diff --git a/pkg/sentry/platform/kvm/bluepill_amd64.s b/pkg/sentry/platform/kvm/bluepill_amd64.s
index 2bc34a435..025ea93b5 100644
--- a/pkg/sentry/platform/kvm/bluepill_amd64.s
+++ b/pkg/sentry/platform/kvm/bluepill_amd64.s
@@ -19,11 +19,6 @@
 // This is guaranteed to be zero.
 #define VCPU_CPU 0x0
 
-// CPU_SELF is the self reference in ring0's percpu.
-//
-// This is guaranteed to be zero.
-#define CPU_SELF 0x0
-
 // Context offsets.
 //
 // Only limited use of the context is done in the assembly stub below, most is
@@ -44,7 +39,7 @@ begin:
 	LEAQ VCPU_CPU(AX), BX
 	BYTE CLI;
 check_vcpu:
-	MOVQ CPU_SELF(GS), CX
+	MOVQ ENTRY_CPU_SELF(GS), CX
 	CMPQ BX, CX
 	JE right_vCPU
 wrong_vcpu:
diff --git a/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go b/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go
index 03a98512e..0a54dd30d 100644
--- a/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go
+++ b/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go
@@ -83,5 +83,34 @@ func bluepillStopGuest(c *vCPU) {
 //
 //go:nosplit
 func bluepillReadyStopGuest(c *vCPU) bool {
-	return c.runData.readyForInterruptInjection != 0
+	if c.runData.readyForInterruptInjection == 0 {
+		return false
+	}
+
+	if c.runData.ifFlag == 0 {
+		// This is impossible if readyForInterruptInjection is 1.
+		throw("interrupts are disabled")
+	}
+
+	// Disable interrupts if we are in the kernel space.
+	//
+	// When the Sentry switches into the kernel mode, it disables
+	// interrupts. But when goruntime switches on a goroutine which has
+	// been saved in the host mode, it restores flags and this enables
+	// interrupts.  See the comment of UserFlagsSet for more details.
+	uregs := userRegs{}
+	err := c.getUserRegisters(&uregs)
+	if err != 0 {
+		throw("failed to get user registers")
+	}
+
+	if ring0.IsKernelFlags(uregs.RFLAGS) {
+		uregs.RFLAGS &^= ring0.KernelFlagsClear
+		err = c.setUserRegisters(&uregs)
+		if err != 0 {
+			throw("failed to set user registers")
+		}
+		return false
+	}
+	return true
 }
diff --git a/pkg/sentry/platform/kvm/bluepill_arm64.go b/pkg/sentry/platform/kvm/bluepill_arm64.go
index ed5ae03d3..58f3d6fdd 100644
--- a/pkg/sentry/platform/kvm/bluepill_arm64.go
+++ b/pkg/sentry/platform/kvm/bluepill_arm64.go
@@ -39,6 +39,16 @@ var (
 	}
 )
 
+// getTLS returns the value of TPIDR_EL0 register.
+//
+//go:nosplit
+func getTLS() (value uint64)
+
+// setTLS writes the TPIDR_EL0 value.
+//
+//go:nosplit
+func setTLS(value uint64)
+
 // bluepillArchEnter is called during bluepillEnter.
 //
 //go:nosplit
@@ -51,6 +61,8 @@ func bluepillArchEnter(context *arch.SignalContext64) (c *vCPU) {
 	regs.Pstate = context.Pstate
 	regs.Pstate &^= uint64(ring0.PsrFlagsClear)
 	regs.Pstate |= ring0.KernelFlagsSet
+	regs.TPIDR_EL0 = getTLS()
+
 	return
 }
 
@@ -65,6 +77,7 @@ func bluepillArchExit(c *vCPU, context *arch.SignalContext64) {
 	context.Pstate = regs.Pstate
 	context.Pstate &^= uint64(ring0.PsrFlagsClear)
 	context.Pstate |= ring0.UserFlagsSet
+	setTLS(regs.TPIDR_EL0)
 
 	lazyVfp := c.GetLazyVFP()
 	if lazyVfp != 0 {
diff --git a/pkg/sentry/platform/kvm/bluepill_arm64.s b/pkg/sentry/platform/kvm/bluepill_arm64.s
index 04efa0147..09c7e88e5 100644
--- a/pkg/sentry/platform/kvm/bluepill_arm64.s
+++ b/pkg/sentry/platform/kvm/bluepill_arm64.s
@@ -32,6 +32,18 @@
 #define CONTEXT_PC  0x1B8
 #define CONTEXT_R0 0xB8
 
+// getTLS returns the value of TPIDR_EL0 register.
+TEXT ·getTLS(SB),NOSPLIT,$0-8
+	MRS TPIDR_EL0, R1
+	MOVD R1, ret+0(FP)
+	RET
+
+// setTLS writes the TPIDR_EL0 value.
+TEXT ·setTLS(SB),NOSPLIT,$0-8
+	MOVD addr+0(FP), R1
+	MSR R1, TPIDR_EL0
+	RET
+
 // See bluepill.go.
 TEXT ·bluepill(SB),NOSPLIT,$0
 begin:
diff --git a/pkg/sentry/platform/kvm/bluepill_fault.go b/pkg/sentry/platform/kvm/bluepill_fault.go
index e34f46aeb..a182e4f22 100644
--- a/pkg/sentry/platform/kvm/bluepill_fault.go
+++ b/pkg/sentry/platform/kvm/bluepill_fault.go
@@ -98,6 +98,10 @@ func handleBluepillFault(m *machine, physical uintptr, phyRegions []physicalRegi
 	}
 	errno := m.setMemoryRegion(int(slot), physicalStart, length, virtualStart, flags)
 	if errno == 0 {
+		// Store the physical address in the slot. This is used to
+		// avoid calls to handleBluepillFault in the future (see
+		// machine.mapPhysical).
+		atomic.StoreUintptr(&m.usedSlots[slot], physical)
 		// Successfully added region; we can increment nextSlot and
 		// allow another set to proceed here.
 		atomic.StoreUint32(&m.nextSlot, slot+1)
diff --git a/pkg/sentry/platform/kvm/bluepill_unsafe.go b/pkg/sentry/platform/kvm/bluepill_unsafe.go
index bf357de1a..eb05950cd 100644
--- a/pkg/sentry/platform/kvm/bluepill_unsafe.go
+++ b/pkg/sentry/platform/kvm/bluepill_unsafe.go
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 // +build go1.12
-// +build !go1.16
+// +build !go1.17
 
 // Check go:linkname function signatures when updating Go version.
 
@@ -62,6 +62,9 @@ func bluepillArchContext(context unsafe.Pointer) *arch.SignalContext64 {
 //
 //go:nosplit
 func bluepillGuestExit(c *vCPU, context unsafe.Pointer) {
+	// Increment our counter.
+	atomic.AddUint64(&c.guestExits, 1)
+
 	// Copy out registers.
 	bluepillArchExit(c, bluepillArchContext(context))
 
@@ -89,9 +92,6 @@ func bluepillHandler(context unsafe.Pointer) {
 	// Sanitize the registers; interrupts must always be disabled.
 	c := bluepillArchEnter(bluepillArchContext(context))
 
-	// Increment the number of switches.
-	atomic.AddUint32(&c.switches, 1)
-
 	// Mark this as guest mode.
 	switch atomic.SwapUint32(&c.state, vCPUGuest|vCPUUser) {
 	case vCPUUser: // Expected case.
diff --git a/pkg/sentry/platform/kvm/context.go b/pkg/sentry/platform/kvm/context.go
index 6e6b76416..17268d127 100644
--- a/pkg/sentry/platform/kvm/context.go
+++ b/pkg/sentry/platform/kvm/context.go
@@ -15,6 +15,8 @@
 package kvm
 
 import (
+	"sync/atomic"
+
 	pkgcontext "gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/platform"
@@ -75,6 +77,9 @@ func (c *context) Switch(ctx pkgcontext.Context, mm platform.MemoryManager, ac a
 	// Clear the address space.
 	cpu.active.set(nil)
 
+	// Increment the number of user exits.
+	atomic.AddUint64(&cpu.userExits, 1)
+
 	// Release resources.
 	c.machine.Put(cpu)
 
diff --git a/pkg/sentry/platform/kvm/filters_amd64.go b/pkg/sentry/platform/kvm/filters_amd64.go
index 7d949f1dd..d3d216aa5 100644
--- a/pkg/sentry/platform/kvm/filters_amd64.go
+++ b/pkg/sentry/platform/kvm/filters_amd64.go
@@ -17,14 +17,23 @@ package kvm
 import (
 	"syscall"
 
+	"golang.org/x/sys/unix"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/seccomp"
 )
 
 // SyscallFilters returns syscalls made exclusively by the KVM platform.
 func (*KVM) SyscallFilters() seccomp.SyscallRules {
 	return seccomp.SyscallRules{
-		syscall.SYS_ARCH_PRCTL:      {},
-		syscall.SYS_IOCTL:           {},
+		syscall.SYS_ARCH_PRCTL: {},
+		syscall.SYS_IOCTL:      {},
+		unix.SYS_MEMBARRIER: []seccomp.Rule{
+			{
+				seccomp.EqualTo(linux.MEMBARRIER_CMD_PRIVATE_EXPEDITED),
+				seccomp.EqualTo(0),
+			},
+		},
 		syscall.SYS_MMAP:            {},
 		syscall.SYS_RT_SIGSUSPEND:   {},
 		syscall.SYS_RT_SIGTIMEDWAIT: {},
diff --git a/pkg/sentry/platform/kvm/filters_arm64.go b/pkg/sentry/platform/kvm/filters_arm64.go
index 9245d07c2..21abc2a3d 100644
--- a/pkg/sentry/platform/kvm/filters_arm64.go
+++ b/pkg/sentry/platform/kvm/filters_arm64.go
@@ -17,13 +17,22 @@ package kvm
 import (
 	"syscall"
 
+	"golang.org/x/sys/unix"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/seccomp"
 )
 
 // SyscallFilters returns syscalls made exclusively by the KVM platform.
 func (*KVM) SyscallFilters() seccomp.SyscallRules {
 	return seccomp.SyscallRules{
-		syscall.SYS_IOCTL:           {},
+		syscall.SYS_IOCTL: {},
+		unix.SYS_MEMBARRIER: []seccomp.Rule{
+			{
+				seccomp.EqualTo(linux.MEMBARRIER_CMD_PRIVATE_EXPEDITED),
+				seccomp.EqualTo(0),
+			},
+		},
 		syscall.SYS_MMAP:            {},
 		syscall.SYS_RT_SIGSUSPEND:   {},
 		syscall.SYS_RT_SIGTIMEDWAIT: {},
diff --git a/pkg/sentry/platform/kvm/kvm.go b/pkg/sentry/platform/kvm/kvm.go
index ae813e24e..dd45ad10b 100644
--- a/pkg/sentry/platform/kvm/kvm.go
+++ b/pkg/sentry/platform/kvm/kvm.go
@@ -63,6 +63,9 @@ type runData struct {
 type KVM struct {
 	platform.NoCPUPreemptionDetection
 
+	// KVM never changes mm_structs.
+	platform.UseHostProcessMemoryBarrier
+
 	// machine is the backing VM.
 	machine *machine
 }
@@ -156,15 +159,7 @@ func (*KVM) MaxUserAddress() usermem.Addr {
 func (k *KVM) NewAddressSpace(_ interface{}) (platform.AddressSpace, <-chan struct{}, error) {
 	// Allocate page tables and install system mappings.
 	pageTables := pagetables.New(newAllocator())
-	applyPhysicalRegions(func(pr physicalRegion) bool {
-		// Map the kernel in the upper half.
-		pageTables.Map(
-			usermem.Addr(ring0.KernelStartAddress|pr.virtual),
-			pr.length,
-			pagetables.MapOpts{AccessType: usermem.AnyAccess},
-			pr.physical)
-		return true // Keep iterating.
-	})
+	k.machine.mapUpperHalf(pageTables)
 
 	// Return the new address space.
 	return &addressSpace{
diff --git a/pkg/sentry/platform/kvm/kvm_arm64_test.go b/pkg/sentry/platform/kvm/kvm_arm64_test.go
new file mode 100644
index 000000000..0e3d84d95
--- /dev/null
+++ b/pkg/sentry/platform/kvm/kvm_arm64_test.go
@@ -0,0 +1,31 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package kvm
+
+import (
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/sentry/platform/kvm/testutil"
+)
+
+func TestKernelTLS(t *testing.T) {
+	bluepillTest(t, func(c *vCPU) {
+		if !testutil.TLSWorks() {
+			t.Errorf("tls does not work, and it should!")
+		}
+	})
+}
diff --git a/pkg/sentry/platform/kvm/kvm_const.go b/pkg/sentry/platform/kvm/kvm_const.go
index 3bf918446..6abaa21c4 100644
--- a/pkg/sentry/platform/kvm/kvm_const.go
+++ b/pkg/sentry/platform/kvm/kvm_const.go
@@ -26,12 +26,16 @@ const (
 	_KVM_RUN                    = 0xae80
 	_KVM_NMI                    = 0xae9a
 	_KVM_CHECK_EXTENSION        = 0xae03
+	_KVM_GET_TSC_KHZ            = 0xaea3
+	_KVM_SET_TSC_KHZ            = 0xaea2
 	_KVM_INTERRUPT              = 0x4004ae86
 	_KVM_SET_MSRS               = 0x4008ae89
 	_KVM_SET_USER_MEMORY_REGION = 0x4020ae46
 	_KVM_SET_REGS               = 0x4090ae82
 	_KVM_SET_SREGS              = 0x4138ae84
+	_KVM_GET_MSRS               = 0xc008ae88
 	_KVM_GET_REGS               = 0x8090ae81
+	_KVM_GET_SREGS              = 0x8138ae83
 	_KVM_GET_SUPPORTED_CPUID    = 0xc008ae05
 	_KVM_SET_CPUID2             = 0x4008ae90
 	_KVM_SET_SIGNAL_MASK        = 0x4004ae8b
@@ -56,6 +60,7 @@ const (
 
 // KVM capability options.
 const (
+	_KVM_CAP_MAX_MEMSLOTS          = 0x0a
 	_KVM_CAP_MAX_VCPUS             = 0x42
 	_KVM_CAP_ARM_VM_IPA_SIZE       = 0xa5
 	_KVM_CAP_VCPU_EVENTS           = 0x29
@@ -64,6 +69,7 @@ const (
 
 // KVM limits.
 const (
+	_KVM_NR_MEMSLOTS      = 0x100
 	_KVM_NR_VCPUS         = 0xff
 	_KVM_NR_INTERRUPTS    = 0x100
 	_KVM_NR_CPUID_ENTRIES = 0x100
@@ -77,11 +83,14 @@ const (
 )
 
 // KVM hypercall list.
+//
 // Canonical list of hypercalls supported.
 const (
 	// On amd64, it uses 'HLT' to leave the guest.
+	//
 	// Unlike amd64, arm64 can only uses mmio_exit/psci to leave the guest.
-	// _KVM_HYPERCALL_VMEXIT is only used on Arm64 for now.
+	//
+	// _KVM_HYPERCALL_VMEXIT is only used on arm64 for now.
 	_KVM_HYPERCALL_VMEXIT int = iota
 	_KVM_HYPERCALL_MAX
 )
diff --git a/pkg/sentry/platform/kvm/kvm_const_arm64.go b/pkg/sentry/platform/kvm/kvm_const_arm64.go
index 9a7be3655..84df0f878 100644
--- a/pkg/sentry/platform/kvm/kvm_const_arm64.go
+++ b/pkg/sentry/platform/kvm/kvm_const_arm64.go
@@ -101,13 +101,20 @@ const (
 
 // Arm64: Memory Attribute Indirection Register EL1.
 const (
-	_MT_DEVICE_nGnRnE = 0
-	_MT_DEVICE_nGnRE  = 1
-	_MT_DEVICE_GRE    = 2
-	_MT_NORMAL_NC     = 3
-	_MT_NORMAL        = 4
-	_MT_NORMAL_WT     = 5
-	_MT_EL1_INIT      = (0 << _MT_DEVICE_nGnRnE) | (0x4 << _MT_DEVICE_nGnRE * 8) | (0xc << _MT_DEVICE_GRE * 8) | (0x44 << _MT_NORMAL_NC * 8) | (0xff << _MT_NORMAL * 8) | (0xbb << _MT_NORMAL_WT * 8)
+	_MT_DEVICE_nGnRnE      = 0
+	_MT_DEVICE_nGnRE       = 1
+	_MT_DEVICE_GRE         = 2
+	_MT_NORMAL_NC          = 3
+	_MT_NORMAL             = 4
+	_MT_NORMAL_WT          = 5
+	_MT_ATTR_DEVICE_nGnRnE = 0x00
+	_MT_ATTR_DEVICE_nGnRE  = 0x04
+	_MT_ATTR_DEVICE_GRE    = 0x0c
+	_MT_ATTR_NORMAL_NC     = 0x44
+	_MT_ATTR_NORMAL_WT     = 0xbb
+	_MT_ATTR_NORMAL        = 0xff
+	_MT_ATTR_MASK          = 0xff
+	_MT_EL1_INIT           = (_MT_ATTR_DEVICE_nGnRnE << (_MT_DEVICE_nGnRnE * 8)) | (_MT_ATTR_DEVICE_nGnRE << (_MT_DEVICE_nGnRE * 8)) | (_MT_ATTR_DEVICE_GRE << (_MT_DEVICE_GRE * 8)) | (_MT_ATTR_NORMAL_NC << (_MT_NORMAL_NC * 8)) | (_MT_ATTR_NORMAL << (_MT_NORMAL * 8)) | (_MT_ATTR_NORMAL_WT << (_MT_NORMAL_WT * 8))
 )
 
 const (
diff --git a/pkg/sentry/platform/kvm/kvm_test.go b/pkg/sentry/platform/kvm/kvm_test.go
index 45b3180f1..e58acc071 100644
--- a/pkg/sentry/platform/kvm/kvm_test.go
+++ b/pkg/sentry/platform/kvm/kvm_test.go
@@ -27,6 +27,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/platform/kvm/testutil"
 	"gvisor.dev/gvisor/pkg/sentry/platform/ring0"
 	"gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables"
+	ktime "gvisor.dev/gvisor/pkg/sentry/time"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
@@ -411,9 +412,9 @@ func TestWrongVCPU(t *testing.T) {
 			// Basic test, one then the other.
 			bluepill(c1)
 			bluepill(c2)
-			if c2.switches == 0 {
+			if c2.guestExits == 0 {
 				// Don't allow the test to proceed if this fails.
-				t.Fatalf("wrong vCPU#2 switches: vCPU1=%+v,vCPU2=%+v", c1, c2)
+				t.Fatalf("wrong vCPU#2 exits: vCPU1=%+v,vCPU2=%+v", c1, c2)
 			}
 
 			// Alternate vCPUs; we expect to need to trigger the
@@ -422,11 +423,11 @@ func TestWrongVCPU(t *testing.T) {
 				bluepill(c1)
 				bluepill(c2)
 			}
-			if count := c1.switches; count < 90 {
-				t.Errorf("wrong vCPU#1 switches: vCPU1=%+v,vCPU2=%+v", c1, c2)
+			if count := c1.guestExits; count < 90 {
+				t.Errorf("wrong vCPU#1 exits: vCPU1=%+v,vCPU2=%+v", c1, c2)
 			}
-			if count := c2.switches; count < 90 {
-				t.Errorf("wrong vCPU#2 switches: vCPU1=%+v,vCPU2=%+v", c1, c2)
+			if count := c2.guestExits; count < 90 {
+				t.Errorf("wrong vCPU#2 exits: vCPU1=%+v,vCPU2=%+v", c1, c2)
 			}
 			return false
 		})
@@ -442,6 +443,22 @@ func TestWrongVCPU(t *testing.T) {
 	})
 }
 
+func TestRdtsc(t *testing.T) {
+	var i int // Iteration count.
+	kvmTest(t, nil, func(c *vCPU) bool {
+		start := ktime.Rdtsc()
+		bluepill(c)
+		guest := ktime.Rdtsc()
+		redpill()
+		end := ktime.Rdtsc()
+		if start > guest || guest > end {
+			t.Errorf("inconsistent time: start=%d, guest=%d, end=%d", start, guest, end)
+		}
+		i++
+		return i < 100
+	})
+}
+
 func BenchmarkApplicationSyscall(b *testing.B) {
 	var (
 		i int // Iteration includes machine.Get() / machine.Put().
diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go
index 6c54712d1..61ed24d01 100644
--- a/pkg/sentry/platform/kvm/machine.go
+++ b/pkg/sentry/platform/kvm/machine.go
@@ -43,9 +43,6 @@ type machine struct {
 	// kernel is the set of global structures.
 	kernel ring0.Kernel
 
-	// mappingCache is used for mapPhysical.
-	mappingCache sync.Map
-
 	// mu protects vCPUs.
 	mu sync.RWMutex
 
@@ -63,6 +60,12 @@ type machine struct {
 	// maxVCPUs is the maximum number of vCPUs supported by the machine.
 	maxVCPUs int
 
+	// maxSlots is the maximum number of memory slots supported by the machine.
+	maxSlots int
+
+	// usedSlots is the set of used physical addresses (sorted).
+	usedSlots []uintptr
+
 	// nextID is the next vCPU ID.
 	nextID uint32
 }
@@ -100,8 +103,11 @@ type vCPU struct {
 	// tid is the last set tid.
 	tid uint64
 
-	// switches is a count of world switches (informational only).
-	switches uint32
+	// userExits is the count of user exits.
+	userExits uint64
+
+	// guestExits is the count of guest to host world switches.
+	guestExits uint64
 
 	// faults is a count of world faults (informational only).
 	faults uint32
@@ -124,6 +130,7 @@ type vCPU struct {
 	// vCPUArchState is the architecture-specific state.
 	vCPUArchState
 
+	// dieState holds state related to vCPU death.
 	dieState dieState
 }
 
@@ -152,7 +159,7 @@ func (m *machine) newVCPU() *vCPU {
 		fd:      int(fd),
 		machine: m,
 	}
-	c.CPU.Init(&m.kernel, c)
+	c.CPU.Init(&m.kernel, c.id, c)
 	m.vCPUsByID[c.id] = c
 
 	// Ensure the signal mask is correct.
@@ -180,10 +187,8 @@ func newMachine(vm int) (*machine, error) {
 	// Create the machine.
 	m := &machine{fd: vm}
 	m.available.L = &m.mu
-	m.kernel.Init(ring0.KernelOpts{
-		PageTables: pagetables.New(newAllocator()),
-	})
 
+	// Pull the maximum vCPUs.
 	maxVCPUs, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(m.fd), _KVM_CHECK_EXTENSION, _KVM_CAP_MAX_VCPUS)
 	if errno != 0 {
 		m.maxVCPUs = _KVM_NR_VCPUS
@@ -191,10 +196,21 @@ func newMachine(vm int) (*machine, error) {
 		m.maxVCPUs = int(maxVCPUs)
 	}
 	log.Debugf("The maximum number of vCPUs is %d.", m.maxVCPUs)
-
-	// Create the vCPUs map/slices.
 	m.vCPUsByTID = make(map[uint64]*vCPU)
 	m.vCPUsByID = make([]*vCPU, m.maxVCPUs)
+	m.kernel.Init(ring0.KernelOpts{
+		PageTables: pagetables.New(newAllocator()),
+	}, m.maxVCPUs)
+
+	// Pull the maximum slots.
+	maxSlots, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(m.fd), _KVM_CHECK_EXTENSION, _KVM_CAP_MAX_MEMSLOTS)
+	if errno != 0 {
+		m.maxSlots = _KVM_NR_MEMSLOTS
+	} else {
+		m.maxSlots = int(maxSlots)
+	}
+	log.Debugf("The maximum number of slots is %d.", m.maxSlots)
+	m.usedSlots = make([]uintptr, m.maxSlots)
 
 	// Apply the physical mappings. Note that these mappings may point to
 	// guest physical addresses that are not actually available. These
@@ -207,15 +223,9 @@ func newMachine(vm int) (*machine, error) {
 			pagetables.MapOpts{AccessType: usermem.AnyAccess},
 			pr.physical)
 
-		// And keep everything in the upper half.
-		m.kernel.PageTables.Map(
-			usermem.Addr(ring0.KernelStartAddress|pr.virtual),
-			pr.length,
-			pagetables.MapOpts{AccessType: usermem.AnyAccess},
-			pr.physical)
-
 		return true // Keep iterating.
 	})
+	m.mapUpperHalf(m.kernel.PageTables)
 
 	var physicalRegionsReadOnly []physicalRegion
 	var physicalRegionsAvailable []physicalRegion
@@ -272,6 +282,20 @@ func newMachine(vm int) (*machine, error) {
 	return m, nil
 }
 
+// hasSlot returns true iff the given address is mapped.
+//
+// This must be done via a linear scan.
+//
+//go:nosplit
+func (m *machine) hasSlot(physical uintptr) bool {
+	for i := 0; i < len(m.usedSlots); i++ {
+		if p := atomic.LoadUintptr(&m.usedSlots[i]); p == physical {
+			return true
+		}
+	}
+	return false
+}
+
 // mapPhysical checks for the mapping of a physical range, and installs one if
 // not available. This attempts to be efficient for calls in the hot path.
 //
@@ -286,8 +310,8 @@ func (m *machine) mapPhysical(physical, length uintptr, phyRegions []physicalReg
 			panic("mapPhysical on unknown physical address")
 		}
 
-		if _, ok := m.mappingCache.LoadOrStore(physicalStart, true); !ok {
-			// Not present in the cache; requires setting the slot.
+		// Is this already mapped? Check the usedSlots.
+		if !m.hasSlot(physicalStart) {
 			if _, ok := handleBluepillFault(m, physical, phyRegions, flags); !ok {
 				panic("handleBluepillFault failed")
 			}
@@ -339,6 +363,11 @@ func (m *machine) Destroy() {
 // Get gets an available vCPU.
 //
 // This will return with the OS thread locked.
+//
+// It is guaranteed that if any OS thread TID is in guest, m.vCPUs[TID] points
+// to the vCPU in which the OS thread TID is running. So if Get() returns with
+// the corrent context in guest, the vCPU of it must be the same as what
+// Get() returns.
 func (m *machine) Get() *vCPU {
 	m.mu.RLock()
 	runtime.LockOSThread()
@@ -443,6 +472,19 @@ func (m *machine) newDirtySet() *dirtySet {
 	}
 }
 
+// dropPageTables drops cached page table entries.
+func (m *machine) dropPageTables(pt *pagetables.PageTables) {
+	m.mu.Lock()
+	defer m.mu.Unlock()
+
+	// Clear from all PCIDs.
+	for _, c := range m.vCPUsByID {
+		if c != nil && c.PCIDs != nil {
+			c.PCIDs.Drop(pt)
+		}
+	}
+}
+
 // lock marks the vCPU as in user mode.
 //
 // This should only be called directly when known to be safe, i.e. when
@@ -502,6 +544,8 @@ var pid = syscall.Getpid()
 //
 // This effectively unwinds the state machine.
 func (c *vCPU) bounce(forceGuestExit bool) {
+	origGuestExits := atomic.LoadUint64(&c.guestExits)
+	origUserExits := atomic.LoadUint64(&c.userExits)
 	for {
 		switch state := atomic.LoadUint32(&c.state); state {
 		case vCPUReady, vCPUWaiter:
@@ -557,6 +601,14 @@ func (c *vCPU) bounce(forceGuestExit bool) {
 			// Should not happen: the above is exhaustive.
 			panic("invalid state")
 		}
+
+		// Check if we've missed the state transition, but
+		// we can safely return at this point in time.
+		newGuestExits := atomic.LoadUint64(&c.guestExits)
+		newUserExits := atomic.LoadUint64(&c.userExits)
+		if newUserExits != origUserExits && (!forceGuestExit || newGuestExits != origGuestExits) {
+			return
+		}
 	}
 }
 
diff --git a/pkg/sentry/platform/kvm/machine_amd64.go b/pkg/sentry/platform/kvm/machine_amd64.go
index acc823ba6..c67127d95 100644
--- a/pkg/sentry/platform/kvm/machine_amd64.go
+++ b/pkg/sentry/platform/kvm/machine_amd64.go
@@ -18,14 +18,17 @@ package kvm
 
 import (
 	"fmt"
+	"math/big"
 	"reflect"
 	"runtime/debug"
 	"syscall"
 
+	"gvisor.dev/gvisor/pkg/cpuid"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/platform"
 	"gvisor.dev/gvisor/pkg/sentry/platform/ring0"
 	"gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables"
+	ktime "gvisor.dev/gvisor/pkg/sentry/time"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
@@ -84,19 +87,6 @@ const (
 	poolPCIDs = 8
 )
 
-// dropPageTables drops cached page table entries.
-func (m *machine) dropPageTables(pt *pagetables.PageTables) {
-	m.mu.Lock()
-	defer m.mu.Unlock()
-
-	// Clear from all PCIDs.
-	for _, c := range m.vCPUsByID {
-		if c != nil && c.PCIDs != nil {
-			c.PCIDs.Drop(pt)
-		}
-	}
-}
-
 // initArchState initializes architecture-specific state.
 func (c *vCPU) initArchState() error {
 	var (
@@ -144,6 +134,7 @@ func (c *vCPU) initArchState() error {
 	// Set the entrypoint for the kernel.
 	kernelUserRegs.RIP = uint64(reflect.ValueOf(ring0.Start).Pointer())
 	kernelUserRegs.RAX = uint64(reflect.ValueOf(&c.CPU).Pointer())
+	kernelUserRegs.RSP = c.StackTop()
 	kernelUserRegs.RFLAGS = ring0.KernelFlagsSet
 
 	// Set the system registers.
@@ -152,8 +143,8 @@ func (c *vCPU) initArchState() error {
 	}
 
 	// Set the user registers.
-	if err := c.setUserRegisters(&kernelUserRegs); err != nil {
-		return err
+	if errno := c.setUserRegisters(&kernelUserRegs); errno != 0 {
+		return fmt.Errorf("error setting user registers: %v", errno)
 	}
 
 	// Allocate some floating point state save area for the local vCPU.
@@ -166,6 +157,133 @@ func (c *vCPU) initArchState() error {
 	return c.setSystemTime()
 }
 
+// bitsForScaling returns the bits available for storing the fraction component
+// of the TSC scaling ratio. This allows us to replicate the (bad) math done by
+// the kernel below in scaledTSC, and ensure we can compute an exact zero
+// offset in setSystemTime.
+//
+// These constants correspond to kvm_tsc_scaling_ratio_frac_bits.
+var bitsForScaling = func() int64 {
+	fs := cpuid.HostFeatureSet()
+	if fs.Intel() {
+		return 48 // See vmx.c (kvm sources).
+	} else if fs.AMD() {
+		return 32 // See svm.c (svm sources).
+	} else {
+		return 63 // Unknown: theoretical maximum.
+	}
+}()
+
+// scaledTSC returns the host TSC scaled by the given frequency.
+//
+// This assumes a current frequency of 1. We require only the unitless ratio of
+// rawFreq to some current frequency. See setSystemTime for context.
+//
+// The kernel math guarantees that all bits of the multiplication and division
+// will be correctly preserved and applied. However, it is not possible to
+// actually store the ratio correctly.  So we need to use the same schema in
+// order to calculate the scaled frequency and get the same result.
+//
+// We can assume that the current frequency is (1), so we are calculating a
+// strict inverse of this value. This simplifies this function considerably.
+//
+// Roughly, the returned value "scaledTSC" will have:
+// 	scaledTSC/hostTSC == 1/rawFreq
+//
+//go:nosplit
+func scaledTSC(rawFreq uintptr) int64 {
+	scale := int64(1 << bitsForScaling)
+	ratio := big.NewInt(scale / int64(rawFreq))
+	ratio.Mul(ratio, big.NewInt(int64(ktime.Rdtsc())))
+	ratio.Div(ratio, big.NewInt(scale))
+	return ratio.Int64()
+}
+
+// setSystemTime sets the vCPU to the system time.
+func (c *vCPU) setSystemTime() error {
+	// First, scale down the clock frequency to the lowest value allowed by
+	// the API itself.  How low we can go depends on the underlying
+	// hardware, but it is typically ~1/2^48 for Intel, ~1/2^32 for AMD.
+	// Even the lower bound here will take a 4GHz frequency down to 1Hz,
+	// meaning that everything should be able to handle a Khz setting of 1
+	// with bits to spare.
+	//
+	// Note that reducing the clock does not typically require special
+	// capabilities as it is emulated in KVM. We don't actually use this
+	// capability, but it means that this method should be robust to
+	// different hardware configurations.
+	rawFreq, err := c.getTSCFreq()
+	if err != nil {
+		return c.setSystemTimeLegacy()
+	}
+	if err := c.setTSCFreq(1); err != nil {
+		return c.setSystemTimeLegacy()
+	}
+
+	// Always restore the original frequency.
+	defer func() {
+		if err := c.setTSCFreq(rawFreq); err != nil {
+			panic(err.Error())
+		}
+	}()
+
+	// Attempt to set the system time in this compressed world. The
+	// calculation for offset normally looks like:
+	//
+	//	offset = target_tsc - kvm_scale_tsc(vcpu, rdtsc());
+	//
+	// So as long as the kvm_scale_tsc component is constant before and
+	// after the call to set the TSC value (and it is passes as the
+	// target_tsc), we will compute an offset value of zero.
+	//
+	// This is effectively cheating to make our "setSystemTime" call so
+	// unbelievably, incredibly fast that we do it "instantly" and all the
+	// calculations result in an offset of zero.
+	lastTSC := scaledTSC(rawFreq)
+	for {
+		if err := c.setTSC(uint64(lastTSC)); err != nil {
+			return err
+		}
+		nextTSC := scaledTSC(rawFreq)
+		if lastTSC == nextTSC {
+			return nil
+		}
+		lastTSC = nextTSC // Try again.
+	}
+}
+
+// setSystemTimeLegacy calibrates and sets an approximate system time.
+func (c *vCPU) setSystemTimeLegacy() error {
+	const minIterations = 10
+	minimum := uint64(0)
+	for iter := 0; ; iter++ {
+		// Try to set the TSC to an estimate of where it will be
+		// on the host during a "fast" system call iteration.
+		start := uint64(ktime.Rdtsc())
+		if err := c.setTSC(start + (minimum / 2)); err != nil {
+			return err
+		}
+		// See if this is our new minimum call time. Note that this
+		// serves two functions: one, we make sure that we are
+		// accurately predicting the offset we need to set. Second, we
+		// don't want to do the final set on a slow call, which could
+		// produce a really bad result.
+		end := uint64(ktime.Rdtsc())
+		if end < start {
+			continue // Totally bogus: unstable TSC?
+		}
+		current := end - start
+		if current < minimum || iter == 0 {
+			minimum = current // Set our new minimum.
+		}
+		// Is this past minIterations and within ~10% of minimum?
+		upperThreshold := (((minimum << 3) + minimum) >> 3)
+		if iter >= minIterations && current <= upperThreshold {
+			return nil
+		}
+	}
+}
+
 // nonCanonical generates a canonical address return.
 //
 //go:nosplit
@@ -345,3 +463,41 @@ func rdonlyRegionsForSetMem() (phyRegions []physicalRegion) {
 func availableRegionsForSetMem() (phyRegions []physicalRegion) {
 	return physicalRegions
 }
+
+var execRegions = func() (regions []region) {
+	applyVirtualRegions(func(vr virtualRegion) {
+		if excludeVirtualRegion(vr) || vr.filename == "[vsyscall]" {
+			return
+		}
+		if vr.accessType.Execute {
+			regions = append(regions, vr.region)
+		}
+	})
+	return
+}()
+
+func (m *machine) mapUpperHalf(pageTable *pagetables.PageTables) {
+	for _, r := range execRegions {
+		physical, length, ok := translateToPhysical(r.virtual)
+		if !ok || length < r.length {
+			panic("impossilbe translation")
+		}
+		pageTable.Map(
+			usermem.Addr(ring0.KernelStartAddress|r.virtual),
+			r.length,
+			pagetables.MapOpts{AccessType: usermem.Execute},
+			physical)
+	}
+	for start, end := range m.kernel.EntryRegions() {
+		regionLen := end - start
+		physical, length, ok := translateToPhysical(start)
+		if !ok || length < regionLen {
+			panic("impossible translation")
+		}
+		pageTable.Map(
+			usermem.Addr(ring0.KernelStartAddress|start),
+			regionLen,
+			pagetables.MapOpts{AccessType: usermem.ReadWrite},
+			physical)
+	}
+}
diff --git a/pkg/sentry/platform/kvm/machine_amd64_unsafe.go b/pkg/sentry/platform/kvm/machine_amd64_unsafe.go
index 290f035dd..b430f92c6 100644
--- a/pkg/sentry/platform/kvm/machine_amd64_unsafe.go
+++ b/pkg/sentry/platform/kvm/machine_amd64_unsafe.go
@@ -23,7 +23,6 @@ import (
 	"unsafe"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/time"
 )
 
 // loadSegments copies the current segments.
@@ -61,91 +60,63 @@ func (c *vCPU) setCPUID() error {
 	return nil
 }
 
-// setSystemTime sets the TSC for the vCPU.
+// getTSCFreq gets the TSC frequency.
 //
-// This has to make the call many times in order to minimize the intrinsic
-// error in the offset. Unfortunately KVM does not expose a relative offset via
-// the API, so this is an approximation. We do this via an iterative algorithm.
-// This has the advantage that it can generally deal with highly variable
-// system call times and should converge on the correct offset.
-func (c *vCPU) setSystemTime() error {
-	const (
-		_MSR_IA32_TSC  = 0x00000010
-		calibrateTries = 10
-	)
-	registers := modelControlRegisters{
-		nmsrs: 1,
-	}
-	registers.entries[0] = modelControlRegister{
-		index: _MSR_IA32_TSC,
+// If mustSucceed is true, then this function panics on error.
+func (c *vCPU) getTSCFreq() (uintptr, error) {
+	rawFreq, _, errno := syscall.RawSyscall(
+		syscall.SYS_IOCTL,
+		uintptr(c.fd),
+		_KVM_GET_TSC_KHZ,
+		0 /* ignored */)
+	if errno != 0 {
+		return 0, errno
 	}
-	target := uint64(^uint32(0))
-	for done := 0; done < calibrateTries; {
-		start := uint64(time.Rdtsc())
-		registers.entries[0].data = start + target
-		if _, _, errno := syscall.RawSyscall(
-			syscall.SYS_IOCTL,
-			uintptr(c.fd),
-			_KVM_SET_MSRS,
-			uintptr(unsafe.Pointer(&registers))); errno != 0 {
-			return fmt.Errorf("error setting system time: %v", errno)
-		}
-		// See if this is our new minimum call time. Note that this
-		// serves two functions: one, we make sure that we are
-		// accurately predicting the offset we need to set. Second, we
-		// don't want to do the final set on a slow call, which could
-		// produce a really bad result. So we only count attempts
-		// within +/- 6.25% of our minimum as an attempt.
-		end := uint64(time.Rdtsc())
-		if end < start {
-			continue // Totally bogus.
-		}
-		half := (end - start) / 2
-		if half < target {
-			target = half
-		}
-		if (half - target) < target/8 {
-			done++
-		}
+	return rawFreq, nil
+}
+
+// setTSCFreq sets the TSC frequency.
+func (c *vCPU) setTSCFreq(freq uintptr) error {
+	if _, _, errno := syscall.RawSyscall(
+		syscall.SYS_IOCTL,
+		uintptr(c.fd),
+		_KVM_SET_TSC_KHZ,
+		freq /* khz */); errno != 0 {
+		return fmt.Errorf("error setting TSC frequency: %v", errno)
 	}
 	return nil
 }
 
-// setSignalMask sets the vCPU signal mask.
-//
-// This must be called prior to running the vCPU.
-func (c *vCPU) setSignalMask() error {
-	// The layout of this structure implies that it will not necessarily be
-	// the same layout chosen by the Go compiler. It gets fudged here.
-	var data struct {
-		length uint32
-		mask1  uint32
-		mask2  uint32
-		_      uint32
+// setTSC sets the TSC value.
+func (c *vCPU) setTSC(value uint64) error {
+	const _MSR_IA32_TSC = 0x00000010
+	registers := modelControlRegisters{
+		nmsrs: 1,
 	}
-	data.length = 8 // Fixed sigset size.
-	data.mask1 = ^uint32(bounceSignalMask & 0xffffffff)
-	data.mask2 = ^uint32(bounceSignalMask >> 32)
+	registers.entries[0].index = _MSR_IA32_TSC
+	registers.entries[0].data = value
 	if _, _, errno := syscall.RawSyscall(
 		syscall.SYS_IOCTL,
 		uintptr(c.fd),
-		_KVM_SET_SIGNAL_MASK,
-		uintptr(unsafe.Pointer(&data))); errno != 0 {
-		return fmt.Errorf("error setting signal mask: %v", errno)
+		_KVM_SET_MSRS,
+		uintptr(unsafe.Pointer(&registers))); errno != 0 {
+		return fmt.Errorf("error setting tsc: %v", errno)
 	}
 	return nil
 }
 
 // setUserRegisters sets user registers in the vCPU.
-func (c *vCPU) setUserRegisters(uregs *userRegs) error {
+//
+//go:nosplit
+func (c *vCPU) setUserRegisters(uregs *userRegs) syscall.Errno {
 	if _, _, errno := syscall.RawSyscall(
 		syscall.SYS_IOCTL,
 		uintptr(c.fd),
 		_KVM_SET_REGS,
 		uintptr(unsafe.Pointer(uregs))); errno != 0 {
-		return fmt.Errorf("error setting user registers: %v", errno)
+		return errno
 	}
-	return nil
+	return 0
 }
 
 // getUserRegisters reloads user registers in the vCPU.
@@ -175,3 +146,17 @@ func (c *vCPU) setSystemRegisters(sregs *systemRegs) error {
 	}
 	return nil
 }
+
+// getSystemRegisters sets system registers.
+//
+//go:nosplit
+func (c *vCPU) getSystemRegisters(sregs *systemRegs) syscall.Errno {
+	if _, _, errno := syscall.RawSyscall(
+		syscall.SYS_IOCTL,
+		uintptr(c.fd),
+		_KVM_GET_SREGS,
+		uintptr(unsafe.Pointer(sregs))); errno != 0 {
+		return errno
+	}
+	return 0
+}
diff --git a/pkg/sentry/platform/kvm/machine_arm64.go b/pkg/sentry/platform/kvm/machine_arm64.go
index 9db171af9..54837f20c 100644
--- a/pkg/sentry/platform/kvm/machine_arm64.go
+++ b/pkg/sentry/platform/kvm/machine_arm64.go
@@ -19,6 +19,7 @@ package kvm
 import (
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/platform"
+	"gvisor.dev/gvisor/pkg/sentry/platform/ring0"
 	"gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
@@ -48,6 +49,18 @@ const (
 	poolPCIDs = 8
 )
 
+func (m *machine) mapUpperHalf(pageTable *pagetables.PageTables) {
+	applyPhysicalRegions(func(pr physicalRegion) bool {
+		pageTable.Map(
+			usermem.Addr(ring0.KernelStartAddress|pr.virtual),
+			pr.length,
+			pagetables.MapOpts{AccessType: usermem.AnyAccess},
+			pr.physical)
+
+		return true // Keep iterating.
+	})
+}
+
 // Get all read-only physicalRegions.
 func rdonlyRegionsForSetMem() (phyRegions []physicalRegion) {
 	var rdonlyRegions []region
@@ -100,19 +113,6 @@ func availableRegionsForSetMem() (phyRegions []physicalRegion) {
 	return phyRegions
 }
 
-// dropPageTables drops cached page table entries.
-func (m *machine) dropPageTables(pt *pagetables.PageTables) {
-	m.mu.Lock()
-	defer m.mu.Unlock()
-
-	// Clear from all PCIDs.
-	for _, c := range m.vCPUsByID {
-		if c.PCIDs != nil {
-			c.PCIDs.Drop(pt)
-		}
-	}
-}
-
 // nonCanonical generates a canonical address return.
 //
 //go:nosplit
diff --git a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go
index 905712076..84992c06d 100644
--- a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go
+++ b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go
@@ -79,7 +79,7 @@ func (c *vCPU) initArchState() error {
 	}
 
 	// tcr_el1
-	data = _TCR_TXSZ_VA48 | _TCR_CACHE_FLAGS | _TCR_SHARED | _TCR_TG_FLAGS | _TCR_ASID16 | _TCR_IPS_40BITS
+	data = _TCR_TXSZ_VA48 | _TCR_CACHE_FLAGS | _TCR_SHARED | _TCR_TG_FLAGS | _TCR_ASID16 | _TCR_IPS_40BITS | _TCR_A1
 	reg.id = _KVM_ARM64_REGS_TCR_EL1
 	if err := c.setOneRegister(&reg); err != nil {
 		return err
@@ -103,7 +103,7 @@ func (c *vCPU) initArchState() error {
 	c.SetTtbr0Kvm(uintptr(data))
 
 	// ttbr1_el1
-	data = c.machine.kernel.PageTables.TTBR1_EL1(false, 0)
+	data = c.machine.kernel.PageTables.TTBR1_EL1(false, 1)
 
 	reg.id = _KVM_ARM64_REGS_TTBR1_EL1
 	if err := c.setOneRegister(&reg); err != nil {
@@ -191,42 +191,6 @@ func (c *vCPU) getOneRegister(reg *kvmOneReg) error {
 	return nil
 }
 
-// setCPUID sets the CPUID to be used by the guest.
-func (c *vCPU) setCPUID() error {
-	return nil
-}
-
-// setSystemTime sets the TSC for the vCPU.
-func (c *vCPU) setSystemTime() error {
-	return nil
-}
-
-// setSignalMask sets the vCPU signal mask.
-//
-// This must be called prior to running the vCPU.
-func (c *vCPU) setSignalMask() error {
-	// The layout of this structure implies that it will not necessarily be
-	// the same layout chosen by the Go compiler. It gets fudged here.
-	var data struct {
-		length uint32
-		mask1  uint32
-		mask2  uint32
-		_      uint32
-	}
-	data.length = 8 // Fixed sigset size.
-	data.mask1 = ^uint32(bounceSignalMask & 0xffffffff)
-	data.mask2 = ^uint32(bounceSignalMask >> 32)
-	if _, _, errno := syscall.RawSyscall(
-		syscall.SYS_IOCTL,
-		uintptr(c.fd),
-		_KVM_SET_SIGNAL_MASK,
-		uintptr(unsafe.Pointer(&data))); errno != 0 {
-		return fmt.Errorf("error setting signal mask: %v", errno)
-	}
-
-	return nil
-}
-
 // SwitchToUser unpacks architectural-details.
 func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts, info *arch.SignalInfo) (usermem.AccessType, error) {
 	// Check for canonical addresses.
@@ -271,8 +235,9 @@ func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts, info *arch.SignalInfo)
 		return c.fault(int32(syscall.SIGSEGV), info)
 	case ring0.Vector(bounce): // ring0.VirtualizationException
 		return usermem.NoAccess, platform.ErrContextInterrupt
-	case ring0.El0Sync_undef,
-		ring0.El1Sync_undef:
+	case ring0.El0Sync_undef:
+		return c.fault(int32(syscall.SIGILL), info)
+	case ring0.El1Sync_undef:
 		*info = arch.SignalInfo{
 			Signo: int32(syscall.SIGILL),
 			Code:  1, // ILL_ILLOPC (illegal opcode).
diff --git a/pkg/sentry/platform/kvm/machine_unsafe.go b/pkg/sentry/platform/kvm/machine_unsafe.go
index 9f86f6a7a..1d6ca245a 100644
--- a/pkg/sentry/platform/kvm/machine_unsafe.go
+++ b/pkg/sentry/platform/kvm/machine_unsafe.go
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 // +build go1.12
-// +build !go1.16
+// +build !go1.17
 
 // Check go:linkname function signatures when updating Go version.
 
@@ -143,3 +143,29 @@ func (c *vCPU) waitUntilNot(state uint32) {
 		panic("futex wait error")
 	}
 }
+
+// setSignalMask sets the vCPU signal mask.
+//
+// This must be called prior to running the vCPU.
+func (c *vCPU) setSignalMask() error {
+	// The layout of this structure implies that it will not necessarily be
+	// the same layout chosen by the Go compiler. It gets fudged here.
+	var data struct {
+		length uint32
+		mask1  uint32
+		mask2  uint32
+		_      uint32
+	}
+	data.length = 8 // Fixed sigset size.
+	data.mask1 = ^uint32(bounceSignalMask & 0xffffffff)
+	data.mask2 = ^uint32(bounceSignalMask >> 32)
+	if _, _, errno := syscall.RawSyscall(
+		syscall.SYS_IOCTL,
+		uintptr(c.fd),
+		_KVM_SET_SIGNAL_MASK,
+		uintptr(unsafe.Pointer(&data))); errno != 0 {
+		return fmt.Errorf("error setting signal mask: %v", errno)
+	}
+
+	return nil
+}
diff --git a/pkg/sentry/platform/kvm/testutil/testutil_arm64.go b/pkg/sentry/platform/kvm/testutil/testutil_arm64.go
index 4dad877ba..c5235ca9d 100644
--- a/pkg/sentry/platform/kvm/testutil/testutil_arm64.go
+++ b/pkg/sentry/platform/kvm/testutil/testutil_arm64.go
@@ -23,6 +23,11 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 )
 
+// TLSWorks is a tls test.
+//
+// It returns true or false.
+func TLSWorks() bool
+
 // SetTestTarget sets the rip appropriately.
 func SetTestTarget(regs *arch.Registers, fn func()) {
 	regs.Pc = uint64(reflect.ValueOf(fn).Pointer())
diff --git a/pkg/sentry/platform/kvm/testutil/testutil_arm64.s b/pkg/sentry/platform/kvm/testutil/testutil_arm64.s
index 6caf7282d..7348c29a5 100644
--- a/pkg/sentry/platform/kvm/testutil/testutil_arm64.s
+++ b/pkg/sentry/platform/kvm/testutil/testutil_arm64.s
@@ -50,6 +50,22 @@ TEXT ·SpinLoop(SB),NOSPLIT,$0
 start:
 	B start
 
+TEXT ·TLSWorks(SB),NOSPLIT,$0-8
+        NO_LOCAL_POINTERS
+        MOVD $0x6789, R5
+        MSR R5, TPIDR_EL0
+        MOVD $SYS_GETPID, R8 // getpid
+        SVC
+        MRS TPIDR_EL0, R6
+        CMP R5, R6
+        BNE isNaN
+        MOVD $1, R0
+        MOVD R0, ret+0(FP)
+        RET
+isNaN:
+        MOVD $0, ret+0(FP)
+        RET
+
 TEXT ·FloatingPointWorks(SB),NOSPLIT,$0-8
 	NO_LOCAL_POINTERS
 	// gc will touch fpsimd, so we should test it.
diff --git a/pkg/sentry/platform/kvm/virtual_map.go b/pkg/sentry/platform/kvm/virtual_map.go
index c8897d34f..4dcdbf8a7 100644
--- a/pkg/sentry/platform/kvm/virtual_map.go
+++ b/pkg/sentry/platform/kvm/virtual_map.go
@@ -34,7 +34,7 @@ type virtualRegion struct {
 }
 
 // mapsLine matches a single line from /proc/PID/maps.
-var mapsLine = regexp.MustCompile("([0-9a-f]+)-([0-9a-f]+) ([r-][w-][x-][sp]) ([0-9a-f]+) [0-9a-f]{2}:[0-9a-f]{2,} [0-9]+\\s+(.*)")
+var mapsLine = regexp.MustCompile("([0-9a-f]+)-([0-9a-f]+) ([r-][w-][x-][sp]) ([0-9a-f]+) [0-9a-f]{2,3}:[0-9a-f]{2,} [0-9]+\\s+(.*)")
 
 // excludeRegion returns true if these regions should be excluded from the
 // physical map. Virtual regions need to be excluded if get_user_pages will
diff --git a/pkg/sentry/platform/platform.go b/pkg/sentry/platform/platform.go
index ba031516a..dcfe839a7 100644
--- a/pkg/sentry/platform/platform.go
+++ b/pkg/sentry/platform/platform.go
@@ -25,6 +25,7 @@ import (
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/seccomp"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/hostmm"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
@@ -52,6 +53,10 @@ type Platform interface {
 	// can reliably return ErrContextCPUPreempted.
 	DetectsCPUPreemption() bool
 
+	// HaveGlobalMemoryBarrier returns true if the GlobalMemoryBarrier method
+	// is supported.
+	HaveGlobalMemoryBarrier() bool
+
 	// MapUnit returns the alignment used for optional mappings into this
 	// platform's AddressSpaces. Higher values indicate lower per-page costs
 	// for AddressSpace.MapFile. As a special case, a MapUnit of 0 indicates
@@ -97,6 +102,15 @@ type Platform interface {
 	// called.
 	PreemptAllCPUs() error
 
+	// GlobalMemoryBarrier blocks until all threads running application code
+	// (via Context.Switch) and all task goroutines "have passed through a
+	// state where all memory accesses to user-space addresses match program
+	// order between entry to and return from [GlobalMemoryBarrier]", as for
+	// membarrier(2).
+	//
+	// Preconditions: HaveGlobalMemoryBarrier() == true.
+	GlobalMemoryBarrier() error
+
 	// SyscallFilters returns syscalls made exclusively by this platform.
 	SyscallFilters() seccomp.SyscallRules
 }
@@ -115,6 +129,43 @@ func (NoCPUPreemptionDetection) PreemptAllCPUs() error {
 	panic("This platform does not support CPU preemption detection")
 }
 
+// UseHostGlobalMemoryBarrier implements Platform.HaveGlobalMemoryBarrier and
+// Platform.GlobalMemoryBarrier by invoking equivalent functionality on the
+// host.
+type UseHostGlobalMemoryBarrier struct{}
+
+// HaveGlobalMemoryBarrier implements Platform.HaveGlobalMemoryBarrier.
+func (UseHostGlobalMemoryBarrier) HaveGlobalMemoryBarrier() bool {
+	return hostmm.HaveGlobalMemoryBarrier()
+}
+
+// GlobalMemoryBarrier implements Platform.GlobalMemoryBarrier.
+func (UseHostGlobalMemoryBarrier) GlobalMemoryBarrier() error {
+	return hostmm.GlobalMemoryBarrier()
+}
+
+// UseHostProcessMemoryBarrier implements Platform.HaveGlobalMemoryBarrier and
+// Platform.GlobalMemoryBarrier by invoking a process-local memory barrier.
+// This is faster than UseHostGlobalMemoryBarrier, but is only appropriate for
+// platforms for which application code executes while using the sentry's
+// mm_struct.
+type UseHostProcessMemoryBarrier struct{}
+
+// HaveGlobalMemoryBarrier implements Platform.HaveGlobalMemoryBarrier.
+func (UseHostProcessMemoryBarrier) HaveGlobalMemoryBarrier() bool {
+	// Fall back to a global memory barrier if a process-local one isn't
+	// available.
+	return hostmm.HaveProcessMemoryBarrier() || hostmm.HaveGlobalMemoryBarrier()
+}
+
+// GlobalMemoryBarrier implements Platform.GlobalMemoryBarrier.
+func (UseHostProcessMemoryBarrier) GlobalMemoryBarrier() error {
+	if hostmm.HaveProcessMemoryBarrier() {
+		return hostmm.ProcessMemoryBarrier()
+	}
+	return hostmm.GlobalMemoryBarrier()
+}
+
 // MemoryManager represents an abstraction above the platform address space
 // which manages memory mappings and their contents.
 type MemoryManager interface {
@@ -245,14 +296,19 @@ type AddressSpace interface {
 	// physical memory) to the mapping. The precommit flag is advisory and
 	// implementations may choose to ignore it.
 	//
-	// Preconditions: addr and fr must be page-aligned. fr.Length() > 0.
-	// at.Any() == true. At least one reference must be held on all pages in
-	// fr, and must continue to be held as long as pages are mapped.
+	// Preconditions:
+	// * addr and fr must be page-aligned.
+	// * fr.Length() > 0.
+	// * at.Any() == true.
+	// * At least one reference must be held on all pages in fr, and must
+	//   continue to be held as long as pages are mapped.
 	MapFile(addr usermem.Addr, f memmap.File, fr memmap.FileRange, at usermem.AccessType, precommit bool) error
 
 	// Unmap unmaps the given range.
 	//
-	// Preconditions: addr is page-aligned. length > 0.
+	// Preconditions:
+	// * addr is page-aligned.
+	// * length > 0.
 	Unmap(addr usermem.Addr, length uint64)
 
 	// Release releases this address space. After releasing, a new AddressSpace
diff --git a/pkg/sentry/platform/ptrace/BUILD b/pkg/sentry/platform/ptrace/BUILD
index e04165fbf..fc43cc3c0 100644
--- a/pkg/sentry/platform/ptrace/BUILD
+++ b/pkg/sentry/platform/ptrace/BUILD
@@ -30,7 +30,6 @@ go_library(
         "//pkg/safecopy",
         "//pkg/seccomp",
         "//pkg/sentry/arch",
-        "//pkg/sentry/hostcpu",
         "//pkg/sentry/memmap",
         "//pkg/sentry/platform",
         "//pkg/sentry/platform/interrupt",
diff --git a/pkg/sentry/platform/ptrace/filters.go b/pkg/sentry/platform/ptrace/filters.go
index 1e07cfd0d..b0970e356 100644
--- a/pkg/sentry/platform/ptrace/filters.go
+++ b/pkg/sentry/platform/ptrace/filters.go
@@ -24,10 +24,9 @@ import (
 // SyscallFilters returns syscalls made exclusively by the ptrace platform.
 func (*PTrace) SyscallFilters() seccomp.SyscallRules {
 	return seccomp.SyscallRules{
-		unix.SYS_GETCPU:            {},
-		unix.SYS_SCHED_SETAFFINITY: {},
-		syscall.SYS_PTRACE:         {},
-		syscall.SYS_TGKILL:         {},
-		syscall.SYS_WAIT4:          {},
+		unix.SYS_GETCPU:    {},
+		syscall.SYS_PTRACE: {},
+		syscall.SYS_TGKILL: {},
+		syscall.SYS_WAIT4:  {},
 	}
 }
diff --git a/pkg/sentry/platform/ptrace/ptrace.go b/pkg/sentry/platform/ptrace/ptrace.go
index b52d0fbd8..f56aa3b79 100644
--- a/pkg/sentry/platform/ptrace/ptrace.go
+++ b/pkg/sentry/platform/ptrace/ptrace.go
@@ -192,6 +192,7 @@ func (c *context) PullFullState(as platform.AddressSpace, ac arch.Context) {}
 type PTrace struct {
 	platform.MMapMinAddr
 	platform.NoCPUPreemptionDetection
+	platform.UseHostGlobalMemoryBarrier
 }
 
 // New returns a new ptrace-based implementation of the platform interface.
diff --git a/pkg/sentry/platform/ptrace/subprocess.go b/pkg/sentry/platform/ptrace/subprocess.go
index e1d54d8a2..812ab80ef 100644
--- a/pkg/sentry/platform/ptrace/subprocess.go
+++ b/pkg/sentry/platform/ptrace/subprocess.go
@@ -518,11 +518,6 @@ func (s *subprocess) switchToApp(c *context, ac arch.Context) bool {
 	}
 	defer c.interrupt.Disable()
 
-	// Ensure that the CPU set is bound appropriately; this makes the
-	// emulation below several times faster, presumably by avoiding
-	// interprocessor wakeups and by simplifying the schedule.
-	t.bind()
-
 	// Set registers.
 	if err := t.setRegs(regs); err != nil {
 		panic(fmt.Sprintf("ptrace set regs (%+v) failed: %v", regs, err))
diff --git a/pkg/sentry/platform/ptrace/subprocess_amd64.go b/pkg/sentry/platform/ptrace/subprocess_amd64.go
index 84b699f0d..020bbda79 100644
--- a/pkg/sentry/platform/ptrace/subprocess_amd64.go
+++ b/pkg/sentry/platform/ptrace/subprocess_amd64.go
@@ -201,7 +201,7 @@ func appendArchSeccompRules(rules []seccomp.RuleSet, defaultAction linux.BPFActi
 			seccomp.RuleSet{
 				Rules: seccomp.SyscallRules{
 					syscall.SYS_ARCH_PRCTL: []seccomp.Rule{
-						{seccomp.AllowValue(linux.ARCH_SET_CPUID), seccomp.AllowValue(0)},
+						{seccomp.EqualTo(linux.ARCH_SET_CPUID), seccomp.EqualTo(0)},
 					},
 				},
 				Action: linux.SECCOMP_RET_ALLOW,
diff --git a/pkg/sentry/platform/ptrace/subprocess_linux.go b/pkg/sentry/platform/ptrace/subprocess_linux.go
index 2ce528601..8548853da 100644
--- a/pkg/sentry/platform/ptrace/subprocess_linux.go
+++ b/pkg/sentry/platform/ptrace/subprocess_linux.go
@@ -80,9 +80,9 @@ func attachedThread(flags uintptr, defaultAction linux.BPFAction) (*thread, erro
 			Rules: seccomp.SyscallRules{
 				syscall.SYS_CLONE: []seccomp.Rule{
 					// Allow creation of new subprocesses (used by the master).
-					{seccomp.AllowValue(syscall.CLONE_FILES | syscall.SIGKILL)},
+					{seccomp.EqualTo(syscall.CLONE_FILES | syscall.SIGKILL)},
 					// Allow creation of new threads within a single address space (used by addresss spaces).
-					{seccomp.AllowValue(
+					{seccomp.EqualTo(
 						syscall.CLONE_FILES |
 							syscall.CLONE_FS |
 							syscall.CLONE_SIGHAND |
@@ -97,14 +97,14 @@ func attachedThread(flags uintptr, defaultAction linux.BPFAction) (*thread, erro
 
 				// For the stub prctl dance (all).
 				syscall.SYS_PRCTL: []seccomp.Rule{
-					{seccomp.AllowValue(syscall.PR_SET_PDEATHSIG), seccomp.AllowValue(syscall.SIGKILL)},
+					{seccomp.EqualTo(syscall.PR_SET_PDEATHSIG), seccomp.EqualTo(syscall.SIGKILL)},
 				},
 				syscall.SYS_GETPPID: {},
 
 				// For the stub to stop itself (all).
 				syscall.SYS_GETPID: {},
 				syscall.SYS_KILL: []seccomp.Rule{
-					{seccomp.AllowAny{}, seccomp.AllowValue(syscall.SIGSTOP)},
+					{seccomp.MatchAny{}, seccomp.EqualTo(syscall.SIGSTOP)},
 				},
 
 				// Injected to support the address space operations.
@@ -115,7 +115,7 @@ func attachedThread(flags uintptr, defaultAction linux.BPFAction) (*thread, erro
 		})
 	}
 	rules = appendArchSeccompRules(rules, defaultAction)
-	instrs, err := seccomp.BuildProgram(rules, defaultAction)
+	instrs, err := seccomp.BuildProgram(rules, defaultAction, defaultAction)
 	if err != nil {
 		return nil, err
 	}
diff --git a/pkg/sentry/platform/ptrace/subprocess_linux_unsafe.go b/pkg/sentry/platform/ptrace/subprocess_linux_unsafe.go
index 245b20722..533e45497 100644
--- a/pkg/sentry/platform/ptrace/subprocess_linux_unsafe.go
+++ b/pkg/sentry/platform/ptrace/subprocess_linux_unsafe.go
@@ -18,29 +18,12 @@
 package ptrace
 
 import (
-	"sync/atomic"
 	"syscall"
 	"unsafe"
 
-	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/hostcpu"
-	"gvisor.dev/gvisor/pkg/sync"
 )
 
-// maskPool contains reusable CPU masks for setting affinity. Unfortunately,
-// runtime.NumCPU doesn't actually record the number of CPUs on the system, it
-// just records the number of CPUs available in the scheduler affinity set at
-// startup. This may a) change over time and b) gives a number far lower than
-// the maximum indexable CPU. To prevent lots of allocation in the hot path, we
-// use a pool to store large masks that we can reuse during bind.
-var maskPool = sync.Pool{
-	New: func() interface{} {
-		const maxCPUs = 1024 // Not a hard limit; see below.
-		return make([]uintptr, maxCPUs/64)
-	},
-}
-
 // unmaskAllSignals unmasks all signals on the current thread.
 //
 //go:nosplit
@@ -49,47 +32,3 @@ func unmaskAllSignals() syscall.Errno {
 	_, _, errno := syscall.RawSyscall6(syscall.SYS_RT_SIGPROCMASK, linux.SIG_SETMASK, uintptr(unsafe.Pointer(&set)), 0, linux.SignalSetSize, 0, 0)
 	return errno
 }
-
-// setCPU sets the CPU affinity.
-func (t *thread) setCPU(cpu uint32) error {
-	mask := maskPool.Get().([]uintptr)
-	n := int(cpu / 64)
-	v := uintptr(1 << uintptr(cpu%64))
-	if n >= len(mask) {
-		// See maskPool note above. We've actually exceeded the number
-		// of available cores. Grow the mask and return it.
-		mask = make([]uintptr, n+1)
-	}
-	mask[n] |= v
-	if _, _, errno := syscall.RawSyscall(
-		unix.SYS_SCHED_SETAFFINITY,
-		uintptr(t.tid),
-		uintptr(len(mask)*8),
-		uintptr(unsafe.Pointer(&mask[0]))); errno != 0 {
-		return errno
-	}
-	mask[n] &^= v
-	maskPool.Put(mask)
-	return nil
-}
-
-// bind attempts to ensure that the thread is on the same CPU as the current
-// thread. This provides no guarantees as it is fundamentally a racy operation:
-// CPU sets may change and we may be rescheduled in the middle of this
-// operation. As a result, no failures are reported.
-//
-// Precondition: the current runtime thread should be locked.
-func (t *thread) bind() {
-	currentCPU := hostcpu.GetCPU()
-
-	if oldCPU := atomic.SwapUint32(&t.cpu, currentCPU); oldCPU != currentCPU {
-		// Set the affinity on the thread and save the CPU for next
-		// round; we don't expect CPUs to bounce around too frequently.
-		//
-		// (It's worth noting that we could move CPUs between this point
-		// and when the tracee finishes executing. But that would be
-		// roughly the status quo anyways -- we're just maximizing our
-		// chances of colocation, not guaranteeing it.)
-		t.setCPU(currentCPU)
-	}
-}
diff --git a/pkg/sentry/platform/ptrace/subprocess_unsafe.go b/pkg/sentry/platform/ptrace/subprocess_unsafe.go
index 0bee995e4..7ee20d89a 100644
--- a/pkg/sentry/platform/ptrace/subprocess_unsafe.go
+++ b/pkg/sentry/platform/ptrace/subprocess_unsafe.go
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 // +build go1.12
-// +build !go1.16
+// +build !go1.17
 
 // Check go:linkname function signatures when updating Go version.
 
diff --git a/pkg/sentry/platform/ring0/defs_amd64.go b/pkg/sentry/platform/ring0/defs_amd64.go
index 9c6c2cf5c..00899273e 100644
--- a/pkg/sentry/platform/ring0/defs_amd64.go
+++ b/pkg/sentry/platform/ring0/defs_amd64.go
@@ -76,15 +76,41 @@ type KernelOpts struct {
 type KernelArchState struct {
 	KernelOpts
 
+	// cpuEntries is array of kernelEntry for all cpus
+	cpuEntries []kernelEntry
+
 	// globalIDT is our set of interrupt gates.
-	globalIDT idt64
+	globalIDT *idt64
 }
 
-// CPUArchState contains CPU-specific arch state.
-type CPUArchState struct {
+// kernelEntry contains minimal CPU-specific arch state
+// that can be mapped at the upper of the address space.
+// Malicious APP might steal info from it via CPU bugs.
+type kernelEntry struct {
 	// stack is the stack used for interrupts on this CPU.
 	stack [256]byte
 
+	// scratch space for temporary usage.
+	scratch0 uint64
+
+	// stackTop is the top of the stack.
+	stackTop uint64
+
+	// cpuSelf is back reference to CPU.
+	cpuSelf *CPU
+
+	// kernelCR3 is the cr3 used for sentry kernel.
+	kernelCR3 uintptr
+
+	// gdt is the CPU's descriptor table.
+	gdt descriptorTable
+
+	// tss is the CPU's task state.
+	tss TaskState64
+}
+
+// CPUArchState contains CPU-specific arch state.
+type CPUArchState struct {
 	// errorCode is the error code from the last exception.
 	errorCode uintptr
 
@@ -97,11 +123,7 @@ type CPUArchState struct {
 	// exception.
 	errorType uintptr
 
-	// gdt is the CPU's descriptor table.
-	gdt descriptorTable
-
-	// tss is the CPU's task state.
-	tss TaskState64
+	*kernelEntry
 }
 
 // ErrorCode returns the last error code.
diff --git a/pkg/sentry/platform/ring0/defs_arm64.go b/pkg/sentry/platform/ring0/defs_arm64.go
index 0e2ab716c..508236e46 100644
--- a/pkg/sentry/platform/ring0/defs_arm64.go
+++ b/pkg/sentry/platform/ring0/defs_arm64.go
@@ -77,6 +77,9 @@ type CPUArchState struct {
 
 	// lazyVFP is the value of cpacr_el1.
 	lazyVFP uintptr
+
+	// appASID is the asid value of guest application.
+	appASID uintptr
 }
 
 // ErrorCode returns the last error code.
diff --git a/pkg/sentry/platform/ring0/entry_amd64.go b/pkg/sentry/platform/ring0/entry_amd64.go
index 7fa43c2f5..d87b1fd00 100644
--- a/pkg/sentry/platform/ring0/entry_amd64.go
+++ b/pkg/sentry/platform/ring0/entry_amd64.go
@@ -36,12 +36,15 @@ func sysenter()
 // This must be called prior to sysret/iret.
 func swapgs()
 
+// jumpToKernel jumps to the kernel version of the current RIP.
+func jumpToKernel()
+
 // sysret returns to userspace from a system call.
 //
 // The return code is the vector that interrupted execution.
 //
 // See stubs.go for a note regarding the frame size of this function.
-func sysret(*CPU, *arch.Registers) Vector
+func sysret(cpu *CPU, regs *arch.Registers, userCR3 uintptr) Vector
 
 // "iret is the cadillac of CPL switching."
 //
@@ -50,7 +53,7 @@ func sysret(*CPU, *arch.Registers) Vector
 // iret is nearly identical to sysret, except an iret is used to fully restore
 // all user state. This must be called in cases where all registers need to be
 // restored.
-func iret(*CPU, *arch.Registers) Vector
+func iret(cpu *CPU, regs *arch.Registers, userCR3 uintptr) Vector
 
 // exception is the generic exception entry.
 //
diff --git a/pkg/sentry/platform/ring0/entry_amd64.s b/pkg/sentry/platform/ring0/entry_amd64.s
index 02df38331..f59747df3 100644
--- a/pkg/sentry/platform/ring0/entry_amd64.s
+++ b/pkg/sentry/platform/ring0/entry_amd64.s
@@ -63,6 +63,15 @@
   MOVQ offset+PTRACE_RSI(reg), SI; \
   MOVQ offset+PTRACE_RDI(reg), DI;
 
+// WRITE_CR3() writes the given CR3 value.
+//
+// The code corresponds to:
+//
+//     mov %rax, %cr3
+//
+#define WRITE_CR3() \
+	BYTE $0x0f; BYTE $0x22; BYTE $0xd8;
+
 // SWAP_GS swaps the kernel GS (CPU).
 #define SWAP_GS() \
 	BYTE $0x0F; BYTE $0x01; BYTE $0xf8;
@@ -75,15 +84,9 @@
 #define SYSRET64() \
 	BYTE $0x48; BYTE $0x0f; BYTE $0x07;
 
-// LOAD_KERNEL_ADDRESS loads a kernel address.
-#define LOAD_KERNEL_ADDRESS(from, to) \
-	MOVQ from, to; \
-	ORQ ·KernelStartAddress(SB), to;
-
 // LOAD_KERNEL_STACK loads the kernel stack.
-#define LOAD_KERNEL_STACK(from) \
-	LOAD_KERNEL_ADDRESS(CPU_SELF(from), SP); \
-	LEAQ CPU_STACK_TOP(SP), SP;
+#define LOAD_KERNEL_STACK(entry) \
+	MOVQ ENTRY_STACK_TOP(entry), SP;
 
 // See kernel.go.
 TEXT ·Halt(SB),NOSPLIT,$0
@@ -95,58 +98,93 @@ TEXT ·swapgs(SB),NOSPLIT,$0
 	SWAP_GS()
 	RET
 
+// jumpToKernel changes execution to the kernel address space.
+//
+// This works by changing the return value to the kernel version.
+TEXT ·jumpToKernel(SB),NOSPLIT,$0
+	MOVQ 0(SP), AX
+	ORQ ·KernelStartAddress(SB), AX // Future return value.
+	MOVQ AX, 0(SP)
+	RET
+
 // See entry_amd64.go.
 TEXT ·sysret(SB),NOSPLIT,$0-24
-	// Save original state.
-	LOAD_KERNEL_ADDRESS(cpu+0(FP), BX)
-	LOAD_KERNEL_ADDRESS(regs+8(FP), AX)
+	CALL ·jumpToKernel(SB)
+	// Save original state and stack. sysenter() or exception()
+	// from APP(gr3) will switch to this stack, set the return
+	// value (vector: 32(SP)) and then do RET, which will also
+	// automatically return to the lower half.
+	MOVQ cpu+0(FP), BX
+	MOVQ regs+8(FP), AX
+	MOVQ userCR3+16(FP), CX
 	MOVQ SP, CPU_REGISTERS+PTRACE_RSP(BX)
 	MOVQ BP, CPU_REGISTERS+PTRACE_RBP(BX)
 	MOVQ AX, CPU_REGISTERS+PTRACE_RAX(BX)
 
+	// save SP AX userCR3 on the kernel stack.
+	MOVQ CPU_ENTRY(BX), BX
+	LOAD_KERNEL_STACK(BX)
+	PUSHQ PTRACE_RSP(AX)
+	PUSHQ PTRACE_RAX(AX)
+	PUSHQ CX
+
 	// Restore user register state.
 	REGISTERS_LOAD(AX, 0)
 	MOVQ PTRACE_RIP(AX), CX    // Needed for SYSRET.
 	MOVQ PTRACE_FLAGS(AX), R11 // Needed for SYSRET.
-	MOVQ PTRACE_RSP(AX), SP    // Restore the stack directly.
-	MOVQ PTRACE_RAX(AX), AX    // Restore AX (scratch).
+
+	// restore userCR3, AX, SP.
+	POPQ AX	                            // Get userCR3.
+	WRITE_CR3()                         // Switch to userCR3.
+	POPQ AX                             // Restore AX.
+	POPQ SP                             // Restore SP.
 	SYSRET64()
 
 // See entry_amd64.go.
 TEXT ·iret(SB),NOSPLIT,$0-24
-	// Save original state.
-	LOAD_KERNEL_ADDRESS(cpu+0(FP), BX)
-	LOAD_KERNEL_ADDRESS(regs+8(FP), AX)
+	CALL ·jumpToKernel(SB)
+	// Save original state and stack. sysenter() or exception()
+	// from APP(gr3) will switch to this stack, set the return
+	// value (vector: 32(SP)) and then do RET, which will also
+	// automatically return to the lower half.
+	MOVQ cpu+0(FP), BX
+	MOVQ regs+8(FP), AX
+	MOVQ userCR3+16(FP), CX
 	MOVQ SP, CPU_REGISTERS+PTRACE_RSP(BX)
 	MOVQ BP, CPU_REGISTERS+PTRACE_RBP(BX)
 	MOVQ AX, CPU_REGISTERS+PTRACE_RAX(BX)
 
 	// Build an IRET frame & restore state.
+	MOVQ CPU_ENTRY(BX), BX
 	LOAD_KERNEL_STACK(BX)
-	MOVQ PTRACE_SS(AX), BX;    PUSHQ BX
-	MOVQ PTRACE_RSP(AX), CX;   PUSHQ CX
-	MOVQ PTRACE_FLAGS(AX), DX; PUSHQ DX
-	MOVQ PTRACE_CS(AX), DI;    PUSHQ DI
-	MOVQ PTRACE_RIP(AX), SI;   PUSHQ SI
-	REGISTERS_LOAD(AX, 0)   // Restore most registers.
-	MOVQ PTRACE_RAX(AX), AX // Restore AX (scratch).
+	PUSHQ PTRACE_SS(AX)
+	PUSHQ PTRACE_RSP(AX)
+	PUSHQ PTRACE_FLAGS(AX)
+	PUSHQ PTRACE_CS(AX)
+	PUSHQ PTRACE_RIP(AX)
+	PUSHQ PTRACE_RAX(AX)                // Save AX on kernel stack.
+	PUSHQ CX                            // Save userCR3 on kernel stack.
+	REGISTERS_LOAD(AX, 0)               // Restore most registers.
+	POPQ AX	                            // Get userCR3.
+	WRITE_CR3()                         // Switch to userCR3.
+	POPQ AX                             // Restore AX.
 	IRET()
 
 // See entry_amd64.go.
 TEXT ·resume(SB),NOSPLIT,$0
 	// See iret, above.
-	MOVQ CPU_REGISTERS+PTRACE_SS(GS), BX;    PUSHQ BX
-	MOVQ CPU_REGISTERS+PTRACE_RSP(GS), CX;   PUSHQ CX
-	MOVQ CPU_REGISTERS+PTRACE_FLAGS(GS), DX; PUSHQ DX
-	MOVQ CPU_REGISTERS+PTRACE_CS(GS), DI;    PUSHQ DI
-	MOVQ CPU_REGISTERS+PTRACE_RIP(GS), SI;   PUSHQ SI
-	REGISTERS_LOAD(GS, CPU_REGISTERS)
-	MOVQ CPU_REGISTERS+PTRACE_RAX(GS), AX
+	MOVQ ENTRY_CPU_SELF(GS), AX                 // Load vCPU.
+	PUSHQ CPU_REGISTERS+PTRACE_SS(AX)
+	PUSHQ CPU_REGISTERS+PTRACE_RSP(AX)
+	PUSHQ CPU_REGISTERS+PTRACE_FLAGS(AX)
+	PUSHQ CPU_REGISTERS+PTRACE_CS(AX)
+	PUSHQ CPU_REGISTERS+PTRACE_RIP(AX)
+	REGISTERS_LOAD(AX, CPU_REGISTERS)
+	MOVQ CPU_REGISTERS+PTRACE_RAX(AX), AX
 	IRET()
 
 // See entry_amd64.go.
 TEXT ·Start(SB),NOSPLIT,$0
-	LOAD_KERNEL_STACK(AX) // Set the stack.
 	PUSHQ $0x0            // Previous frame pointer.
 	MOVQ SP, BP           // Set frame pointer.
 	PUSHQ AX              // First argument (CPU).
@@ -155,53 +193,60 @@ TEXT ·Start(SB),NOSPLIT,$0
 
 // See entry_amd64.go.
 TEXT ·sysenter(SB),NOSPLIT,$0
-	// Interrupts are always disabled while we're executing in kernel mode
-	// and always enabled while executing in user mode. Therefore, we can
-	// reliably look at the flags in R11 to determine where this syscall
-	// was from.
-	TESTL $_RFLAGS_IF, R11
+	// _RFLAGS_IOPL0 is always set in the user mode and it is never set in
+	// the kernel mode. See the comment of UserFlagsSet for more details.
+	TESTL $_RFLAGS_IOPL0, R11
 	JZ kernel
-
 user:
 	SWAP_GS()
-	XCHGQ CPU_REGISTERS+PTRACE_RSP(GS), SP // Swap stacks.
-	XCHGQ CPU_REGISTERS+PTRACE_RAX(GS), AX // Swap for AX (regs).
+	MOVQ AX, ENTRY_SCRATCH0(GS)            // Save user AX on scratch.
+	MOVQ ENTRY_KERNEL_CR3(GS), AX          // Get kernel cr3 on AX.
+	WRITE_CR3()                            // Switch to kernel cr3.
+
+	MOVQ ENTRY_CPU_SELF(GS), AX            // Load vCPU.
+	MOVQ CPU_REGISTERS+PTRACE_RAX(AX), AX  // Get user regs.
 	REGISTERS_SAVE(AX, 0)                  // Save all except IP, FLAGS, SP, AX.
-	MOVQ CPU_REGISTERS+PTRACE_RAX(GS), BX  // Load saved AX value.
-	MOVQ BX,  PTRACE_RAX(AX)               // Save everything else.
-	MOVQ BX,  PTRACE_ORIGRAX(AX)
 	MOVQ CX,  PTRACE_RIP(AX)
 	MOVQ R11, PTRACE_FLAGS(AX)
-	MOVQ CPU_REGISTERS+PTRACE_RSP(GS), BX; MOVQ BX, PTRACE_RSP(AX)
-	MOVQ $0, CPU_ERROR_CODE(GS) // Clear error code.
-	MOVQ $1, CPU_ERROR_TYPE(GS) // Set error type to user.
+	MOVQ SP,  PTRACE_RSP(AX)
+	MOVQ ENTRY_SCRATCH0(GS), CX            // Load saved user AX value.
+	MOVQ CX,  PTRACE_RAX(AX)               // Save everything else.
+	MOVQ CX,  PTRACE_ORIGRAX(AX)
+
+	MOVQ ENTRY_CPU_SELF(GS), AX            // Load vCPU.
+	MOVQ CPU_REGISTERS+PTRACE_RSP(AX), SP  // Get stacks.
+	MOVQ $0, CPU_ERROR_CODE(AX)            // Clear error code.
+	MOVQ $1, CPU_ERROR_TYPE(AX)            // Set error type to user.
 
 	// Return to the kernel, where the frame is:
 	//
-	//	vector      (sp+24)
+	//	vector      (sp+32)
+	//	userCR3     (sp+24)
 	// 	regs        (sp+16)
 	// 	cpu         (sp+8)
 	// 	vcpu.Switch (sp+0)
 	//
-	MOVQ CPU_REGISTERS+PTRACE_RBP(GS), BP // Original base pointer.
-	MOVQ $Syscall, 24(SP)                 // Output vector.
+	MOVQ CPU_REGISTERS+PTRACE_RBP(AX), BP // Original base pointer.
+	MOVQ $Syscall, 32(SP)                 // Output vector.
 	RET
 
 kernel:
 	// We can't restore the original stack, but we can access the registers
 	// in the CPU state directly. No need for temporary juggling.
-	MOVQ AX,  CPU_REGISTERS+PTRACE_ORIGRAX(GS)
-	MOVQ AX,  CPU_REGISTERS+PTRACE_RAX(GS)
-	REGISTERS_SAVE(GS, CPU_REGISTERS)
-	MOVQ CX,  CPU_REGISTERS+PTRACE_RIP(GS)
-	MOVQ R11, CPU_REGISTERS+PTRACE_FLAGS(GS)
-	MOVQ SP,  CPU_REGISTERS+PTRACE_RSP(GS)
-	MOVQ $0, CPU_ERROR_CODE(GS) // Clear error code.
-	MOVQ $0, CPU_ERROR_TYPE(GS) // Set error type to kernel.
+	MOVQ AX,  ENTRY_SCRATCH0(GS)
+	MOVQ ENTRY_CPU_SELF(GS), AX                 // Load vCPU.
+	REGISTERS_SAVE(AX, CPU_REGISTERS)
+	MOVQ CX,  CPU_REGISTERS+PTRACE_RIP(AX)
+	MOVQ R11, CPU_REGISTERS+PTRACE_FLAGS(AX)
+	MOVQ SP,  CPU_REGISTERS+PTRACE_RSP(AX)
+	MOVQ ENTRY_SCRATCH0(GS), BX
+	MOVQ BX,  CPU_REGISTERS+PTRACE_ORIGRAX(AX)
+	MOVQ BX,  CPU_REGISTERS+PTRACE_RAX(AX)
+	MOVQ $0,  CPU_ERROR_CODE(AX)                // Clear error code.
+	MOVQ $0,  CPU_ERROR_TYPE(AX)                // Set error type to kernel.
 
 	// Call the syscall trampoline.
 	LOAD_KERNEL_STACK(GS)
-	MOVQ CPU_SELF(GS), AX   // Load vCPU.
 	PUSHQ AX                // First argument (vCPU).
 	CALL ·kernelSyscall(SB) // Call the trampoline.
 	POPQ AX                 // Pop vCPU.
@@ -230,16 +275,21 @@ TEXT ·exception(SB),NOSPLIT,$0
 	//	ERROR_CODE  (sp+8)
 	//	VECTOR      (sp+0)
 	//
-	TESTL $_RFLAGS_IF, 32(SP)
+	TESTL $_RFLAGS_IOPL0, 32(SP)
 	JZ kernel
 
 user:
 	SWAP_GS()
 	ADDQ $-8, SP                            // Adjust for flags.
 	MOVQ $_KERNEL_FLAGS, 0(SP); BYTE $0x9d; // Reset flags (POPFQ).
-	XCHGQ CPU_REGISTERS+PTRACE_RAX(GS), AX  // Swap for user regs.
+	PUSHQ AX                                // Save user AX on stack.
+	MOVQ ENTRY_KERNEL_CR3(GS), AX           // Get kernel cr3 on AX.
+	WRITE_CR3()                             // Switch to kernel cr3.
+
+	MOVQ ENTRY_CPU_SELF(GS), AX             // Load vCPU.
+	MOVQ CPU_REGISTERS+PTRACE_RAX(AX), AX   // Get user regs.
 	REGISTERS_SAVE(AX, 0)                   // Save all except IP, FLAGS, SP, AX.
-	MOVQ CPU_REGISTERS+PTRACE_RAX(GS), BX   // Restore original AX.
+	POPQ BX                                 // Restore original AX.
 	MOVQ BX, PTRACE_RAX(AX)                 // Save it.
 	MOVQ BX, PTRACE_ORIGRAX(AX)
 	MOVQ 16(SP), BX; MOVQ BX, PTRACE_RIP(AX)
@@ -249,34 +299,36 @@ user:
 	MOVQ 48(SP), SI; MOVQ SI, PTRACE_SS(AX)
 
 	// Copy out and return.
+	MOVQ ENTRY_CPU_SELF(GS), AX           // Load vCPU.
 	MOVQ 0(SP), BX                        // Load vector.
 	MOVQ 8(SP), CX                        // Load error code.
-	MOVQ CPU_REGISTERS+PTRACE_RSP(GS), SP // Original stack (kernel version).
-	MOVQ CPU_REGISTERS+PTRACE_RBP(GS), BP // Original base pointer.
-	MOVQ CX, CPU_ERROR_CODE(GS)           // Set error code.
-	MOVQ $1, CPU_ERROR_TYPE(GS)           // Set error type to user.
-	MOVQ BX, 24(SP)                       // Output vector.
+	MOVQ CPU_REGISTERS+PTRACE_RSP(AX), SP // Original stack (kernel version).
+	MOVQ CPU_REGISTERS+PTRACE_RBP(AX), BP // Original base pointer.
+	MOVQ CX, CPU_ERROR_CODE(AX)           // Set error code.
+	MOVQ $1, CPU_ERROR_TYPE(AX)           // Set error type to user.
+	MOVQ BX, 32(SP)                       // Output vector.
 	RET
 
 kernel:
 	// As per above, we can save directly.
-	MOVQ AX, CPU_REGISTERS+PTRACE_RAX(GS)
-	MOVQ AX, CPU_REGISTERS+PTRACE_ORIGRAX(GS)
-	REGISTERS_SAVE(GS, CPU_REGISTERS)
-	MOVQ 16(SP), AX; MOVQ AX, CPU_REGISTERS+PTRACE_RIP(GS)
-	MOVQ 32(SP), BX; MOVQ BX, CPU_REGISTERS+PTRACE_FLAGS(GS)
-	MOVQ 40(SP), CX; MOVQ CX, CPU_REGISTERS+PTRACE_RSP(GS)
+	PUSHQ AX
+	MOVQ ENTRY_CPU_SELF(GS), AX                        // Load vCPU.
+	REGISTERS_SAVE(AX, CPU_REGISTERS)
+	POPQ BX
+	MOVQ BX, CPU_REGISTERS+PTRACE_RAX(AX)
+	MOVQ BX, CPU_REGISTERS+PTRACE_ORIGRAX(AX)
+	MOVQ 16(SP), BX; MOVQ BX, CPU_REGISTERS+PTRACE_RIP(AX)
+	MOVQ 32(SP), BX; MOVQ BX, CPU_REGISTERS+PTRACE_FLAGS(AX)
+	MOVQ 40(SP), BX; MOVQ BX, CPU_REGISTERS+PTRACE_RSP(AX)
 
 	// Set the error code and adjust the stack.
-	MOVQ 8(SP), AX              // Load the error code.
-	MOVQ AX, CPU_ERROR_CODE(GS) // Copy out to the CPU.
-	MOVQ $0, CPU_ERROR_TYPE(GS) // Set error type to kernel.
+	MOVQ 8(SP), BX              // Load the error code.
+	MOVQ BX, CPU_ERROR_CODE(AX) // Copy out to the CPU.
+	MOVQ $0, CPU_ERROR_TYPE(AX) // Set error type to kernel.
 	MOVQ 0(SP), BX              // BX contains the vector.
-	ADDQ $48, SP                // Drop the exception frame.
 
 	// Call the exception trampoline.
 	LOAD_KERNEL_STACK(GS)
-	MOVQ CPU_SELF(GS), AX     // Load vCPU.
 	PUSHQ BX                  // Second argument (vector).
 	PUSHQ AX                  // First argument (vCPU).
 	CALL ·kernelException(SB) // Call the trampoline.
diff --git a/pkg/sentry/platform/ring0/entry_arm64.s b/pkg/sentry/platform/ring0/entry_arm64.s
index 9d29b7168..1079a024b 100644
--- a/pkg/sentry/platform/ring0/entry_arm64.s
+++ b/pkg/sentry/platform/ring0/entry_arm64.s
@@ -27,7 +27,9 @@
 
 // ERET returns using the ELR and SPSR for the current exception level.
 #define ERET() \
-  WORD $0xd69f03e0
+  WORD $0xd69f03e0; \
+  DSB $7; \
+  ISB $15;
 
 // RSV_REG is a register that holds el1 information temporarily.
 #define RSV_REG 	R18_PLATFORM
@@ -44,9 +46,11 @@
 #define SCTLR_M         1 << 0
 #define SCTLR_C         1 << 2
 #define SCTLR_I         1 << 12
+#define SCTLR_DZE       1 << 14
 #define SCTLR_UCT       1 << 15
+#define SCTLR_UCI       1 << 26
 
-#define SCTLR_EL1_DEFAULT       (SCTLR_M | SCTLR_C | SCTLR_I | SCTLR_UCT)
+#define SCTLR_EL1_DEFAULT       (SCTLR_M | SCTLR_C | SCTLR_I | SCTLR_UCT | SCTLR_UCI | SCTLR_DZE)
 
 // cntkctl_el1: counter-timer kernel control register el1.
 #define CNTKCTL_EL0PCTEN 	1 << 0
@@ -294,23 +298,27 @@
 	LOAD_KERNEL_ADDRESS(CPU_SELF(from), RSV_REG); \
 	MOVD $CPU_STACK_TOP(RSV_REG), RSV_REG; \
 	MOVD RSV_REG, RSP; \
-	WORD $0xd538d092; \   //MRS   TPIDR_EL1, R18
-	ISB $15; \
-	DSB $15;
+	WORD $0xd538d092;   //MRS   TPIDR_EL1, R18
 
 // SWITCH_TO_APP_PAGETABLE sets a new pagetable for a container application.
 #define SWITCH_TO_APP_PAGETABLE(from) \
-	MOVD CPU_TTBR0_APP(from), RSV_REG; \
-	WORD $0xd5182012; \	//        MSR R18, TTBR0_EL1
+	MRS TTBR1_EL1, R0; \
+	MOVD CPU_APP_ASID(from), R1; \
+	BFI $48, R1, $16, R0; \
+	MSR R0, TTBR1_EL1; \ // set the ASID in TTBR1_EL1 (since TCR.A1 is set)
 	ISB $15; \
-	DSB $15;
+	MOVD CPU_TTBR0_APP(from), RSV_REG; \
+	MSR RSV_REG, TTBR0_EL1;
 
 // SWITCH_TO_KVM_PAGETABLE sets the kvm pagetable.
 #define SWITCH_TO_KVM_PAGETABLE(from) \
-	MOVD CPU_TTBR0_KVM(from), RSV_REG; \
-	WORD $0xd5182012; \	//        MSR R18, TTBR0_EL1
+	MRS TTBR1_EL1, R0; \
+	MOVD $1, R1; \
+	BFI $48, R1, $16, R0; \
+	MSR R0, TTBR1_EL1; \
 	ISB $15; \
-	DSB $15;
+	MOVD CPU_TTBR0_KVM(from), RSV_REG; \
+	MSR RSV_REG, TTBR0_EL1;
 
 #define VFP_ENABLE \
 	MOVD $FPEN_ENABLE, R0; \
@@ -326,29 +334,30 @@
 #define KERNEL_ENTRY_FROM_EL0 \
 	SUB $16, RSP, RSP; \		// step1, save r18, r9 into kernel temporary stack.
 	STP (RSV_REG, RSV_REG_APP), 16*0(RSP); \
-	WORD $0xd538d092; \    //MRS   TPIDR_EL1, R18, step2, switch user pagetable.
-	SWITCH_TO_KVM_PAGETABLE(RSV_REG); \
-	WORD $0xd538d092; \    //MRS   TPIDR_EL1, R18
-	MOVD CPU_APP_ADDR(RSV_REG), RSV_REG_APP; \ // step3, load app context pointer.
-	REGISTERS_SAVE(RSV_REG_APP, 0); \          // step4, save app context.
+	WORD $0xd538d092; \    // MRS   TPIDR_EL1, R18
+	MOVD CPU_APP_ADDR(RSV_REG), RSV_REG_APP; \ // step2, load app context pointer.
+	REGISTERS_SAVE(RSV_REG_APP, 0); \          // step3, save app context.
 	MOVD RSV_REG_APP, R20; \
 	LDP 16*0(RSP), (RSV_REG, RSV_REG_APP); \
 	ADD $16, RSP, RSP; \
 	MOVD RSV_REG, PTRACE_R18(R20); \
 	MOVD RSV_REG_APP, PTRACE_R9(R20); \
-	MOVD R20, RSV_REG_APP; \
+	MRS TPIDR_EL0, R3; \
+	MOVD R3, PTRACE_TLS(R20); \
 	WORD $0xd5384003; \      //  MRS SPSR_EL1, R3
-	MOVD R3, PTRACE_PSTATE(RSV_REG_APP); \
+	MOVD R3, PTRACE_PSTATE(R20); \
 	MRS ELR_EL1, R3; \
-	MOVD R3, PTRACE_PC(RSV_REG_APP); \
+	MOVD R3, PTRACE_PC(R20); \
 	WORD $0xd5384103; \      //  MRS SP_EL0, R3
-	MOVD R3, PTRACE_SP(RSV_REG_APP);
+	MOVD R3, PTRACE_SP(R20);
 
 // KERNEL_ENTRY_FROM_EL1 is the entry code of the vcpu from el1 to el1.
 #define KERNEL_ENTRY_FROM_EL1 \
 	WORD $0xd538d092; \   //MRS   TPIDR_EL1, R18
 	REGISTERS_SAVE(RSV_REG, CPU_REGISTERS); \	// Save sentry context.
 	MOVD RSV_REG_APP, CPU_REGISTERS+PTRACE_R9(RSV_REG); \
+	MRS TPIDR_EL0, R4; \
+	MOVD R4, CPU_REGISTERS+PTRACE_TLS(RSV_REG); \
 	WORD $0xd5384004; \    //    MRS SPSR_EL1, R4
 	MOVD R4, CPU_REGISTERS+PTRACE_PSTATE(RSV_REG); \
 	MRS ELR_EL1, R4; \
@@ -357,6 +366,26 @@
 	MOVD R4, CPU_REGISTERS+PTRACE_SP(RSV_REG); \
 	LOAD_KERNEL_STACK(RSV_REG);  // Load the temporary stack.
 
+// EXCEPTION_WITH_ERROR is a common exception handler function.
+#define EXCEPTION_WITH_ERROR(user, vector) \
+	WORD $0xd538d092; \	//MRS   TPIDR_EL1, R18
+	WORD $0xd538601a; \	//MRS   FAR_EL1, R26
+	MOVD R26, CPU_FAULT_ADDR(RSV_REG); \
+	MOVD $user, R3; \
+	MOVD R3, CPU_ERROR_TYPE(RSV_REG); \	// Set error type to user.
+	MOVD $vector, R3; \
+	MOVD R3, CPU_VECTOR_CODE(RSV_REG); \
+	MRS ESR_EL1, R3; \
+	MOVD R3, CPU_ERROR_CODE(RSV_REG); \
+	B ·kernelExitToEl1(SB);
+
+// storeAppASID writes the application's asid value.
+TEXT ·storeAppASID(SB),NOSPLIT,$0-8
+	MOVD asid+0(FP), R1
+	MRS  TPIDR_EL1, RSV_REG
+	MOVD R1, CPU_APP_ASID(RSV_REG)
+	RET
+
 // Halt halts execution.
 TEXT ·Halt(SB),NOSPLIT,$0
 	// Clear bluepill.
@@ -365,8 +394,6 @@ TEXT ·Halt(SB),NOSPLIT,$0
 	BNE mmio_exit
 	MOVD $0, CPU_REGISTERS+PTRACE_R9(RSV_REG)
 
-	// Flush dcache.
-	WORD $0xd5087e52   // DC CISW
 mmio_exit:
 	// Disable fpsimd.
 	WORD $0xd5381041 // MRS CPACR_EL1, R1
@@ -384,9 +411,6 @@ mmio_exit:
 	MRS VBAR_EL1, R9
 	MOVD R0, 0x0(R9)
 
-	// Flush dcahce.
-	WORD $0xd5087e52  // DC CISW
-
 	RET
 
 // HaltAndResume halts execution and point the pointer to the resume function.
@@ -414,7 +438,7 @@ TEXT ·Current(SB),NOSPLIT,$0-8
 	MOVD R8, ret+0(FP)
 	RET
 
-#define STACK_FRAME_SIZE 16
+#define STACK_FRAME_SIZE 32
 
 // kernelExitToEl0 is the entrypoint for application in guest_el0.
 // Prepare the vcpu environment for container application.
@@ -423,6 +447,8 @@ TEXT ·kernelExitToEl0(SB),NOSPLIT,$0
 	MRS TPIDR_EL1, RSV_REG
 	REGISTERS_SAVE(RSV_REG, CPU_REGISTERS)
 	MOVD RSV_REG_APP, CPU_REGISTERS+PTRACE_R9(RSV_REG)
+	MRS TPIDR_EL0, R3
+	MOVD R3, CPU_REGISTERS+PTRACE_TLS(RSV_REG)
 
 	WORD $0xd5384003    //    MRS SPSR_EL1, R3
 	MOVD R3, CPU_REGISTERS+PTRACE_PSTATE(RSV_REG)
@@ -449,8 +475,18 @@ TEXT ·kernelExitToEl0(SB),NOSPLIT,$0
 	MOVD PTRACE_PSTATE(RSV_REG_APP), R1
 	WORD $0xd5184001  //MSR R1, SPSR_EL1
 
+	// need use kernel space address to excute below code, since
+	// after SWITCH_TO_APP_PAGETABLE the ASID is changed to app's
+	// ASID.
+	WORD $0x10000061		// ADR R1, do_exit_to_el0
+	ORR $0xffff000000000000, R1, R1
+	JMP (R1)
+
+do_exit_to_el0:
 	// RSV_REG & RSV_REG_APP will be loaded at the end.
 	REGISTERS_LOAD(RSV_REG_APP, 0)
+	MOVD PTRACE_TLS(RSV_REG_APP), RSV_REG
+	MSR RSV_REG, TPIDR_EL0
 
 	// switch to user pagetable.
 	MOVD PTRACE_R18(RSV_REG_APP), RSV_REG
@@ -458,15 +494,16 @@ TEXT ·kernelExitToEl0(SB),NOSPLIT,$0
 
 	SUB $STACK_FRAME_SIZE, RSP, RSP
 	STP (RSV_REG, RSV_REG_APP), 16*0(RSP)
+	STP (R0, R1), 16*1(RSP)
 
 	WORD $0xd538d092    //MRS   TPIDR_EL1, R18
 
 	SWITCH_TO_APP_PAGETABLE(RSV_REG)
 
+	LDP 16*1(RSP), (R0, R1)
 	LDP 16*0(RSP), (RSV_REG, RSV_REG_APP)
 	ADD $STACK_FRAME_SIZE, RSP, RSP
 
-	ISB $15
 	ERET()
 
 // kernelExitToEl1 is the entrypoint for sentry in guest_el1.
@@ -482,6 +519,9 @@ TEXT ·kernelExitToEl1(SB),NOSPLIT,$0
 	MOVD CPU_REGISTERS+PTRACE_SP(RSV_REG), R1
 	MOVD R1, RSP
 
+	SWITCH_TO_KVM_PAGETABLE(RSV_REG)
+	MRS TPIDR_EL1, RSV_REG
+
 	REGISTERS_LOAD(RSV_REG, CPU_REGISTERS)
 	MOVD CPU_REGISTERS+PTRACE_R9(RSV_REG), RSV_REG_APP
 
@@ -489,8 +529,6 @@ TEXT ·kernelExitToEl1(SB),NOSPLIT,$0
 
 // Start is the CPU entrypoint.
 TEXT ·Start(SB),NOSPLIT,$0
-	// Flush dcache.
-	WORD $0xd5087e52 // DC CISW
 	// Init.
 	MOVD $SCTLR_EL1_DEFAULT, R1
 	MSR R1, SCTLR_EL1
@@ -634,21 +672,7 @@ el0_svc:
 
 el0_da:
 el0_ia:
-	WORD $0xd538d092     //MRS   TPIDR_EL1, R18
-	WORD $0xd538601a     //MRS   FAR_EL1, R26
-
-	MOVD R26, CPU_FAULT_ADDR(RSV_REG)
-
-	MOVD $1, R3
-	MOVD R3, CPU_ERROR_TYPE(RSV_REG) // Set error type to user.
-
-	MOVD $PageFault, R3
-	MOVD R3, CPU_VECTOR_CODE(RSV_REG)
-
-	MRS ESR_EL1, R3
-	MOVD R3, CPU_ERROR_CODE(RSV_REG)
-
-	B ·kernelExitToEl1(SB)
+	EXCEPTION_WITH_ERROR(1, PageFault)
 
 el0_fpsimd_acc:
 	B ·Shutdown(SB)
@@ -663,10 +687,7 @@ el0_sp_pc:
 	B ·Shutdown(SB)
 
 el0_undef:
-	MOVD $El0Sync_undef, R3
-	MOVD R3, CPU_VECTOR_CODE(RSV_REG)
-
-	B ·kernelExitToEl1(SB)
+	EXCEPTION_WITH_ERROR(1, El0Sync_undef)
 
 el0_dbg:
 	B ·Shutdown(SB)
diff --git a/pkg/sentry/platform/ring0/gen_offsets/BUILD b/pkg/sentry/platform/ring0/gen_offsets/BUILD
index 549f3d228..9742308d8 100644
--- a/pkg/sentry/platform/ring0/gen_offsets/BUILD
+++ b/pkg/sentry/platform/ring0/gen_offsets/BUILD
@@ -24,7 +24,10 @@ go_binary(
         "defs_impl_arm64.go",
         "main.go",
     ],
-    visibility = ["//pkg/sentry/platform/ring0:__pkg__"],
+    visibility = [
+        "//pkg/sentry/platform/kvm:__pkg__",
+        "//pkg/sentry/platform/ring0:__pkg__",
+    ],
     deps = [
         "//pkg/cpuid",
         "//pkg/sentry/arch",
diff --git a/pkg/sentry/platform/ring0/kernel.go b/pkg/sentry/platform/ring0/kernel.go
index 021693791..264be23d3 100644
--- a/pkg/sentry/platform/ring0/kernel.go
+++ b/pkg/sentry/platform/ring0/kernel.go
@@ -19,8 +19,8 @@ package ring0
 // N.B. that constraints on KernelOpts must be satisfied.
 //
 //go:nosplit
-func (k *Kernel) Init(opts KernelOpts) {
-	k.init(opts)
+func (k *Kernel) Init(opts KernelOpts, maxCPUs int) {
+	k.init(opts, maxCPUs)
 }
 
 // Halt halts execution.
@@ -49,6 +49,11 @@ func (defaultHooks) KernelException(Vector) {
 
 // kernelSyscall is a trampoline.
 //
+// When in amd64, it is called with %rip on the upper half, so it can
+// NOT access to any global data which is not mapped on upper and must
+// call to function pointers or interfaces to switch to the lower half
+// so that callee can access to global data.
+//
 // +checkescape:hard,stack
 //
 //go:nosplit
@@ -58,6 +63,11 @@ func kernelSyscall(c *CPU) {
 
 // kernelException is a trampoline.
 //
+// When in amd64, it is called with %rip on the upper half, so it can
+// NOT access to any global data which is not mapped on upper and must
+// call to function pointers or interfaces to switch to the lower half
+// so that callee can access to global data.
+//
 // +checkescape:hard,stack
 //
 //go:nosplit
@@ -68,10 +78,10 @@ func kernelException(c *CPU, vector Vector) {
 // Init initializes a new CPU.
 //
 // Init allows embedding in other objects.
-func (c *CPU) Init(k *Kernel, hooks Hooks) {
-	c.self = c   // Set self reference.
-	c.kernel = k // Set kernel reference.
-	c.init()     // Perform architectural init.
+func (c *CPU) Init(k *Kernel, cpuID int, hooks Hooks) {
+	c.self = c    // Set self reference.
+	c.kernel = k  // Set kernel reference.
+	c.init(cpuID) // Perform architectural init.
 
 	// Require hooks.
 	if hooks != nil {
diff --git a/pkg/sentry/platform/ring0/kernel_amd64.go b/pkg/sentry/platform/ring0/kernel_amd64.go
index d37981dbf..3a9dff4cc 100644
--- a/pkg/sentry/platform/ring0/kernel_amd64.go
+++ b/pkg/sentry/platform/ring0/kernel_amd64.go
@@ -18,13 +18,42 @@ package ring0
 
 import (
 	"encoding/binary"
+	"reflect"
+
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // init initializes architecture-specific state.
-func (k *Kernel) init(opts KernelOpts) {
+func (k *Kernel) init(opts KernelOpts, maxCPUs int) {
 	// Save the root page tables.
 	k.PageTables = opts.PageTables
 
+	entrySize := reflect.TypeOf(kernelEntry{}).Size()
+	var (
+		entries []kernelEntry
+		padding = 1
+	)
+	for {
+		entries = make([]kernelEntry, maxCPUs+padding-1)
+		totalSize := entrySize * uintptr(maxCPUs+padding-1)
+		addr := reflect.ValueOf(&entries[0]).Pointer()
+		if addr&(usermem.PageSize-1) == 0 && totalSize >= usermem.PageSize {
+			// The runtime forces power-of-2 alignment for allocations, and we are therefore
+			// safe once the first address is aligned and the chunk is at least a full page.
+			break
+		}
+		padding = padding << 1
+	}
+	k.cpuEntries = entries
+
+	k.globalIDT = &idt64{}
+	if reflect.TypeOf(idt64{}).Size() != usermem.PageSize {
+		panic("Size of globalIDT should be PageSize")
+	}
+	if reflect.ValueOf(k.globalIDT).Pointer()&(usermem.PageSize-1) != 0 {
+		panic("Allocated globalIDT should be page aligned")
+	}
+
 	// Setup the IDT, which is uniform.
 	for v, handler := range handlers {
 		// Allow Breakpoint and Overflow to be called from all
@@ -39,8 +68,26 @@ func (k *Kernel) init(opts KernelOpts) {
 	}
 }
 
+func (k *Kernel) EntryRegions() map[uintptr]uintptr {
+	regions := make(map[uintptr]uintptr)
+
+	addr := reflect.ValueOf(&k.cpuEntries[0]).Pointer()
+	size := reflect.TypeOf(kernelEntry{}).Size() * uintptr(len(k.cpuEntries))
+	end, _ := usermem.Addr(addr + size).RoundUp()
+	regions[uintptr(usermem.Addr(addr).RoundDown())] = uintptr(end)
+
+	addr = reflect.ValueOf(k.globalIDT).Pointer()
+	size = reflect.TypeOf(idt64{}).Size()
+	end, _ = usermem.Addr(addr + size).RoundUp()
+	regions[uintptr(usermem.Addr(addr).RoundDown())] = uintptr(end)
+
+	return regions
+}
+
 // init initializes architecture-specific state.
-func (c *CPU) init() {
+func (c *CPU) init(cpuID int) {
+	c.kernelEntry = &c.kernel.cpuEntries[cpuID]
+	c.cpuSelf = c
 	// Null segment.
 	c.gdt[0].setNull()
 
@@ -65,6 +112,7 @@ func (c *CPU) init() {
 
 	// Set the kernel stack pointer in the TSS (virtual address).
 	stackAddr := c.StackTop()
+	c.stackTop = stackAddr
 	c.tss.rsp0Lo = uint32(stackAddr)
 	c.tss.rsp0Hi = uint32(stackAddr >> 32)
 	c.tss.ist1Lo = uint32(stackAddr)
@@ -183,7 +231,7 @@ func IsCanonical(addr uint64) bool {
 //go:nosplit
 func (c *CPU) SwitchToUser(switchOpts SwitchOpts) (vector Vector) {
 	userCR3 := switchOpts.PageTables.CR3(!switchOpts.Flush, switchOpts.UserPCID)
-	kernelCR3 := c.kernel.PageTables.CR3(true, switchOpts.KernelPCID)
+	c.kernelCR3 = uintptr(c.kernel.PageTables.CR3(true, switchOpts.KernelPCID))
 
 	// Sanitize registers.
 	regs := switchOpts.Registers
@@ -197,15 +245,11 @@ func (c *CPU) SwitchToUser(switchOpts SwitchOpts) (vector Vector) {
 	WriteFS(uintptr(regs.Fs_base))                   // escapes: no. Set application FS.
 	WriteGS(uintptr(regs.Gs_base))                   // escapes: no. Set application GS.
 	LoadFloatingPoint(switchOpts.FloatingPointState) // escapes: no. Copy in floating point.
-	jumpToKernel()                                   // Switch to upper half.
-	writeCR3(uintptr(userCR3))                       // Change to user address space.
 	if switchOpts.FullRestore {
-		vector = iret(c, regs)
+		vector = iret(c, regs, uintptr(userCR3))
 	} else {
-		vector = sysret(c, regs)
+		vector = sysret(c, regs, uintptr(userCR3))
 	}
-	writeCR3(uintptr(kernelCR3))                     // Return to kernel address space.
-	jumpToUser()                                     // Return to lower half.
 	SaveFloatingPoint(switchOpts.FloatingPointState) // escapes: no. Copy out floating point.
 	WriteFS(uintptr(c.registers.Fs_base))            // escapes: no. Restore kernel FS.
 	return
@@ -219,7 +263,7 @@ func (c *CPU) SwitchToUser(switchOpts SwitchOpts) (vector Vector) {
 //go:nosplit
 func start(c *CPU) {
 	// Save per-cpu & FS segment.
-	WriteGS(kernelAddr(c))
+	WriteGS(kernelAddr(c.kernelEntry))
 	WriteFS(uintptr(c.registers.Fs_base))
 
 	// Initialize floating point.
diff --git a/pkg/sentry/platform/ring0/kernel_arm64.go b/pkg/sentry/platform/ring0/kernel_arm64.go
index d0afa1aaa..b294ccc7c 100644
--- a/pkg/sentry/platform/ring0/kernel_arm64.go
+++ b/pkg/sentry/platform/ring0/kernel_arm64.go
@@ -25,13 +25,13 @@ func HaltAndResume()
 func HaltEl1SvcAndResume()
 
 // init initializes architecture-specific state.
-func (k *Kernel) init(opts KernelOpts) {
+func (k *Kernel) init(opts KernelOpts, maxCPUs int) {
 	// Save the root page tables.
 	k.PageTables = opts.PageTables
 }
 
 // init initializes architecture-specific state.
-func (c *CPU) init() {
+func (c *CPU) init(cpuID int) {
 	// Set the kernel stack pointer(virtual address).
 	c.registers.Sp = uint64(c.StackTop())
 
@@ -53,17 +53,20 @@ func IsCanonical(addr uint64) bool {
 
 //go:nosplit
 func (c *CPU) SwitchToUser(switchOpts SwitchOpts) (vector Vector) {
+	storeAppASID(uintptr(switchOpts.UserASID))
+	if switchOpts.Flush {
+		FlushTlbAll()
+	}
+
 	regs := switchOpts.Registers
 
 	regs.Pstate &= ^uint64(PsrFlagsClear)
 	regs.Pstate |= UserFlagsSet
 
 	LoadFloatingPoint(switchOpts.FloatingPointState)
-	SetTLS(regs.TPIDR_EL0)
 
 	kernelExitToEl0()
 
-	regs.TPIDR_EL0 = GetTLS()
 	SaveFloatingPoint(switchOpts.FloatingPointState)
 
 	vector = c.vecCode
diff --git a/pkg/sentry/platform/ring0/lib_amd64.go b/pkg/sentry/platform/ring0/lib_amd64.go
index ca968a036..0ec5c3bc5 100644
--- a/pkg/sentry/platform/ring0/lib_amd64.go
+++ b/pkg/sentry/platform/ring0/lib_amd64.go
@@ -61,21 +61,9 @@ func wrgsbase(addr uintptr)
 // wrgsmsr writes to the GS_BASE MSR.
 func wrgsmsr(addr uintptr)
 
-// writeCR3 writes the CR3 value.
-func writeCR3(phys uintptr)
-
-// readCR3 reads the current CR3 value.
-func readCR3() uintptr
-
 // readCR2 reads the current CR2 value.
 func readCR2() uintptr
 
-// jumpToKernel jumps to the kernel version of the current RIP.
-func jumpToKernel()
-
-// jumpToUser jumps to the user version of the current RIP.
-func jumpToUser()
-
 // fninit initializes the floating point unit.
 func fninit()
 
diff --git a/pkg/sentry/platform/ring0/lib_amd64.s b/pkg/sentry/platform/ring0/lib_amd64.s
index 75d742750..2fe83568a 100644
--- a/pkg/sentry/platform/ring0/lib_amd64.s
+++ b/pkg/sentry/platform/ring0/lib_amd64.s
@@ -127,53 +127,6 @@ TEXT ·wrgsmsr(SB),NOSPLIT,$0-8
 	BYTE $0x0f; BYTE $0x30;  // WRMSR
 	RET
 
-// jumpToUser changes execution to the user address.
-//
-// This works by changing the return value to the user version.
-TEXT ·jumpToUser(SB),NOSPLIT,$0
-	MOVQ 0(SP), AX
-	MOVQ ·KernelStartAddress(SB), BX
-	NOTQ BX
-	ANDQ BX, SP // Switch the stack.
-	ANDQ BX, BP // Switch the frame pointer.
-	ANDQ BX, AX // Future return value.
-	MOVQ AX, 0(SP)
-	RET
-
-// jumpToKernel changes execution to the kernel address space.
-//
-// This works by changing the return value to the kernel version.
-TEXT ·jumpToKernel(SB),NOSPLIT,$0
-	MOVQ 0(SP), AX
-	MOVQ ·KernelStartAddress(SB), BX
-	ORQ BX, SP // Switch the stack.
-	ORQ BX, BP // Switch the frame pointer.
-	ORQ BX, AX // Future return value.
-	MOVQ AX, 0(SP)
-	RET
-
-// writeCR3 writes the given CR3 value.
-//
-// The code corresponds to:
-//
-// 	mov %rax, %cr3
-//
-TEXT ·writeCR3(SB),NOSPLIT,$0-8
-	MOVQ cr3+0(FP), AX
-	BYTE $0x0f; BYTE $0x22; BYTE $0xd8;
-	RET
-
-// readCR3 reads the current CR3 value.
-//
-// The code corresponds to:
-//
-// 	mov %cr3, %rax
-//
-TEXT ·readCR3(SB),NOSPLIT,$0-8
-	BYTE $0x0f; BYTE $0x20; BYTE $0xd8;
-	MOVQ AX, ret+0(FP)
-	RET
-
 // readCR2 reads the current CR2 value.
 //
 // The code corresponds to:
diff --git a/pkg/sentry/platform/ring0/lib_arm64.go b/pkg/sentry/platform/ring0/lib_arm64.go
index 00e52c8af..d91a09de1 100644
--- a/pkg/sentry/platform/ring0/lib_arm64.go
+++ b/pkg/sentry/platform/ring0/lib_arm64.go
@@ -16,6 +16,15 @@
 
 package ring0
 
+// storeAppASID writes the application's asid value.
+func storeAppASID(asid uintptr)
+
+// LocalFlushTlbAll same as FlushTlbAll, but only applies to the calling CPU.
+func LocalFlushTlbAll()
+
+// FlushTlbAll flush all tlb.
+func FlushTlbAll()
+
 // CPACREL1 returns the value of the CPACR_EL1 register.
 func CPACREL1() (value uintptr)
 
@@ -44,12 +53,6 @@ func LoadFloatingPoint(*byte)
 // SaveFloatingPoint saves floating point state.
 func SaveFloatingPoint(*byte)
 
-// GetTLS returns the value of TPIDR_EL0 register.
-func GetTLS() (value uint64)
-
-// SetTLS writes the TPIDR_EL0 value.
-func SetTLS(value uint64)
-
 // Init sets function pointers based on architectural features.
 //
 // This must be called prior to using ring0.
diff --git a/pkg/sentry/platform/ring0/lib_arm64.s b/pkg/sentry/platform/ring0/lib_arm64.s
index 86bfbe46f..da9d3cf55 100644
--- a/pkg/sentry/platform/ring0/lib_arm64.s
+++ b/pkg/sentry/platform/ring0/lib_arm64.s
@@ -15,14 +15,18 @@
 #include "funcdata.h"
 #include "textflag.h"
 
-TEXT ·GetTLS(SB),NOSPLIT,$0-8
-	MRS TPIDR_EL0, R1
-	MOVD R1, ret+0(FP)
+TEXT ·LocalFlushTlbAll(SB),NOSPLIT,$0
+	DSB $6			// dsb(nshst)
+	WORD $0xd508871f	// __tlbi(vmalle1)
+	DSB $7			// dsb(nsh)
+	ISB $15
 	RET
 
-TEXT ·SetTLS(SB),NOSPLIT,$0-8
-	MOVD addr+0(FP), R1
-	MSR R1, TPIDR_EL0
+TEXT ·FlushTlbAll(SB),NOSPLIT,$0
+	DSB $10			// dsb(ishst)
+	WORD $0xd508831f	// __tlbi(vmalle1is)
+	DSB $11			// dsb(ish)
+	ISB $15
 	RET
 
 TEXT ·CPACREL1(SB),NOSPLIT,$0-8
diff --git a/pkg/sentry/platform/ring0/offsets_amd64.go b/pkg/sentry/platform/ring0/offsets_amd64.go
index b8ab120a0..ca4075b09 100644
--- a/pkg/sentry/platform/ring0/offsets_amd64.go
+++ b/pkg/sentry/platform/ring0/offsets_amd64.go
@@ -30,14 +30,21 @@ func Emit(w io.Writer) {
 
 	c := &CPU{}
 	fmt.Fprintf(w, "\n// CPU offsets.\n")
-	fmt.Fprintf(w, "#define CPU_SELF             0x%02x\n", reflect.ValueOf(&c.self).Pointer()-reflect.ValueOf(c).Pointer())
 	fmt.Fprintf(w, "#define CPU_REGISTERS        0x%02x\n", reflect.ValueOf(&c.registers).Pointer()-reflect.ValueOf(c).Pointer())
-	fmt.Fprintf(w, "#define CPU_STACK_TOP        0x%02x\n", reflect.ValueOf(&c.stack[0]).Pointer()-reflect.ValueOf(c).Pointer()+uintptr(len(c.stack)))
 	fmt.Fprintf(w, "#define CPU_ERROR_CODE       0x%02x\n", reflect.ValueOf(&c.errorCode).Pointer()-reflect.ValueOf(c).Pointer())
 	fmt.Fprintf(w, "#define CPU_ERROR_TYPE       0x%02x\n", reflect.ValueOf(&c.errorType).Pointer()-reflect.ValueOf(c).Pointer())
+	fmt.Fprintf(w, "#define CPU_ENTRY            0x%02x\n", reflect.ValueOf(&c.kernelEntry).Pointer()-reflect.ValueOf(c).Pointer())
+
+	e := &kernelEntry{}
+	fmt.Fprintf(w, "\n// CPU entry offsets.\n")
+	fmt.Fprintf(w, "#define ENTRY_SCRATCH0       0x%02x\n", reflect.ValueOf(&e.scratch0).Pointer()-reflect.ValueOf(e).Pointer())
+	fmt.Fprintf(w, "#define ENTRY_STACK_TOP      0x%02x\n", reflect.ValueOf(&e.stackTop).Pointer()-reflect.ValueOf(e).Pointer())
+	fmt.Fprintf(w, "#define ENTRY_CPU_SELF       0x%02x\n", reflect.ValueOf(&e.cpuSelf).Pointer()-reflect.ValueOf(e).Pointer())
+	fmt.Fprintf(w, "#define ENTRY_KERNEL_CR3     0x%02x\n", reflect.ValueOf(&e.kernelCR3).Pointer()-reflect.ValueOf(e).Pointer())
 
 	fmt.Fprintf(w, "\n// Bits.\n")
 	fmt.Fprintf(w, "#define _RFLAGS_IF           0x%02x\n", _RFLAGS_IF)
+	fmt.Fprintf(w, "#define _RFLAGS_IOPL0         0x%02x\n", _RFLAGS_IOPL0)
 	fmt.Fprintf(w, "#define _KERNEL_FLAGS        0x%02x\n", KernelFlagsSet)
 
 	fmt.Fprintf(w, "\n// Vectors.\n")
diff --git a/pkg/sentry/platform/ring0/offsets_arm64.go b/pkg/sentry/platform/ring0/offsets_arm64.go
index f3de962f0..45eba960d 100644
--- a/pkg/sentry/platform/ring0/offsets_arm64.go
+++ b/pkg/sentry/platform/ring0/offsets_arm64.go
@@ -41,6 +41,7 @@ func Emit(w io.Writer) {
 	fmt.Fprintf(w, "#define CPU_VECTOR_CODE      0x%02x\n", reflect.ValueOf(&c.vecCode).Pointer()-reflect.ValueOf(c).Pointer())
 	fmt.Fprintf(w, "#define CPU_APP_ADDR         0x%02x\n", reflect.ValueOf(&c.appAddr).Pointer()-reflect.ValueOf(c).Pointer())
 	fmt.Fprintf(w, "#define CPU_LAZY_VFP         0x%02x\n", reflect.ValueOf(&c.lazyVFP).Pointer()-reflect.ValueOf(c).Pointer())
+	fmt.Fprintf(w, "#define CPU_APP_ASID         0x%02x\n", reflect.ValueOf(&c.appASID).Pointer()-reflect.ValueOf(c).Pointer())
 
 	fmt.Fprintf(w, "\n// Bits.\n")
 	fmt.Fprintf(w, "#define _KERNEL_FLAGS        0x%02x\n", KernelFlagsSet)
@@ -124,4 +125,5 @@ func Emit(w io.Writer) {
 	fmt.Fprintf(w, "#define PTRACE_SP       0x%02x\n", reflect.ValueOf(&p.Sp).Pointer()-reflect.ValueOf(p).Pointer())
 	fmt.Fprintf(w, "#define PTRACE_PC       0x%02x\n", reflect.ValueOf(&p.Pc).Pointer()-reflect.ValueOf(p).Pointer())
 	fmt.Fprintf(w, "#define PTRACE_PSTATE   0x%02x\n", reflect.ValueOf(&p.Pstate).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_TLS      0x%02x\n", reflect.ValueOf(&p.TPIDR_EL0).Pointer()-reflect.ValueOf(p).Pointer())
 }
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_aarch64.go b/pkg/sentry/platform/ring0/pagetables/pagetables_aarch64.go
index 6409d1d91..520161755 100644
--- a/pkg/sentry/platform/ring0/pagetables/pagetables_aarch64.go
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables_aarch64.go
@@ -78,7 +78,7 @@ const (
 
 const (
 	executeDisable = xn
-	optionMask     = 0xfff | 0xfff<<48
+	optionMask     = 0xfff | 0xffff<<48
 	protDefault    = accessed | shared
 )
 
@@ -188,7 +188,7 @@ func (p *PTE) Set(addr uintptr, opts MapOpts) {
 		v |= mtNormal
 	} else {
 		v = v &^ user
-		v |= mtDevicenGnRE // Strong order for the addresses with ring0.KernelStartAddress.
+		v |= mtNormal
 	}
 	atomic.StoreUintptr((*uintptr)(p), v)
 }
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_arm64.go b/pkg/sentry/platform/ring0/pagetables/pagetables_arm64.go
index 1a49f12a2..5ddd10256 100644
--- a/pkg/sentry/platform/ring0/pagetables/pagetables_arm64.go
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables_arm64.go
@@ -36,7 +36,7 @@ const (
 	pudSize = 1 << pudShift
 	pgdSize = 1 << pgdShift
 
-	ttbrASIDOffset = 55
+	ttbrASIDOffset = 48
 	ttbrASIDMask   = 0xff
 
 	entriesPerPage = 512
diff --git a/pkg/sentry/platform/ring0/x86.go b/pkg/sentry/platform/ring0/x86.go
index 9da0ea685..34fbc1c35 100644
--- a/pkg/sentry/platform/ring0/x86.go
+++ b/pkg/sentry/platform/ring0/x86.go
@@ -39,7 +39,9 @@ const (
 
 	_RFLAGS_AC       = 1 << 18
 	_RFLAGS_NT       = 1 << 14
-	_RFLAGS_IOPL     = 3 << 12
+	_RFLAGS_IOPL0    = 1 << 12
+	_RFLAGS_IOPL1    = 1 << 13
+	_RFLAGS_IOPL     = _RFLAGS_IOPL0 | _RFLAGS_IOPL1
 	_RFLAGS_DF       = 1 << 10
 	_RFLAGS_IF       = 1 << 9
 	_RFLAGS_STEP     = 1 << 8
@@ -67,15 +69,45 @@ const (
 	KernelFlagsSet = _RFLAGS_RESERVED
 
 	// UserFlagsSet are always set in userspace.
-	UserFlagsSet = _RFLAGS_RESERVED | _RFLAGS_IF
+	//
+	// _RFLAGS_IOPL is a set of two bits and it shows the I/O privilege
+	// level. The Current Privilege Level (CPL) of the task must be less
+	// than or equal to the IOPL in order for the task or program to access
+	// I/O ports.
+	//
+	// Here, _RFLAGS_IOPL0 is used only to determine whether the task is
+	// running in the kernel or userspace mode. In the user mode, the CPL is
+	// always 3 and it doesn't matter what IOPL is set if it is bellow CPL.
+	//
+	// We need to have one bit which will be always different in user and
+	// kernel modes. And we have to remember that even though we have
+	// KernelFlagsClear, we still can see some of these flags in the kernel
+	// mode. This can happen when the goruntime switches on a goroutine
+	// which has been saved in the host mode. On restore, the popf
+	// instruction is used to restore flags and this means that all flags
+	// what the goroutine has in the host mode will be restored in the
+	// kernel mode.
+	//
+	// _RFLAGS_IOPL0 is never set in host and kernel modes and we always set
+	// it in the user mode. So if this flag is set, the task is running in
+	// the user mode and if it isn't set, the task is running in the kernel
+	// mode.
+	UserFlagsSet = _RFLAGS_RESERVED | _RFLAGS_IF | _RFLAGS_IOPL0
 
 	// KernelFlagsClear should always be clear in the kernel.
 	KernelFlagsClear = _RFLAGS_STEP | _RFLAGS_IF | _RFLAGS_IOPL | _RFLAGS_AC | _RFLAGS_NT
 
 	// UserFlagsClear are always cleared in userspace.
-	UserFlagsClear = _RFLAGS_NT | _RFLAGS_IOPL
+	UserFlagsClear = _RFLAGS_NT | _RFLAGS_IOPL1
 )
 
+// IsKernelFlags returns true if rflags coresponds to the kernel mode.
+//
+// go:nosplit
+func IsKernelFlags(rflags uint64) bool {
+	return rflags&_RFLAGS_IOPL0 == 0
+}
+
 // Vector is an exception vector.
 type Vector uintptr
 
@@ -104,7 +136,7 @@ const (
 	VirtualizationException
 	SecurityException = 0x1e
 	SyscallInt80      = 0x80
-	_NR_INTERRUPTS    = SyscallInt80 + 1
+	_NR_INTERRUPTS    = 0x100
 )
 
 // System call vectors.
diff --git a/pkg/sentry/socket/BUILD b/pkg/sentry/socket/BUILD
index c0fd3425b..a3f775d15 100644
--- a/pkg/sentry/socket/BUILD
+++ b/pkg/sentry/socket/BUILD
@@ -10,6 +10,7 @@ go_library(
         "//pkg/abi/linux",
         "//pkg/binary",
         "//pkg/context",
+        "//pkg/marshal",
         "//pkg/sentry/device",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/fsutil",
@@ -20,6 +21,5 @@ go_library(
         "//pkg/syserr",
         "//pkg/tcpip",
         "//pkg/usermem",
-        "//tools/go_marshal/marshal",
     ],
 )
diff --git a/pkg/sentry/socket/hostinet/BUILD b/pkg/sentry/socket/hostinet/BUILD
index e76e498de..b6ebe29d6 100644
--- a/pkg/sentry/socket/hostinet/BUILD
+++ b/pkg/sentry/socket/hostinet/BUILD
@@ -21,6 +21,8 @@ go_library(
         "//pkg/context",
         "//pkg/fdnotifier",
         "//pkg/log",
+        "//pkg/marshal",
+        "//pkg/marshal/primitive",
         "//pkg/safemem",
         "//pkg/sentry/arch",
         "//pkg/sentry/device",
@@ -37,11 +39,12 @@ go_library(
         "//pkg/sentry/vfs",
         "//pkg/syserr",
         "//pkg/syserror",
+        "//pkg/tcpip",
+        "//pkg/tcpip/network/ipv4",
+        "//pkg/tcpip/network/ipv6",
         "//pkg/tcpip/stack",
         "//pkg/usermem",
         "//pkg/waiter",
-        "//tools/go_marshal/marshal",
-        "//tools/go_marshal/primitive",
         "@org_golang_x_sys//unix:go_default_library",
     ],
 )
diff --git a/pkg/sentry/socket/hostinet/socket.go b/pkg/sentry/socket/hostinet/socket.go
index 242e6bf76..7d3c4a01c 100644
--- a/pkg/sentry/socket/hostinet/socket.go
+++ b/pkg/sentry/socket/hostinet/socket.go
@@ -24,6 +24,8 @@ import (
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fdnotifier"
 	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/marshal"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
 	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
@@ -36,8 +38,6 @@ import (
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
-	"gvisor.dev/gvisor/tools/go_marshal/marshal"
-	"gvisor.dev/gvisor/tools/go_marshal/primitive"
 )
 
 const (
diff --git a/pkg/sentry/socket/hostinet/socket_vfs2.go b/pkg/sentry/socket/hostinet/socket_vfs2.go
index 8a1d52ebf..163af329b 100644
--- a/pkg/sentry/socket/hostinet/socket_vfs2.go
+++ b/pkg/sentry/socket/hostinet/socket_vfs2.go
@@ -52,6 +52,7 @@ var _ = socket.SocketVFS2(&socketVFS2{})
 func newVFS2Socket(t *kernel.Task, family int, stype linux.SockType, protocol int, fd int, flags uint32) (*vfs.FileDescription, *syserr.Error) {
 	mnt := t.Kernel().SocketMount()
 	d := sockfs.NewDentry(t.Credentials(), mnt)
+	defer d.DecRef(t)
 
 	s := &socketVFS2{
 		socketOpsCommon: socketOpsCommon{
@@ -77,6 +78,13 @@ func newVFS2Socket(t *kernel.Task, family int, stype linux.SockType, protocol in
 	return vfsfd, nil
 }
 
+// Release implements vfs.FileDescriptionImpl.Release.
+func (s *socketVFS2) Release(ctx context.Context) {
+	t := kernel.TaskFromContext(ctx)
+	t.Kernel().DeleteSocketVFS2(&s.vfsfd)
+	s.socketOpsCommon.Release(ctx)
+}
+
 // Readiness implements waiter.Waitable.Readiness.
 func (s *socketVFS2) Readiness(mask waiter.EventMask) waiter.EventMask {
 	return s.socketOpsCommon.Readiness(mask)
@@ -97,11 +105,6 @@ func (s *socketVFS2) Ioctl(ctx context.Context, uio usermem.IO, args arch.Syscal
 	return ioctl(ctx, s.fd, uio, args)
 }
 
-// Allocate implements vfs.FileDescriptionImpl.Allocate.
-func (s *socketVFS2) Allocate(ctx context.Context, mode, offset, length uint64) error {
-	return syserror.ENODEV
-}
-
 // PRead implements vfs.FileDescriptionImpl.PRead.
 func (s *socketVFS2) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
 	return 0, syserror.ESPIPE
diff --git a/pkg/sentry/socket/hostinet/stack.go b/pkg/sentry/socket/hostinet/stack.go
index fda3dcb35..faa61160e 100644
--- a/pkg/sentry/socket/hostinet/stack.go
+++ b/pkg/sentry/socket/hostinet/stack.go
@@ -30,6 +30,9 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/inet"
 	"gvisor.dev/gvisor/pkg/syserr"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
+	"gvisor.dev/gvisor/pkg/tcpip/network/ipv6"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
@@ -59,6 +62,8 @@ type Stack struct {
 	tcpSACKEnabled bool
 	netDevFile     *os.File
 	netSNMPFile    *os.File
+	ipv4Forwarding bool
+	ipv6Forwarding bool
 }
 
 // NewStack returns an empty Stack containing no configuration.
@@ -118,6 +123,13 @@ func (s *Stack) Configure() error {
 		s.netSNMPFile = f
 	}
 
+	s.ipv6Forwarding = false
+	if ipForwarding, err := ioutil.ReadFile("/proc/sys/net/ipv6/conf/all/forwarding"); err == nil {
+		s.ipv6Forwarding = strings.TrimSpace(string(ipForwarding)) != "0"
+	} else {
+		log.Warningf("Failed to read if ipv6 forwarding is enabled, setting to false")
+	}
+
 	return nil
 }
 
@@ -468,3 +480,21 @@ func (s *Stack) CleanupEndpoints() []stack.TransportEndpoint { return nil }
 
 // RestoreCleanupEndpoints implements inet.Stack.RestoreCleanupEndpoints.
 func (s *Stack) RestoreCleanupEndpoints([]stack.TransportEndpoint) {}
+
+// Forwarding implements inet.Stack.Forwarding.
+func (s *Stack) Forwarding(protocol tcpip.NetworkProtocolNumber) bool {
+	switch protocol {
+	case ipv4.ProtocolNumber:
+		return s.ipv4Forwarding
+	case ipv6.ProtocolNumber:
+		return s.ipv6Forwarding
+	default:
+		log.Warningf("Forwarding(%v) failed: unsupported protocol", protocol)
+		return false
+	}
+}
+
+// SetForwarding implements inet.Stack.SetForwarding.
+func (s *Stack) SetForwarding(protocol tcpip.NetworkProtocolNumber, enable bool) error {
+	return syserror.EACCES
+}
diff --git a/pkg/sentry/socket/netfilter/BUILD b/pkg/sentry/socket/netfilter/BUILD
index 721094bbf..8aea0200f 100644
--- a/pkg/sentry/socket/netfilter/BUILD
+++ b/pkg/sentry/socket/netfilter/BUILD
@@ -6,6 +6,8 @@ go_library(
     name = "netfilter",
     srcs = [
         "extensions.go",
+        "ipv4.go",
+        "ipv6.go",
         "netfilter.go",
         "owner_matcher.go",
         "targets.go",
diff --git a/pkg/sentry/socket/netfilter/extensions.go b/pkg/sentry/socket/netfilter/extensions.go
index 0336a32d8..549787955 100644
--- a/pkg/sentry/socket/netfilter/extensions.go
+++ b/pkg/sentry/socket/netfilter/extensions.go
@@ -19,6 +19,8 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/syserr"
+	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
@@ -37,7 +39,7 @@ type matchMaker interface {
 	// name is the matcher name as stored in the xt_entry_match struct.
 	name() string
 
-	// marshal converts from an stack.Matcher to an ABI struct.
+	// marshal converts from a stack.Matcher to an ABI struct.
 	marshal(matcher stack.Matcher) []byte
 
 	// unmarshal converts from the ABI matcher struct to an
@@ -93,3 +95,71 @@ func unmarshalMatcher(match linux.XTEntryMatch, filter stack.IPHeaderFilter, buf
 	}
 	return matchMaker.unmarshal(buf, filter)
 }
+
+// targetMaker knows how to (un)marshal a target. Once registered,
+// marshalTarget and unmarshalTarget can be used.
+type targetMaker interface {
+	// id uniquely identifies the target.
+	id() stack.TargetID
+
+	// marshal converts from a stack.Target to an ABI struct.
+	marshal(target stack.Target) []byte
+
+	// unmarshal converts from the ABI matcher struct to a stack.Target.
+	unmarshal(buf []byte, filter stack.IPHeaderFilter) (stack.Target, *syserr.Error)
+}
+
+// targetMakers maps the TargetID of supported targets to the targetMaker that
+// marshals and unmarshals it. It is immutable after package initialization.
+var targetMakers = map[stack.TargetID]targetMaker{}
+
+func targetRevision(name string, netProto tcpip.NetworkProtocolNumber, rev uint8) (uint8, bool) {
+	tid := stack.TargetID{
+		Name:            name,
+		NetworkProtocol: netProto,
+		Revision:        rev,
+	}
+	if _, ok := targetMakers[tid]; !ok {
+		return 0, false
+	}
+
+	// Return the highest supported revision unless rev is higher.
+	for _, other := range targetMakers {
+		otherID := other.id()
+		if name == otherID.Name && netProto == otherID.NetworkProtocol && otherID.Revision > rev {
+			rev = uint8(otherID.Revision)
+		}
+	}
+	return rev, true
+}
+
+// registerTargetMaker should be called by target extensions to register them
+// with the netfilter package.
+func registerTargetMaker(tm targetMaker) {
+	if _, ok := targetMakers[tm.id()]; ok {
+		panic(fmt.Sprintf("multiple targets registered with name %q.", tm.id()))
+	}
+	targetMakers[tm.id()] = tm
+}
+
+func marshalTarget(target stack.Target) []byte {
+	targetMaker, ok := targetMakers[target.ID()]
+	if !ok {
+		panic(fmt.Sprintf("unknown target of type %T with id %+v.", target, target.ID()))
+	}
+	return targetMaker.marshal(target)
+}
+
+func unmarshalTarget(target linux.XTEntryTarget, filter stack.IPHeaderFilter, buf []byte) (stack.Target, *syserr.Error) {
+	tid := stack.TargetID{
+		Name:            target.Name.String(),
+		NetworkProtocol: filter.NetworkProtocol(),
+		Revision:        target.Revision,
+	}
+	targetMaker, ok := targetMakers[tid]
+	if !ok {
+		nflog("unsupported target with name %q", target.Name.String())
+		return nil, syserr.ErrInvalidArgument
+	}
+	return targetMaker.unmarshal(buf, filter)
+}
diff --git a/pkg/sentry/socket/netfilter/ipv4.go b/pkg/sentry/socket/netfilter/ipv4.go
new file mode 100644
index 000000000..b560fae0d
--- /dev/null
+++ b/pkg/sentry/socket/netfilter/ipv4.go
@@ -0,0 +1,265 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package netfilter
+
+import (
+	"bytes"
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/syserr"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// emptyIPv4Filter is for comparison with a rule's filters to determine whether
+// it is also empty. It is immutable.
+var emptyIPv4Filter = stack.IPHeaderFilter{
+	Dst:     "\x00\x00\x00\x00",
+	DstMask: "\x00\x00\x00\x00",
+	Src:     "\x00\x00\x00\x00",
+	SrcMask: "\x00\x00\x00\x00",
+}
+
+// convertNetstackToBinary4 converts the iptables as stored in netstack to the
+// format expected by the iptables tool. Linux stores each table as a binary
+// blob that can only be traversed by parsing a little data, reading some
+// offsets, jumping to those offsets, parsing again, etc.
+func convertNetstackToBinary4(stk *stack.Stack, tablename linux.TableName) (linux.KernelIPTGetEntries, linux.IPTGetinfo, error) {
+	// The table name has to fit in the struct.
+	if linux.XT_TABLE_MAXNAMELEN < len(tablename) {
+		return linux.KernelIPTGetEntries{}, linux.IPTGetinfo{}, fmt.Errorf("table name %q too long", tablename)
+	}
+
+	table, ok := stk.IPTables().GetTable(tablename.String(), false)
+	if !ok {
+		return linux.KernelIPTGetEntries{}, linux.IPTGetinfo{}, fmt.Errorf("couldn't find table %q", tablename)
+	}
+
+	// Setup the info struct.
+	entries, info := getEntries4(table, tablename)
+	return entries, info, nil
+}
+
+func getEntries4(table stack.Table, tablename linux.TableName) (linux.KernelIPTGetEntries, linux.IPTGetinfo) {
+	var info linux.IPTGetinfo
+	var entries linux.KernelIPTGetEntries
+	copy(info.Name[:], tablename[:])
+	copy(entries.Name[:], info.Name[:])
+	info.ValidHooks = table.ValidHooks()
+
+	for ruleIdx, rule := range table.Rules {
+		nflog("convert to binary: current offset: %d", entries.Size)
+
+		setHooksAndUnderflow(&info, table, entries.Size, ruleIdx)
+		// Each rule corresponds to an entry.
+		entry := linux.KernelIPTEntry{
+			Entry: linux.IPTEntry{
+				IP: linux.IPTIP{
+					Protocol: uint16(rule.Filter.Protocol),
+				},
+				NextOffset:   linux.SizeOfIPTEntry,
+				TargetOffset: linux.SizeOfIPTEntry,
+			},
+		}
+		copy(entry.Entry.IP.Dst[:], rule.Filter.Dst)
+		copy(entry.Entry.IP.DstMask[:], rule.Filter.DstMask)
+		copy(entry.Entry.IP.Src[:], rule.Filter.Src)
+		copy(entry.Entry.IP.SrcMask[:], rule.Filter.SrcMask)
+		copy(entry.Entry.IP.OutputInterface[:], rule.Filter.OutputInterface)
+		copy(entry.Entry.IP.OutputInterfaceMask[:], rule.Filter.OutputInterfaceMask)
+		if rule.Filter.DstInvert {
+			entry.Entry.IP.InverseFlags |= linux.IPT_INV_DSTIP
+		}
+		if rule.Filter.SrcInvert {
+			entry.Entry.IP.InverseFlags |= linux.IPT_INV_SRCIP
+		}
+		if rule.Filter.OutputInterfaceInvert {
+			entry.Entry.IP.InverseFlags |= linux.IPT_INV_VIA_OUT
+		}
+
+		for _, matcher := range rule.Matchers {
+			// Serialize the matcher and add it to the
+			// entry.
+			serialized := marshalMatcher(matcher)
+			nflog("convert to binary: matcher serialized as: %v", serialized)
+			if len(serialized)%8 != 0 {
+				panic(fmt.Sprintf("matcher %T is not 64-bit aligned", matcher))
+			}
+			entry.Elems = append(entry.Elems, serialized...)
+			entry.Entry.NextOffset += uint16(len(serialized))
+			entry.Entry.TargetOffset += uint16(len(serialized))
+		}
+
+		// Serialize and append the target.
+		serialized := marshalTarget(rule.Target)
+		if len(serialized)%8 != 0 {
+			panic(fmt.Sprintf("target %T is not 64-bit aligned", rule.Target))
+		}
+		entry.Elems = append(entry.Elems, serialized...)
+		entry.Entry.NextOffset += uint16(len(serialized))
+
+		nflog("convert to binary: adding entry: %+v", entry)
+
+		entries.Size += uint32(entry.Entry.NextOffset)
+		entries.Entrytable = append(entries.Entrytable, entry)
+		info.NumEntries++
+	}
+
+	info.Size = entries.Size
+	nflog("convert to binary: finished with an marshalled size of %d", info.Size)
+	return entries, info
+}
+
+func modifyEntries4(stk *stack.Stack, optVal []byte, replace *linux.IPTReplace, table *stack.Table) (map[uint32]int, *syserr.Error) {
+	nflog("set entries: setting entries in table %q", replace.Name.String())
+
+	// Convert input into a list of rules and their offsets.
+	var offset uint32
+	// offsets maps rule byte offsets to their position in table.Rules.
+	offsets := map[uint32]int{}
+	for entryIdx := uint32(0); entryIdx < replace.NumEntries; entryIdx++ {
+		nflog("set entries: processing entry at offset %d", offset)
+
+		// Get the struct ipt_entry.
+		if len(optVal) < linux.SizeOfIPTEntry {
+			nflog("optVal has insufficient size for entry %d", len(optVal))
+			return nil, syserr.ErrInvalidArgument
+		}
+		var entry linux.IPTEntry
+		buf := optVal[:linux.SizeOfIPTEntry]
+		binary.Unmarshal(buf, usermem.ByteOrder, &entry)
+		initialOptValLen := len(optVal)
+		optVal = optVal[linux.SizeOfIPTEntry:]
+
+		if entry.TargetOffset < linux.SizeOfIPTEntry {
+			nflog("entry has too-small target offset %d", entry.TargetOffset)
+			return nil, syserr.ErrInvalidArgument
+		}
+
+		// TODO(gvisor.dev/issue/170): We should support more IPTIP
+		// filtering fields.
+		filter, err := filterFromIPTIP(entry.IP)
+		if err != nil {
+			nflog("bad iptip: %v", err)
+			return nil, syserr.ErrInvalidArgument
+		}
+
+		// TODO(gvisor.dev/issue/170): Matchers and targets can specify
+		// that they only work for certain protocols, hooks, tables.
+		// Get matchers.
+		matchersSize := entry.TargetOffset - linux.SizeOfIPTEntry
+		if len(optVal) < int(matchersSize) {
+			nflog("entry doesn't have enough room for its matchers (only %d bytes remain)", len(optVal))
+			return nil, syserr.ErrInvalidArgument
+		}
+		matchers, err := parseMatchers(filter, optVal[:matchersSize])
+		if err != nil {
+			nflog("failed to parse matchers: %v", err)
+			return nil, syserr.ErrInvalidArgument
+		}
+		optVal = optVal[matchersSize:]
+
+		// Get the target of the rule.
+		targetSize := entry.NextOffset - entry.TargetOffset
+		if len(optVal) < int(targetSize) {
+			nflog("entry doesn't have enough room for its target (only %d bytes remain)", len(optVal))
+			return nil, syserr.ErrInvalidArgument
+		}
+
+		rule := stack.Rule{
+			Filter:   filter,
+			Matchers: matchers,
+		}
+
+		{
+			target, err := parseTarget(filter, optVal[:targetSize], false /* ipv6 */)
+			if err != nil {
+				nflog("failed to parse target: %v", err)
+				return nil, err
+			}
+			rule.Target = target
+		}
+		optVal = optVal[targetSize:]
+
+		table.Rules = append(table.Rules, rule)
+		offsets[offset] = int(entryIdx)
+		offset += uint32(entry.NextOffset)
+
+		if initialOptValLen-len(optVal) != int(entry.NextOffset) {
+			nflog("entry NextOffset is %d, but entry took up %d bytes", entry.NextOffset, initialOptValLen-len(optVal))
+			return nil, syserr.ErrInvalidArgument
+		}
+	}
+	return offsets, nil
+}
+
+func filterFromIPTIP(iptip linux.IPTIP) (stack.IPHeaderFilter, error) {
+	if containsUnsupportedFields4(iptip) {
+		return stack.IPHeaderFilter{}, fmt.Errorf("unsupported fields in struct iptip: %+v", iptip)
+	}
+	if len(iptip.Dst) != header.IPv4AddressSize || len(iptip.DstMask) != header.IPv4AddressSize {
+		return stack.IPHeaderFilter{}, fmt.Errorf("incorrect length of destination (%d) and/or destination mask (%d) fields", len(iptip.Dst), len(iptip.DstMask))
+	}
+	if len(iptip.Src) != header.IPv4AddressSize || len(iptip.SrcMask) != header.IPv4AddressSize {
+		return stack.IPHeaderFilter{}, fmt.Errorf("incorrect length of source (%d) and/or source mask (%d) fields", len(iptip.Src), len(iptip.SrcMask))
+	}
+
+	n := bytes.IndexByte([]byte(iptip.OutputInterface[:]), 0)
+	if n == -1 {
+		n = len(iptip.OutputInterface)
+	}
+	ifname := string(iptip.OutputInterface[:n])
+
+	n = bytes.IndexByte([]byte(iptip.OutputInterfaceMask[:]), 0)
+	if n == -1 {
+		n = len(iptip.OutputInterfaceMask)
+	}
+	ifnameMask := string(iptip.OutputInterfaceMask[:n])
+
+	return stack.IPHeaderFilter{
+		Protocol: tcpip.TransportProtocolNumber(iptip.Protocol),
+		// A Protocol value of 0 indicates all protocols match.
+		CheckProtocol:         iptip.Protocol != 0,
+		Dst:                   tcpip.Address(iptip.Dst[:]),
+		DstMask:               tcpip.Address(iptip.DstMask[:]),
+		DstInvert:             iptip.InverseFlags&linux.IPT_INV_DSTIP != 0,
+		Src:                   tcpip.Address(iptip.Src[:]),
+		SrcMask:               tcpip.Address(iptip.SrcMask[:]),
+		SrcInvert:             iptip.InverseFlags&linux.IPT_INV_SRCIP != 0,
+		OutputInterface:       ifname,
+		OutputInterfaceMask:   ifnameMask,
+		OutputInterfaceInvert: iptip.InverseFlags&linux.IPT_INV_VIA_OUT != 0,
+	}, nil
+}
+
+func containsUnsupportedFields4(iptip linux.IPTIP) bool {
+	// The following features are supported:
+	// - Protocol
+	// - Dst and DstMask
+	// - Src and SrcMask
+	// - The inverse destination IP check flag
+	// - OutputInterface, OutputInterfaceMask and its inverse.
+	var emptyInterface = [linux.IFNAMSIZ]byte{}
+	// Disable any supported inverse flags.
+	inverseMask := uint8(linux.IPT_INV_DSTIP) | uint8(linux.IPT_INV_SRCIP) | uint8(linux.IPT_INV_VIA_OUT)
+	return iptip.InputInterface != emptyInterface ||
+		iptip.InputInterfaceMask != emptyInterface ||
+		iptip.Flags != 0 ||
+		iptip.InverseFlags&^inverseMask != 0
+}
diff --git a/pkg/sentry/socket/netfilter/ipv6.go b/pkg/sentry/socket/netfilter/ipv6.go
new file mode 100644
index 000000000..4253f7bf4
--- /dev/null
+++ b/pkg/sentry/socket/netfilter/ipv6.go
@@ -0,0 +1,270 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package netfilter
+
+import (
+	"bytes"
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/syserr"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// emptyIPv6Filter is for comparison with a rule's filters to determine whether
+// it is also empty. It is immutable.
+var emptyIPv6Filter = stack.IPHeaderFilter{
+	Dst:     "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
+	DstMask: "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
+	Src:     "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
+	SrcMask: "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
+}
+
+// convertNetstackToBinary6 converts the ip6tables as stored in netstack to the
+// format expected by the iptables tool. Linux stores each table as a binary
+// blob that can only be traversed by parsing a little data, reading some
+// offsets, jumping to those offsets, parsing again, etc.
+func convertNetstackToBinary6(stk *stack.Stack, tablename linux.TableName) (linux.KernelIP6TGetEntries, linux.IPTGetinfo, error) {
+	// The table name has to fit in the struct.
+	if linux.XT_TABLE_MAXNAMELEN < len(tablename) {
+		return linux.KernelIP6TGetEntries{}, linux.IPTGetinfo{}, fmt.Errorf("table name %q too long", tablename)
+	}
+
+	table, ok := stk.IPTables().GetTable(tablename.String(), true)
+	if !ok {
+		return linux.KernelIP6TGetEntries{}, linux.IPTGetinfo{}, fmt.Errorf("couldn't find table %q", tablename)
+	}
+
+	// Setup the info struct, which is the same in IPv4 and IPv6.
+	entries, info := getEntries6(table, tablename)
+	return entries, info, nil
+}
+
+func getEntries6(table stack.Table, tablename linux.TableName) (linux.KernelIP6TGetEntries, linux.IPTGetinfo) {
+	var info linux.IPTGetinfo
+	var entries linux.KernelIP6TGetEntries
+	copy(info.Name[:], tablename[:])
+	copy(entries.Name[:], info.Name[:])
+	info.ValidHooks = table.ValidHooks()
+
+	for ruleIdx, rule := range table.Rules {
+		nflog("convert to binary: current offset: %d", entries.Size)
+
+		setHooksAndUnderflow(&info, table, entries.Size, ruleIdx)
+		// Each rule corresponds to an entry.
+		entry := linux.KernelIP6TEntry{
+			Entry: linux.IP6TEntry{
+				IPv6: linux.IP6TIP{
+					Protocol: uint16(rule.Filter.Protocol),
+				},
+				NextOffset:   linux.SizeOfIP6TEntry,
+				TargetOffset: linux.SizeOfIP6TEntry,
+			},
+		}
+		copy(entry.Entry.IPv6.Dst[:], rule.Filter.Dst)
+		copy(entry.Entry.IPv6.DstMask[:], rule.Filter.DstMask)
+		copy(entry.Entry.IPv6.Src[:], rule.Filter.Src)
+		copy(entry.Entry.IPv6.SrcMask[:], rule.Filter.SrcMask)
+		copy(entry.Entry.IPv6.OutputInterface[:], rule.Filter.OutputInterface)
+		copy(entry.Entry.IPv6.OutputInterfaceMask[:], rule.Filter.OutputInterfaceMask)
+		if rule.Filter.DstInvert {
+			entry.Entry.IPv6.InverseFlags |= linux.IP6T_INV_DSTIP
+		}
+		if rule.Filter.SrcInvert {
+			entry.Entry.IPv6.InverseFlags |= linux.IP6T_INV_SRCIP
+		}
+		if rule.Filter.OutputInterfaceInvert {
+			entry.Entry.IPv6.InverseFlags |= linux.IP6T_INV_VIA_OUT
+		}
+		if rule.Filter.CheckProtocol {
+			entry.Entry.IPv6.Flags |= linux.IP6T_F_PROTO
+		}
+
+		for _, matcher := range rule.Matchers {
+			// Serialize the matcher and add it to the
+			// entry.
+			serialized := marshalMatcher(matcher)
+			nflog("convert to binary: matcher serialized as: %v", serialized)
+			if len(serialized)%8 != 0 {
+				panic(fmt.Sprintf("matcher %T is not 64-bit aligned", matcher))
+			}
+			entry.Elems = append(entry.Elems, serialized...)
+			entry.Entry.NextOffset += uint16(len(serialized))
+			entry.Entry.TargetOffset += uint16(len(serialized))
+		}
+
+		// Serialize and append the target.
+		serialized := marshalTarget(rule.Target)
+		if len(serialized)%8 != 0 {
+			panic(fmt.Sprintf("target %T is not 64-bit aligned", rule.Target))
+		}
+		entry.Elems = append(entry.Elems, serialized...)
+		entry.Entry.NextOffset += uint16(len(serialized))
+
+		nflog("convert to binary: adding entry: %+v", entry)
+
+		entries.Size += uint32(entry.Entry.NextOffset)
+		entries.Entrytable = append(entries.Entrytable, entry)
+		info.NumEntries++
+	}
+
+	info.Size = entries.Size
+	nflog("convert to binary: finished with an marshalled size of %d", info.Size)
+	return entries, info
+}
+
+func modifyEntries6(stk *stack.Stack, optVal []byte, replace *linux.IPTReplace, table *stack.Table) (map[uint32]int, *syserr.Error) {
+	nflog("set entries: setting entries in table %q", replace.Name.String())
+
+	// Convert input into a list of rules and their offsets.
+	var offset uint32
+	// offsets maps rule byte offsets to their position in table.Rules.
+	offsets := map[uint32]int{}
+	for entryIdx := uint32(0); entryIdx < replace.NumEntries; entryIdx++ {
+		nflog("set entries: processing entry at offset %d", offset)
+
+		// Get the struct ipt_entry.
+		if len(optVal) < linux.SizeOfIP6TEntry {
+			nflog("optVal has insufficient size for entry %d", len(optVal))
+			return nil, syserr.ErrInvalidArgument
+		}
+		var entry linux.IP6TEntry
+		buf := optVal[:linux.SizeOfIP6TEntry]
+		binary.Unmarshal(buf, usermem.ByteOrder, &entry)
+		initialOptValLen := len(optVal)
+		optVal = optVal[linux.SizeOfIP6TEntry:]
+
+		if entry.TargetOffset < linux.SizeOfIP6TEntry {
+			nflog("entry has too-small target offset %d", entry.TargetOffset)
+			return nil, syserr.ErrInvalidArgument
+		}
+
+		// TODO(gvisor.dev/issue/170): We should support more IPTIP
+		// filtering fields.
+		filter, err := filterFromIP6TIP(entry.IPv6)
+		if err != nil {
+			nflog("bad iptip: %v", err)
+			return nil, syserr.ErrInvalidArgument
+		}
+
+		// TODO(gvisor.dev/issue/170): Matchers and targets can specify
+		// that they only work for certain protocols, hooks, tables.
+		// Get matchers.
+		matchersSize := entry.TargetOffset - linux.SizeOfIP6TEntry
+		if len(optVal) < int(matchersSize) {
+			nflog("entry doesn't have enough room for its matchers (only %d bytes remain)", len(optVal))
+			return nil, syserr.ErrInvalidArgument
+		}
+		matchers, err := parseMatchers(filter, optVal[:matchersSize])
+		if err != nil {
+			nflog("failed to parse matchers: %v", err)
+			return nil, syserr.ErrInvalidArgument
+		}
+		optVal = optVal[matchersSize:]
+
+		// Get the target of the rule.
+		targetSize := entry.NextOffset - entry.TargetOffset
+		if len(optVal) < int(targetSize) {
+			nflog("entry doesn't have enough room for its target (only %d bytes remain)", len(optVal))
+			return nil, syserr.ErrInvalidArgument
+		}
+
+		rule := stack.Rule{
+			Filter:   filter,
+			Matchers: matchers,
+		}
+
+		{
+			target, err := parseTarget(filter, optVal[:targetSize], true /* ipv6 */)
+			if err != nil {
+				nflog("failed to parse target: %v", err)
+				return nil, err
+			}
+			rule.Target = target
+		}
+		optVal = optVal[targetSize:]
+
+		table.Rules = append(table.Rules, rule)
+		offsets[offset] = int(entryIdx)
+		offset += uint32(entry.NextOffset)
+
+		if initialOptValLen-len(optVal) != int(entry.NextOffset) {
+			nflog("entry NextOffset is %d, but entry took up %d bytes", entry.NextOffset, initialOptValLen-len(optVal))
+			return nil, syserr.ErrInvalidArgument
+		}
+	}
+	return offsets, nil
+}
+
+func filterFromIP6TIP(iptip linux.IP6TIP) (stack.IPHeaderFilter, error) {
+	if containsUnsupportedFields6(iptip) {
+		return stack.IPHeaderFilter{}, fmt.Errorf("unsupported fields in struct iptip: %+v", iptip)
+	}
+	if len(iptip.Dst) != header.IPv6AddressSize || len(iptip.DstMask) != header.IPv6AddressSize {
+		return stack.IPHeaderFilter{}, fmt.Errorf("incorrect length of destination (%d) and/or destination mask (%d) fields", len(iptip.Dst), len(iptip.DstMask))
+	}
+	if len(iptip.Src) != header.IPv6AddressSize || len(iptip.SrcMask) != header.IPv6AddressSize {
+		return stack.IPHeaderFilter{}, fmt.Errorf("incorrect length of source (%d) and/or source mask (%d) fields", len(iptip.Src), len(iptip.SrcMask))
+	}
+
+	n := bytes.IndexByte([]byte(iptip.OutputInterface[:]), 0)
+	if n == -1 {
+		n = len(iptip.OutputInterface)
+	}
+	ifname := string(iptip.OutputInterface[:n])
+
+	n = bytes.IndexByte([]byte(iptip.OutputInterfaceMask[:]), 0)
+	if n == -1 {
+		n = len(iptip.OutputInterfaceMask)
+	}
+	ifnameMask := string(iptip.OutputInterfaceMask[:n])
+
+	return stack.IPHeaderFilter{
+		Protocol: tcpip.TransportProtocolNumber(iptip.Protocol),
+		// In ip6tables a flag controls whether to check the protocol.
+		CheckProtocol:         iptip.Flags&linux.IP6T_F_PROTO != 0,
+		Dst:                   tcpip.Address(iptip.Dst[:]),
+		DstMask:               tcpip.Address(iptip.DstMask[:]),
+		DstInvert:             iptip.InverseFlags&linux.IP6T_INV_DSTIP != 0,
+		Src:                   tcpip.Address(iptip.Src[:]),
+		SrcMask:               tcpip.Address(iptip.SrcMask[:]),
+		SrcInvert:             iptip.InverseFlags&linux.IP6T_INV_SRCIP != 0,
+		OutputInterface:       ifname,
+		OutputInterfaceMask:   ifnameMask,
+		OutputInterfaceInvert: iptip.InverseFlags&linux.IP6T_INV_VIA_OUT != 0,
+	}, nil
+}
+
+func containsUnsupportedFields6(iptip linux.IP6TIP) bool {
+	// The following features are supported:
+	// - Protocol
+	// - Dst and DstMask
+	// - Src and SrcMask
+	// - The inverse destination IP check flag
+	// - OutputInterface, OutputInterfaceMask and its inverse.
+	var emptyInterface = [linux.IFNAMSIZ]byte{}
+	flagMask := uint8(linux.IP6T_F_PROTO)
+	// Disable any supported inverse flags.
+	inverseMask := uint8(linux.IP6T_INV_DSTIP) | uint8(linux.IP6T_INV_SRCIP) | uint8(linux.IP6T_INV_VIA_OUT)
+	return iptip.InputInterface != emptyInterface ||
+		iptip.InputInterfaceMask != emptyInterface ||
+		iptip.Flags&^flagMask != 0 ||
+		iptip.InverseFlags&^inverseMask != 0 ||
+		iptip.TOS != 0
+}
diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go
index e91b0624c..904a12e38 100644
--- a/pkg/sentry/socket/netfilter/netfilter.go
+++ b/pkg/sentry/socket/netfilter/netfilter.go
@@ -17,7 +17,6 @@
 package netfilter
 
 import (
-	"bytes"
 	"errors"
 	"fmt"
 
@@ -27,7 +26,6 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/syserr"
 	"gvisor.dev/gvisor/pkg/tcpip"
-	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
@@ -37,15 +35,6 @@ import (
 // developing iptables, but can pollute sentry logs otherwise.
 const enableLogging = false
 
-// emptyFilter is for comparison with a rule's filters to determine whether it
-// is also empty. It is immutable.
-var emptyFilter = stack.IPHeaderFilter{
-	Dst:     "\x00\x00\x00\x00",
-	DstMask: "\x00\x00\x00\x00",
-	Src:     "\x00\x00\x00\x00",
-	SrcMask: "\x00\x00\x00\x00",
-}
-
 // nflog logs messages related to the writing and reading of iptables.
 func nflog(format string, args ...interface{}) {
 	if enableLogging && log.IsLogging(log.Debug) {
@@ -54,14 +43,19 @@ func nflog(format string, args ...interface{}) {
 }
 
 // GetInfo returns information about iptables.
-func GetInfo(t *kernel.Task, stack *stack.Stack, outPtr usermem.Addr) (linux.IPTGetinfo, *syserr.Error) {
+func GetInfo(t *kernel.Task, stack *stack.Stack, outPtr usermem.Addr, ipv6 bool) (linux.IPTGetinfo, *syserr.Error) {
 	// Read in the struct and table name.
 	var info linux.IPTGetinfo
 	if _, err := info.CopyIn(t, outPtr); err != nil {
 		return linux.IPTGetinfo{}, syserr.FromError(err)
 	}
 
-	_, info, err := convertNetstackToBinary(stack, info.Name)
+	var err error
+	if ipv6 {
+		_, info, err = convertNetstackToBinary6(stack, info.Name)
+	} else {
+		_, info, err = convertNetstackToBinary4(stack, info.Name)
+	}
 	if err != nil {
 		nflog("couldn't convert iptables: %v", err)
 		return linux.IPTGetinfo{}, syserr.ErrInvalidArgument
@@ -71,8 +65,8 @@ func GetInfo(t *kernel.Task, stack *stack.Stack, outPtr usermem.Addr) (linux.IPT
 	return info, nil
 }
 
-// GetEntries returns netstack's iptables rules encoded for the iptables tool.
-func GetEntries(t *kernel.Task, stack *stack.Stack, outPtr usermem.Addr, outLen int) (linux.KernelIPTGetEntries, *syserr.Error) {
+// GetEntries4 returns netstack's iptables rules.
+func GetEntries4(t *kernel.Task, stack *stack.Stack, outPtr usermem.Addr, outLen int) (linux.KernelIPTGetEntries, *syserr.Error) {
 	// Read in the struct and table name.
 	var userEntries linux.IPTGetEntries
 	if _, err := userEntries.CopyIn(t, outPtr); err != nil {
@@ -82,7 +76,7 @@ func GetEntries(t *kernel.Task, stack *stack.Stack, outPtr usermem.Addr, outLen
 
 	// Convert netstack's iptables rules to something that the iptables
 	// tool can understand.
-	entries, _, err := convertNetstackToBinary(stack, userEntries.Name)
+	entries, _, err := convertNetstackToBinary4(stack, userEntries.Name)
 	if err != nil {
 		nflog("couldn't read entries: %v", err)
 		return linux.KernelIPTGetEntries{}, syserr.ErrInvalidArgument
@@ -95,112 +89,53 @@ func GetEntries(t *kernel.Task, stack *stack.Stack, outPtr usermem.Addr, outLen
 	return entries, nil
 }
 
-// convertNetstackToBinary converts the iptables as stored in netstack to the
-// format expected by the iptables tool. Linux stores each table as a binary
-// blob that can only be traversed by parsing a bit, reading some offsets,
-// jumping to those offsets, parsing again, etc.
-func convertNetstackToBinary(stack *stack.Stack, tablename linux.TableName) (linux.KernelIPTGetEntries, linux.IPTGetinfo, error) {
-	table, ok := stack.IPTables().GetTable(tablename.String())
-	if !ok {
-		return linux.KernelIPTGetEntries{}, linux.IPTGetinfo{}, fmt.Errorf("couldn't find table %q", tablename)
+// GetEntries6 returns netstack's ip6tables rules.
+func GetEntries6(t *kernel.Task, stack *stack.Stack, outPtr usermem.Addr, outLen int) (linux.KernelIP6TGetEntries, *syserr.Error) {
+	// Read in the struct and table name. IPv4 and IPv6 utilize structs
+	// with the same layout.
+	var userEntries linux.IPTGetEntries
+	if _, err := userEntries.CopyIn(t, outPtr); err != nil {
+		nflog("couldn't copy in entries %q", userEntries.Name)
+		return linux.KernelIP6TGetEntries{}, syserr.FromError(err)
 	}
 
-	var entries linux.KernelIPTGetEntries
-	var info linux.IPTGetinfo
-	info.ValidHooks = table.ValidHooks()
-
-	// The table name has to fit in the struct.
-	if linux.XT_TABLE_MAXNAMELEN < len(tablename) {
-		return linux.KernelIPTGetEntries{}, linux.IPTGetinfo{}, fmt.Errorf("table name %q too long", tablename)
+	// Convert netstack's iptables rules to something that the iptables
+	// tool can understand.
+	entries, _, err := convertNetstackToBinary6(stack, userEntries.Name)
+	if err != nil {
+		nflog("couldn't read entries: %v", err)
+		return linux.KernelIP6TGetEntries{}, syserr.ErrInvalidArgument
+	}
+	if binary.Size(entries) > uintptr(outLen) {
+		nflog("insufficient GetEntries output size: %d", uintptr(outLen))
+		return linux.KernelIP6TGetEntries{}, syserr.ErrInvalidArgument
 	}
-	copy(info.Name[:], tablename[:])
-	copy(entries.Name[:], tablename[:])
-
-	for ruleIdx, rule := range table.Rules {
-		nflog("convert to binary: current offset: %d", entries.Size)
-
-		// Is this a chain entry point?
-		for hook, hookRuleIdx := range table.BuiltinChains {
-			if hookRuleIdx == ruleIdx {
-				nflog("convert to binary: found hook %d at offset %d", hook, entries.Size)
-				info.HookEntry[hook] = entries.Size
-			}
-		}
-		// Is this a chain underflow point?
-		for underflow, underflowRuleIdx := range table.Underflows {
-			if underflowRuleIdx == ruleIdx {
-				nflog("convert to binary: found underflow %d at offset %d", underflow, entries.Size)
-				info.Underflow[underflow] = entries.Size
-			}
-		}
 
-		// Each rule corresponds to an entry.
-		entry := linux.KernelIPTEntry{
-			Entry: linux.IPTEntry{
-				IP: linux.IPTIP{
-					Protocol: uint16(rule.Filter.Protocol),
-				},
-				NextOffset:   linux.SizeOfIPTEntry,
-				TargetOffset: linux.SizeOfIPTEntry,
-			},
-		}
-		copy(entry.Entry.IP.Dst[:], rule.Filter.Dst)
-		copy(entry.Entry.IP.DstMask[:], rule.Filter.DstMask)
-		copy(entry.Entry.IP.Src[:], rule.Filter.Src)
-		copy(entry.Entry.IP.SrcMask[:], rule.Filter.SrcMask)
-		copy(entry.Entry.IP.OutputInterface[:], rule.Filter.OutputInterface)
-		copy(entry.Entry.IP.OutputInterfaceMask[:], rule.Filter.OutputInterfaceMask)
-		if rule.Filter.DstInvert {
-			entry.Entry.IP.InverseFlags |= linux.IPT_INV_DSTIP
-		}
-		if rule.Filter.SrcInvert {
-			entry.Entry.IP.InverseFlags |= linux.IPT_INV_SRCIP
-		}
-		if rule.Filter.OutputInterfaceInvert {
-			entry.Entry.IP.InverseFlags |= linux.IPT_INV_VIA_OUT
-		}
+	return entries, nil
+}
 
-		for _, matcher := range rule.Matchers {
-			// Serialize the matcher and add it to the
-			// entry.
-			serialized := marshalMatcher(matcher)
-			nflog("convert to binary: matcher serialized as: %v", serialized)
-			if len(serialized)%8 != 0 {
-				panic(fmt.Sprintf("matcher %T is not 64-bit aligned", matcher))
-			}
-			entry.Elems = append(entry.Elems, serialized...)
-			entry.Entry.NextOffset += uint16(len(serialized))
-			entry.Entry.TargetOffset += uint16(len(serialized))
+// setHooksAndUnderflow checks whether the rule at ruleIdx is a hook entrypoint
+// or underflow, in which case it fills in info.HookEntry and info.Underflows.
+func setHooksAndUnderflow(info *linux.IPTGetinfo, table stack.Table, offset uint32, ruleIdx int) {
+	// Is this a chain entry point?
+	for hook, hookRuleIdx := range table.BuiltinChains {
+		if hookRuleIdx == ruleIdx {
+			nflog("convert to binary: found hook %d at offset %d", hook, offset)
+			info.HookEntry[hook] = offset
 		}
-
-		// Serialize and append the target.
-		serialized := marshalTarget(rule.Target)
-		if len(serialized)%8 != 0 {
-			panic(fmt.Sprintf("target %T is not 64-bit aligned", rule.Target))
+	}
+	// Is this a chain underflow point?
+	for underflow, underflowRuleIdx := range table.Underflows {
+		if underflowRuleIdx == ruleIdx {
+			nflog("convert to binary: found underflow %d at offset %d", underflow, offset)
+			info.Underflow[underflow] = offset
 		}
-		entry.Elems = append(entry.Elems, serialized...)
-		entry.Entry.NextOffset += uint16(len(serialized))
-
-		nflog("convert to binary: adding entry: %+v", entry)
-
-		entries.Size += uint32(entry.Entry.NextOffset)
-		entries.Entrytable = append(entries.Entrytable, entry)
-		info.NumEntries++
 	}
-
-	nflog("convert to binary: finished with an marshalled size of %d", info.Size)
-	info.Size = entries.Size
-	return entries, info, nil
 }
 
 // SetEntries sets iptables rules for a single table. See
 // net/ipv4/netfilter/ip_tables.c:translate_table for reference.
-func SetEntries(stk *stack.Stack, optVal []byte) *syserr.Error {
-	// Get the basic rules data (struct ipt_replace).
-	if len(optVal) < linux.SizeOfIPTReplace {
-		nflog("optVal has insufficient size for replace %d", len(optVal))
-		return syserr.ErrInvalidArgument
-	}
+func SetEntries(stk *stack.Stack, optVal []byte, ipv6 bool) *syserr.Error {
 	var replace linux.IPTReplace
 	replaceBuf := optVal[:linux.SizeOfIPTReplace]
 	optVal = optVal[linux.SizeOfIPTReplace:]
@@ -218,79 +153,15 @@ func SetEntries(stk *stack.Stack, optVal []byte) *syserr.Error {
 		return syserr.ErrInvalidArgument
 	}
 
-	nflog("set entries: setting entries in table %q", replace.Name.String())
-
-	// Convert input into a list of rules and their offsets.
-	var offset uint32
-	// offsets maps rule byte offsets to their position in table.Rules.
-	offsets := map[uint32]int{}
-	for entryIdx := uint32(0); entryIdx < replace.NumEntries; entryIdx++ {
-		nflog("set entries: processing entry at offset %d", offset)
-
-		// Get the struct ipt_entry.
-		if len(optVal) < linux.SizeOfIPTEntry {
-			nflog("optVal has insufficient size for entry %d", len(optVal))
-			return syserr.ErrInvalidArgument
-		}
-		var entry linux.IPTEntry
-		buf := optVal[:linux.SizeOfIPTEntry]
-		binary.Unmarshal(buf, usermem.ByteOrder, &entry)
-		initialOptValLen := len(optVal)
-		optVal = optVal[linux.SizeOfIPTEntry:]
-
-		if entry.TargetOffset < linux.SizeOfIPTEntry {
-			nflog("entry has too-small target offset %d", entry.TargetOffset)
-			return syserr.ErrInvalidArgument
-		}
-
-		// TODO(gvisor.dev/issue/170): We should support more IPTIP
-		// filtering fields.
-		filter, err := filterFromIPTIP(entry.IP)
-		if err != nil {
-			nflog("bad iptip: %v", err)
-			return syserr.ErrInvalidArgument
-		}
-
-		// TODO(gvisor.dev/issue/170): Matchers and targets can specify
-		// that they only work for certain protocols, hooks, tables.
-		// Get matchers.
-		matchersSize := entry.TargetOffset - linux.SizeOfIPTEntry
-		if len(optVal) < int(matchersSize) {
-			nflog("entry doesn't have enough room for its matchers (only %d bytes remain)", len(optVal))
-			return syserr.ErrInvalidArgument
-		}
-		matchers, err := parseMatchers(filter, optVal[:matchersSize])
-		if err != nil {
-			nflog("failed to parse matchers: %v", err)
-			return syserr.ErrInvalidArgument
-		}
-		optVal = optVal[matchersSize:]
-
-		// Get the target of the rule.
-		targetSize := entry.NextOffset - entry.TargetOffset
-		if len(optVal) < int(targetSize) {
-			nflog("entry doesn't have enough room for its target (only %d bytes remain)", len(optVal))
-			return syserr.ErrInvalidArgument
-		}
-		target, err := parseTarget(filter, optVal[:targetSize])
-		if err != nil {
-			nflog("failed to parse target: %v", err)
-			return syserr.ErrInvalidArgument
-		}
-		optVal = optVal[targetSize:]
-
-		table.Rules = append(table.Rules, stack.Rule{
-			Filter:   filter,
-			Target:   target,
-			Matchers: matchers,
-		})
-		offsets[offset] = int(entryIdx)
-		offset += uint32(entry.NextOffset)
-
-		if initialOptValLen-len(optVal) != int(entry.NextOffset) {
-			nflog("entry NextOffset is %d, but entry took up %d bytes", entry.NextOffset, initialOptValLen-len(optVal))
-			return syserr.ErrInvalidArgument
-		}
+	var err *syserr.Error
+	var offsets map[uint32]int
+	if ipv6 {
+		offsets, err = modifyEntries6(stk, optVal, &replace, &table)
+	} else {
+		offsets, err = modifyEntries4(stk, optVal, &replace, &table)
+	}
+	if err != nil {
+		return err
 	}
 
 	// Go through the list of supported hooks for this table and, for each
@@ -305,7 +176,7 @@ func SetEntries(stk *stack.Stack, optVal []byte) *syserr.Error {
 					table.BuiltinChains[hk] = ruleIdx
 				}
 				if offset == replace.Underflow[hook] {
-					if !validUnderflow(table.Rules[ruleIdx]) {
+					if !validUnderflow(table.Rules[ruleIdx], ipv6) {
 						nflog("underflow for hook %d isn't an unconditional ACCEPT or DROP", ruleIdx)
 						return syserr.ErrInvalidArgument
 					}
@@ -323,9 +194,9 @@ func SetEntries(stk *stack.Stack, optVal []byte) *syserr.Error {
 		}
 	}
 
-	// Add the user chains.
+	// Check the user chains.
 	for ruleIdx, rule := range table.Rules {
-		if _, ok := rule.Target.(stack.UserChainTarget); !ok {
+		if _, ok := rule.Target.(*stack.UserChainTarget); !ok {
 			continue
 		}
 
@@ -346,7 +217,7 @@ func SetEntries(stk *stack.Stack, optVal []byte) *syserr.Error {
 	// Set each jump to point to the appropriate rule. Right now they hold byte
 	// offsets.
 	for ruleIdx, rule := range table.Rules {
-		jump, ok := rule.Target.(JumpTarget)
+		jump, ok := rule.Target.(*JumpTarget)
 		if !ok {
 			continue
 		}
@@ -370,7 +241,7 @@ func SetEntries(stk *stack.Stack, optVal []byte) *syserr.Error {
 			if ruleIdx == stack.HookUnset {
 				continue
 			}
-			if !isUnconditionalAccept(table.Rules[ruleIdx]) {
+			if !isUnconditionalAccept(table.Rules[ruleIdx], ipv6) {
 				nflog("hook %d is unsupported.", hook)
 				return syserr.ErrInvalidArgument
 			}
@@ -382,7 +253,8 @@ func SetEntries(stk *stack.Stack, optVal []byte) *syserr.Error {
 	// - There are no chains without an unconditional final rule.
 	// - There are no chains without an unconditional underflow rule.
 
-	return syserr.TranslateNetstackError(stk.IPTables().ReplaceTable(replace.Name.String(), table))
+	return syserr.TranslateNetstackError(stk.IPTables().ReplaceTable(replace.Name.String(), table, ipv6))
+
 }
 
 // parseMatchers parses 0 or more matchers from optVal. optVal should contain
@@ -404,7 +276,6 @@ func parseMatchers(filter stack.IPHeaderFilter, optVal []byte) ([]stack.Matcher,
 
 		// Check some invariants.
 		if match.MatchSize < linux.SizeOfXTEntryMatch {
-
 			return nil, fmt.Errorf("match size is too small, must be at least %d", linux.SizeOfXTEntryMatch)
 		}
 		if len(optVal) < int(match.MatchSize) {
@@ -429,79 +300,26 @@ func parseMatchers(filter stack.IPHeaderFilter, optVal []byte) ([]stack.Matcher,
 	return matchers, nil
 }
 
-func filterFromIPTIP(iptip linux.IPTIP) (stack.IPHeaderFilter, error) {
-	if containsUnsupportedFields(iptip) {
-		return stack.IPHeaderFilter{}, fmt.Errorf("unsupported fields in struct iptip: %+v", iptip)
-	}
-	if len(iptip.Dst) != header.IPv4AddressSize || len(iptip.DstMask) != header.IPv4AddressSize {
-		return stack.IPHeaderFilter{}, fmt.Errorf("incorrect length of destination (%d) and/or destination mask (%d) fields", len(iptip.Dst), len(iptip.DstMask))
-	}
-	if len(iptip.Src) != header.IPv4AddressSize || len(iptip.SrcMask) != header.IPv4AddressSize {
-		return stack.IPHeaderFilter{}, fmt.Errorf("incorrect length of source (%d) and/or source mask (%d) fields", len(iptip.Src), len(iptip.SrcMask))
-	}
-
-	n := bytes.IndexByte([]byte(iptip.OutputInterface[:]), 0)
-	if n == -1 {
-		n = len(iptip.OutputInterface)
-	}
-	ifname := string(iptip.OutputInterface[:n])
-
-	n = bytes.IndexByte([]byte(iptip.OutputInterfaceMask[:]), 0)
-	if n == -1 {
-		n = len(iptip.OutputInterfaceMask)
-	}
-	ifnameMask := string(iptip.OutputInterfaceMask[:n])
-
-	return stack.IPHeaderFilter{
-		Protocol:              tcpip.TransportProtocolNumber(iptip.Protocol),
-		Dst:                   tcpip.Address(iptip.Dst[:]),
-		DstMask:               tcpip.Address(iptip.DstMask[:]),
-		DstInvert:             iptip.InverseFlags&linux.IPT_INV_DSTIP != 0,
-		Src:                   tcpip.Address(iptip.Src[:]),
-		SrcMask:               tcpip.Address(iptip.SrcMask[:]),
-		SrcInvert:             iptip.InverseFlags&linux.IPT_INV_SRCIP != 0,
-		OutputInterface:       ifname,
-		OutputInterfaceMask:   ifnameMask,
-		OutputInterfaceInvert: iptip.InverseFlags&linux.IPT_INV_VIA_OUT != 0,
-	}, nil
-}
-
-func containsUnsupportedFields(iptip linux.IPTIP) bool {
-	// The following features are supported:
-	// - Protocol
-	// - Dst and DstMask
-	// - Src and SrcMask
-	// - The inverse destination IP check flag
-	// - OutputInterface, OutputInterfaceMask and its inverse.
-	var emptyInterface = [linux.IFNAMSIZ]byte{}
-	// Disable any supported inverse flags.
-	inverseMask := uint8(linux.IPT_INV_DSTIP) | uint8(linux.IPT_INV_SRCIP) | uint8(linux.IPT_INV_VIA_OUT)
-	return iptip.InputInterface != emptyInterface ||
-		iptip.InputInterfaceMask != emptyInterface ||
-		iptip.Flags != 0 ||
-		iptip.InverseFlags&^inverseMask != 0
-}
-
-func validUnderflow(rule stack.Rule) bool {
+func validUnderflow(rule stack.Rule, ipv6 bool) bool {
 	if len(rule.Matchers) != 0 {
 		return false
 	}
-	if rule.Filter != emptyFilter {
+	if (ipv6 && rule.Filter != emptyIPv6Filter) || (!ipv6 && rule.Filter != emptyIPv4Filter) {
 		return false
 	}
 	switch rule.Target.(type) {
-	case stack.AcceptTarget, stack.DropTarget:
+	case *stack.AcceptTarget, *stack.DropTarget:
 		return true
 	default:
 		return false
 	}
 }
 
-func isUnconditionalAccept(rule stack.Rule) bool {
-	if !validUnderflow(rule) {
+func isUnconditionalAccept(rule stack.Rule, ipv6 bool) bool {
+	if !validUnderflow(rule, ipv6) {
 		return false
 	}
-	_, ok := rule.Target.(stack.AcceptTarget)
+	_, ok := rule.Target.(*stack.AcceptTarget)
 	return ok
 }
 
@@ -520,3 +338,20 @@ func hookFromLinux(hook int) stack.Hook {
 	}
 	panic(fmt.Sprintf("Unknown hook %d does not correspond to a builtin chain", hook))
 }
+
+// TargetRevision returns a linux.XTGetRevision for a given target. It sets
+// Revision to the highest supported value, unless the provided revision number
+// is larger.
+func TargetRevision(t *kernel.Task, revPtr usermem.Addr, netProto tcpip.NetworkProtocolNumber) (linux.XTGetRevision, *syserr.Error) {
+	// Read in the target name and version.
+	var rev linux.XTGetRevision
+	if _, err := rev.CopyIn(t, revPtr); err != nil {
+		return linux.XTGetRevision{}, syserr.FromError(err)
+	}
+	maxSupported, ok := targetRevision(rev.Name.String(), netProto, rev.Revision)
+	if !ok {
+		return linux.XTGetRevision{}, syserr.ErrProtocolNotSupported
+	}
+	rev.Revision = maxSupported
+	return rev, nil
+}
diff --git a/pkg/sentry/socket/netfilter/targets.go b/pkg/sentry/socket/netfilter/targets.go
index 8ebdaff18..0e14447fe 100644
--- a/pkg/sentry/socket/netfilter/targets.go
+++ b/pkg/sentry/socket/netfilter/targets.go
@@ -15,255 +15,357 @@
 package netfilter
 
 import (
-	"errors"
 	"fmt"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/syserr"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
-// errorTargetName is used to mark targets as error targets. Error targets
-// shouldn't be reached - an error has occurred if we fall through to one.
-const errorTargetName = "ERROR"
+func init() {
+	// Standard targets include ACCEPT, DROP, RETURN, and JUMP.
+	registerTargetMaker(&standardTargetMaker{
+		NetworkProtocol: header.IPv4ProtocolNumber,
+	})
+	registerTargetMaker(&standardTargetMaker{
+		NetworkProtocol: header.IPv6ProtocolNumber,
+	})
+
+	// Both user chains and actual errors are represented in iptables by
+	// error targets.
+	registerTargetMaker(&errorTargetMaker{
+		NetworkProtocol: header.IPv4ProtocolNumber,
+	})
+	registerTargetMaker(&errorTargetMaker{
+		NetworkProtocol: header.IPv6ProtocolNumber,
+	})
+
+	registerTargetMaker(&redirectTargetMaker{
+		NetworkProtocol: header.IPv4ProtocolNumber,
+	})
+	registerTargetMaker(&nfNATTargetMaker{
+		NetworkProtocol: header.IPv6ProtocolNumber,
+	})
+}
 
-// redirectTargetName is used to mark targets as redirect targets. Redirect
-// targets should be reached for only NAT and Mangle tables. These targets will
-// change the destination port/destination IP for packets.
-const redirectTargetName = "REDIRECT"
+type standardTargetMaker struct {
+	NetworkProtocol tcpip.NetworkProtocolNumber
+}
 
-func marshalTarget(target stack.Target) []byte {
+func (sm *standardTargetMaker) id() stack.TargetID {
+	// Standard targets have the empty string as a name and no revisions.
+	return stack.TargetID{
+		NetworkProtocol: sm.NetworkProtocol,
+	}
+}
+func (*standardTargetMaker) marshal(target stack.Target) []byte {
+	// Translate verdicts the same way as the iptables tool.
+	var verdict int32
 	switch tg := target.(type) {
-	case stack.AcceptTarget:
-		return marshalStandardTarget(stack.RuleAccept)
-	case stack.DropTarget:
-		return marshalStandardTarget(stack.RuleDrop)
-	case stack.ErrorTarget:
-		return marshalErrorTarget(errorTargetName)
-	case stack.UserChainTarget:
-		return marshalErrorTarget(tg.Name)
-	case stack.ReturnTarget:
-		return marshalStandardTarget(stack.RuleReturn)
-	case stack.RedirectTarget:
-		return marshalRedirectTarget(tg)
-	case JumpTarget:
-		return marshalJumpTarget(tg)
+	case *stack.AcceptTarget:
+		verdict = -linux.NF_ACCEPT - 1
+	case *stack.DropTarget:
+		verdict = -linux.NF_DROP - 1
+	case *stack.ReturnTarget:
+		verdict = linux.NF_RETURN
+	case *JumpTarget:
+		verdict = int32(tg.Offset)
 	default:
 		panic(fmt.Errorf("unknown target of type %T", target))
 	}
-}
-
-func marshalStandardTarget(verdict stack.RuleVerdict) []byte {
-	nflog("convert to binary: marshalling standard target")
 
 	// The target's name will be the empty string.
-	target := linux.XTStandardTarget{
+	xt := linux.XTStandardTarget{
 		Target: linux.XTEntryTarget{
 			TargetSize: linux.SizeOfXTStandardTarget,
 		},
-		Verdict: translateFromStandardVerdict(verdict),
+		Verdict: verdict,
 	}
 
 	ret := make([]byte, 0, linux.SizeOfXTStandardTarget)
-	return binary.Marshal(ret, usermem.ByteOrder, target)
+	return binary.Marshal(ret, usermem.ByteOrder, xt)
+}
+
+func (*standardTargetMaker) unmarshal(buf []byte, filter stack.IPHeaderFilter) (stack.Target, *syserr.Error) {
+	if len(buf) != linux.SizeOfXTStandardTarget {
+		nflog("buf has wrong size for standard target %d", len(buf))
+		return nil, syserr.ErrInvalidArgument
+	}
+	var standardTarget linux.XTStandardTarget
+	buf = buf[:linux.SizeOfXTStandardTarget]
+	binary.Unmarshal(buf, usermem.ByteOrder, &standardTarget)
+
+	if standardTarget.Verdict < 0 {
+		// A Verdict < 0 indicates a non-jump verdict.
+		return translateToStandardTarget(standardTarget.Verdict, filter.NetworkProtocol())
+	}
+	// A verdict >= 0 indicates a jump.
+	return &JumpTarget{
+		Offset:          uint32(standardTarget.Verdict),
+		NetworkProtocol: filter.NetworkProtocol(),
+	}, nil
+}
+
+type errorTargetMaker struct {
+	NetworkProtocol tcpip.NetworkProtocolNumber
+}
+
+func (em *errorTargetMaker) id() stack.TargetID {
+	// Error targets have no revision.
+	return stack.TargetID{
+		Name:            stack.ErrorTargetName,
+		NetworkProtocol: em.NetworkProtocol,
+	}
 }
 
-func marshalErrorTarget(errorName string) []byte {
+func (*errorTargetMaker) marshal(target stack.Target) []byte {
+	var errorName string
+	switch tg := target.(type) {
+	case *stack.ErrorTarget:
+		errorName = stack.ErrorTargetName
+	case *stack.UserChainTarget:
+		errorName = tg.Name
+	default:
+		panic(fmt.Sprintf("errorMakerTarget cannot marshal unknown type %T", target))
+	}
+
 	// This is an error target named error
-	target := linux.XTErrorTarget{
+	xt := linux.XTErrorTarget{
 		Target: linux.XTEntryTarget{
 			TargetSize: linux.SizeOfXTErrorTarget,
 		},
 	}
-	copy(target.Name[:], errorName)
-	copy(target.Target.Name[:], errorTargetName)
+	copy(xt.Name[:], errorName)
+	copy(xt.Target.Name[:], stack.ErrorTargetName)
 
 	ret := make([]byte, 0, linux.SizeOfXTErrorTarget)
-	return binary.Marshal(ret, usermem.ByteOrder, target)
+	return binary.Marshal(ret, usermem.ByteOrder, xt)
 }
 
-func marshalRedirectTarget(rt stack.RedirectTarget) []byte {
+func (*errorTargetMaker) unmarshal(buf []byte, filter stack.IPHeaderFilter) (stack.Target, *syserr.Error) {
+	if len(buf) != linux.SizeOfXTErrorTarget {
+		nflog("buf has insufficient size for error target %d", len(buf))
+		return nil, syserr.ErrInvalidArgument
+	}
+	var errorTarget linux.XTErrorTarget
+	buf = buf[:linux.SizeOfXTErrorTarget]
+	binary.Unmarshal(buf, usermem.ByteOrder, &errorTarget)
+
+	// Error targets are used in 2 cases:
+	// * An actual error case. These rules have an error
+	//   named stack.ErrorTargetName. The last entry of the table
+	//   is usually an error case to catch any packets that
+	//   somehow fall through every rule.
+	// * To mark the start of a user defined chain. These
+	//   rules have an error with the name of the chain.
+	switch name := errorTarget.Name.String(); name {
+	case stack.ErrorTargetName:
+		return &stack.ErrorTarget{NetworkProtocol: filter.NetworkProtocol()}, nil
+	default:
+		// User defined chain.
+		return &stack.UserChainTarget{
+			Name:            name,
+			NetworkProtocol: filter.NetworkProtocol(),
+		}, nil
+	}
+}
+
+type redirectTargetMaker struct {
+	NetworkProtocol tcpip.NetworkProtocolNumber
+}
+
+func (rm *redirectTargetMaker) id() stack.TargetID {
+	return stack.TargetID{
+		Name:            stack.RedirectTargetName,
+		NetworkProtocol: rm.NetworkProtocol,
+	}
+}
+
+func (*redirectTargetMaker) marshal(target stack.Target) []byte {
+	rt := target.(*stack.RedirectTarget)
 	// This is a redirect target named redirect
-	target := linux.XTRedirectTarget{
+	xt := linux.XTRedirectTarget{
 		Target: linux.XTEntryTarget{
 			TargetSize: linux.SizeOfXTRedirectTarget,
 		},
 	}
-	copy(target.Target.Name[:], redirectTargetName)
+	copy(xt.Target.Name[:], stack.RedirectTargetName)
 
 	ret := make([]byte, 0, linux.SizeOfXTRedirectTarget)
-	target.NfRange.RangeSize = 1
-	if rt.RangeProtoSpecified {
-		target.NfRange.RangeIPV4.Flags |= linux.NF_NAT_RANGE_PROTO_SPECIFIED
+	xt.NfRange.RangeSize = 1
+	xt.NfRange.RangeIPV4.Flags |= linux.NF_NAT_RANGE_PROTO_SPECIFIED
+	xt.NfRange.RangeIPV4.MinPort = htons(rt.Port)
+	xt.NfRange.RangeIPV4.MaxPort = xt.NfRange.RangeIPV4.MinPort
+	return binary.Marshal(ret, usermem.ByteOrder, xt)
+}
+
+func (*redirectTargetMaker) unmarshal(buf []byte, filter stack.IPHeaderFilter) (stack.Target, *syserr.Error) {
+	if len(buf) < linux.SizeOfXTRedirectTarget {
+		nflog("redirectTargetMaker: buf has insufficient size for redirect target %d", len(buf))
+		return nil, syserr.ErrInvalidArgument
+	}
+
+	if p := filter.Protocol; p != header.TCPProtocolNumber && p != header.UDPProtocolNumber {
+		nflog("redirectTargetMaker: bad proto %d", p)
+		return nil, syserr.ErrInvalidArgument
+	}
+
+	var redirectTarget linux.XTRedirectTarget
+	buf = buf[:linux.SizeOfXTRedirectTarget]
+	binary.Unmarshal(buf, usermem.ByteOrder, &redirectTarget)
+
+	// Copy linux.XTRedirectTarget to stack.RedirectTarget.
+	target := stack.RedirectTarget{NetworkProtocol: filter.NetworkProtocol()}
+
+	// RangeSize should be 1.
+	nfRange := redirectTarget.NfRange
+	if nfRange.RangeSize != 1 {
+		nflog("redirectTargetMaker: bad rangesize %d", nfRange.RangeSize)
+		return nil, syserr.ErrInvalidArgument
+	}
+
+	// TODO(gvisor.dev/issue/170): Check if the flags are valid.
+	// Also check if we need to map ports or IP.
+	// For now, redirect target only supports destination port change.
+	// Port range and IP range are not supported yet.
+	if nfRange.RangeIPV4.Flags != linux.NF_NAT_RANGE_PROTO_SPECIFIED {
+		nflog("redirectTargetMaker: invalid range flags %d", nfRange.RangeIPV4.Flags)
+		return nil, syserr.ErrInvalidArgument
+	}
+
+	// TODO(gvisor.dev/issue/170): Port range is not supported yet.
+	if nfRange.RangeIPV4.MinPort != nfRange.RangeIPV4.MaxPort {
+		nflog("redirectTargetMaker: MinPort != MaxPort (%d, %d)", nfRange.RangeIPV4.MinPort, nfRange.RangeIPV4.MaxPort)
+		return nil, syserr.ErrInvalidArgument
 	}
-	// Convert port from little endian to big endian.
-	port := make([]byte, 2)
-	binary.LittleEndian.PutUint16(port, rt.MinPort)
-	target.NfRange.RangeIPV4.MinPort = binary.BigEndian.Uint16(port)
-	binary.LittleEndian.PutUint16(port, rt.MaxPort)
-	target.NfRange.RangeIPV4.MaxPort = binary.BigEndian.Uint16(port)
-	return binary.Marshal(ret, usermem.ByteOrder, target)
+	if nfRange.RangeIPV4.MinIP != nfRange.RangeIPV4.MaxIP {
+		nflog("redirectTargetMaker: MinIP != MaxIP (%d, %d)", nfRange.RangeIPV4.MinPort, nfRange.RangeIPV4.MaxPort)
+		return nil, syserr.ErrInvalidArgument
+	}
+
+	target.Addr = tcpip.Address(nfRange.RangeIPV4.MinIP[:])
+	target.Port = ntohs(nfRange.RangeIPV4.MinPort)
+
+	return &target, nil
 }
 
-func marshalJumpTarget(jt JumpTarget) []byte {
-	nflog("convert to binary: marshalling jump target")
+type nfNATTarget struct {
+	Target linux.XTEntryTarget
+	Range  linux.NFNATRange
+}
 
-	// The target's name will be the empty string.
-	target := linux.XTStandardTarget{
+const nfNATMarhsalledSize = linux.SizeOfXTEntryTarget + linux.SizeOfNFNATRange
+
+type nfNATTargetMaker struct {
+	NetworkProtocol tcpip.NetworkProtocolNumber
+}
+
+func (rm *nfNATTargetMaker) id() stack.TargetID {
+	return stack.TargetID{
+		Name:            stack.RedirectTargetName,
+		NetworkProtocol: rm.NetworkProtocol,
+	}
+}
+
+func (*nfNATTargetMaker) marshal(target stack.Target) []byte {
+	rt := target.(*stack.RedirectTarget)
+	nt := nfNATTarget{
 		Target: linux.XTEntryTarget{
-			TargetSize: linux.SizeOfXTStandardTarget,
+			TargetSize: nfNATMarhsalledSize,
+		},
+		Range: linux.NFNATRange{
+			Flags: linux.NF_NAT_RANGE_PROTO_SPECIFIED,
 		},
-		// Verdict is overloaded by the ABI. When positive, it holds
-		// the jump offset from the start of the table.
-		Verdict: int32(jt.Offset),
 	}
+	copy(nt.Target.Name[:], stack.RedirectTargetName)
+	copy(nt.Range.MinAddr[:], rt.Addr)
+	copy(nt.Range.MaxAddr[:], rt.Addr)
 
-	ret := make([]byte, 0, linux.SizeOfXTStandardTarget)
-	return binary.Marshal(ret, usermem.ByteOrder, target)
+	nt.Range.MinProto = htons(rt.Port)
+	nt.Range.MaxProto = nt.Range.MinProto
+
+	ret := make([]byte, 0, nfNATMarhsalledSize)
+	return binary.Marshal(ret, usermem.ByteOrder, nt)
 }
 
-// translateFromStandardVerdict translates verdicts the same way as the iptables
-// tool.
-func translateFromStandardVerdict(verdict stack.RuleVerdict) int32 {
-	switch verdict {
-	case stack.RuleAccept:
-		return -linux.NF_ACCEPT - 1
-	case stack.RuleDrop:
-		return -linux.NF_DROP - 1
-	case stack.RuleReturn:
-		return linux.NF_RETURN
-	default:
-		// TODO(gvisor.dev/issue/170): Support Jump.
-		panic(fmt.Sprintf("unknown standard verdict: %d", verdict))
+func (*nfNATTargetMaker) unmarshal(buf []byte, filter stack.IPHeaderFilter) (stack.Target, *syserr.Error) {
+	if size := nfNATMarhsalledSize; len(buf) < size {
+		nflog("nfNATTargetMaker: buf has insufficient size (%d) for nfNAT target (%d)", len(buf), size)
+		return nil, syserr.ErrInvalidArgument
 	}
+
+	if p := filter.Protocol; p != header.TCPProtocolNumber && p != header.UDPProtocolNumber {
+		nflog("nfNATTargetMaker: bad proto %d", p)
+		return nil, syserr.ErrInvalidArgument
+	}
+
+	var natRange linux.NFNATRange
+	buf = buf[linux.SizeOfXTEntryTarget:nfNATMarhsalledSize]
+	binary.Unmarshal(buf, usermem.ByteOrder, &natRange)
+
+	// We don't support port or address ranges.
+	if natRange.MinAddr != natRange.MaxAddr {
+		nflog("nfNATTargetMaker: MinAddr and MaxAddr are different")
+		return nil, syserr.ErrInvalidArgument
+	}
+	if natRange.MinProto != natRange.MaxProto {
+		nflog("nfNATTargetMaker: MinProto and MaxProto are different")
+		return nil, syserr.ErrInvalidArgument
+	}
+
+	// TODO(gvisor.dev/issue/3549): Check for other flags.
+	// For now, redirect target only supports destination change.
+	if natRange.Flags != linux.NF_NAT_RANGE_PROTO_SPECIFIED {
+		nflog("nfNATTargetMaker: invalid range flags %d", natRange.Flags)
+		return nil, syserr.ErrInvalidArgument
+	}
+
+	target := stack.RedirectTarget{
+		NetworkProtocol: filter.NetworkProtocol(),
+		Addr:            tcpip.Address(natRange.MinAddr[:]),
+		Port:            ntohs(natRange.MinProto),
+	}
+
+	return &target, nil
 }
 
 // translateToStandardTarget translates from the value in a
 // linux.XTStandardTarget to an stack.Verdict.
-func translateToStandardTarget(val int32) (stack.Target, error) {
+func translateToStandardTarget(val int32, netProto tcpip.NetworkProtocolNumber) (stack.Target, *syserr.Error) {
 	// TODO(gvisor.dev/issue/170): Support other verdicts.
 	switch val {
 	case -linux.NF_ACCEPT - 1:
-		return stack.AcceptTarget{}, nil
+		return &stack.AcceptTarget{NetworkProtocol: netProto}, nil
 	case -linux.NF_DROP - 1:
-		return stack.DropTarget{}, nil
+		return &stack.DropTarget{NetworkProtocol: netProto}, nil
 	case -linux.NF_QUEUE - 1:
-		return nil, errors.New("unsupported iptables verdict QUEUE")
+		nflog("unsupported iptables verdict QUEUE")
+		return nil, syserr.ErrInvalidArgument
 	case linux.NF_RETURN:
-		return stack.ReturnTarget{}, nil
+		return &stack.ReturnTarget{NetworkProtocol: netProto}, nil
 	default:
-		return nil, fmt.Errorf("unknown iptables verdict %d", val)
+		nflog("unknown iptables verdict %d", val)
+		return nil, syserr.ErrInvalidArgument
 	}
 }
 
 // parseTarget parses a target from optVal. optVal should contain only the
 // target.
-func parseTarget(filter stack.IPHeaderFilter, optVal []byte) (stack.Target, error) {
+func parseTarget(filter stack.IPHeaderFilter, optVal []byte, ipv6 bool) (stack.Target, *syserr.Error) {
 	nflog("set entries: parsing target of size %d", len(optVal))
 	if len(optVal) < linux.SizeOfXTEntryTarget {
-		return nil, fmt.Errorf("optVal has insufficient size for entry target %d", len(optVal))
+		nflog("optVal has insufficient size for entry target %d", len(optVal))
+		return nil, syserr.ErrInvalidArgument
 	}
 	var target linux.XTEntryTarget
 	buf := optVal[:linux.SizeOfXTEntryTarget]
 	binary.Unmarshal(buf, usermem.ByteOrder, &target)
-	switch target.Name.String() {
-	case "":
-		// Standard target.
-		if len(optVal) != linux.SizeOfXTStandardTarget {
-			return nil, fmt.Errorf("optVal has wrong size for standard target %d", len(optVal))
-		}
-		var standardTarget linux.XTStandardTarget
-		buf = optVal[:linux.SizeOfXTStandardTarget]
-		binary.Unmarshal(buf, usermem.ByteOrder, &standardTarget)
-
-		if standardTarget.Verdict < 0 {
-			// A Verdict < 0 indicates a non-jump verdict.
-			return translateToStandardTarget(standardTarget.Verdict)
-		}
-		// A verdict >= 0 indicates a jump.
-		return JumpTarget{Offset: uint32(standardTarget.Verdict)}, nil
-
-	case errorTargetName:
-		// Error target.
-		if len(optVal) != linux.SizeOfXTErrorTarget {
-			return nil, fmt.Errorf("optVal has insufficient size for error target %d", len(optVal))
-		}
-		var errorTarget linux.XTErrorTarget
-		buf = optVal[:linux.SizeOfXTErrorTarget]
-		binary.Unmarshal(buf, usermem.ByteOrder, &errorTarget)
-
-		// Error targets are used in 2 cases:
-		// * An actual error case. These rules have an error
-		//   named errorTargetName. The last entry of the table
-		//   is usually an error case to catch any packets that
-		//   somehow fall through every rule.
-		// * To mark the start of a user defined chain. These
-		//   rules have an error with the name of the chain.
-		switch name := errorTarget.Name.String(); name {
-		case errorTargetName:
-			nflog("set entries: error target")
-			return stack.ErrorTarget{}, nil
-		default:
-			// User defined chain.
-			nflog("set entries: user-defined target %q", name)
-			return stack.UserChainTarget{Name: name}, nil
-		}
-
-	case redirectTargetName:
-		// Redirect target.
-		if len(optVal) < linux.SizeOfXTRedirectTarget {
-			return nil, fmt.Errorf("netfilter.SetEntries: optVal has insufficient size for redirect target %d", len(optVal))
-		}
-
-		if filter.Protocol != header.TCPProtocolNumber && filter.Protocol != header.UDPProtocolNumber {
-			return nil, fmt.Errorf("netfilter.SetEntries: invalid argument")
-		}
-
-		var redirectTarget linux.XTRedirectTarget
-		buf = optVal[:linux.SizeOfXTRedirectTarget]
-		binary.Unmarshal(buf, usermem.ByteOrder, &redirectTarget)
-
-		// Copy linux.XTRedirectTarget to stack.RedirectTarget.
-		var target stack.RedirectTarget
-		nfRange := redirectTarget.NfRange
-
-		// RangeSize should be 1.
-		if nfRange.RangeSize != 1 {
-			return nil, fmt.Errorf("netfilter.SetEntries: invalid argument")
-		}
-
-		// TODO(gvisor.dev/issue/170): Check if the flags are valid.
-		// Also check if we need to map ports or IP.
-		// For now, redirect target only supports destination port change.
-		// Port range and IP range are not supported yet.
-		if nfRange.RangeIPV4.Flags&linux.NF_NAT_RANGE_PROTO_SPECIFIED == 0 {
-			return nil, fmt.Errorf("netfilter.SetEntries: invalid argument")
-		}
-		target.RangeProtoSpecified = true
-
-		target.MinIP = tcpip.Address(nfRange.RangeIPV4.MinIP[:])
-		target.MaxIP = tcpip.Address(nfRange.RangeIPV4.MaxIP[:])
-
-		// TODO(gvisor.dev/issue/170): Port range is not supported yet.
-		if nfRange.RangeIPV4.MinPort != nfRange.RangeIPV4.MaxPort {
-			return nil, fmt.Errorf("netfilter.SetEntries: invalid argument")
-		}
-
-		// Convert port from big endian to little endian.
-		port := make([]byte, 2)
-		binary.BigEndian.PutUint16(port, nfRange.RangeIPV4.MinPort)
-		target.MinPort = binary.LittleEndian.Uint16(port)
-
-		binary.BigEndian.PutUint16(port, nfRange.RangeIPV4.MaxPort)
-		target.MaxPort = binary.LittleEndian.Uint16(port)
-		return target, nil
-	}
 
-	// Unknown target.
-	return nil, fmt.Errorf("unknown target %q doesn't exist or isn't supported yet", target.Name.String())
+	return unmarshalTarget(target, filter, optVal)
 }
 
 // JumpTarget implements stack.Target.
@@ -274,9 +376,31 @@ type JumpTarget struct {
 
 	// RuleNum is the rule to jump to.
 	RuleNum int
+
+	// NetworkProtocol is the network protocol the target is used with.
+	NetworkProtocol tcpip.NetworkProtocolNumber
+}
+
+// ID implements Target.ID.
+func (jt *JumpTarget) ID() stack.TargetID {
+	return stack.TargetID{
+		NetworkProtocol: jt.NetworkProtocol,
+	}
 }
 
 // Action implements stack.Target.Action.
-func (jt JumpTarget) Action(*stack.PacketBuffer, *stack.ConnTrack, stack.Hook, *stack.GSO, *stack.Route, tcpip.Address) (stack.RuleVerdict, int) {
+func (jt *JumpTarget) Action(*stack.PacketBuffer, *stack.ConnTrack, stack.Hook, *stack.GSO, *stack.Route, tcpip.Address) (stack.RuleVerdict, int) {
 	return stack.RuleJump, jt.RuleNum
 }
+
+func ntohs(port uint16) uint16 {
+	buf := make([]byte, 2)
+	binary.BigEndian.PutUint16(buf, port)
+	return usermem.ByteOrder.Uint16(buf)
+}
+
+func htons(port uint16) uint16 {
+	buf := make([]byte, 2)
+	usermem.ByteOrder.PutUint16(buf, port)
+	return binary.BigEndian.Uint16(buf)
+}
diff --git a/pkg/sentry/socket/netfilter/tcp_matcher.go b/pkg/sentry/socket/netfilter/tcp_matcher.go
index 0bfd6c1f4..844acfede 100644
--- a/pkg/sentry/socket/netfilter/tcp_matcher.go
+++ b/pkg/sentry/socket/netfilter/tcp_matcher.go
@@ -97,17 +97,33 @@ func (*TCPMatcher) Name() string {
 
 // Match implements Matcher.Match.
 func (tm *TCPMatcher) Match(hook stack.Hook, pkt *stack.PacketBuffer, interfaceName string) (bool, bool) {
-	netHeader := header.IPv4(pkt.NetworkHeader().View())
+	// TODO(gvisor.dev/issue/170): Proto checks should ultimately be moved
+	// into the stack.Check codepath as matchers are added.
+	switch pkt.NetworkProtocolNumber {
+	case header.IPv4ProtocolNumber:
+		netHeader := header.IPv4(pkt.NetworkHeader().View())
+		if netHeader.TransportProtocol() != header.TCPProtocolNumber {
+			return false, false
+		}
 
-	if netHeader.TransportProtocol() != header.TCPProtocolNumber {
-		return false, false
-	}
+		// We don't match fragments.
+		if frag := netHeader.FragmentOffset(); frag != 0 {
+			if frag == 1 {
+				return false, true
+			}
+			return false, false
+		}
 
-	// We dont't match fragments.
-	if frag := netHeader.FragmentOffset(); frag != 0 {
-		if frag == 1 {
-			return false, true
+	case header.IPv6ProtocolNumber:
+		// As in Linux, we do not perform an IPv6 fragment check. See
+		// xt_action_param.fragoff in
+		// include/linux/netfilter/x_tables.h.
+		if header.IPv6(pkt.NetworkHeader().View()).TransportProtocol() != header.TCPProtocolNumber {
+			return false, false
 		}
+
+	default:
+		// We don't know the network protocol.
 		return false, false
 	}
 
diff --git a/pkg/sentry/socket/netfilter/udp_matcher.go b/pkg/sentry/socket/netfilter/udp_matcher.go
index 7ed05461d..63201201c 100644
--- a/pkg/sentry/socket/netfilter/udp_matcher.go
+++ b/pkg/sentry/socket/netfilter/udp_matcher.go
@@ -94,19 +94,33 @@ func (*UDPMatcher) Name() string {
 
 // Match implements Matcher.Match.
 func (um *UDPMatcher) Match(hook stack.Hook, pkt *stack.PacketBuffer, interfaceName string) (bool, bool) {
-	netHeader := header.IPv4(pkt.NetworkHeader().View())
-
 	// TODO(gvisor.dev/issue/170): Proto checks should ultimately be moved
 	// into the stack.Check codepath as matchers are added.
-	if netHeader.TransportProtocol() != header.UDPProtocolNumber {
-		return false, false
-	}
+	switch pkt.NetworkProtocolNumber {
+	case header.IPv4ProtocolNumber:
+		netHeader := header.IPv4(pkt.NetworkHeader().View())
+		if netHeader.TransportProtocol() != header.UDPProtocolNumber {
+			return false, false
+		}
 
-	// We dont't match fragments.
-	if frag := netHeader.FragmentOffset(); frag != 0 {
-		if frag == 1 {
-			return false, true
+		// We don't match fragments.
+		if frag := netHeader.FragmentOffset(); frag != 0 {
+			if frag == 1 {
+				return false, true
+			}
+			return false, false
 		}
+
+	case header.IPv6ProtocolNumber:
+		// As in Linux, we do not perform an IPv6 fragment check. See
+		// xt_action_param.fragoff in
+		// include/linux/netfilter/x_tables.h.
+		if header.IPv6(pkt.NetworkHeader().View()).TransportProtocol() != header.UDPProtocolNumber {
+			return false, false
+		}
+
+	default:
+		// We don't know the network protocol.
 		return false, false
 	}
 
diff --git a/pkg/sentry/socket/netlink/BUILD b/pkg/sentry/socket/netlink/BUILD
index 0546801bf..1f926aa91 100644
--- a/pkg/sentry/socket/netlink/BUILD
+++ b/pkg/sentry/socket/netlink/BUILD
@@ -16,6 +16,8 @@ go_library(
         "//pkg/abi/linux",
         "//pkg/binary",
         "//pkg/context",
+        "//pkg/marshal",
+        "//pkg/marshal/primitive",
         "//pkg/sentry/arch",
         "//pkg/sentry/device",
         "//pkg/sentry/fs",
@@ -36,8 +38,6 @@ go_library(
         "//pkg/tcpip",
         "//pkg/usermem",
         "//pkg/waiter",
-        "//tools/go_marshal/marshal",
-        "//tools/go_marshal/primitive",
     ],
 )
 
diff --git a/pkg/sentry/socket/netlink/provider_vfs2.go b/pkg/sentry/socket/netlink/provider_vfs2.go
index bb205be0d..e8930f031 100644
--- a/pkg/sentry/socket/netlink/provider_vfs2.go
+++ b/pkg/sentry/socket/netlink/provider_vfs2.go
@@ -52,6 +52,7 @@ func (*socketProviderVFS2) Socket(t *kernel.Task, stype linux.SockType, protocol
 	vfsfd := &s.vfsfd
 	mnt := t.Kernel().SocketMount()
 	d := sockfs.NewDentry(t.Credentials(), mnt)
+	defer d.DecRef(t)
 	if err := vfsfd.Init(s, linux.O_RDWR, mnt, d, &vfs.FileDescriptionOptions{
 		DenyPRead:         true,
 		DenyPWrite:        true,
diff --git a/pkg/sentry/socket/netlink/socket.go b/pkg/sentry/socket/netlink/socket.go
index 68a9b9a96..3baad098b 100644
--- a/pkg/sentry/socket/netlink/socket.go
+++ b/pkg/sentry/socket/netlink/socket.go
@@ -16,11 +16,14 @@
 package netlink
 
 import (
+	"io"
 	"math"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/binary"
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/marshal"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/device"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
@@ -38,8 +41,6 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
-	"gvisor.dev/gvisor/tools/go_marshal/marshal"
-	"gvisor.dev/gvisor/tools/go_marshal/primitive"
 )
 
 const sizeOfInt32 int = 4
@@ -748,6 +749,12 @@ func (s *socketOpsCommon) sendMsg(ctx context.Context, src usermem.IOSequence, t
 
 	buf := make([]byte, src.NumBytes())
 	n, err := src.CopyIn(ctx, buf)
+	// io.EOF can be only returned if src is a file, this means that
+	// sendMsg is called from splice and the error has to be ignored in
+	// this case.
+	if err == io.EOF {
+		err = nil
+	}
 	if err != nil {
 		// Don't partially consume messages.
 		return 0, syserr.FromError(err)
diff --git a/pkg/sentry/socket/netlink/socket_vfs2.go b/pkg/sentry/socket/netlink/socket_vfs2.go
index a38d25da9..c83b23242 100644
--- a/pkg/sentry/socket/netlink/socket_vfs2.go
+++ b/pkg/sentry/socket/netlink/socket_vfs2.go
@@ -82,6 +82,13 @@ func NewVFS2(t *kernel.Task, skType linux.SockType, protocol Protocol) (*SocketV
 	return fd, nil
 }
 
+// Release implements vfs.FileDescriptionImpl.Release.
+func (s *SocketVFS2) Release(ctx context.Context) {
+	t := kernel.TaskFromContext(ctx)
+	t.Kernel().DeleteSocketVFS2(&s.vfsfd)
+	s.socketOpsCommon.Release(ctx)
+}
+
 // Readiness implements waiter.Waitable.Readiness.
 func (s *SocketVFS2) Readiness(mask waiter.EventMask) waiter.EventMask {
 	return s.socketOpsCommon.Readiness(mask)
diff --git a/pkg/sentry/socket/netstack/BUILD b/pkg/sentry/socket/netstack/BUILD
index 1fb777a6c..fae3b6783 100644
--- a/pkg/sentry/socket/netstack/BUILD
+++ b/pkg/sentry/socket/netstack/BUILD
@@ -22,6 +22,8 @@ go_library(
         "//pkg/binary",
         "//pkg/context",
         "//pkg/log",
+        "//pkg/marshal",
+        "//pkg/marshal/primitive",
         "//pkg/metric",
         "//pkg/safemem",
         "//pkg/sentry/arch",
@@ -51,8 +53,6 @@ go_library(
         "//pkg/tcpip/transport/udp",
         "//pkg/usermem",
         "//pkg/waiter",
-        "//tools/go_marshal/marshal",
-        "//tools/go_marshal/primitive",
         "@org_golang_x_sys//unix:go_default_library",
     ],
 )
diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index e4846bc0b..211f07947 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -40,6 +40,8 @@ import (
 	"gvisor.dev/gvisor/pkg/binary"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/marshal"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
 	"gvisor.dev/gvisor/pkg/metric"
 	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
@@ -62,8 +64,6 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip/transport/udp"
 	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
-	"gvisor.dev/gvisor/tools/go_marshal/marshal"
-	"gvisor.dev/gvisor/tools/go_marshal/primitive"
 )
 
 func mustCreateMetric(name, description string) *tcpip.StatCounter {
@@ -158,6 +158,9 @@ var Metrics = tcpip.Stats{
 		OutgoingPacketErrors:                mustCreateMetric("/netstack/ip/outgoing_packet_errors", "Total number of IP packets which failed to write to a link-layer endpoint."),
 		MalformedPacketsReceived:            mustCreateMetric("/netstack/ip/malformed_packets_received", "Total number of IP packets which failed IP header validation checks."),
 		MalformedFragmentsReceived:          mustCreateMetric("/netstack/ip/malformed_fragments_received", "Total number of IP fragments which failed IP fragment validation checks."),
+		IPTablesPreroutingDropped:           mustCreateMetric("/netstack/ip/iptables/prerouting_dropped", "Total number of IP packets dropped in the Prerouting chain."),
+		IPTablesInputDropped:                mustCreateMetric("/netstack/ip/iptables/input_dropped", "Total number of IP packets dropped in the Input chain."),
+		IPTablesOutputDropped:               mustCreateMetric("/netstack/ip/iptables/output_dropped", "Total number of IP packets dropped in the Output chain."),
 	},
 	TCP: tcpip.TCPStats{
 		ActiveConnectionOpenings:           mustCreateMetric("/netstack/tcp/active_connection_openings", "Number of connections opened successfully via Connect."),
@@ -195,7 +198,6 @@ var Metrics = tcpip.Stats{
 		PacketsSent:              mustCreateMetric("/netstack/udp/packets_sent", "Number of UDP datagrams sent."),
 		PacketSendErrors:         mustCreateMetric("/netstack/udp/packet_send_errors", "Number of UDP datagrams failed to be sent."),
 		ChecksumErrors:           mustCreateMetric("/netstack/udp/checksum_errors", "Number of UDP datagrams dropped due to bad checksums."),
-		InvalidSourceAddress:     mustCreateMetric("/netstack/udp/invalid_source", "Number of UDP datagrams dropped due to invalid source address."),
 	},
 }
 
@@ -236,7 +238,7 @@ type commonEndpoint interface {
 
 	// SetSockOpt implements tcpip.Endpoint.SetSockOpt and
 	// transport.Endpoint.SetSockOpt.
-	SetSockOpt(interface{}) *tcpip.Error
+	SetSockOpt(tcpip.SettableSocketOption) *tcpip.Error
 
 	// SetSockOptBool implements tcpip.Endpoint.SetSockOptBool and
 	// transport.Endpoint.SetSockOptBool.
@@ -248,7 +250,7 @@ type commonEndpoint interface {
 
 	// GetSockOpt implements tcpip.Endpoint.GetSockOpt and
 	// transport.Endpoint.GetSockOpt.
-	GetSockOpt(interface{}) *tcpip.Error
+	GetSockOpt(tcpip.GettableSocketOption) *tcpip.Error
 
 	// GetSockOptBool implements tcpip.Endpoint.GetSockOptBool and
 	// transport.Endpoint.GetSockOpt.
@@ -257,6 +259,9 @@ type commonEndpoint interface {
 	// GetSockOptInt implements tcpip.Endpoint.GetSockOptInt and
 	// transport.Endpoint.GetSockOpt.
 	GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error)
+
+	// LastError implements tcpip.Endpoint.LastError.
+	LastError() *tcpip.Error
 }
 
 // LINT.IfChange
@@ -479,8 +484,35 @@ func (s *socketOpsCommon) fetchReadView() *syserr.Error {
 }
 
 // Release implements fs.FileOperations.Release.
-func (s *socketOpsCommon) Release(context.Context) {
+func (s *socketOpsCommon) Release(ctx context.Context) {
+	e, ch := waiter.NewChannelEntry(nil)
+	s.EventRegister(&e, waiter.EventHUp|waiter.EventErr)
+	defer s.EventUnregister(&e)
+
 	s.Endpoint.Close()
+
+	// SO_LINGER option is valid only for TCP. For other socket types
+	// return after endpoint close.
+	if family, skType, _ := s.Type(); skType != linux.SOCK_STREAM || (family != linux.AF_INET && family != linux.AF_INET6) {
+		return
+	}
+
+	var v tcpip.LingerOption
+	if err := s.Endpoint.GetSockOpt(&v); err != nil {
+		return
+	}
+
+	// The case for zero timeout is handled in tcp endpoint close function.
+	// Close is blocked until either:
+	// 1. The endpoint state is not in any of the states: FIN-WAIT1,
+	// CLOSING and LAST_ACK.
+	// 2. Timeout is reached.
+	if v.Enabled && v.Timeout != 0 {
+		t := kernel.TaskFromContext(ctx)
+		start := t.Kernel().MonotonicClock().Now()
+		deadline := start.Add(v.Timeout)
+		t.BlockWithDeadline(ch, true, deadline)
+	}
 }
 
 // Read implements fs.FileOperations.Read.
@@ -555,6 +587,11 @@ func (i *ioSequencePayload) Payload(size int) ([]byte, *tcpip.Error) {
 	}
 	v := buffer.NewView(size)
 	if _, err := i.src.CopyIn(i.ctx, v); err != nil {
+		// EOF can be returned only if src is a file and this means it
+		// is in a splice syscall and the error has to be ignored.
+		if err == io.EOF {
+			return v, nil
+		}
 		return nil, tcpip.ErrBadAddress
 	}
 	return v, nil
@@ -803,7 +840,20 @@ func (s *socketOpsCommon) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error {
 	}
 
 	// Issue the bind request to the endpoint.
-	return syserr.TranslateNetstackError(s.Endpoint.Bind(addr))
+	err := s.Endpoint.Bind(addr)
+	if err == tcpip.ErrNoPortAvailable {
+		// Bind always returns EADDRINUSE irrespective of if the specified port was
+		// already bound or if an ephemeral port was requested but none were
+		// available.
+		//
+		// tcpip.ErrNoPortAvailable is mapped to EAGAIN in syserr package because
+		// UDP connect returns EAGAIN on ephemeral port exhaustion.
+		//
+		// TCP connect returns EADDRNOTAVAIL on ephemeral port exhaustion.
+		err = tcpip.ErrPortInUse
+	}
+
+	return syserr.TranslateNetstackError(err)
 }
 
 // Listen implements the linux syscall listen(2) for sockets backed by
@@ -814,7 +864,7 @@ func (s *socketOpsCommon) Listen(t *kernel.Task, backlog int) *syserr.Error {
 
 // blockingAccept implements a blocking version of accept(2), that is, if no
 // connections are ready to be accept, it will block until one becomes ready.
-func (s *socketOpsCommon) blockingAccept(t *kernel.Task) (tcpip.Endpoint, *waiter.Queue, *syserr.Error) {
+func (s *socketOpsCommon) blockingAccept(t *kernel.Task, peerAddr *tcpip.FullAddress) (tcpip.Endpoint, *waiter.Queue, *syserr.Error) {
 	// Register for notifications.
 	e, ch := waiter.NewChannelEntry(nil)
 	s.EventRegister(&e, waiter.EventIn)
@@ -823,7 +873,7 @@ func (s *socketOpsCommon) blockingAccept(t *kernel.Task) (tcpip.Endpoint, *waite
 	// Try to accept the connection again; if it fails, then wait until we
 	// get a notification.
 	for {
-		if ep, wq, err := s.Endpoint.Accept(); err != tcpip.ErrWouldBlock {
+		if ep, wq, err := s.Endpoint.Accept(peerAddr); err != tcpip.ErrWouldBlock {
 			return ep, wq, syserr.TranslateNetstackError(err)
 		}
 
@@ -836,15 +886,18 @@ func (s *socketOpsCommon) blockingAccept(t *kernel.Task) (tcpip.Endpoint, *waite
 // Accept implements the linux syscall accept(2) for sockets backed by
 // tcpip.Endpoint.
 func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) {
-	// Issue the accept request to get the new endpoint.
-	ep, wq, terr := s.Endpoint.Accept()
+	var peerAddr *tcpip.FullAddress
+	if peerRequested {
+		peerAddr = &tcpip.FullAddress{}
+	}
+	ep, wq, terr := s.Endpoint.Accept(peerAddr)
 	if terr != nil {
 		if terr != tcpip.ErrWouldBlock || !blocking {
 			return 0, nil, 0, syserr.TranslateNetstackError(terr)
 		}
 
 		var err *syserr.Error
-		ep, wq, err = s.blockingAccept(t)
+		ep, wq, err = s.blockingAccept(t, peerAddr)
 		if err != nil {
 			return 0, nil, 0, err
 		}
@@ -864,13 +917,8 @@ func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int,
 
 	var addr linux.SockAddr
 	var addrLen uint32
-	if peerRequested {
-		// Get address of the peer and write it to peer slice.
-		var err *syserr.Error
-		addr, addrLen, err = ns.FileOperations.(*SocketOperations).GetPeerName(t)
-		if err != nil {
-			return 0, nil, 0, err
-		}
+	if peerAddr != nil {
+		addr, addrLen = ConvertAddress(s.family, *peerAddr)
 	}
 
 	fd, e := t.NewFDFrom(0, ns, kernel.FDFlags{
@@ -943,47 +991,12 @@ func (s *SocketOperations) GetSockOpt(t *kernel.Task, level, name int, outPtr us
 		return &val, nil
 	}
 
-	if s.skType == linux.SOCK_RAW && level == linux.IPPROTO_IP {
-		switch name {
-		case linux.IPT_SO_GET_INFO:
-			if outLen < linux.SizeOfIPTGetinfo {
-				return nil, syserr.ErrInvalidArgument
-			}
-
-			stack := inet.StackFromContext(t)
-			if stack == nil {
-				return nil, syserr.ErrNoDevice
-			}
-			info, err := netfilter.GetInfo(t, stack.(*Stack).Stack, outPtr)
-			if err != nil {
-				return nil, err
-			}
-			return &info, nil
-
-		case linux.IPT_SO_GET_ENTRIES:
-			if outLen < linux.SizeOfIPTGetEntries {
-				return nil, syserr.ErrInvalidArgument
-			}
-
-			stack := inet.StackFromContext(t)
-			if stack == nil {
-				return nil, syserr.ErrNoDevice
-			}
-			entries, err := netfilter.GetEntries(t, stack.(*Stack).Stack, outPtr, outLen)
-			if err != nil {
-				return nil, err
-			}
-			return &entries, nil
-
-		}
-	}
-
-	return GetSockOpt(t, s, s.Endpoint, s.family, s.skType, level, name, outLen)
+	return GetSockOpt(t, s, s.Endpoint, s.family, s.skType, level, name, outPtr, outLen)
 }
 
 // GetSockOpt can be used to implement the linux syscall getsockopt(2) for
 // sockets backed by a commonEndpoint.
-func GetSockOpt(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, family int, skType linux.SockType, level, name, outLen int) (marshal.Marshallable, *syserr.Error) {
+func GetSockOpt(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, family int, skType linux.SockType, level, name int, outPtr usermem.Addr, outLen int) (marshal.Marshallable, *syserr.Error) {
 	switch level {
 	case linux.SOL_SOCKET:
 		return getSockOptSocket(t, s, ep, family, skType, name, outLen)
@@ -992,10 +1005,10 @@ func GetSockOpt(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, family in
 		return getSockOptTCP(t, ep, name, outLen)
 
 	case linux.SOL_IPV6:
-		return getSockOptIPv6(t, ep, name, outLen)
+		return getSockOptIPv6(t, s, ep, name, outPtr, outLen)
 
 	case linux.SOL_IP:
-		return getSockOptIP(t, ep, name, outLen, family)
+		return getSockOptIP(t, s, ep, name, outPtr, outLen, family)
 
 	case linux.SOL_UDP,
 		linux.SOL_ICMPV6,
@@ -1025,7 +1038,7 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam
 		}
 
 		// Get the last error and convert it.
-		err := ep.GetSockOpt(tcpip.ErrorOption{})
+		err := ep.LastError()
 		if err == nil {
 			optP := primitive.Int32(0)
 			return &optP, nil
@@ -1176,7 +1189,16 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam
 			return nil, syserr.ErrInvalidArgument
 		}
 
-		linger := linux.Linger{}
+		var v tcpip.LingerOption
+		var linger linux.Linger
+		if err := ep.GetSockOpt(&v); err != nil {
+			return nil, syserr.TranslateNetstackError(err)
+		}
+
+		if v.Enabled {
+			linger.OnOff = 1
+		}
+		linger.Linger = int32(v.Timeout.Seconds())
 		return &linger, nil
 
 	case linux.SO_SNDTIMEO:
@@ -1390,8 +1412,12 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (marshal
 		if err := ep.GetSockOpt(&v); err != nil {
 			return nil, syserr.TranslateNetstackError(err)
 		}
-
-		lingerTimeout := primitive.Int32(time.Duration(v) / time.Second)
+		var lingerTimeout primitive.Int32
+		if v >= 0 {
+			lingerTimeout = primitive.Int32(time.Duration(v) / time.Second)
+		} else {
+			lingerTimeout = -1
+		}
 		return &lingerTimeout, nil
 
 	case linux.TCP_DEFER_ACCEPT:
@@ -1437,7 +1463,7 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (marshal
 }
 
 // getSockOptIPv6 implements GetSockOpt when level is SOL_IPV6.
-func getSockOptIPv6(t *kernel.Task, ep commonEndpoint, name, outLen int) (marshal.Marshallable, *syserr.Error) {
+func getSockOptIPv6(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name int, outPtr usermem.Addr, outLen int) (marshal.Marshallable, *syserr.Error) {
 	switch name {
 	case linux.IPV6_V6ONLY:
 		if outLen < sizeOfInt32 {
@@ -1490,9 +1516,78 @@ func getSockOptIPv6(t *kernel.Task, ep commonEndpoint, name, outLen int) (marsha
 		vP := primitive.Int32(boolToInt32(v))
 		return &vP, nil
 
-	case linux.SO_ORIGINAL_DST:
-		// TODO(gvisor.dev/issue/170): ip6tables.
-		return nil, syserr.ErrInvalidArgument
+	case linux.IP6T_ORIGINAL_DST:
+		if outLen < int(binary.Size(linux.SockAddrInet6{})) {
+			return nil, syserr.ErrInvalidArgument
+		}
+
+		var v tcpip.OriginalDestinationOption
+		if err := ep.GetSockOpt(&v); err != nil {
+			return nil, syserr.TranslateNetstackError(err)
+		}
+
+		a, _ := ConvertAddress(linux.AF_INET6, tcpip.FullAddress(v))
+		return a.(*linux.SockAddrInet6), nil
+
+	case linux.IP6T_SO_GET_INFO:
+		if outLen < linux.SizeOfIPTGetinfo {
+			return nil, syserr.ErrInvalidArgument
+		}
+
+		// Only valid for raw IPv6 sockets.
+		if family, skType, _ := s.Type(); family != linux.AF_INET6 || skType != linux.SOCK_RAW {
+			return nil, syserr.ErrProtocolNotAvailable
+		}
+
+		stack := inet.StackFromContext(t)
+		if stack == nil {
+			return nil, syserr.ErrNoDevice
+		}
+		info, err := netfilter.GetInfo(t, stack.(*Stack).Stack, outPtr, true)
+		if err != nil {
+			return nil, err
+		}
+		return &info, nil
+
+	case linux.IP6T_SO_GET_ENTRIES:
+		// IPTGetEntries is reused for IPv6.
+		if outLen < linux.SizeOfIPTGetEntries {
+			return nil, syserr.ErrInvalidArgument
+		}
+		// Only valid for raw IPv6 sockets.
+		if family, skType, _ := s.Type(); family != linux.AF_INET6 || skType != linux.SOCK_RAW {
+			return nil, syserr.ErrProtocolNotAvailable
+		}
+
+		stack := inet.StackFromContext(t)
+		if stack == nil {
+			return nil, syserr.ErrNoDevice
+		}
+		entries, err := netfilter.GetEntries6(t, stack.(*Stack).Stack, outPtr, outLen)
+		if err != nil {
+			return nil, err
+		}
+		return &entries, nil
+
+	case linux.IP6T_SO_GET_REVISION_TARGET:
+		if outLen < linux.SizeOfXTGetRevision {
+			return nil, syserr.ErrInvalidArgument
+		}
+
+		// Only valid for raw IPv6 sockets.
+		if family, skType, _ := s.Type(); family != linux.AF_INET6 || skType != linux.SOCK_RAW {
+			return nil, syserr.ErrProtocolNotAvailable
+		}
+
+		stack := inet.StackFromContext(t)
+		if stack == nil {
+			return nil, syserr.ErrNoDevice
+		}
+		ret, err := netfilter.TargetRevision(t, outPtr, header.IPv6ProtocolNumber)
+		if err != nil {
+			return nil, err
+		}
+		return &ret, nil
 
 	default:
 		emitUnimplementedEventIPv6(t, name)
@@ -1501,7 +1596,7 @@ func getSockOptIPv6(t *kernel.Task, ep commonEndpoint, name, outLen int) (marsha
 }
 
 // getSockOptIP implements GetSockOpt when level is SOL_IP.
-func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int, family int) (marshal.Marshallable, *syserr.Error) {
+func getSockOptIP(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name int, outPtr usermem.Addr, outLen int, family int) (marshal.Marshallable, *syserr.Error) {
 	switch name {
 	case linux.IP_TTL:
 		if outLen < sizeOfInt32 {
@@ -1617,6 +1712,66 @@ func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int, family in
 		a, _ := ConvertAddress(linux.AF_INET, tcpip.FullAddress(v))
 		return a.(*linux.SockAddrInet), nil
 
+	case linux.IPT_SO_GET_INFO:
+		if outLen < linux.SizeOfIPTGetinfo {
+			return nil, syserr.ErrInvalidArgument
+		}
+
+		// Only valid for raw IPv4 sockets.
+		if family, skType, _ := s.Type(); family != linux.AF_INET || skType != linux.SOCK_RAW {
+			return nil, syserr.ErrProtocolNotAvailable
+		}
+
+		stack := inet.StackFromContext(t)
+		if stack == nil {
+			return nil, syserr.ErrNoDevice
+		}
+		info, err := netfilter.GetInfo(t, stack.(*Stack).Stack, outPtr, false)
+		if err != nil {
+			return nil, err
+		}
+		return &info, nil
+
+	case linux.IPT_SO_GET_ENTRIES:
+		if outLen < linux.SizeOfIPTGetEntries {
+			return nil, syserr.ErrInvalidArgument
+		}
+
+		// Only valid for raw IPv4 sockets.
+		if family, skType, _ := s.Type(); family != linux.AF_INET || skType != linux.SOCK_RAW {
+			return nil, syserr.ErrProtocolNotAvailable
+		}
+
+		stack := inet.StackFromContext(t)
+		if stack == nil {
+			return nil, syserr.ErrNoDevice
+		}
+		entries, err := netfilter.GetEntries4(t, stack.(*Stack).Stack, outPtr, outLen)
+		if err != nil {
+			return nil, err
+		}
+		return &entries, nil
+
+	case linux.IPT_SO_GET_REVISION_TARGET:
+		if outLen < linux.SizeOfXTGetRevision {
+			return nil, syserr.ErrInvalidArgument
+		}
+
+		// Only valid for raw IPv4 sockets.
+		if family, skType, _ := s.Type(); family != linux.AF_INET || skType != linux.SOCK_RAW {
+			return nil, syserr.ErrProtocolNotAvailable
+		}
+
+		stack := inet.StackFromContext(t)
+		if stack == nil {
+			return nil, syserr.ErrNoDevice
+		}
+		ret, err := netfilter.TargetRevision(t, outPtr, header.IPv4ProtocolNumber)
+		if err != nil {
+			return nil, err
+		}
+		return &ret, nil
+
 	default:
 		emitUnimplementedEventIP(t, name)
 	}
@@ -1650,26 +1805,6 @@ func (s *SocketOperations) SetSockOpt(t *kernel.Task, level int, name int, optVa
 		return nil
 	}
 
-	if s.skType == linux.SOCK_RAW && level == linux.IPPROTO_IP {
-		switch name {
-		case linux.IPT_SO_SET_REPLACE:
-			if len(optVal) < linux.SizeOfIPTReplace {
-				return syserr.ErrInvalidArgument
-			}
-
-			stack := inet.StackFromContext(t)
-			if stack == nil {
-				return syserr.ErrNoDevice
-			}
-			// Stack must be a netstack stack.
-			return netfilter.SetEntries(stack.(*Stack).Stack, optVal)
-
-		case linux.IPT_SO_SET_ADD_COUNTERS:
-			// TODO(gvisor.dev/issue/170): Counter support.
-			return nil
-		}
-	}
-
 	return SetSockOpt(t, s, s.Endpoint, level, name, optVal)
 }
 
@@ -1684,21 +1819,26 @@ func SetSockOpt(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, level int
 		return setSockOptTCP(t, ep, name, optVal)
 
 	case linux.SOL_IPV6:
-		return setSockOptIPv6(t, ep, name, optVal)
+		return setSockOptIPv6(t, s, ep, name, optVal)
 
 	case linux.SOL_IP:
-		return setSockOptIP(t, ep, name, optVal)
+		return setSockOptIP(t, s, ep, name, optVal)
+
+	case linux.SOL_PACKET:
+		// gVisor doesn't support any SOL_PACKET options just return not
+		// supported. Returning nil here will result in tcpdump thinking AF_PACKET
+		// features are supported and proceed to use them and break.
+		t.Kernel().EmitUnimplementedEvent(t)
+		return syserr.ErrProtocolNotAvailable
 
 	case linux.SOL_UDP,
 		linux.SOL_ICMPV6,
-		linux.SOL_RAW,
-		linux.SOL_PACKET:
+		linux.SOL_RAW:
 
 		t.Kernel().EmitUnimplementedEvent(t)
 	}
 
-	// Default to the old behavior; hand off to network stack.
-	return syserr.TranslateNetstackError(ep.SetSockOpt(struct{}{}))
+	return nil
 }
 
 // setSockOptSocket implements SetSockOpt when level is SOL_SOCKET.
@@ -1743,7 +1883,8 @@ func setSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, nam
 		}
 		name := string(optVal[:n])
 		if name == "" {
-			return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.BindToDeviceOption(0)))
+			v := tcpip.BindToDeviceOption(0)
+			return syserr.TranslateNetstackError(ep.SetSockOpt(&v))
 		}
 		s := t.NetworkContext()
 		if s == nil {
@@ -1751,7 +1892,8 @@ func setSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, nam
 		}
 		for nicID, nic := range s.Interfaces() {
 			if nic.Name == name {
-				return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.BindToDeviceOption(nicID)))
+				v := tcpip.BindToDeviceOption(nicID)
+				return syserr.TranslateNetstackError(ep.SetSockOpt(&v))
 			}
 		}
 		return syserr.ErrUnknownDevice
@@ -1817,7 +1959,8 @@ func setSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, nam
 			socket.SetSockOptEmitUnimplementedEvent(t, name)
 		}
 
-		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.OutOfBandInlineOption(v)))
+		opt := tcpip.OutOfBandInlineOption(v)
+		return syserr.TranslateNetstackError(ep.SetSockOpt(&opt))
 
 	case linux.SO_NO_CHECK:
 		if len(optVal) < sizeOfInt32 {
@@ -1839,19 +1982,21 @@ func setSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, nam
 			socket.SetSockOptEmitUnimplementedEvent(t, name)
 		}
 
-		return nil
+		return syserr.TranslateNetstackError(
+			ep.SetSockOpt(&tcpip.LingerOption{
+				Enabled: v.OnOff != 0,
+				Timeout: time.Second * time.Duration(v.Linger)}))
 
 	case linux.SO_DETACH_FILTER:
 		// optval is ignored.
 		var v tcpip.SocketDetachFilterOption
-		return syserr.TranslateNetstackError(ep.SetSockOpt(v))
+		return syserr.TranslateNetstackError(ep.SetSockOpt(&v))
 
 	default:
 		socket.SetSockOptEmitUnimplementedEvent(t, name)
 	}
 
-	// Default to the old behavior; hand off to network stack.
-	return syserr.TranslateNetstackError(ep.SetSockOpt(struct{}{}))
+	return nil
 }
 
 // setSockOptTCP implements SetSockOpt when level is SOL_TCP.
@@ -1898,7 +2043,8 @@ func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *
 		if v < 1 || v > linux.MAX_TCP_KEEPIDLE {
 			return syserr.ErrInvalidArgument
 		}
-		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.KeepaliveIdleOption(time.Second * time.Duration(v))))
+		opt := tcpip.KeepaliveIdleOption(time.Second * time.Duration(v))
+		return syserr.TranslateNetstackError(ep.SetSockOpt(&opt))
 
 	case linux.TCP_KEEPINTVL:
 		if len(optVal) < sizeOfInt32 {
@@ -1909,7 +2055,8 @@ func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *
 		if v < 1 || v > linux.MAX_TCP_KEEPINTVL {
 			return syserr.ErrInvalidArgument
 		}
-		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.KeepaliveIntervalOption(time.Second * time.Duration(v))))
+		opt := tcpip.KeepaliveIntervalOption(time.Second * time.Duration(v))
+		return syserr.TranslateNetstackError(ep.SetSockOpt(&opt))
 
 	case linux.TCP_KEEPCNT:
 		if len(optVal) < sizeOfInt32 {
@@ -1931,11 +2078,12 @@ func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *
 		if v < 0 {
 			return syserr.ErrInvalidArgument
 		}
-		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.TCPUserTimeoutOption(time.Millisecond * time.Duration(v))))
+		opt := tcpip.TCPUserTimeoutOption(time.Millisecond * time.Duration(v))
+		return syserr.TranslateNetstackError(ep.SetSockOpt(&opt))
 
 	case linux.TCP_CONGESTION:
 		v := tcpip.CongestionControlOption(optVal)
-		if err := ep.SetSockOpt(v); err != nil {
+		if err := ep.SetSockOpt(&v); err != nil {
 			return syserr.TranslateNetstackError(err)
 		}
 		return nil
@@ -1945,8 +2093,9 @@ func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *
 			return syserr.ErrInvalidArgument
 		}
 
-		v := usermem.ByteOrder.Uint32(optVal)
-		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.TCPLingerTimeoutOption(time.Second * time.Duration(v))))
+		v := int32(usermem.ByteOrder.Uint32(optVal))
+		opt := tcpip.TCPLingerTimeoutOption(time.Second * time.Duration(v))
+		return syserr.TranslateNetstackError(ep.SetSockOpt(&opt))
 
 	case linux.TCP_DEFER_ACCEPT:
 		if len(optVal) < sizeOfInt32 {
@@ -1956,7 +2105,8 @@ func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *
 		if v < 0 {
 			v = 0
 		}
-		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.TCPDeferAcceptOption(time.Second * time.Duration(v))))
+		opt := tcpip.TCPDeferAcceptOption(time.Second * time.Duration(v))
+		return syserr.TranslateNetstackError(ep.SetSockOpt(&opt))
 
 	case linux.TCP_SYNCNT:
 		if len(optVal) < sizeOfInt32 {
@@ -1981,12 +2131,11 @@ func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *
 		emitUnimplementedEventTCP(t, name)
 	}
 
-	// Default to the old behavior; hand off to network stack.
-	return syserr.TranslateNetstackError(ep.SetSockOpt(struct{}{}))
+	return nil
 }
 
 // setSockOptIPv6 implements SetSockOpt when level is SOL_IPV6.
-func setSockOptIPv6(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *syserr.Error {
+func setSockOptIPv6(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name int, optVal []byte) *syserr.Error {
 	switch name {
 	case linux.IPV6_V6ONLY:
 		if len(optVal) < sizeOfInt32 {
@@ -2035,12 +2184,32 @@ func setSockOptIPv6(t *kernel.Task, ep commonEndpoint, name int, optVal []byte)
 
 		return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.ReceiveTClassOption, v != 0))
 
+	case linux.IP6T_SO_SET_REPLACE:
+		if len(optVal) < linux.SizeOfIP6TReplace {
+			return syserr.ErrInvalidArgument
+		}
+
+		// Only valid for raw IPv6 sockets.
+		if family, skType, _ := s.Type(); family != linux.AF_INET6 || skType != linux.SOCK_RAW {
+			return syserr.ErrProtocolNotAvailable
+		}
+
+		stack := inet.StackFromContext(t)
+		if stack == nil {
+			return syserr.ErrNoDevice
+		}
+		// Stack must be a netstack stack.
+		return netfilter.SetEntries(stack.(*Stack).Stack, optVal, true)
+
+	case linux.IP6T_SO_SET_ADD_COUNTERS:
+		// TODO(gvisor.dev/issue/170): Counter support.
+		return nil
+
 	default:
 		emitUnimplementedEventIPv6(t, name)
 	}
 
-	// Default to the old behavior; hand off to network stack.
-	return syserr.TranslateNetstackError(ep.SetSockOpt(struct{}{}))
+	return nil
 }
 
 var (
@@ -2095,7 +2264,7 @@ func parseIntOrChar(buf []byte) (int32, *syserr.Error) {
 }
 
 // setSockOptIP implements SetSockOpt when level is SOL_IP.
-func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *syserr.Error {
+func setSockOptIP(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name int, optVal []byte) *syserr.Error {
 	switch name {
 	case linux.IP_MULTICAST_TTL:
 		v, err := parseIntOrChar(optVal)
@@ -2118,7 +2287,7 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s
 			return err
 		}
 
-		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.AddMembershipOption{
+		return syserr.TranslateNetstackError(ep.SetSockOpt(&tcpip.AddMembershipOption{
 			NIC: tcpip.NICID(req.InterfaceIndex),
 			// TODO(igudger): Change AddMembership to use the standard
 			// any address representation.
@@ -2132,7 +2301,7 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s
 			return err
 		}
 
-		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.RemoveMembershipOption{
+		return syserr.TranslateNetstackError(ep.SetSockOpt(&tcpip.RemoveMembershipOption{
 			NIC: tcpip.NICID(req.InterfaceIndex),
 			// TODO(igudger): Change DropMembership to use the standard
 			// any address representation.
@@ -2146,7 +2315,7 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s
 			return err
 		}
 
-		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.MulticastInterfaceOption{
+		return syserr.TranslateNetstackError(ep.SetSockOpt(&tcpip.MulticastInterfaceOption{
 			NIC:           tcpip.NICID(req.InterfaceIndex),
 			InterfaceAddr: bytesToIPAddress(req.InterfaceAddr[:]),
 		}))
@@ -2215,6 +2384,27 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s
 		}
 		return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.IPHdrIncludedOption, v != 0))
 
+	case linux.IPT_SO_SET_REPLACE:
+		if len(optVal) < linux.SizeOfIPTReplace {
+			return syserr.ErrInvalidArgument
+		}
+
+		// Only valid for raw IPv4 sockets.
+		if family, skType, _ := s.Type(); family != linux.AF_INET || skType != linux.SOCK_RAW {
+			return syserr.ErrProtocolNotAvailable
+		}
+
+		stack := inet.StackFromContext(t)
+		if stack == nil {
+			return syserr.ErrNoDevice
+		}
+		// Stack must be a netstack stack.
+		return netfilter.SetEntries(stack.(*Stack).Stack, optVal, false)
+
+	case linux.IPT_SO_SET_ADD_COUNTERS:
+		// TODO(gvisor.dev/issue/170): Counter support.
+		return nil
+
 	case linux.IP_ADD_SOURCE_MEMBERSHIP,
 		linux.IP_BIND_ADDRESS_NO_PORT,
 		linux.IP_BLOCK_SOURCE,
@@ -2249,8 +2439,7 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s
 		t.Kernel().EmitUnimplementedEvent(t)
 	}
 
-	// Default to the old behavior; hand off to network stack.
-	return syserr.TranslateNetstackError(ep.SetSockOpt(struct{}{}))
+	return nil
 }
 
 // emitUnimplementedEventTCP emits unimplemented event if name is valid. This
diff --git a/pkg/sentry/socket/netstack/netstack_vfs2.go b/pkg/sentry/socket/netstack/netstack_vfs2.go
index 3335e7430..4c6791fff 100644
--- a/pkg/sentry/socket/netstack/netstack_vfs2.go
+++ b/pkg/sentry/socket/netstack/netstack_vfs2.go
@@ -18,21 +18,19 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/amutex"
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/marshal"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs"
-	"gvisor.dev/gvisor/pkg/sentry/inet"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/socket"
-	"gvisor.dev/gvisor/pkg/sentry/socket/netfilter"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserr"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
-	"gvisor.dev/gvisor/tools/go_marshal/marshal"
-	"gvisor.dev/gvisor/tools/go_marshal/primitive"
 )
 
 // SocketVFS2 encapsulates all the state needed to represent a network stack
@@ -58,6 +56,7 @@ func NewVFS2(t *kernel.Task, family int, skType linux.SockType, protocol int, qu
 
 	mnt := t.Kernel().SocketMount()
 	d := sockfs.NewDentry(t.Credentials(), mnt)
+	defer d.DecRef(t)
 
 	s := &SocketVFS2{
 		socketOpsCommon: socketOpsCommon{
@@ -80,6 +79,13 @@ func NewVFS2(t *kernel.Task, family int, skType linux.SockType, protocol int, qu
 	return vfsfd, nil
 }
 
+// Release implements vfs.FileDescriptionImpl.Release.
+func (s *SocketVFS2) Release(ctx context.Context) {
+	t := kernel.TaskFromContext(ctx)
+	t.Kernel().DeleteSocketVFS2(&s.vfsfd)
+	s.socketOpsCommon.Release(ctx)
+}
+
 // Readiness implements waiter.Waitable.Readiness.
 func (s *SocketVFS2) Readiness(mask waiter.EventMask) waiter.EventMask {
 	return s.socketOpsCommon.Readiness(mask)
@@ -152,14 +158,18 @@ func (s *SocketVFS2) Write(ctx context.Context, src usermem.IOSequence, opts vfs
 // tcpip.Endpoint.
 func (s *SocketVFS2) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) {
 	// Issue the accept request to get the new endpoint.
-	ep, wq, terr := s.Endpoint.Accept()
+	var peerAddr *tcpip.FullAddress
+	if peerRequested {
+		peerAddr = &tcpip.FullAddress{}
+	}
+	ep, wq, terr := s.Endpoint.Accept(peerAddr)
 	if terr != nil {
 		if terr != tcpip.ErrWouldBlock || !blocking {
 			return 0, nil, 0, syserr.TranslateNetstackError(terr)
 		}
 
 		var err *syserr.Error
-		ep, wq, err = s.blockingAccept(t)
+		ep, wq, err = s.blockingAccept(t, peerAddr)
 		if err != nil {
 			return 0, nil, 0, err
 		}
@@ -177,13 +187,9 @@ func (s *SocketVFS2) Accept(t *kernel.Task, peerRequested bool, flags int, block
 
 	var addr linux.SockAddr
 	var addrLen uint32
-	if peerRequested {
+	if peerAddr != nil {
 		// Get address of the peer and write it to peer slice.
-		var err *syserr.Error
-		addr, addrLen, err = ns.Impl().(*SocketVFS2).GetPeerName(t)
-		if err != nil {
-			return 0, nil, 0, err
-		}
+		addr, addrLen = ConvertAddress(s.family, *peerAddr)
 	}
 
 	fd, e := t.NewFDFromVFS2(0, ns, kernel.FDFlags{
@@ -233,42 +239,7 @@ func (s *SocketVFS2) GetSockOpt(t *kernel.Task, level, name int, outPtr usermem.
 		return &val, nil
 	}
 
-	if s.skType == linux.SOCK_RAW && level == linux.IPPROTO_IP {
-		switch name {
-		case linux.IPT_SO_GET_INFO:
-			if outLen < linux.SizeOfIPTGetinfo {
-				return nil, syserr.ErrInvalidArgument
-			}
-
-			stack := inet.StackFromContext(t)
-			if stack == nil {
-				return nil, syserr.ErrNoDevice
-			}
-			info, err := netfilter.GetInfo(t, stack.(*Stack).Stack, outPtr)
-			if err != nil {
-				return nil, err
-			}
-			return &info, nil
-
-		case linux.IPT_SO_GET_ENTRIES:
-			if outLen < linux.SizeOfIPTGetEntries {
-				return nil, syserr.ErrInvalidArgument
-			}
-
-			stack := inet.StackFromContext(t)
-			if stack == nil {
-				return nil, syserr.ErrNoDevice
-			}
-			entries, err := netfilter.GetEntries(t, stack.(*Stack).Stack, outPtr, outLen)
-			if err != nil {
-				return nil, err
-			}
-			return &entries, nil
-
-		}
-	}
-
-	return GetSockOpt(t, s, s.Endpoint, s.family, s.skType, level, name, outLen)
+	return GetSockOpt(t, s, s.Endpoint, s.family, s.skType, level, name, outPtr, outLen)
 }
 
 // SetSockOpt implements the linux syscall setsockopt(2) for sockets backed by
@@ -298,26 +269,6 @@ func (s *SocketVFS2) SetSockOpt(t *kernel.Task, level int, name int, optVal []by
 		return nil
 	}
 
-	if s.skType == linux.SOCK_RAW && level == linux.IPPROTO_IP {
-		switch name {
-		case linux.IPT_SO_SET_REPLACE:
-			if len(optVal) < linux.SizeOfIPTReplace {
-				return syserr.ErrInvalidArgument
-			}
-
-			stack := inet.StackFromContext(t)
-			if stack == nil {
-				return syserr.ErrNoDevice
-			}
-			// Stack must be a netstack stack.
-			return netfilter.SetEntries(stack.(*Stack).Stack, optVal)
-
-		case linux.IPT_SO_SET_ADD_COUNTERS:
-			// TODO(gvisor.dev/issue/170): Counter support.
-			return nil
-		}
-	}
-
 	return SetSockOpt(t, s, s.Endpoint, level, name, optVal)
 }
 
diff --git a/pkg/sentry/socket/netstack/stack.go b/pkg/sentry/socket/netstack/stack.go
index f0fe18684..1028d2a6e 100644
--- a/pkg/sentry/socket/netstack/stack.go
+++ b/pkg/sentry/socket/netstack/stack.go
@@ -155,7 +155,7 @@ func (s *Stack) AddInterfaceAddr(idx int32, addr inet.InterfaceAddr) error {
 
 // TCPReceiveBufferSize implements inet.Stack.TCPReceiveBufferSize.
 func (s *Stack) TCPReceiveBufferSize() (inet.TCPBufferSize, error) {
-	var rs tcp.ReceiveBufferSizeOption
+	var rs tcpip.TCPReceiveBufferSizeRangeOption
 	err := s.Stack.TransportProtocolOption(tcp.ProtocolNumber, &rs)
 	return inet.TCPBufferSize{
 		Min:     rs.Min,
@@ -166,17 +166,17 @@ func (s *Stack) TCPReceiveBufferSize() (inet.TCPBufferSize, error) {
 
 // SetTCPReceiveBufferSize implements inet.Stack.SetTCPReceiveBufferSize.
 func (s *Stack) SetTCPReceiveBufferSize(size inet.TCPBufferSize) error {
-	rs := tcp.ReceiveBufferSizeOption{
+	rs := tcpip.TCPReceiveBufferSizeRangeOption{
 		Min:     size.Min,
 		Default: size.Default,
 		Max:     size.Max,
 	}
-	return syserr.TranslateNetstackError(s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, rs)).ToError()
+	return syserr.TranslateNetstackError(s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, &rs)).ToError()
 }
 
 // TCPSendBufferSize implements inet.Stack.TCPSendBufferSize.
 func (s *Stack) TCPSendBufferSize() (inet.TCPBufferSize, error) {
-	var ss tcp.SendBufferSizeOption
+	var ss tcpip.TCPSendBufferSizeRangeOption
 	err := s.Stack.TransportProtocolOption(tcp.ProtocolNumber, &ss)
 	return inet.TCPBufferSize{
 		Min:     ss.Min,
@@ -187,29 +187,30 @@ func (s *Stack) TCPSendBufferSize() (inet.TCPBufferSize, error) {
 
 // SetTCPSendBufferSize implements inet.Stack.SetTCPSendBufferSize.
 func (s *Stack) SetTCPSendBufferSize(size inet.TCPBufferSize) error {
-	ss := tcp.SendBufferSizeOption{
+	ss := tcpip.TCPSendBufferSizeRangeOption{
 		Min:     size.Min,
 		Default: size.Default,
 		Max:     size.Max,
 	}
-	return syserr.TranslateNetstackError(s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, ss)).ToError()
+	return syserr.TranslateNetstackError(s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, &ss)).ToError()
 }
 
 // TCPSACKEnabled implements inet.Stack.TCPSACKEnabled.
 func (s *Stack) TCPSACKEnabled() (bool, error) {
-	var sack tcp.SACKEnabled
+	var sack tcpip.TCPSACKEnabled
 	err := s.Stack.TransportProtocolOption(tcp.ProtocolNumber, &sack)
 	return bool(sack), syserr.TranslateNetstackError(err).ToError()
 }
 
 // SetTCPSACKEnabled implements inet.Stack.SetTCPSACKEnabled.
 func (s *Stack) SetTCPSACKEnabled(enabled bool) error {
-	return syserr.TranslateNetstackError(s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SACKEnabled(enabled))).ToError()
+	opt := tcpip.TCPSACKEnabled(enabled)
+	return syserr.TranslateNetstackError(s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, &opt)).ToError()
 }
 
 // TCPRecovery implements inet.Stack.TCPRecovery.
 func (s *Stack) TCPRecovery() (inet.TCPLossRecovery, error) {
-	var recovery tcp.Recovery
+	var recovery tcpip.TCPRecovery
 	if err := s.Stack.TransportProtocolOption(tcp.ProtocolNumber, &recovery); err != nil {
 		return 0, syserr.TranslateNetstackError(err).ToError()
 	}
@@ -218,7 +219,8 @@ func (s *Stack) TCPRecovery() (inet.TCPLossRecovery, error) {
 
 // SetTCPRecovery implements inet.Stack.SetTCPRecovery.
 func (s *Stack) SetTCPRecovery(recovery inet.TCPLossRecovery) error {
-	return syserr.TranslateNetstackError(s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.Recovery(recovery))).ToError()
+	opt := tcpip.TCPRecovery(recovery)
+	return syserr.TranslateNetstackError(s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, &opt)).ToError()
 }
 
 // Statistics implements inet.Stack.Statistics.
@@ -410,3 +412,24 @@ func (s *Stack) CleanupEndpoints() []stack.TransportEndpoint {
 func (s *Stack) RestoreCleanupEndpoints(es []stack.TransportEndpoint) {
 	s.Stack.RestoreCleanupEndpoints(es)
 }
+
+// Forwarding implements inet.Stack.Forwarding.
+func (s *Stack) Forwarding(protocol tcpip.NetworkProtocolNumber) bool {
+	switch protocol {
+	case ipv4.ProtocolNumber, ipv6.ProtocolNumber:
+		return s.Stack.Forwarding(protocol)
+	default:
+		panic(fmt.Sprintf("Forwarding(%v) failed: unsupported protocol", protocol))
+	}
+}
+
+// SetForwarding implements inet.Stack.SetForwarding.
+func (s *Stack) SetForwarding(protocol tcpip.NetworkProtocolNumber, enable bool) error {
+	switch protocol {
+	case ipv4.ProtocolNumber, ipv6.ProtocolNumber:
+		s.Stack.SetForwarding(protocol, enable)
+	default:
+		panic(fmt.Sprintf("SetForwarding(%v) failed: unsupported protocol", protocol))
+	}
+	return nil
+}
diff --git a/pkg/sentry/socket/socket.go b/pkg/sentry/socket/socket.go
index 04b259d27..fd31479e5 100644
--- a/pkg/sentry/socket/socket.go
+++ b/pkg/sentry/socket/socket.go
@@ -25,6 +25,7 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/binary"
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/marshal"
 	"gvisor.dev/gvisor/pkg/sentry/device"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
@@ -35,7 +36,6 @@ import (
 	"gvisor.dev/gvisor/pkg/syserr"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/usermem"
-	"gvisor.dev/gvisor/tools/go_marshal/marshal"
 )
 
 // ControlMessages represents the union of unix control messages and tcpip
diff --git a/pkg/sentry/socket/unix/BUILD b/pkg/sentry/socket/unix/BUILD
index cb953e4dc..cc7408698 100644
--- a/pkg/sentry/socket/unix/BUILD
+++ b/pkg/sentry/socket/unix/BUILD
@@ -7,10 +7,21 @@ go_template_instance(
     name = "socket_refs",
     out = "socket_refs.go",
     package = "unix",
-    prefix = "socketOpsCommon",
+    prefix = "socketOperations",
     template = "//pkg/refs_vfs2:refs_template",
     types = {
-        "T": "socketOpsCommon",
+        "T": "SocketOperations",
+    },
+)
+
+go_template_instance(
+    name = "socket_vfs2_refs",
+    out = "socket_vfs2_refs.go",
+    package = "unix",
+    prefix = "socketVFS2",
+    template = "//pkg/refs_vfs2:refs_template",
+    types = {
+        "T": "SocketVFS2",
     },
 )
 
@@ -20,6 +31,7 @@ go_library(
         "device.go",
         "io.go",
         "socket_refs.go",
+        "socket_vfs2_refs.go",
         "unix.go",
         "unix_vfs2.go",
     ],
@@ -29,6 +41,7 @@ go_library(
         "//pkg/context",
         "//pkg/fspath",
         "//pkg/log",
+        "//pkg/marshal",
         "//pkg/refs",
         "//pkg/safemem",
         "//pkg/sentry/arch",
@@ -49,6 +62,5 @@ go_library(
         "//pkg/tcpip",
         "//pkg/usermem",
         "//pkg/waiter",
-        "//tools/go_marshal/marshal",
     ],
 )
diff --git a/pkg/sentry/socket/unix/transport/BUILD b/pkg/sentry/socket/unix/transport/BUILD
index c708b6030..26c3a51b9 100644
--- a/pkg/sentry/socket/unix/transport/BUILD
+++ b/pkg/sentry/socket/unix/transport/BUILD
@@ -15,6 +15,17 @@ go_template_instance(
     },
 )
 
+go_template_instance(
+    name = "queue_refs",
+    out = "queue_refs.go",
+    package = "transport",
+    prefix = "queue",
+    template = "//pkg/refs_vfs2:refs_template",
+    types = {
+        "T": "queue",
+    },
+)
+
 go_library(
     name = "transport",
     srcs = [
@@ -22,6 +33,7 @@ go_library(
         "connectioned_state.go",
         "connectionless.go",
         "queue.go",
+        "queue_refs.go",
         "transport_message_list.go",
         "unix.go",
     ],
diff --git a/pkg/sentry/socket/unix/transport/connectioned.go b/pkg/sentry/socket/unix/transport/connectioned.go
index c67b602f0..aa4f3c04d 100644
--- a/pkg/sentry/socket/unix/transport/connectioned.go
+++ b/pkg/sentry/socket/unix/transport/connectioned.go
@@ -142,9 +142,9 @@ func NewPair(ctx context.Context, stype linux.SockType, uid UniqueIDProvider) (E
 	}
 
 	q1 := &queue{ReaderQueue: a.Queue, WriterQueue: b.Queue, limit: initialLimit}
-	q1.EnableLeakCheck("transport.queue")
+	q1.EnableLeakCheck()
 	q2 := &queue{ReaderQueue: b.Queue, WriterQueue: a.Queue, limit: initialLimit}
-	q2.EnableLeakCheck("transport.queue")
+	q2.EnableLeakCheck()
 
 	if stype == linux.SOCK_STREAM {
 		a.receiver = &streamQueueReceiver{queueReceiver: queueReceiver{q1}}
@@ -300,14 +300,14 @@ func (e *connectionedEndpoint) BidirectionalConnect(ctx context.Context, ce Conn
 	}
 
 	readQueue := &queue{ReaderQueue: ce.WaiterQueue(), WriterQueue: ne.Queue, limit: initialLimit}
-	readQueue.EnableLeakCheck("transport.queue")
+	readQueue.EnableLeakCheck()
 	ne.connected = &connectedEndpoint{
 		endpoint:   ce,
 		writeQueue: readQueue,
 	}
 
 	writeQueue := &queue{ReaderQueue: ne.Queue, WriterQueue: ce.WaiterQueue(), limit: initialLimit}
-	writeQueue.EnableLeakCheck("transport.queue")
+	writeQueue.EnableLeakCheck()
 	if e.stype == linux.SOCK_STREAM {
 		ne.receiver = &streamQueueReceiver{queueReceiver: queueReceiver{readQueue: writeQueue}}
 	} else {
@@ -391,7 +391,7 @@ func (e *connectionedEndpoint) Listen(backlog int) *syserr.Error {
 }
 
 // Accept accepts a new connection.
-func (e *connectionedEndpoint) Accept() (Endpoint, *syserr.Error) {
+func (e *connectionedEndpoint) Accept(peerAddr *tcpip.FullAddress) (Endpoint, *syserr.Error) {
 	e.Lock()
 	defer e.Unlock()
 
@@ -401,6 +401,18 @@ func (e *connectionedEndpoint) Accept() (Endpoint, *syserr.Error) {
 
 	select {
 	case ne := <-e.acceptedChan:
+		if peerAddr != nil {
+			ne.Lock()
+			c := ne.connected
+			ne.Unlock()
+			if c != nil {
+				addr, err := c.GetLocalAddress()
+				if err != nil {
+					return nil, syserr.TranslateNetstackError(err)
+				}
+				*peerAddr = addr
+			}
+		}
 		return ne, nil
 
 	default:
diff --git a/pkg/sentry/socket/unix/transport/connectionless.go b/pkg/sentry/socket/unix/transport/connectionless.go
index 70ee8f9b8..f8aacca13 100644
--- a/pkg/sentry/socket/unix/transport/connectionless.go
+++ b/pkg/sentry/socket/unix/transport/connectionless.go
@@ -42,7 +42,7 @@ var (
 func NewConnectionless(ctx context.Context) Endpoint {
 	ep := &connectionlessEndpoint{baseEndpoint{Queue: &waiter.Queue{}}}
 	q := queue{ReaderQueue: ep.Queue, WriterQueue: &waiter.Queue{}, limit: initialLimit}
-	q.EnableLeakCheck("transport.queue")
+	q.EnableLeakCheck()
 	ep.receiver = &queueReceiver{readQueue: &q}
 	return ep
 }
@@ -144,12 +144,12 @@ func (e *connectionlessEndpoint) Connect(ctx context.Context, server BoundEndpoi
 }
 
 // Listen starts listening on the connection.
-func (e *connectionlessEndpoint) Listen(int) *syserr.Error {
+func (*connectionlessEndpoint) Listen(int) *syserr.Error {
 	return syserr.ErrNotSupported
 }
 
 // Accept accepts a new connection.
-func (e *connectionlessEndpoint) Accept() (Endpoint, *syserr.Error) {
+func (*connectionlessEndpoint) Accept(*tcpip.FullAddress) (Endpoint, *syserr.Error) {
 	return nil, syserr.ErrNotSupported
 }
 
diff --git a/pkg/sentry/socket/unix/transport/queue.go b/pkg/sentry/socket/unix/transport/queue.go
index ef6043e19..342def28f 100644
--- a/pkg/sentry/socket/unix/transport/queue.go
+++ b/pkg/sentry/socket/unix/transport/queue.go
@@ -16,7 +16,6 @@ package transport
 
 import (
 	"gvisor.dev/gvisor/pkg/context"
-	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserr"
 	"gvisor.dev/gvisor/pkg/tcpip"
@@ -28,7 +27,7 @@ import (
 //
 // +stateify savable
 type queue struct {
-	refs.AtomicRefCount
+	queueRefs
 
 	ReaderQueue *waiter.Queue
 	WriterQueue *waiter.Queue
@@ -68,11 +67,13 @@ func (q *queue) Reset(ctx context.Context) {
 	q.mu.Unlock()
 }
 
-// DecRef implements RefCounter.DecRef with destructor q.Reset.
+// DecRef implements RefCounter.DecRef.
 func (q *queue) DecRef(ctx context.Context) {
-	q.DecRefWithDestructor(ctx, q.Reset)
-	// We don't need to notify after resetting because no one cares about
-	// this queue after all references have been dropped.
+	q.queueRefs.DecRef(func() {
+		// We don't need to notify after resetting because no one cares about
+		// this queue after all references have been dropped.
+		q.Reset(ctx)
+	})
 }
 
 // IsReadable determines if q is currently readable.
diff --git a/pkg/sentry/socket/unix/transport/unix.go b/pkg/sentry/socket/unix/transport/unix.go
index 475d7177e..d6fc03520 100644
--- a/pkg/sentry/socket/unix/transport/unix.go
+++ b/pkg/sentry/socket/unix/transport/unix.go
@@ -151,7 +151,10 @@ type Endpoint interface {
 	// block if no new connections are available.
 	//
 	// The returned Queue is the wait queue for the newly created endpoint.
-	Accept() (Endpoint, *syserr.Error)
+	//
+	// peerAddr if not nil will be populated with the address of the connected
+	// peer on a successful accept.
+	Accept(peerAddr *tcpip.FullAddress) (Endpoint, *syserr.Error)
 
 	// Bind binds the endpoint to a specific local address and port.
 	// Specifying a NIC is optional.
@@ -172,9 +175,8 @@ type Endpoint interface {
 	// connected.
 	GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error)
 
-	// SetSockOpt sets a socket option. opt should be one of the tcpip.*Option
-	// types.
-	SetSockOpt(opt interface{}) *tcpip.Error
+	// SetSockOpt sets a socket option.
+	SetSockOpt(opt tcpip.SettableSocketOption) *tcpip.Error
 
 	// SetSockOptBool sets a socket option for simple cases when a value has
 	// the int type.
@@ -184,9 +186,8 @@ type Endpoint interface {
 	// the int type.
 	SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error
 
-	// GetSockOpt gets a socket option. opt should be a pointer to one of the
-	// tcpip.*Option types.
-	GetSockOpt(opt interface{}) *tcpip.Error
+	// GetSockOpt gets a socket option.
+	GetSockOpt(opt tcpip.GettableSocketOption) *tcpip.Error
 
 	// GetSockOptBool gets a socket option for simple cases when a return
 	// value has the int type.
@@ -199,6 +200,9 @@ type Endpoint interface {
 	// State returns the current state of the socket, as represented by Linux in
 	// procfs.
 	State() uint32
+
+	// LastError implements tcpip.Endpoint.LastError.
+	LastError() *tcpip.Error
 }
 
 // A Credentialer is a socket or endpoint that supports the SO_PASSCRED socket
@@ -742,6 +746,9 @@ type baseEndpoint struct {
 	// path is not empty if the endpoint has been bound,
 	// or may be used if the endpoint is connected.
 	path string
+
+	// linger is used for SO_LINGER socket option.
+	linger tcpip.LingerOption
 }
 
 // EventRegister implements waiter.Waitable.EventRegister.
@@ -837,8 +844,14 @@ func (e *baseEndpoint) SendMsg(ctx context.Context, data [][]byte, c ControlMess
 	return n, err
 }
 
-// SetSockOpt sets a socket option. Currently not supported.
-func (e *baseEndpoint) SetSockOpt(opt interface{}) *tcpip.Error {
+// SetSockOpt sets a socket option.
+func (e *baseEndpoint) SetSockOpt(opt tcpip.SettableSocketOption) *tcpip.Error {
+	switch v := opt.(type) {
+	case *tcpip.LingerOption:
+		e.Lock()
+		e.linger = *v
+		e.Unlock()
+	}
 	return nil
 }
 
@@ -940,9 +953,12 @@ func (e *baseEndpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
 }
 
 // GetSockOpt implements tcpip.Endpoint.GetSockOpt.
-func (e *baseEndpoint) GetSockOpt(opt interface{}) *tcpip.Error {
-	switch opt.(type) {
-	case tcpip.ErrorOption:
+func (e *baseEndpoint) GetSockOpt(opt tcpip.GettableSocketOption) *tcpip.Error {
+	switch o := opt.(type) {
+	case *tcpip.LingerOption:
+		e.Lock()
+		*o = e.linger
+		e.Unlock()
 		return nil
 
 	default:
@@ -951,6 +967,11 @@ func (e *baseEndpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 	}
 }
 
+// LastError implements Endpoint.LastError.
+func (*baseEndpoint) LastError() *tcpip.Error {
+	return nil
+}
+
 // Shutdown closes the read and/or write end of the endpoint connection to its
 // peer.
 func (e *baseEndpoint) Shutdown(flags tcpip.ShutdownFlags) *syserr.Error {
diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go
index b7e8e4325..a4a76d0a3 100644
--- a/pkg/sentry/socket/unix/unix.go
+++ b/pkg/sentry/socket/unix/unix.go
@@ -24,6 +24,7 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/marshal"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
@@ -39,7 +40,6 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
-	"gvisor.dev/gvisor/tools/go_marshal/marshal"
 )
 
 // SocketOperations is a Unix socket. It is similar to a netstack socket,
@@ -55,6 +55,7 @@ type SocketOperations struct {
 	fsutil.FileNoopFlush            `state:"nosave"`
 	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
 
+	socketOperationsRefs
 	socketOpsCommon
 }
 
@@ -84,11 +85,27 @@ func NewWithDirent(ctx context.Context, d *fs.Dirent, ep transport.Endpoint, sty
 	return fs.NewFile(ctx, d, flags, &s)
 }
 
+// DecRef implements RefCounter.DecRef.
+func (s *SocketOperations) DecRef(ctx context.Context) {
+	s.socketOperationsRefs.DecRef(func() {
+		s.ep.Close(ctx)
+		if s.abstractNamespace != nil {
+			s.abstractNamespace.Remove(s.abstractName, s)
+		}
+	})
+}
+
+// Release implemements fs.FileOperations.Release.
+func (s *SocketOperations) Release(ctx context.Context) {
+	// Release only decrements a reference on s because s may be referenced in
+	// the abstract socket namespace.
+	s.DecRef(ctx)
+}
+
 // socketOpsCommon contains the socket operations common to VFS1 and VFS2.
 //
 // +stateify savable
 type socketOpsCommon struct {
-	socketOpsCommonRefs
 	socket.SendReceiveTimeout
 
 	ep    transport.Endpoint
@@ -101,23 +118,6 @@ type socketOpsCommon struct {
 	abstractNamespace *kernel.AbstractSocketNamespace
 }
 
-// DecRef implements RefCounter.DecRef.
-func (s *socketOpsCommon) DecRef(ctx context.Context) {
-	s.socketOpsCommonRefs.DecRef(func() {
-		s.ep.Close(ctx)
-		if s.abstractNamespace != nil {
-			s.abstractNamespace.Remove(s.abstractName, s)
-		}
-	})
-}
-
-// Release implemements fs.FileOperations.Release.
-func (s *socketOpsCommon) Release(ctx context.Context) {
-	// Release only decrements a reference on s because s may be referenced in
-	// the abstract socket namespace.
-	s.DecRef(ctx)
-}
-
 func (s *socketOpsCommon) isPacket() bool {
 	switch s.stype {
 	case linux.SOCK_DGRAM, linux.SOCK_SEQPACKET:
@@ -194,7 +194,7 @@ func (s *SocketOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO,
 // GetSockOpt implements the linux syscall getsockopt(2) for sockets backed by
 // a transport.Endpoint.
 func (s *SocketOperations) GetSockOpt(t *kernel.Task, level, name int, outPtr usermem.Addr, outLen int) (marshal.Marshallable, *syserr.Error) {
-	return netstack.GetSockOpt(t, s, s.ep, linux.AF_UNIX, s.ep.Type(), level, name, outLen)
+	return netstack.GetSockOpt(t, s, s.ep, linux.AF_UNIX, s.ep.Type(), level, name, outPtr, outLen)
 }
 
 // Listen implements the linux syscall listen(2) for sockets backed by
@@ -205,7 +205,7 @@ func (s *socketOpsCommon) Listen(t *kernel.Task, backlog int) *syserr.Error {
 
 // blockingAccept implements a blocking version of accept(2), that is, if no
 // connections are ready to be accept, it will block until one becomes ready.
-func (s *SocketOperations) blockingAccept(t *kernel.Task) (transport.Endpoint, *syserr.Error) {
+func (s *SocketOperations) blockingAccept(t *kernel.Task, peerAddr *tcpip.FullAddress) (transport.Endpoint, *syserr.Error) {
 	// Register for notifications.
 	e, ch := waiter.NewChannelEntry(nil)
 	s.EventRegister(&e, waiter.EventIn)
@@ -214,7 +214,7 @@ func (s *SocketOperations) blockingAccept(t *kernel.Task) (transport.Endpoint, *
 	// Try to accept the connection; if it fails, then wait until we get a
 	// notification.
 	for {
-		if ep, err := s.ep.Accept(); err != syserr.ErrWouldBlock {
+		if ep, err := s.ep.Accept(peerAddr); err != syserr.ErrWouldBlock {
 			return ep, err
 		}
 
@@ -227,15 +227,18 @@ func (s *SocketOperations) blockingAccept(t *kernel.Task) (transport.Endpoint, *
 // Accept implements the linux syscall accept(2) for sockets backed by
 // a transport.Endpoint.
 func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) {
-	// Issue the accept request to get the new endpoint.
-	ep, err := s.ep.Accept()
+	var peerAddr *tcpip.FullAddress
+	if peerRequested {
+		peerAddr = &tcpip.FullAddress{}
+	}
+	ep, err := s.ep.Accept(peerAddr)
 	if err != nil {
 		if err != syserr.ErrWouldBlock || !blocking {
 			return 0, nil, 0, err
 		}
 
 		var err *syserr.Error
-		ep, err = s.blockingAccept(t)
+		ep, err = s.blockingAccept(t, peerAddr)
 		if err != nil {
 			return 0, nil, 0, err
 		}
@@ -252,13 +255,8 @@ func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int,
 
 	var addr linux.SockAddr
 	var addrLen uint32
-	if peerRequested {
-		// Get address of the peer.
-		var err *syserr.Error
-		addr, addrLen, err = ns.FileOperations.(*SocketOperations).GetPeerName(t)
-		if err != nil {
-			return 0, nil, 0, err
-		}
+	if peerAddr != nil {
+		addr, addrLen = netstack.ConvertAddress(linux.AF_UNIX, *peerAddr)
 	}
 
 	fd, e := t.NewFDFrom(0, ns, kernel.FDFlags{
@@ -575,13 +573,17 @@ func (s *SocketOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOS
 	if dst.NumBytes() == 0 {
 		return 0, nil
 	}
-	return dst.CopyOutFrom(ctx, &EndpointReader{
+	r := &EndpointReader{
 		Ctx:       ctx,
 		Endpoint:  s.ep,
 		NumRights: 0,
 		Peek:      false,
 		From:      nil,
-	})
+	}
+	n, err := dst.CopyOutFrom(ctx, r)
+	// Drop control messages.
+	r.Control.Release(ctx)
+	return n, err
 }
 
 // RecvMsg implements the linux syscall recvmsg(2) for sockets backed by
diff --git a/pkg/sentry/socket/unix/unix_vfs2.go b/pkg/sentry/socket/unix/unix_vfs2.go
index d066ef8ab..678355fb9 100644
--- a/pkg/sentry/socket/unix/unix_vfs2.go
+++ b/pkg/sentry/socket/unix/unix_vfs2.go
@@ -18,6 +18,7 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/marshal"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs"
@@ -32,17 +33,19 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
-	"gvisor.dev/gvisor/tools/go_marshal/marshal"
 )
 
 // SocketVFS2 implements socket.SocketVFS2 (and by extension,
 // vfs.FileDescriptionImpl) for Unix sockets.
+//
+// +stateify savable
 type SocketVFS2 struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
 	vfs.DentryMetadataFileDescriptionImpl
 	vfs.LockFD
 
+	socketVFS2Refs
 	socketOpsCommon
 }
 
@@ -53,6 +56,7 @@ var _ = socket.SocketVFS2(&SocketVFS2{})
 func NewSockfsFile(t *kernel.Task, ep transport.Endpoint, stype linux.SockType) (*vfs.FileDescription, *syserr.Error) {
 	mnt := t.Kernel().SocketMount()
 	d := sockfs.NewDentry(t.Credentials(), mnt)
+	defer d.DecRef(t)
 
 	fd, err := NewFileDescription(ep, stype, linux.O_RDWR, mnt, d, &vfs.FileLocks{})
 	if err != nil {
@@ -88,15 +92,34 @@ func NewFileDescription(ep transport.Endpoint, stype linux.SockType, flags uint3
 	return vfsfd, nil
 }
 
+// DecRef implements RefCounter.DecRef.
+func (s *SocketVFS2) DecRef(ctx context.Context) {
+	s.socketVFS2Refs.DecRef(func() {
+		t := kernel.TaskFromContext(ctx)
+		t.Kernel().DeleteSocketVFS2(&s.vfsfd)
+		s.ep.Close(ctx)
+		if s.abstractNamespace != nil {
+			s.abstractNamespace.Remove(s.abstractName, s)
+		}
+	})
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (s *SocketVFS2) Release(ctx context.Context) {
+	// Release only decrements a reference on s because s may be referenced in
+	// the abstract socket namespace.
+	s.DecRef(ctx)
+}
+
 // GetSockOpt implements the linux syscall getsockopt(2) for sockets backed by
 // a transport.Endpoint.
 func (s *SocketVFS2) GetSockOpt(t *kernel.Task, level, name int, outPtr usermem.Addr, outLen int) (marshal.Marshallable, *syserr.Error) {
-	return netstack.GetSockOpt(t, s, s.ep, linux.AF_UNIX, s.ep.Type(), level, name, outLen)
+	return netstack.GetSockOpt(t, s, s.ep, linux.AF_UNIX, s.ep.Type(), level, name, outPtr, outLen)
 }
 
 // blockingAccept implements a blocking version of accept(2), that is, if no
 // connections are ready to be accept, it will block until one becomes ready.
-func (s *SocketVFS2) blockingAccept(t *kernel.Task) (transport.Endpoint, *syserr.Error) {
+func (s *SocketVFS2) blockingAccept(t *kernel.Task, peerAddr *tcpip.FullAddress) (transport.Endpoint, *syserr.Error) {
 	// Register for notifications.
 	e, ch := waiter.NewChannelEntry(nil)
 	s.socketOpsCommon.EventRegister(&e, waiter.EventIn)
@@ -105,7 +128,7 @@ func (s *SocketVFS2) blockingAccept(t *kernel.Task) (transport.Endpoint, *syserr
 	// Try to accept the connection; if it fails, then wait until we get a
 	// notification.
 	for {
-		if ep, err := s.ep.Accept(); err != syserr.ErrWouldBlock {
+		if ep, err := s.ep.Accept(peerAddr); err != syserr.ErrWouldBlock {
 			return ep, err
 		}
 
@@ -118,15 +141,18 @@ func (s *SocketVFS2) blockingAccept(t *kernel.Task) (transport.Endpoint, *syserr
 // Accept implements the linux syscall accept(2) for sockets backed by
 // a transport.Endpoint.
 func (s *SocketVFS2) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) {
-	// Issue the accept request to get the new endpoint.
-	ep, err := s.ep.Accept()
+	var peerAddr *tcpip.FullAddress
+	if peerRequested {
+		peerAddr = &tcpip.FullAddress{}
+	}
+	ep, err := s.ep.Accept(peerAddr)
 	if err != nil {
 		if err != syserr.ErrWouldBlock || !blocking {
 			return 0, nil, 0, err
 		}
 
 		var err *syserr.Error
-		ep, err = s.blockingAccept(t)
+		ep, err = s.blockingAccept(t, peerAddr)
 		if err != nil {
 			return 0, nil, 0, err
 		}
@@ -144,13 +170,8 @@ func (s *SocketVFS2) Accept(t *kernel.Task, peerRequested bool, flags int, block
 
 	var addr linux.SockAddr
 	var addrLen uint32
-	if peerRequested {
-		// Get address of the peer.
-		var err *syserr.Error
-		addr, addrLen, err = ns.Impl().(*SocketVFS2).GetPeerName(t)
-		if err != nil {
-			return 0, nil, 0, err
-		}
+	if peerAddr != nil {
+		addr, addrLen = netstack.ConvertAddress(linux.AF_UNIX, *peerAddr)
 	}
 
 	fd, e := t.NewFDFromVFS2(0, ns, kernel.FDFlags{
@@ -246,13 +267,17 @@ func (s *SocketVFS2) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.
 	if dst.NumBytes() == 0 {
 		return 0, nil
 	}
-	return dst.CopyOutFrom(ctx, &EndpointReader{
+	r := &EndpointReader{
 		Ctx:       ctx,
 		Endpoint:  s.ep,
 		NumRights: 0,
 		Peek:      false,
 		From:      nil,
-	})
+	}
+	n, err := dst.CopyOutFrom(ctx, r)
+	// Drop control messages.
+	r.Control.Release(ctx)
+	return n, err
 }
 
 // PWrite implements vfs.FileDescriptionImpl.
diff --git a/pkg/sentry/state/state.go b/pkg/sentry/state/state.go
index a06c9b8ab..245d2c5cf 100644
--- a/pkg/sentry/state/state.go
+++ b/pkg/sentry/state/state.go
@@ -61,8 +61,10 @@ func (opts SaveOpts) Save(k *kernel.Kernel, w *watchdog.Watchdog) error {
 	log.Infof("Sandbox save started, pausing all tasks.")
 	k.Pause()
 	k.ReceiveTaskStates()
-	defer k.Unpause()
-	defer log.Infof("Tasks resumed after save.")
+	defer func() {
+		k.Unpause()
+		log.Infof("Tasks resumed after save.")
+	}()
 
 	w.Stop()
 	defer w.Start()
diff --git a/pkg/sentry/strace/BUILD b/pkg/sentry/strace/BUILD
index 88d5db9fc..a920180d3 100644
--- a/pkg/sentry/strace/BUILD
+++ b/pkg/sentry/strace/BUILD
@@ -28,6 +28,7 @@ go_library(
         "//pkg/binary",
         "//pkg/bits",
         "//pkg/eventchannel",
+        "//pkg/marshal/primitive",
         "//pkg/seccomp",
         "//pkg/sentry/arch",
         "//pkg/sentry/kernel",
diff --git a/pkg/sentry/strace/epoll.go b/pkg/sentry/strace/epoll.go
index 5d51a7792..ae3b998c8 100644
--- a/pkg/sentry/strace/epoll.go
+++ b/pkg/sentry/strace/epoll.go
@@ -26,7 +26,7 @@ import (
 
 func epollEvent(t *kernel.Task, eventAddr usermem.Addr) string {
 	var e linux.EpollEvent
-	if _, err := t.CopyIn(eventAddr, &e); err != nil {
+	if _, err := e.CopyIn(t, eventAddr); err != nil {
 		return fmt.Sprintf("%#x {error reading event: %v}", eventAddr, err)
 	}
 	var sb strings.Builder
@@ -41,7 +41,7 @@ func epollEvents(t *kernel.Task, eventsAddr usermem.Addr, numEvents, maxBytes ui
 	addr := eventsAddr
 	for i := uint64(0); i < numEvents; i++ {
 		var e linux.EpollEvent
-		if _, err := t.CopyIn(addr, &e); err != nil {
+		if _, err := e.CopyIn(t, addr); err != nil {
 			fmt.Fprintf(&sb, "{error reading event at %#x: %v}", addr, err)
 			continue
 		}
diff --git a/pkg/sentry/strace/socket.go b/pkg/sentry/strace/socket.go
index b51c4c941..cc5f70cd4 100644
--- a/pkg/sentry/strace/socket.go
+++ b/pkg/sentry/strace/socket.go
@@ -21,6 +21,7 @@ import (
 	"gvisor.dev/gvisor/pkg/abi"
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/socket/netlink"
 	"gvisor.dev/gvisor/pkg/sentry/socket/netstack"
@@ -166,7 +167,7 @@ func cmsghdr(t *kernel.Task, addr usermem.Addr, length uint64, maxBytes uint64)
 	}
 
 	buf := make([]byte, length)
-	if _, err := t.CopyIn(addr, &buf); err != nil {
+	if _, err := t.CopyInBytes(addr, buf); err != nil {
 		return fmt.Sprintf("%#x (error decoding control: %v)", addr, err)
 	}
 
@@ -302,7 +303,7 @@ func cmsghdr(t *kernel.Task, addr usermem.Addr, length uint64, maxBytes uint64)
 
 func msghdr(t *kernel.Task, addr usermem.Addr, printContent bool, maxBytes uint64) string {
 	var msg slinux.MessageHeader64
-	if err := slinux.CopyInMessageHeader64(t, addr, &msg); err != nil {
+	if _, err := msg.CopyIn(t, addr); err != nil {
 		return fmt.Sprintf("%#x (error decoding msghdr: %v)", addr, err)
 	}
 	s := fmt.Sprintf(
@@ -380,9 +381,9 @@ func postSockAddr(t *kernel.Task, addr usermem.Addr, lengthPtr usermem.Addr) str
 
 func copySockLen(t *kernel.Task, addr usermem.Addr) (uint32, error) {
 	// socklen_t is 32-bits.
-	var l uint32
-	_, err := t.CopyIn(addr, &l)
-	return l, err
+	var l primitive.Uint32
+	_, err := l.CopyIn(t, addr)
+	return uint32(l), err
 }
 
 func sockLenPointer(t *kernel.Task, addr usermem.Addr) string {
@@ -436,22 +437,22 @@ func getSockOptVal(t *kernel.Task, level, optname uint64, optVal usermem.Addr, o
 func sockOptVal(t *kernel.Task, level, optname uint64, optVal usermem.Addr, optLen uint64, maximumBlobSize uint) string {
 	switch optLen {
 	case 1:
-		var v uint8
-		_, err := t.CopyIn(optVal, &v)
+		var v primitive.Uint8
+		_, err := v.CopyIn(t, optVal)
 		if err != nil {
 			return fmt.Sprintf("%#x {error reading optval: %v}", optVal, err)
 		}
 		return fmt.Sprintf("%#x {value=%v}", optVal, v)
 	case 2:
-		var v uint16
-		_, err := t.CopyIn(optVal, &v)
+		var v primitive.Uint16
+		_, err := v.CopyIn(t, optVal)
 		if err != nil {
 			return fmt.Sprintf("%#x {error reading optval: %v}", optVal, err)
 		}
 		return fmt.Sprintf("%#x {value=%v}", optVal, v)
 	case 4:
-		var v uint32
-		_, err := t.CopyIn(optVal, &v)
+		var v primitive.Uint32
+		_, err := v.CopyIn(t, optVal)
 		if err != nil {
 			return fmt.Sprintf("%#x {error reading optval: %v}", optVal, err)
 		}
@@ -632,6 +633,8 @@ var sockOptNames = map[uint64]abi.ValueSet{
 		linux.IPV6_UNICAST_IF:          "IPV6_UNICAST_IF",
 		linux.MCAST_MSFILTER:           "MCAST_MSFILTER",
 		linux.IPV6_ADDRFORM:            "IPV6_ADDRFORM",
+		linux.IP6T_SO_GET_INFO:         "IP6T_SO_GET_INFO",
+		linux.IP6T_SO_GET_ENTRIES:      "IP6T_SO_GET_ENTRIES",
 	},
 	linux.SOL_NETLINK: {
 		linux.NETLINK_BROADCAST_ERROR:  "NETLINK_BROADCAST_ERROR",
diff --git a/pkg/sentry/strace/strace.go b/pkg/sentry/strace/strace.go
index 87b239730..396744597 100644
--- a/pkg/sentry/strace/strace.go
+++ b/pkg/sentry/strace/strace.go
@@ -17,17 +17,16 @@
 package strace
 
 import (
-	"encoding/binary"
 	"fmt"
 	"strconv"
 	"strings"
-	"syscall"
 	"time"
 
 	"gvisor.dev/gvisor/pkg/abi"
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/bits"
 	"gvisor.dev/gvisor/pkg/eventchannel"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
 	"gvisor.dev/gvisor/pkg/seccomp"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
@@ -91,7 +90,7 @@ func iovecs(t *kernel.Task, addr usermem.Addr, iovcnt int, printContent bool, ma
 		}
 
 		b := make([]byte, size)
-		amt, err := t.CopyIn(ar.Start, b)
+		amt, err := t.CopyInBytes(ar.Start, b)
 		if err != nil {
 			iovs[i] = fmt.Sprintf("{base=%#x, len=%d, %q..., error decoding string: %v}", ar.Start, ar.Length(), b[:amt], err)
 			continue
@@ -118,7 +117,7 @@ func dump(t *kernel.Task, addr usermem.Addr, size uint, maximumBlobSize uint) st
 	}
 
 	b := make([]byte, size)
-	amt, err := t.CopyIn(addr, b)
+	amt, err := t.CopyInBytes(addr, b)
 	if err != nil {
 		return fmt.Sprintf("%#x (error decoding string: %s)", addr, err)
 	}
@@ -199,7 +198,7 @@ func fdVFS2(t *kernel.Task, fd int32) string {
 
 func fdpair(t *kernel.Task, addr usermem.Addr) string {
 	var fds [2]int32
-	_, err := t.CopyIn(addr, &fds)
+	_, err := primitive.CopyInt32SliceIn(t, addr, fds[:])
 	if err != nil {
 		return fmt.Sprintf("%#x (error decoding fds: %s)", addr, err)
 	}
@@ -209,7 +208,7 @@ func fdpair(t *kernel.Task, addr usermem.Addr) string {
 
 func uname(t *kernel.Task, addr usermem.Addr) string {
 	var u linux.UtsName
-	if _, err := t.CopyIn(addr, &u); err != nil {
+	if _, err := u.CopyIn(t, addr); err != nil {
 		return fmt.Sprintf("%#x (error decoding utsname: %s)", addr, err)
 	}
 
@@ -222,7 +221,7 @@ func utimensTimespec(t *kernel.Task, addr usermem.Addr) string {
 	}
 
 	var tim linux.Timespec
-	if _, err := t.CopyIn(addr, &tim); err != nil {
+	if _, err := tim.CopyIn(t, addr); err != nil {
 		return fmt.Sprintf("%#x (error decoding timespec: %s)", addr, err)
 	}
 
@@ -244,7 +243,7 @@ func timespec(t *kernel.Task, addr usermem.Addr) string {
 	}
 
 	var tim linux.Timespec
-	if _, err := t.CopyIn(addr, &tim); err != nil {
+	if _, err := tim.CopyIn(t, addr); err != nil {
 		return fmt.Sprintf("%#x (error decoding timespec: %s)", addr, err)
 	}
 	return fmt.Sprintf("%#x {sec=%v nsec=%v}", addr, tim.Sec, tim.Nsec)
@@ -256,7 +255,7 @@ func timeval(t *kernel.Task, addr usermem.Addr) string {
 	}
 
 	var tim linux.Timeval
-	if _, err := t.CopyIn(addr, &tim); err != nil {
+	if _, err := tim.CopyIn(t, addr); err != nil {
 		return fmt.Sprintf("%#x (error decoding timeval: %s)", addr, err)
 	}
 
@@ -268,8 +267,8 @@ func utimbuf(t *kernel.Task, addr usermem.Addr) string {
 		return "null"
 	}
 
-	var utim syscall.Utimbuf
-	if _, err := t.CopyIn(addr, &utim); err != nil {
+	var utim linux.Utime
+	if _, err := utim.CopyIn(t, addr); err != nil {
 		return fmt.Sprintf("%#x (error decoding utimbuf: %s)", addr, err)
 	}
 
@@ -282,7 +281,7 @@ func stat(t *kernel.Task, addr usermem.Addr) string {
 	}
 
 	var stat linux.Stat
-	if _, err := t.CopyIn(addr, &stat); err != nil {
+	if _, err := stat.CopyIn(t, addr); err != nil {
 		return fmt.Sprintf("%#x (error decoding stat: %s)", addr, err)
 	}
 	return fmt.Sprintf("%#x {dev=%d, ino=%d, mode=%s, nlink=%d, uid=%d, gid=%d, rdev=%d, size=%d, blksize=%d, blocks=%d, atime=%s, mtime=%s, ctime=%s}", addr, stat.Dev, stat.Ino, linux.FileMode(stat.Mode), stat.Nlink, stat.UID, stat.GID, stat.Rdev, stat.Size, stat.Blksize, stat.Blocks, time.Unix(stat.ATime.Sec, stat.ATime.Nsec), time.Unix(stat.MTime.Sec, stat.MTime.Nsec), time.Unix(stat.CTime.Sec, stat.CTime.Nsec))
@@ -294,7 +293,7 @@ func itimerval(t *kernel.Task, addr usermem.Addr) string {
 	}
 
 	interval := timeval(t, addr)
-	value := timeval(t, addr+usermem.Addr(binary.Size(linux.Timeval{})))
+	value := timeval(t, addr+usermem.Addr((*linux.Timeval)(nil).SizeBytes()))
 	return fmt.Sprintf("%#x {interval=%s, value=%s}", addr, interval, value)
 }
 
@@ -304,7 +303,7 @@ func itimerspec(t *kernel.Task, addr usermem.Addr) string {
 	}
 
 	interval := timespec(t, addr)
-	value := timespec(t, addr+usermem.Addr(binary.Size(linux.Timespec{})))
+	value := timespec(t, addr+usermem.Addr((*linux.Timespec)(nil).SizeBytes()))
 	return fmt.Sprintf("%#x {interval=%s, value=%s}", addr, interval, value)
 }
 
@@ -330,7 +329,7 @@ func rusage(t *kernel.Task, addr usermem.Addr) string {
 	}
 
 	var ru linux.Rusage
-	if _, err := t.CopyIn(addr, &ru); err != nil {
+	if _, err := ru.CopyIn(t, addr); err != nil {
 		return fmt.Sprintf("%#x (error decoding rusage: %s)", addr, err)
 	}
 	return fmt.Sprintf("%#x %+v", addr, ru)
@@ -342,7 +341,7 @@ func capHeader(t *kernel.Task, addr usermem.Addr) string {
 	}
 
 	var hdr linux.CapUserHeader
-	if _, err := t.CopyIn(addr, &hdr); err != nil {
+	if _, err := hdr.CopyIn(t, addr); err != nil {
 		return fmt.Sprintf("%#x (error decoding header: %s)", addr, err)
 	}
 
@@ -367,7 +366,7 @@ func capData(t *kernel.Task, hdrAddr, dataAddr usermem.Addr) string {
 	}
 
 	var hdr linux.CapUserHeader
-	if _, err := t.CopyIn(hdrAddr, &hdr); err != nil {
+	if _, err := hdr.CopyIn(t, hdrAddr); err != nil {
 		return fmt.Sprintf("%#x (error decoding header: %v)", dataAddr, err)
 	}
 
@@ -376,7 +375,7 @@ func capData(t *kernel.Task, hdrAddr, dataAddr usermem.Addr) string {
 	switch hdr.Version {
 	case linux.LINUX_CAPABILITY_VERSION_1:
 		var data linux.CapUserData
-		if _, err := t.CopyIn(dataAddr, &data); err != nil {
+		if _, err := data.CopyIn(t, dataAddr); err != nil {
 			return fmt.Sprintf("%#x (error decoding data: %v)", dataAddr, err)
 		}
 		p = uint64(data.Permitted)
@@ -384,7 +383,7 @@ func capData(t *kernel.Task, hdrAddr, dataAddr usermem.Addr) string {
 		e = uint64(data.Effective)
 	case linux.LINUX_CAPABILITY_VERSION_2, linux.LINUX_CAPABILITY_VERSION_3:
 		var data [2]linux.CapUserData
-		if _, err := t.CopyIn(dataAddr, &data); err != nil {
+		if _, err := linux.CopyCapUserDataSliceIn(t, dataAddr, data[:]); err != nil {
 			return fmt.Sprintf("%#x (error decoding data: %v)", dataAddr, err)
 		}
 		p = uint64(data[0].Permitted) | (uint64(data[1].Permitted) << 32)
diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD
index 4a9b04fd0..a2e441448 100644
--- a/pkg/sentry/syscalls/linux/BUILD
+++ b/pkg/sentry/syscalls/linux/BUILD
@@ -21,6 +21,7 @@ go_library(
         "sys_identity.go",
         "sys_inotify.go",
         "sys_lseek.go",
+        "sys_membarrier.go",
         "sys_mempolicy.go",
         "sys_mmap.go",
         "sys_mount.go",
@@ -56,6 +57,7 @@ go_library(
         "sys_xattr.go",
         "timespec.go",
     ],
+    marshal = True,
     visibility = ["//:sandbox"],
     deps = [
         "//pkg/abi",
@@ -64,6 +66,8 @@ go_library(
         "//pkg/bpf",
         "//pkg/context",
         "//pkg/log",
+        "//pkg/marshal",
+        "//pkg/marshal/primitive",
         "//pkg/metric",
         "//pkg/rand",
         "//pkg/safemem",
@@ -99,7 +103,5 @@ go_library(
         "//pkg/syserror",
         "//pkg/usermem",
         "//pkg/waiter",
-        "//tools/go_marshal/marshal",
-        "//tools/go_marshal/primitive",
     ],
 )
diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go
index 80c65164a..9c9def7cd 100644
--- a/pkg/sentry/syscalls/linux/linux64.go
+++ b/pkg/sentry/syscalls/linux/linux64.go
@@ -138,7 +138,7 @@ var AMD64 = &kernel.SyscallTable{
 		83:  syscalls.Supported("mkdir", Mkdir),
 		84:  syscalls.Supported("rmdir", Rmdir),
 		85:  syscalls.Supported("creat", Creat),
-		86:  syscalls.Supported("link", Link),
+		86:  syscalls.PartiallySupported("link", Link, "Limited support with Gofer. Link count and linked files may get out of sync because gVisor is not aware of external hardlinks.", nil),
 		87:  syscalls.Supported("unlink", Unlink),
 		88:  syscalls.Supported("symlink", Symlink),
 		89:  syscalls.Supported("readlink", Readlink),
@@ -305,9 +305,9 @@ var AMD64 = &kernel.SyscallTable{
 		250: syscalls.Error("keyctl", syserror.EACCES, "Not available to user.", nil),
 		251: syscalls.CapError("ioprio_set", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_nice or cap_sys_admin (depending)
 		252: syscalls.CapError("ioprio_get", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_nice or cap_sys_admin (depending)
-		253: syscalls.PartiallySupported("inotify_init", InotifyInit, "inotify events are only available inside the sandbox.", nil),
-		254: syscalls.PartiallySupported("inotify_add_watch", InotifyAddWatch, "inotify events are only available inside the sandbox.", nil),
-		255: syscalls.PartiallySupported("inotify_rm_watch", InotifyRmWatch, "inotify events are only available inside the sandbox.", nil),
+		253: syscalls.PartiallySupported("inotify_init", InotifyInit, "Inotify events are only available inside the sandbox. Hard links are treated as different watch targets in gofer fs.", nil),
+		254: syscalls.PartiallySupported("inotify_add_watch", InotifyAddWatch, "Inotify events are only available inside the sandbox. Hard links are treated as different watch targets in gofer fs.", nil),
+		255: syscalls.PartiallySupported("inotify_rm_watch", InotifyRmWatch, "Inotify events are only available inside the sandbox. Hard links are treated as different watch targets in gofer fs.", nil),
 		256: syscalls.CapError("migrate_pages", linux.CAP_SYS_NICE, "", nil),
 		257: syscalls.Supported("openat", Openat),
 		258: syscalls.Supported("mkdirat", Mkdirat),
@@ -317,7 +317,7 @@ var AMD64 = &kernel.SyscallTable{
 		262: syscalls.Supported("fstatat", Fstatat),
 		263: syscalls.Supported("unlinkat", Unlinkat),
 		264: syscalls.Supported("renameat", Renameat),
-		265: syscalls.Supported("linkat", Linkat),
+		265: syscalls.PartiallySupported("linkat", Linkat, "See link(2).", nil),
 		266: syscalls.Supported("symlinkat", Symlinkat),
 		267: syscalls.Supported("readlinkat", Readlinkat),
 		268: syscalls.Supported("fchmodat", Fchmodat),
@@ -346,7 +346,7 @@ var AMD64 = &kernel.SyscallTable{
 		291: syscalls.Supported("epoll_create1", EpollCreate1),
 		292: syscalls.Supported("dup3", Dup3),
 		293: syscalls.Supported("pipe2", Pipe2),
-		294: syscalls.Supported("inotify_init1", InotifyInit1),
+		294: syscalls.PartiallySupported("inotify_init1", InotifyInit1, "Inotify events are only available inside the sandbox. Hard links are treated as different watch targets in gofer fs.", nil),
 		295: syscalls.Supported("preadv", Preadv),
 		296: syscalls.Supported("pwritev", Pwritev),
 		297: syscalls.Supported("rt_tgsigqueueinfo", RtTgsigqueueinfo),
@@ -376,7 +376,7 @@ var AMD64 = &kernel.SyscallTable{
 		321: syscalls.CapError("bpf", linux.CAP_SYS_ADMIN, "", nil),
 		322: syscalls.Supported("execveat", Execveat),
 		323: syscalls.ErrorWithEvent("userfaultfd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/266"}), // TODO(b/118906345)
-		324: syscalls.ErrorWithEvent("membarrier", syserror.ENOSYS, "", []string{"gvisor.dev/issue/267"}),  // TODO(gvisor.dev/issue/267)
+		324: syscalls.PartiallySupported("membarrier", Membarrier, "Not supported on all platforms.", nil),
 		325: syscalls.PartiallySupported("mlock2", Mlock2, "Stub implementation. The sandbox lacks appropriate permissions.", nil),
 
 		// Syscalls implemented after 325 are "backports" from versions
@@ -454,9 +454,9 @@ var ARM64 = &kernel.SyscallTable{
 		23:  syscalls.Supported("dup", Dup),
 		24:  syscalls.Supported("dup3", Dup3),
 		25:  syscalls.PartiallySupported("fcntl", Fcntl, "Not all options are supported.", nil),
-		26:  syscalls.Supported("inotify_init1", InotifyInit1),
-		27:  syscalls.PartiallySupported("inotify_add_watch", InotifyAddWatch, "inotify events are only available inside the sandbox.", nil),
-		28:  syscalls.PartiallySupported("inotify_rm_watch", InotifyRmWatch, "inotify events are only available inside the sandbox.", nil),
+		26:  syscalls.PartiallySupported("inotify_init1", InotifyInit1, "Inotify events are only available inside the sandbox. Hard links are treated as different watch targets in gofer fs.", nil),
+		27:  syscalls.PartiallySupported("inotify_add_watch", InotifyAddWatch, "Inotify events are only available inside the sandbox. Hard links are treated as different watch targets in gofer fs.", nil),
+		28:  syscalls.PartiallySupported("inotify_rm_watch", InotifyRmWatch, "Inotify events are only available inside the sandbox. Hard links are treated as different watch targets in gofer fs.", nil),
 		29:  syscalls.PartiallySupported("ioctl", Ioctl, "Only a few ioctls are implemented for backing devices and file systems.", nil),
 		30:  syscalls.CapError("ioprio_set", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_nice or cap_sys_admin (depending)
 		31:  syscalls.CapError("ioprio_get", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_nice or cap_sys_admin (depending)
@@ -527,8 +527,8 @@ var ARM64 = &kernel.SyscallTable{
 		96:  syscalls.Supported("set_tid_address", SetTidAddress),
 		97:  syscalls.PartiallySupported("unshare", Unshare, "Mount, cgroup namespaces not supported. Network namespaces supported but must be empty.", nil),
 		98:  syscalls.PartiallySupported("futex", Futex, "Robust futexes not supported.", nil),
-		99:  syscalls.Error("set_robust_list", syserror.ENOSYS, "Obsolete.", nil),
-		100: syscalls.Error("get_robust_list", syserror.ENOSYS, "Obsolete.", nil),
+		99:  syscalls.Supported("set_robust_list", SetRobustList),
+		100: syscalls.Supported("get_robust_list", GetRobustList),
 		101: syscalls.Supported("nanosleep", Nanosleep),
 		102: syscalls.Supported("getitimer", Getitimer),
 		103: syscalls.Supported("setitimer", Setitimer),
@@ -695,7 +695,7 @@ var ARM64 = &kernel.SyscallTable{
 		280: syscalls.CapError("bpf", linux.CAP_SYS_ADMIN, "", nil),
 		281: syscalls.Supported("execveat", Execveat),
 		282: syscalls.ErrorWithEvent("userfaultfd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/266"}), // TODO(b/118906345)
-		283: syscalls.ErrorWithEvent("membarrier", syserror.ENOSYS, "", []string{"gvisor.dev/issue/267"}),  // TODO(gvisor.dev/issue/267)
+		283: syscalls.PartiallySupported("membarrier", Membarrier, "Not supported on all platforms.", nil),
 		284: syscalls.PartiallySupported("mlock2", Mlock2, "Stub implementation. The sandbox lacks appropriate permissions.", nil),
 
 		// Syscalls after 284 are "backports" from versions of Linux after 4.4.
diff --git a/pkg/sentry/syscalls/linux/sys_aio.go b/pkg/sentry/syscalls/linux/sys_aio.go
index e9d64dec5..0bf313a13 100644
--- a/pkg/sentry/syscalls/linux/sys_aio.go
+++ b/pkg/sentry/syscalls/linux/sys_aio.go
@@ -17,6 +17,7 @@ package linux
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
@@ -36,7 +37,7 @@ func IoSetup(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
 	//
 	// The context pointer _must_ be zero initially.
 	var idIn uint64
-	if _, err := t.CopyIn(idAddr, &idIn); err != nil {
+	if _, err := primitive.CopyUint64In(t, idAddr, &idIn); err != nil {
 		return 0, nil, err
 	}
 	if idIn != 0 {
@@ -49,7 +50,7 @@ func IoSetup(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
 	}
 
 	// Copy out the new ID.
-	if _, err := t.CopyOut(idAddr, &id); err != nil {
+	if _, err := primitive.CopyUint64Out(t, idAddr, id); err != nil {
 		t.MemoryManager().DestroyAIOContext(t, id)
 		return 0, nil, err
 	}
@@ -142,7 +143,7 @@ func IoGetevents(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S
 		ev := v.(*linux.IOEvent)
 
 		// Copy out the result.
-		if _, err := t.CopyOut(eventsAddr, ev); err != nil {
+		if _, err := ev.CopyOut(t, eventsAddr); err != nil {
 			if count > 0 {
 				return uintptr(count), nil, nil
 			}
@@ -338,21 +339,27 @@ func IoSubmit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 	}
 
 	for i := int32(0); i < nrEvents; i++ {
-		// Copy in the address.
-		cbAddrNative := t.Arch().Native(0)
-		if _, err := t.CopyIn(addr, cbAddrNative); err != nil {
-			if i > 0 {
-				// Some successful.
-				return uintptr(i), nil, nil
+		// Copy in the callback address.
+		var cbAddr usermem.Addr
+		switch t.Arch().Width() {
+		case 8:
+			var cbAddrP primitive.Uint64
+			if _, err := cbAddrP.CopyIn(t, addr); err != nil {
+				if i > 0 {
+					// Some successful.
+					return uintptr(i), nil, nil
+				}
+				// Nothing done.
+				return 0, nil, err
 			}
-			// Nothing done.
-			return 0, nil, err
+			cbAddr = usermem.Addr(cbAddrP)
+		default:
+			return 0, nil, syserror.ENOSYS
 		}
 
 		// Copy in this callback.
 		var cb linux.IOCallback
-		cbAddr := usermem.Addr(t.Arch().Value(cbAddrNative))
-		if _, err := t.CopyIn(cbAddr, &cb); err != nil {
+		if _, err := cb.CopyIn(t, cbAddr); err != nil {
 
 			if i > 0 {
 				// Some have been successful.
diff --git a/pkg/sentry/syscalls/linux/sys_capability.go b/pkg/sentry/syscalls/linux/sys_capability.go
index adf5ea5f2..d3b85e11b 100644
--- a/pkg/sentry/syscalls/linux/sys_capability.go
+++ b/pkg/sentry/syscalls/linux/sys_capability.go
@@ -45,7 +45,7 @@ func Capget(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 	dataAddr := args[1].Pointer()
 
 	var hdr linux.CapUserHeader
-	if _, err := t.CopyIn(hdrAddr, &hdr); err != nil {
+	if _, err := hdr.CopyIn(t, hdrAddr); err != nil {
 		return 0, nil, err
 	}
 	// hdr.Pid doesn't need to be valid if this capget() is a "version probe"
@@ -65,7 +65,7 @@ func Capget(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 			Permitted:   uint32(p),
 			Inheritable: uint32(i),
 		}
-		_, err = t.CopyOut(dataAddr, &data)
+		_, err = data.CopyOut(t, dataAddr)
 		return 0, nil, err
 
 	case linux.LINUX_CAPABILITY_VERSION_2, linux.LINUX_CAPABILITY_VERSION_3:
@@ -88,12 +88,12 @@ func Capget(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 				Inheritable: uint32(i >> 32),
 			},
 		}
-		_, err = t.CopyOut(dataAddr, &data)
+		_, err = linux.CopyCapUserDataSliceOut(t, dataAddr, data[:])
 		return 0, nil, err
 
 	default:
 		hdr.Version = linux.HighestCapabilityVersion
-		if _, err := t.CopyOut(hdrAddr, &hdr); err != nil {
+		if _, err := hdr.CopyOut(t, hdrAddr); err != nil {
 			return 0, nil, err
 		}
 		if dataAddr != 0 {
@@ -109,7 +109,7 @@ func Capset(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 	dataAddr := args[1].Pointer()
 
 	var hdr linux.CapUserHeader
-	if _, err := t.CopyIn(hdrAddr, &hdr); err != nil {
+	if _, err := hdr.CopyIn(t, hdrAddr); err != nil {
 		return 0, nil, err
 	}
 	switch hdr.Version {
@@ -118,7 +118,7 @@ func Capset(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 			return 0, nil, syserror.EPERM
 		}
 		var data linux.CapUserData
-		if _, err := t.CopyIn(dataAddr, &data); err != nil {
+		if _, err := data.CopyIn(t, dataAddr); err != nil {
 			return 0, nil, err
 		}
 		p := auth.CapabilitySet(data.Permitted) & auth.AllCapabilities
@@ -131,7 +131,7 @@ func Capset(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 			return 0, nil, syserror.EPERM
 		}
 		var data [2]linux.CapUserData
-		if _, err := t.CopyIn(dataAddr, &data); err != nil {
+		if _, err := linux.CopyCapUserDataSliceIn(t, dataAddr, data[:]); err != nil {
 			return 0, nil, err
 		}
 		p := (auth.CapabilitySet(data[0].Permitted) | (auth.CapabilitySet(data[1].Permitted) << 32)) & auth.AllCapabilities
@@ -141,7 +141,7 @@ func Capset(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 
 	default:
 		hdr.Version = linux.HighestCapabilityVersion
-		if _, err := t.CopyOut(hdrAddr, &hdr); err != nil {
+		if _, err := hdr.CopyOut(t, hdrAddr); err != nil {
 			return 0, nil, err
 		}
 		return 0, nil, syserror.EINVAL
diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go
index 256422689..519066a47 100644
--- a/pkg/sentry/syscalls/linux/sys_file.go
+++ b/pkg/sentry/syscalls/linux/sys_file.go
@@ -19,6 +19,7 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/lock"
@@ -83,6 +84,7 @@ func fileOpOn(t *kernel.Task, dirFD int32, path string, resolve bool, fn func(ro
 		}
 		rel = f.Dirent
 		if !fs.IsDir(rel.Inode.StableAttr) {
+			f.DecRef(t)
 			return syserror.ENOTDIR
 		}
 	}
@@ -601,19 +603,19 @@ func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	// Shared flags between file and socket.
 	switch request {
 	case linux.FIONCLEX:
-		t.FDTable().SetFlags(fd, kernel.FDFlags{
+		t.FDTable().SetFlags(t, fd, kernel.FDFlags{
 			CloseOnExec: false,
 		})
 		return 0, nil, nil
 	case linux.FIOCLEX:
-		t.FDTable().SetFlags(fd, kernel.FDFlags{
+		t.FDTable().SetFlags(t, fd, kernel.FDFlags{
 			CloseOnExec: true,
 		})
 		return 0, nil, nil
 
 	case linux.FIONBIO:
 		var set int32
-		if _, err := t.CopyIn(args[2].Pointer(), &set); err != nil {
+		if _, err := primitive.CopyInt32In(t, args[2].Pointer(), &set); err != nil {
 			return 0, nil, err
 		}
 		flags := file.Flags()
@@ -627,7 +629,7 @@ func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 
 	case linux.FIOASYNC:
 		var set int32
-		if _, err := t.CopyIn(args[2].Pointer(), &set); err != nil {
+		if _, err := primitive.CopyInt32In(t, args[2].Pointer(), &set); err != nil {
 			return 0, nil, err
 		}
 		flags := file.Flags()
@@ -641,15 +643,14 @@ func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 
 	case linux.FIOSETOWN, linux.SIOCSPGRP:
 		var set int32
-		if _, err := t.CopyIn(args[2].Pointer(), &set); err != nil {
+		if _, err := primitive.CopyInt32In(t, args[2].Pointer(), &set); err != nil {
 			return 0, nil, err
 		}
 		fSetOwn(t, file, set)
 		return 0, nil, nil
 
 	case linux.FIOGETOWN, linux.SIOCGPGRP:
-		who := fGetOwn(t, file)
-		_, err := t.CopyOut(args[2].Pointer(), &who)
+		_, err := primitive.CopyInt32Out(t, args[2].Pointer(), fGetOwn(t, file))
 		return 0, nil, err
 
 	default:
@@ -694,7 +695,7 @@ func Getcwd(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 	}
 
 	// Top it off with a terminator.
-	_, err = t.CopyOut(addr+usermem.Addr(bytes), []byte("\x00"))
+	_, err = t.CopyOutBytes(addr+usermem.Addr(bytes), []byte("\x00"))
 	return uintptr(bytes + 1), nil, err
 }
 
@@ -787,7 +788,7 @@ func Close(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	// Note that Remove provides a reference on the file that we may use to
 	// flush. It is still active until we drop the final reference below
 	// (and other reference-holding operations complete).
-	file, _ := t.FDTable().Remove(fd)
+	file, _ := t.FDTable().Remove(t, fd)
 	if file == nil {
 		return 0, nil, syserror.EBADF
 	}
@@ -941,7 +942,7 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 		return uintptr(flags.ToLinuxFDFlags()), nil, nil
 	case linux.F_SETFD:
 		flags := args[2].Uint()
-		err := t.FDTable().SetFlags(fd, kernel.FDFlags{
+		err := t.FDTable().SetFlags(t, fd, kernel.FDFlags{
 			CloseOnExec: flags&linux.FD_CLOEXEC != 0,
 		})
 		return 0, nil, err
@@ -962,7 +963,7 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 		// Copy in the lock request.
 		flockAddr := args[2].Pointer()
 		var flock linux.Flock
-		if _, err := t.CopyIn(flockAddr, &flock); err != nil {
+		if _, err := flock.CopyIn(t, flockAddr); err != nil {
 			return 0, nil, err
 		}
 
@@ -1052,12 +1053,12 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	case linux.F_GETOWN_EX:
 		addr := args[2].Pointer()
 		owner := fGetOwnEx(t, file)
-		_, err := t.CopyOut(addr, &owner)
+		_, err := owner.CopyOut(t, addr)
 		return 0, nil, err
 	case linux.F_SETOWN_EX:
 		addr := args[2].Pointer()
 		var owner linux.FOwnerEx
-		_, err := t.CopyIn(addr, &owner)
+		_, err := owner.CopyIn(t, addr)
 		if err != nil {
 			return 0, nil, err
 		}
@@ -1154,6 +1155,10 @@ func Fadvise64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 	return 0, nil, nil
 }
 
+// LINT.ThenChange(vfs2/fd.go)
+
+// LINT.IfChange
+
 func mkdirAt(t *kernel.Task, dirFD int32, addr usermem.Addr, mode linux.FileMode) error {
 	path, _, err := copyInPath(t, addr, false /* allowEmpty */)
 	if err != nil {
@@ -1918,7 +1923,7 @@ func Utime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	ts := defaultSetToSystemTimeSpec()
 	if timesAddr != 0 {
 		var times linux.Utime
-		if _, err := t.CopyIn(timesAddr, &times); err != nil {
+		if _, err := times.CopyIn(t, timesAddr); err != nil {
 			return 0, nil, err
 		}
 		ts = fs.TimeSpec{
@@ -1938,7 +1943,7 @@ func Utimes(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 	ts := defaultSetToSystemTimeSpec()
 	if timesAddr != 0 {
 		var times [2]linux.Timeval
-		if _, err := t.CopyIn(timesAddr, &times); err != nil {
+		if _, err := linux.CopyTimevalSliceIn(t, timesAddr, times[:]); err != nil {
 			return 0, nil, err
 		}
 		ts = fs.TimeSpec{
@@ -1966,7 +1971,7 @@ func Utimensat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 	ts := defaultSetToSystemTimeSpec()
 	if timesAddr != 0 {
 		var times [2]linux.Timespec
-		if _, err := t.CopyIn(timesAddr, &times); err != nil {
+		if _, err := linux.CopyTimespecSliceIn(t, timesAddr, times[:]); err != nil {
 			return 0, nil, err
 		}
 		if !timespecIsValid(times[0]) || !timespecIsValid(times[1]) {
@@ -2000,7 +2005,7 @@ func Futimesat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 	ts := defaultSetToSystemTimeSpec()
 	if timesAddr != 0 {
 		var times [2]linux.Timeval
-		if _, err := t.CopyIn(timesAddr, &times); err != nil {
+		if _, err := linux.CopyTimevalSliceIn(t, timesAddr, times[:]); err != nil {
 			return 0, nil, err
 		}
 		if times[0].Usec >= 1e6 || times[0].Usec < 0 ||
diff --git a/pkg/sentry/syscalls/linux/sys_futex.go b/pkg/sentry/syscalls/linux/sys_futex.go
index 12b2fa690..f39ce0639 100644
--- a/pkg/sentry/syscalls/linux/sys_futex.go
+++ b/pkg/sentry/syscalls/linux/sys_futex.go
@@ -306,8 +306,8 @@ func GetRobustList(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel
 	// Despite the syscall using the name 'pid' for this variable, it is
 	// very much a tid.
 	tid := args[0].Int()
-	head := args[1].Pointer()
-	size := args[2].Pointer()
+	headAddr := args[1].Pointer()
+	sizeAddr := args[2].Pointer()
 
 	if tid < 0 {
 		return 0, nil, syserror.EINVAL
@@ -321,12 +321,16 @@ func GetRobustList(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel
 	}
 
 	// Copy out head pointer.
-	if _, err := t.CopyOut(head, uint64(ot.GetRobustList())); err != nil {
+	head := t.Arch().Native(uintptr(ot.GetRobustList()))
+	if _, err := head.CopyOut(t, headAddr); err != nil {
 		return 0, nil, err
 	}
 
-	// Copy out size, which is a constant.
-	if _, err := t.CopyOut(size, uint64(linux.SizeOfRobustListHead)); err != nil {
+	// Copy out size, which is a constant. Note that while size isn't
+	// an address, it is defined as the arch-dependent size_t, so it
+	// needs to be converted to a native-sized int.
+	size := t.Arch().Native(uintptr(linux.SizeOfRobustListHead))
+	if _, err := size.CopyOut(t, sizeAddr); err != nil {
 		return 0, nil, err
 	}
 
diff --git a/pkg/sentry/syscalls/linux/sys_getdents.go b/pkg/sentry/syscalls/linux/sys_getdents.go
index 59004cefe..b25f7d881 100644
--- a/pkg/sentry/syscalls/linux/sys_getdents.go
+++ b/pkg/sentry/syscalls/linux/sys_getdents.go
@@ -19,7 +19,6 @@ import (
 	"io"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/binary"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
@@ -93,19 +92,23 @@ func getdents(t *kernel.Task, fd int32, addr usermem.Addr, size int, f func(*dir
 	}
 }
 
-// oldDirentHdr is a fixed sized header matching the fixed size
-// fields found in the old linux dirent struct.
+// oldDirentHdr is a fixed sized header matching the fixed size fields found in
+// the old linux dirent struct.
+//
+// +marshal
 type oldDirentHdr struct {
 	Ino    uint64
 	Off    uint64
-	Reclen uint16
+	Reclen uint16 `marshal:"unaligned"` // Struct ends mid-word.
 }
 
-// direntHdr is a fixed sized header matching the fixed size
-// fields found in the new linux dirent struct.
+// direntHdr is a fixed sized header matching the fixed size fields found in the
+// new linux dirent struct.
+//
+// +marshal
 type direntHdr struct {
 	OldHdr oldDirentHdr
-	Typ    uint8
+	Typ    uint8 `marshal:"unaligned"` // Struct ends mid-word.
 }
 
 // dirent contains the data pointed to by a new linux dirent struct.
@@ -134,20 +137,20 @@ func newDirent(width uint, name string, attr fs.DentAttr, offset uint64) *dirent
 // the old linux dirent format.
 func smallestDirent(a arch.Context) uint {
 	d := dirent{}
-	return uint(binary.Size(d.Hdr.OldHdr)) + a.Width() + 1
+	return uint(d.Hdr.OldHdr.SizeBytes()) + a.Width() + 1
 }
 
 // smallestDirent64 returns the size of the smallest possible dirent using
 // the new linux dirent format.
 func smallestDirent64(a arch.Context) uint {
 	d := dirent{}
-	return uint(binary.Size(d.Hdr)) + a.Width()
+	return uint(d.Hdr.SizeBytes()) + a.Width()
 }
 
 // padRec pads the name field until the rec length is a multiple of the width,
 // which must be a power of 2. It returns the padded rec length.
 func (d *dirent) padRec(width int) uint16 {
-	a := int(binary.Size(d.Hdr)) + len(d.Name)
+	a := d.Hdr.SizeBytes() + len(d.Name)
 	r := (a + width) &^ (width - 1)
 	padding := r - a
 	d.Name = append(d.Name, make([]byte, padding)...)
@@ -157,7 +160,7 @@ func (d *dirent) padRec(width int) uint16 {
 // Serialize64 serializes a Dirent struct to a byte slice, keeping the new
 // linux dirent format. Returns the number of bytes serialized or an error.
 func (d *dirent) Serialize64(w io.Writer) (int, error) {
-	n1, err := w.Write(binary.Marshal(nil, usermem.ByteOrder, d.Hdr))
+	n1, err := d.Hdr.WriteTo(w)
 	if err != nil {
 		return 0, err
 	}
@@ -165,14 +168,14 @@ func (d *dirent) Serialize64(w io.Writer) (int, error) {
 	if err != nil {
 		return 0, err
 	}
-	return n1 + n2, nil
+	return int(n1) + n2, nil
 }
 
 // Serialize serializes a Dirent struct to a byte slice, using the old linux
 // dirent format.
 // Returns the number of bytes serialized or an error.
 func (d *dirent) Serialize(w io.Writer) (int, error) {
-	n1, err := w.Write(binary.Marshal(nil, usermem.ByteOrder, d.Hdr.OldHdr))
+	n1, err := d.Hdr.OldHdr.WriteTo(w)
 	if err != nil {
 		return 0, err
 	}
@@ -184,7 +187,7 @@ func (d *dirent) Serialize(w io.Writer) (int, error) {
 	if err != nil {
 		return 0, err
 	}
-	return n1 + n2 + n3, nil
+	return int(n1) + n2 + n3, nil
 }
 
 // direntSerializer implements fs.InodeOperationsInfoSerializer, serializing dirents to an
diff --git a/pkg/sentry/syscalls/linux/sys_identity.go b/pkg/sentry/syscalls/linux/sys_identity.go
index 715ac45e6..a29d307e5 100644
--- a/pkg/sentry/syscalls/linux/sys_identity.go
+++ b/pkg/sentry/syscalls/linux/sys_identity.go
@@ -49,13 +49,13 @@ func Getresuid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 	ruid := c.RealKUID.In(c.UserNamespace).OrOverflow()
 	euid := c.EffectiveKUID.In(c.UserNamespace).OrOverflow()
 	suid := c.SavedKUID.In(c.UserNamespace).OrOverflow()
-	if _, err := t.CopyOut(ruidAddr, ruid); err != nil {
+	if _, err := ruid.CopyOut(t, ruidAddr); err != nil {
 		return 0, nil, err
 	}
-	if _, err := t.CopyOut(euidAddr, euid); err != nil {
+	if _, err := euid.CopyOut(t, euidAddr); err != nil {
 		return 0, nil, err
 	}
-	if _, err := t.CopyOut(suidAddr, suid); err != nil {
+	if _, err := suid.CopyOut(t, suidAddr); err != nil {
 		return 0, nil, err
 	}
 	return 0, nil, nil
@@ -84,13 +84,13 @@ func Getresgid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 	rgid := c.RealKGID.In(c.UserNamespace).OrOverflow()
 	egid := c.EffectiveKGID.In(c.UserNamespace).OrOverflow()
 	sgid := c.SavedKGID.In(c.UserNamespace).OrOverflow()
-	if _, err := t.CopyOut(rgidAddr, rgid); err != nil {
+	if _, err := rgid.CopyOut(t, rgidAddr); err != nil {
 		return 0, nil, err
 	}
-	if _, err := t.CopyOut(egidAddr, egid); err != nil {
+	if _, err := egid.CopyOut(t, egidAddr); err != nil {
 		return 0, nil, err
 	}
-	if _, err := t.CopyOut(sgidAddr, sgid); err != nil {
+	if _, err := sgid.CopyOut(t, sgidAddr); err != nil {
 		return 0, nil, err
 	}
 	return 0, nil, nil
@@ -157,7 +157,7 @@ func Getgroups(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 	for i, kgid := range kgids {
 		gids[i] = kgid.In(t.UserNamespace()).OrOverflow()
 	}
-	if _, err := t.CopyOut(args[1].Pointer(), gids); err != nil {
+	if _, err := auth.CopyGIDSliceOut(t, args[1].Pointer(), gids); err != nil {
 		return 0, nil, err
 	}
 	return uintptr(len(gids)), nil, nil
@@ -173,7 +173,7 @@ func Setgroups(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 		return 0, nil, t.SetExtraGIDs(nil)
 	}
 	gids := make([]auth.GID, size)
-	if _, err := t.CopyIn(args[1].Pointer(), &gids); err != nil {
+	if _, err := auth.CopyGIDSliceIn(t, args[1].Pointer(), gids); err != nil {
 		return 0, nil, err
 	}
 	return 0, nil, t.SetExtraGIDs(gids)
diff --git a/pkg/sentry/syscalls/linux/sys_membarrier.go b/pkg/sentry/syscalls/linux/sys_membarrier.go
new file mode 100644
index 000000000..63ee5d435
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_membarrier.go
@@ -0,0 +1,103 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Membarrier implements syscall membarrier(2).
+func Membarrier(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	cmd := args[0].Int()
+	flags := args[1].Uint()
+
+	switch cmd {
+	case linux.MEMBARRIER_CMD_QUERY:
+		if flags != 0 {
+			return 0, nil, syserror.EINVAL
+		}
+		var supportedCommands uintptr
+		if t.Kernel().Platform.HaveGlobalMemoryBarrier() {
+			supportedCommands |= linux.MEMBARRIER_CMD_GLOBAL |
+				linux.MEMBARRIER_CMD_GLOBAL_EXPEDITED |
+				linux.MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED |
+				linux.MEMBARRIER_CMD_PRIVATE_EXPEDITED |
+				linux.MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED
+		}
+		if t.RSeqAvailable() {
+			supportedCommands |= linux.MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ |
+				linux.MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ
+		}
+		return supportedCommands, nil, nil
+	case linux.MEMBARRIER_CMD_GLOBAL, linux.MEMBARRIER_CMD_GLOBAL_EXPEDITED, linux.MEMBARRIER_CMD_PRIVATE_EXPEDITED:
+		if flags != 0 {
+			return 0, nil, syserror.EINVAL
+		}
+		if !t.Kernel().Platform.HaveGlobalMemoryBarrier() {
+			return 0, nil, syserror.EINVAL
+		}
+		if cmd == linux.MEMBARRIER_CMD_PRIVATE_EXPEDITED && !t.MemoryManager().IsMembarrierPrivateEnabled() {
+			return 0, nil, syserror.EPERM
+		}
+		return 0, nil, t.Kernel().Platform.GlobalMemoryBarrier()
+	case linux.MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED:
+		if flags != 0 {
+			return 0, nil, syserror.EINVAL
+		}
+		if !t.Kernel().Platform.HaveGlobalMemoryBarrier() {
+			return 0, nil, syserror.EINVAL
+		}
+		// no-op
+		return 0, nil, nil
+	case linux.MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED:
+		if flags != 0 {
+			return 0, nil, syserror.EINVAL
+		}
+		if !t.Kernel().Platform.HaveGlobalMemoryBarrier() {
+			return 0, nil, syserror.EINVAL
+		}
+		t.MemoryManager().EnableMembarrierPrivate()
+		return 0, nil, nil
+	case linux.MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ:
+		if flags&^linux.MEMBARRIER_CMD_FLAG_CPU != 0 {
+			return 0, nil, syserror.EINVAL
+		}
+		if !t.RSeqAvailable() {
+			return 0, nil, syserror.EINVAL
+		}
+		if !t.MemoryManager().IsMembarrierRSeqEnabled() {
+			return 0, nil, syserror.EPERM
+		}
+		// MEMBARRIER_CMD_FLAG_CPU and cpu_id are ignored since we don't have
+		// the ability to preempt specific CPUs.
+		return 0, nil, t.Kernel().Platform.PreemptAllCPUs()
+	case linux.MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ:
+		if flags != 0 {
+			return 0, nil, syserror.EINVAL
+		}
+		if !t.RSeqAvailable() {
+			return 0, nil, syserror.EINVAL
+		}
+		t.MemoryManager().EnableMembarrierRSeq()
+		return 0, nil, nil
+	default:
+		// Probably a command we don't implement.
+		t.Kernel().EmitUnimplementedEvent(t)
+		return 0, nil, syserror.EINVAL
+	}
+}
diff --git a/pkg/sentry/syscalls/linux/sys_mmap.go b/pkg/sentry/syscalls/linux/sys_mmap.go
index d0109baa4..cd8dfdfa4 100644
--- a/pkg/sentry/syscalls/linux/sys_mmap.go
+++ b/pkg/sentry/syscalls/linux/sys_mmap.go
@@ -100,6 +100,15 @@ func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC
 		if err := file.ConfigureMMap(t, &opts); err != nil {
 			return 0, nil, err
 		}
+	} else if shared {
+		// Back shared anonymous mappings with a special mappable.
+		opts.Offset = 0
+		m, err := mm.NewSharedAnonMappable(opts.Length, t.Kernel())
+		if err != nil {
+			return 0, nil, err
+		}
+		opts.MappingIdentity = m // transfers ownership of m to opts
+		opts.Mappable = m
 	}
 
 	rv, err := t.MemoryManager().MMap(t, opts)
@@ -239,7 +248,7 @@ func Mincore(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
 		return 0, nil, syserror.ENOMEM
 	}
 	resident := bytes.Repeat([]byte{1}, int(mapped/usermem.PageSize))
-	_, err := t.CopyOut(vec, resident)
+	_, err := t.CopyOutBytes(vec, resident)
 	return 0, nil, err
 }
 
diff --git a/pkg/sentry/syscalls/linux/sys_pipe.go b/pkg/sentry/syscalls/linux/sys_pipe.go
index 3149e4aad..849a47476 100644
--- a/pkg/sentry/syscalls/linux/sys_pipe.go
+++ b/pkg/sentry/syscalls/linux/sys_pipe.go
@@ -16,6 +16,7 @@ package linux
 
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
@@ -46,9 +47,9 @@ func pipe2(t *kernel.Task, addr usermem.Addr, flags uint) (uintptr, error) {
 		return 0, err
 	}
 
-	if _, err := t.CopyOut(addr, fds); err != nil {
+	if _, err := primitive.CopyInt32SliceOut(t, addr, fds); err != nil {
 		for _, fd := range fds {
-			if file, _ := t.FDTable().Remove(fd); file != nil {
+			if file, _ := t.FDTable().Remove(t, fd); file != nil {
 				file.DecRef(t)
 			}
 		}
diff --git a/pkg/sentry/syscalls/linux/sys_poll.go b/pkg/sentry/syscalls/linux/sys_poll.go
index 789e2ed5b..254f4c9f9 100644
--- a/pkg/sentry/syscalls/linux/sys_poll.go
+++ b/pkg/sentry/syscalls/linux/sys_poll.go
@@ -162,7 +162,7 @@ func CopyInPollFDs(t *kernel.Task, addr usermem.Addr, nfds uint) ([]linux.PollFD
 
 	pfd := make([]linux.PollFD, nfds)
 	if nfds > 0 {
-		if _, err := t.CopyIn(addr, &pfd); err != nil {
+		if _, err := linux.CopyPollFDSliceIn(t, addr, pfd); err != nil {
 			return nil, err
 		}
 	}
@@ -189,7 +189,7 @@ func doPoll(t *kernel.Task, addr usermem.Addr, nfds uint, timeout time.Duration)
 	// The poll entries are copied out regardless of whether
 	// any are set or not. This aligns with the Linux behavior.
 	if nfds > 0 && err == nil {
-		if _, err := t.CopyOut(addr, pfd); err != nil {
+		if _, err := linux.CopyPollFDSliceOut(t, addr, pfd); err != nil {
 			return remainingTimeout, 0, err
 		}
 	}
@@ -202,7 +202,7 @@ func CopyInFDSet(t *kernel.Task, addr usermem.Addr, nBytes, nBitsInLastPartialBy
 	set := make([]byte, nBytes)
 
 	if addr != 0 {
-		if _, err := t.CopyIn(addr, &set); err != nil {
+		if _, err := t.CopyInBytes(addr, set); err != nil {
 			return nil, err
 		}
 		// If we only use part of the last byte, mask out the extraneous bits.
@@ -329,19 +329,19 @@ func doSelect(t *kernel.Task, nfds int, readFDs, writeFDs, exceptFDs usermem.Add
 
 	// Copy updated vectors back.
 	if readFDs != 0 {
-		if _, err := t.CopyOut(readFDs, r); err != nil {
+		if _, err := t.CopyOutBytes(readFDs, r); err != nil {
 			return 0, err
 		}
 	}
 
 	if writeFDs != 0 {
-		if _, err := t.CopyOut(writeFDs, w); err != nil {
+		if _, err := t.CopyOutBytes(writeFDs, w); err != nil {
 			return 0, err
 		}
 	}
 
 	if exceptFDs != 0 {
-		if _, err := t.CopyOut(exceptFDs, e); err != nil {
+		if _, err := t.CopyOutBytes(exceptFDs, e); err != nil {
 			return 0, err
 		}
 	}
diff --git a/pkg/sentry/syscalls/linux/sys_prctl.go b/pkg/sentry/syscalls/linux/sys_prctl.go
index 64a725296..a892d2c62 100644
--- a/pkg/sentry/syscalls/linux/sys_prctl.go
+++ b/pkg/sentry/syscalls/linux/sys_prctl.go
@@ -18,6 +18,7 @@ import (
 	"fmt"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fsbridge"
@@ -43,7 +44,7 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 		return 0, nil, nil
 
 	case linux.PR_GET_PDEATHSIG:
-		_, err := t.CopyOut(args[1].Pointer(), int32(t.ParentDeathSignal()))
+		_, err := primitive.CopyInt32Out(t, args[1].Pointer(), int32(t.ParentDeathSignal()))
 		return 0, nil, err
 
 	case linux.PR_GET_DUMPABLE:
@@ -110,7 +111,7 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 			buf[len] = 0
 			len++
 		}
-		_, err := t.CopyOut(addr, buf[:len])
+		_, err := t.CopyOutBytes(addr, buf[:len])
 		if err != nil {
 			return 0, nil, err
 		}
diff --git a/pkg/sentry/syscalls/linux/sys_rlimit.go b/pkg/sentry/syscalls/linux/sys_rlimit.go
index d5d5b6959..309c183a3 100644
--- a/pkg/sentry/syscalls/linux/sys_rlimit.go
+++ b/pkg/sentry/syscalls/linux/sys_rlimit.go
@@ -16,6 +16,7 @@ package linux
 
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/marshal"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/limits"
@@ -26,17 +27,13 @@ import (
 // rlimit describes an implementation of 'struct rlimit', which may vary from
 // system-to-system.
 type rlimit interface {
+	marshal.Marshallable
+
 	// toLimit converts an rlimit to a limits.Limit.
 	toLimit() *limits.Limit
 
 	// fromLimit converts a limits.Limit to an rlimit.
 	fromLimit(lim limits.Limit)
-
-	// copyIn copies an rlimit from the untrusted app to the kernel.
-	copyIn(t *kernel.Task, addr usermem.Addr) error
-
-	// copyOut copies an rlimit from the kernel to the untrusted app.
-	copyOut(t *kernel.Task, addr usermem.Addr) error
 }
 
 // newRlimit returns the appropriate rlimit type for 'struct rlimit' on this system.
@@ -50,6 +47,7 @@ func newRlimit(t *kernel.Task) (rlimit, error) {
 	}
 }
 
+// +marshal
 type rlimit64 struct {
 	Cur uint64
 	Max uint64
@@ -70,12 +68,12 @@ func (r *rlimit64) fromLimit(lim limits.Limit) {
 }
 
 func (r *rlimit64) copyIn(t *kernel.Task, addr usermem.Addr) error {
-	_, err := t.CopyIn(addr, r)
+	_, err := r.CopyIn(t, addr)
 	return err
 }
 
 func (r *rlimit64) copyOut(t *kernel.Task, addr usermem.Addr) error {
-	_, err := t.CopyOut(addr, *r)
+	_, err := r.CopyOut(t, addr)
 	return err
 }
 
@@ -140,7 +138,8 @@ func Getrlimit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 		return 0, nil, err
 	}
 	rlim.fromLimit(lim)
-	return 0, nil, rlim.copyOut(t, addr)
+	_, err = rlim.CopyOut(t, addr)
+	return 0, nil, err
 }
 
 // Setrlimit implements linux syscall setrlimit(2).
@@ -155,7 +154,7 @@ func Setrlimit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 	if err != nil {
 		return 0, nil, err
 	}
-	if err := rlim.copyIn(t, addr); err != nil {
+	if _, err := rlim.CopyIn(t, addr); err != nil {
 		return 0, nil, syserror.EFAULT
 	}
 	_, err = prlimit64(t, resource, rlim.toLimit())
diff --git a/pkg/sentry/syscalls/linux/sys_rusage.go b/pkg/sentry/syscalls/linux/sys_rusage.go
index 1674c7445..ac5c98a54 100644
--- a/pkg/sentry/syscalls/linux/sys_rusage.go
+++ b/pkg/sentry/syscalls/linux/sys_rusage.go
@@ -80,7 +80,7 @@ func Getrusage(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 	}
 
 	ru := getrusage(t, which)
-	_, err := t.CopyOut(addr, &ru)
+	_, err := ru.CopyOut(t, addr)
 	return 0, nil, err
 }
 
@@ -104,7 +104,7 @@ func Times(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 		CUTime: linux.ClockTFromDuration(cs2.UserTime),
 		CSTime: linux.ClockTFromDuration(cs2.SysTime),
 	}
-	if _, err := t.CopyOut(addr, &r); err != nil {
+	if _, err := r.CopyOut(t, addr); err != nil {
 		return 0, nil, err
 	}
 
diff --git a/pkg/sentry/syscalls/linux/sys_sched.go b/pkg/sentry/syscalls/linux/sys_sched.go
index 99f6993f5..bfcf44b6f 100644
--- a/pkg/sentry/syscalls/linux/sys_sched.go
+++ b/pkg/sentry/syscalls/linux/sys_sched.go
@@ -27,8 +27,10 @@ const (
 )
 
 // SchedParam replicates struct sched_param in sched.h.
+//
+// +marshal
 type SchedParam struct {
-	schedPriority int64
+	schedPriority int32
 }
 
 // SchedGetparam implements linux syscall sched_getparam(2).
@@ -45,7 +47,7 @@ func SchedGetparam(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel
 		return 0, nil, syserror.ESRCH
 	}
 	r := SchedParam{schedPriority: onlyPriority}
-	if _, err := t.CopyOut(param, r); err != nil {
+	if _, err := r.CopyOut(t, param); err != nil {
 		return 0, nil, err
 	}
 
@@ -79,7 +81,7 @@ func SchedSetscheduler(t *kernel.Task, args arch.SyscallArguments) (uintptr, *ke
 		return 0, nil, syserror.ESRCH
 	}
 	var r SchedParam
-	if _, err := t.CopyIn(param, &r); err != nil {
+	if _, err := r.CopyIn(t, param); err != nil {
 		return 0, nil, syserror.EINVAL
 	}
 	if r.schedPriority != onlyPriority {
diff --git a/pkg/sentry/syscalls/linux/sys_seccomp.go b/pkg/sentry/syscalls/linux/sys_seccomp.go
index 5b7a66f4d..4fdb4463c 100644
--- a/pkg/sentry/syscalls/linux/sys_seccomp.go
+++ b/pkg/sentry/syscalls/linux/sys_seccomp.go
@@ -24,6 +24,8 @@ import (
 )
 
 // userSockFprog is equivalent to Linux's struct sock_fprog on amd64.
+//
+// +marshal
 type userSockFprog struct {
 	// Len is the length of the filter in BPF instructions.
 	Len uint16
@@ -33,7 +35,7 @@ type userSockFprog struct {
 	// Filter is a user pointer to the struct sock_filter array that makes up
 	// the filter program. Filter is a uint64 rather than a usermem.Addr
 	// because usermem.Addr is actually uintptr, which is not a fixed-size
-	// type, and encoding/binary.Read objects to this.
+	// type.
 	Filter uint64
 }
 
@@ -54,11 +56,11 @@ func seccomp(t *kernel.Task, mode, flags uint64, addr usermem.Addr) error {
 	}
 
 	var fprog userSockFprog
-	if _, err := t.CopyIn(addr, &fprog); err != nil {
+	if _, err := fprog.CopyIn(t, addr); err != nil {
 		return err
 	}
 	filter := make([]linux.BPFInstruction, int(fprog.Len))
-	if _, err := t.CopyIn(usermem.Addr(fprog.Filter), &filter); err != nil {
+	if _, err := linux.CopyBPFInstructionSliceIn(t, usermem.Addr(fprog.Filter), filter); err != nil {
 		return err
 	}
 	compiledFilter, err := bpf.Compile(filter)
diff --git a/pkg/sentry/syscalls/linux/sys_sem.go b/pkg/sentry/syscalls/linux/sys_sem.go
index 5f54f2456..47dadb800 100644
--- a/pkg/sentry/syscalls/linux/sys_sem.go
+++ b/pkg/sentry/syscalls/linux/sys_sem.go
@@ -18,6 +18,7 @@ import (
 	"math"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
@@ -66,7 +67,7 @@ func Semop(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	}
 
 	ops := make([]linux.Sembuf, nsops)
-	if _, err := t.CopyIn(sembufAddr, ops); err != nil {
+	if _, err := linux.CopySembufSliceIn(t, sembufAddr, ops); err != nil {
 		return 0, nil, err
 	}
 
@@ -116,8 +117,8 @@ func Semctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 
 	case linux.IPC_SET:
 		arg := args[3].Pointer()
-		s := linux.SemidDS{}
-		if _, err := t.CopyIn(arg, &s); err != nil {
+		var s linux.SemidDS
+		if _, err := s.CopyIn(t, arg); err != nil {
 			return 0, nil, err
 		}
 
@@ -188,7 +189,7 @@ func setValAll(t *kernel.Task, id int32, array usermem.Addr) error {
 		return syserror.EINVAL
 	}
 	vals := make([]uint16, set.Size())
-	if _, err := t.CopyIn(array, vals); err != nil {
+	if _, err := primitive.CopyUint16SliceIn(t, array, vals); err != nil {
 		return err
 	}
 	creds := auth.CredentialsFromContext(t)
@@ -217,7 +218,7 @@ func getValAll(t *kernel.Task, id int32, array usermem.Addr) error {
 	if err != nil {
 		return err
 	}
-	_, err = t.CopyOut(array, vals)
+	_, err = primitive.CopyUint16SliceOut(t, array, vals)
 	return err
 }
 
diff --git a/pkg/sentry/syscalls/linux/sys_shm.go b/pkg/sentry/syscalls/linux/sys_shm.go
index f0ae8fa8e..584064143 100644
--- a/pkg/sentry/syscalls/linux/sys_shm.go
+++ b/pkg/sentry/syscalls/linux/sys_shm.go
@@ -112,18 +112,18 @@ func Shmctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 
 		stat, err := segment.IPCStat(t)
 		if err == nil {
-			_, err = t.CopyOut(buf, stat)
+			_, err = stat.CopyOut(t, buf)
 		}
 		return 0, nil, err
 
 	case linux.IPC_INFO:
 		params := r.IPCInfo()
-		_, err := t.CopyOut(buf, params)
+		_, err := params.CopyOut(t, buf)
 		return 0, nil, err
 
 	case linux.SHM_INFO:
 		info := r.ShmInfo()
-		_, err := t.CopyOut(buf, info)
+		_, err := info.CopyOut(t, buf)
 		return 0, nil, err
 	}
 
@@ -137,11 +137,10 @@ func Shmctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 	switch cmd {
 	case linux.IPC_SET:
 		var ds linux.ShmidDS
-		_, err = t.CopyIn(buf, &ds)
-		if err != nil {
+		if _, err = ds.CopyIn(t, buf); err != nil {
 			return 0, nil, err
 		}
-		err = segment.Set(t, &ds)
+		err := segment.Set(t, &ds)
 		return 0, nil, err
 
 	case linux.IPC_RMID:
diff --git a/pkg/sentry/syscalls/linux/sys_socket.go b/pkg/sentry/syscalls/linux/sys_socket.go
index 38f573c14..9cd052c3d 100644
--- a/pkg/sentry/syscalls/linux/sys_socket.go
+++ b/pkg/sentry/syscalls/linux/sys_socket.go
@@ -19,6 +19,8 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/marshal"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
@@ -29,8 +31,6 @@ import (
 	"gvisor.dev/gvisor/pkg/syserr"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
-	"gvisor.dev/gvisor/tools/go_marshal/marshal"
-	"gvisor.dev/gvisor/tools/go_marshal/primitive"
 )
 
 // LINT.IfChange
@@ -67,10 +67,10 @@ const flagsOffset = 48
 const sizeOfInt32 = 4
 
 // messageHeader64Len is the length of a MessageHeader64 struct.
-var messageHeader64Len = uint64(binary.Size(MessageHeader64{}))
+var messageHeader64Len = uint64((*MessageHeader64)(nil).SizeBytes())
 
 // multipleMessageHeader64Len is the length of a multipeMessageHeader64 struct.
-var multipleMessageHeader64Len = uint64(binary.Size(multipleMessageHeader64{}))
+var multipleMessageHeader64Len = uint64((*multipleMessageHeader64)(nil).SizeBytes())
 
 // baseRecvFlags are the flags that are accepted across recvmsg(2),
 // recvmmsg(2), and recvfrom(2).
@@ -78,6 +78,8 @@ const baseRecvFlags = linux.MSG_OOB | linux.MSG_DONTROUTE | linux.MSG_DONTWAIT |
 
 // MessageHeader64 is the 64-bit representation of the msghdr struct used in
 // the recvmsg and sendmsg syscalls.
+//
+// +marshal
 type MessageHeader64 struct {
 	// Name is the optional pointer to a network address buffer.
 	Name uint64
@@ -106,30 +108,14 @@ type MessageHeader64 struct {
 
 // multipleMessageHeader64 is the 64-bit representation of the mmsghdr struct used in
 // the recvmmsg and sendmmsg syscalls.
+//
+// +marshal
 type multipleMessageHeader64 struct {
 	msgHdr MessageHeader64
 	msgLen uint32
 	_      int32
 }
 
-// CopyInMessageHeader64 copies a message header from user to kernel memory.
-func CopyInMessageHeader64(t *kernel.Task, addr usermem.Addr, msg *MessageHeader64) error {
-	b := t.CopyScratchBuffer(52)
-	if _, err := t.CopyInBytes(addr, b); err != nil {
-		return err
-	}
-
-	msg.Name = usermem.ByteOrder.Uint64(b[0:])
-	msg.NameLen = usermem.ByteOrder.Uint32(b[8:])
-	msg.Iov = usermem.ByteOrder.Uint64(b[16:])
-	msg.IovLen = usermem.ByteOrder.Uint64(b[24:])
-	msg.Control = usermem.ByteOrder.Uint64(b[32:])
-	msg.ControlLen = usermem.ByteOrder.Uint64(b[40:])
-	msg.Flags = int32(usermem.ByteOrder.Uint32(b[48:]))
-
-	return nil
-}
-
 // CaptureAddress allocates memory for and copies a socket address structure
 // from the untrusted address space range.
 func CaptureAddress(t *kernel.Task, addr usermem.Addr, addrlen uint32) ([]byte, error) {
@@ -148,10 +134,10 @@ func CaptureAddress(t *kernel.Task, addr usermem.Addr, addrlen uint32) ([]byte,
 // writeAddress writes a sockaddr structure and its length to an output buffer
 // in the unstrusted address space range. If the address is bigger than the
 // buffer, it is truncated.
-func writeAddress(t *kernel.Task, addr interface{}, addrLen uint32, addrPtr usermem.Addr, addrLenPtr usermem.Addr) error {
+func writeAddress(t *kernel.Task, addr linux.SockAddr, addrLen uint32, addrPtr usermem.Addr, addrLenPtr usermem.Addr) error {
 	// Get the buffer length.
 	var bufLen uint32
-	if _, err := t.CopyIn(addrLenPtr, &bufLen); err != nil {
+	if _, err := primitive.CopyUint32In(t, addrLenPtr, &bufLen); err != nil {
 		return err
 	}
 
@@ -160,7 +146,7 @@ func writeAddress(t *kernel.Task, addr interface{}, addrLen uint32, addrPtr user
 	}
 
 	// Write the length unconditionally.
-	if _, err := t.CopyOut(addrLenPtr, addrLen); err != nil {
+	if _, err := primitive.CopyUint32Out(t, addrLenPtr, addrLen); err != nil {
 		return err
 	}
 
@@ -173,7 +159,8 @@ func writeAddress(t *kernel.Task, addr interface{}, addrLen uint32, addrPtr user
 	}
 
 	// Copy as much of the address as will fit in the buffer.
-	encodedAddr := binary.Marshal(nil, usermem.ByteOrder, addr)
+	encodedAddr := t.CopyScratchBuffer(addr.SizeBytes())
+	addr.MarshalUnsafe(encodedAddr)
 	if bufLen > uint32(len(encodedAddr)) {
 		bufLen = uint32(len(encodedAddr))
 	}
@@ -247,9 +234,9 @@ func SocketPair(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
 	}
 
 	// Copy the file descriptors out.
-	if _, err := t.CopyOut(socks, fds); err != nil {
+	if _, err := primitive.CopyInt32SliceOut(t, socks, fds); err != nil {
 		for _, fd := range fds {
-			if file, _ := t.FDTable().Remove(fd); file != nil {
+			if file, _ := t.FDTable().Remove(t, fd); file != nil {
 				file.DecRef(t)
 			}
 		}
@@ -456,8 +443,8 @@ func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
 	}
 
 	// Read the length. Reject negative values.
-	optLen := int32(0)
-	if _, err := t.CopyIn(optLenAddr, &optLen); err != nil {
+	var optLen int32
+	if _, err := primitive.CopyInt32In(t, optLenAddr, &optLen); err != nil {
 		return 0, nil, err
 	}
 	if optLen < 0 {
@@ -471,7 +458,7 @@ func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
 	}
 
 	vLen := int32(binary.Size(v))
-	if _, err := t.CopyOut(optLenAddr, vLen); err != nil {
+	if _, err := primitive.CopyInt32Out(t, optLenAddr, vLen); err != nil {
 		return 0, nil, err
 	}
 
@@ -733,7 +720,7 @@ func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 		if !ok {
 			return 0, nil, syserror.EFAULT
 		}
-		if _, err = t.CopyOut(lp, uint32(n)); err != nil {
+		if _, err = primitive.CopyUint32Out(t, lp, uint32(n)); err != nil {
 			break
 		}
 		count++
@@ -748,7 +735,7 @@ func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 func recvSingleMsg(t *kernel.Task, s socket.Socket, msgPtr usermem.Addr, flags int32, haveDeadline bool, deadline ktime.Time) (uintptr, error) {
 	// Capture the message header and io vectors.
 	var msg MessageHeader64
-	if err := CopyInMessageHeader64(t, msgPtr, &msg); err != nil {
+	if _, err := msg.CopyIn(t, msgPtr); err != nil {
 		return 0, err
 	}
 
@@ -780,7 +767,7 @@ func recvSingleMsg(t *kernel.Task, s socket.Socket, msgPtr usermem.Addr, flags i
 
 		if int(msg.Flags) != mflags {
 			// Copy out the flags to the caller.
-			if _, err := t.CopyOut(msgPtr+flagsOffset, int32(mflags)); err != nil {
+			if _, err := primitive.CopyInt32Out(t, msgPtr+flagsOffset, int32(mflags)); err != nil {
 				return 0, err
 			}
 		}
@@ -817,17 +804,17 @@ func recvSingleMsg(t *kernel.Task, s socket.Socket, msgPtr usermem.Addr, flags i
 	}
 
 	// Copy the control data to the caller.
-	if _, err := t.CopyOut(msgPtr+controlLenOffset, uint64(len(controlData))); err != nil {
+	if _, err := primitive.CopyUint64Out(t, msgPtr+controlLenOffset, uint64(len(controlData))); err != nil {
 		return 0, err
 	}
 	if len(controlData) > 0 {
-		if _, err := t.CopyOut(usermem.Addr(msg.Control), controlData); err != nil {
+		if _, err := t.CopyOutBytes(usermem.Addr(msg.Control), controlData); err != nil {
 			return 0, err
 		}
 	}
 
 	// Copy out the flags to the caller.
-	if _, err := t.CopyOut(msgPtr+flagsOffset, int32(mflags)); err != nil {
+	if _, err := primitive.CopyInt32Out(t, msgPtr+flagsOffset, int32(mflags)); err != nil {
 		return 0, err
 	}
 
@@ -996,7 +983,7 @@ func SendMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 		if !ok {
 			return 0, nil, syserror.EFAULT
 		}
-		if _, err = t.CopyOut(lp, uint32(n)); err != nil {
+		if _, err = primitive.CopyUint32Out(t, lp, uint32(n)); err != nil {
 			break
 		}
 		count++
@@ -1011,7 +998,7 @@ func SendMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 func sendSingleMsg(t *kernel.Task, s socket.Socket, file *fs.File, msgPtr usermem.Addr, flags int32) (uintptr, error) {
 	// Capture the message header.
 	var msg MessageHeader64
-	if err := CopyInMessageHeader64(t, msgPtr, &msg); err != nil {
+	if _, err := msg.CopyIn(t, msgPtr); err != nil {
 		return 0, err
 	}
 
@@ -1022,7 +1009,7 @@ func sendSingleMsg(t *kernel.Task, s socket.Socket, file *fs.File, msgPtr userme
 			return 0, syserror.ENOBUFS
 		}
 		controlData = make([]byte, msg.ControlLen)
-		if _, err := t.CopyIn(usermem.Addr(msg.Control), &controlData); err != nil {
+		if _, err := t.CopyInBytes(usermem.Addr(msg.Control), controlData); err != nil {
 			return 0, err
 		}
 	}
@@ -1065,7 +1052,9 @@ func sendSingleMsg(t *kernel.Task, s socket.Socket, file *fs.File, msgPtr userme
 	// Call the syscall implementation.
 	n, e := s.SendMsg(t, src, to, int(flags), haveDeadline, deadline, controlMessages)
 	err = handleIOError(t, n != 0, e.ToError(), syserror.ERESTARTSYS, "sendmsg", file)
-	if err != nil {
+	// Control messages should be released on error as well as for zero-length
+	// messages, which are discarded by the receiver.
+	if n == 0 || err != nil {
 		controlMessages.Release(t)
 	}
 	return uintptr(n), err
diff --git a/pkg/sentry/syscalls/linux/sys_splice.go b/pkg/sentry/syscalls/linux/sys_splice.go
index c69941feb..46616c961 100644
--- a/pkg/sentry/syscalls/linux/sys_splice.go
+++ b/pkg/sentry/syscalls/linux/sys_splice.go
@@ -16,6 +16,7 @@ package linux
 
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
@@ -141,7 +142,7 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 
 		// Copy in the offset.
 		var offset int64
-		if _, err := t.CopyIn(offsetAddr, &offset); err != nil {
+		if _, err := primitive.CopyInt64In(t, offsetAddr, &offset); err != nil {
 			return 0, nil, err
 		}
 
@@ -149,11 +150,11 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 		n, err = doSplice(t, outFile, inFile, fs.SpliceOpts{
 			Length:    count,
 			SrcOffset: true,
-			SrcStart:  offset,
+			SrcStart:  int64(offset),
 		}, outFile.Flags().NonBlocking)
 
 		// Copy out the new offset.
-		if _, err := t.CopyOut(offsetAddr, n+offset); err != nil {
+		if _, err := primitive.CopyInt64Out(t, offsetAddr, offset+n); err != nil {
 			return 0, nil, err
 		}
 	} else {
@@ -228,7 +229,7 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 			}
 
 			var offset int64
-			if _, err := t.CopyIn(outOffset, &offset); err != nil {
+			if _, err := primitive.CopyInt64In(t, outOffset, &offset); err != nil {
 				return 0, nil, err
 			}
 
@@ -246,7 +247,7 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 			}
 
 			var offset int64
-			if _, err := t.CopyIn(inOffset, &offset); err != nil {
+			if _, err := primitive.CopyInt64In(t, inOffset, &offset); err != nil {
 				return 0, nil, err
 			}
 
diff --git a/pkg/sentry/syscalls/linux/sys_stat.go b/pkg/sentry/syscalls/linux/sys_stat.go
index a5826f2dd..cda29a8b5 100644
--- a/pkg/sentry/syscalls/linux/sys_stat.go
+++ b/pkg/sentry/syscalls/linux/sys_stat.go
@@ -221,7 +221,7 @@ func statx(t *kernel.Task, sattr fs.StableAttr, uattr fs.UnstableAttr, statxAddr
 		DevMajor:  uint32(devMajor),
 		DevMinor:  devMinor,
 	}
-	_, err := t.CopyOut(statxAddr, &s)
+	_, err := s.CopyOut(t, statxAddr)
 	return err
 }
 
@@ -283,7 +283,7 @@ func statfsImpl(t *kernel.Task, d *fs.Dirent, addr usermem.Addr) error {
 		FragmentSize: d.Inode.StableAttr.BlockSize,
 		// Leave other fields 0 like simple_statfs does.
 	}
-	_, err = t.CopyOut(addr, &statfs)
+	_, err = statfs.CopyOut(t, addr)
 	return err
 }
 
diff --git a/pkg/sentry/syscalls/linux/sys_sysinfo.go b/pkg/sentry/syscalls/linux/sys_sysinfo.go
index 297de052a..db3d924d9 100644
--- a/pkg/sentry/syscalls/linux/sys_sysinfo.go
+++ b/pkg/sentry/syscalls/linux/sys_sysinfo.go
@@ -21,13 +21,17 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/usage"
 )
 
-// Sysinfo implements the sysinfo syscall as described in man 2 sysinfo.
+// Sysinfo implements Linux syscall sysinfo(2).
 func Sysinfo(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	addr := args[0].Pointer()
 
 	mf := t.Kernel().MemoryFile()
-	mf.UpdateUsage()
-	_, totalUsage := usage.MemoryAccounting.Copy()
+	mfUsage, err := mf.TotalUsage()
+	if err != nil {
+		return 0, nil, err
+	}
+	memStats, _ := usage.MemoryAccounting.Copy()
+	totalUsage := mfUsage + memStats.Mapped
 	totalSize := usage.TotalMemory(mf.TotalSize(), totalUsage)
 	memFree := totalSize - totalUsage
 	if memFree > totalSize {
@@ -37,12 +41,12 @@ func Sysinfo(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
 
 	// Only a subset of the fields in sysinfo_t make sense to return.
 	si := linux.Sysinfo{
-		Procs:    uint16(len(t.PIDNamespace().Tasks())),
+		Procs:    uint16(t.Kernel().TaskSet().Root.NumTasks()),
 		Uptime:   t.Kernel().MonotonicClock().Now().Seconds(),
 		TotalRAM: totalSize,
 		FreeRAM:  memFree,
 		Unit:     1,
 	}
-	_, err := t.CopyOut(addr, si)
+	_, err = si.CopyOut(t, addr)
 	return 0, nil, err
 }
diff --git a/pkg/sentry/syscalls/linux/sys_thread.go b/pkg/sentry/syscalls/linux/sys_thread.go
index 101096038..39ca9ea97 100644
--- a/pkg/sentry/syscalls/linux/sys_thread.go
+++ b/pkg/sentry/syscalls/linux/sys_thread.go
@@ -19,6 +19,7 @@ import (
 	"syscall"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fsbridge"
@@ -311,13 +312,13 @@ func wait4(t *kernel.Task, pid int, statusAddr usermem.Addr, options int, rusage
 		return 0, err
 	}
 	if statusAddr != 0 {
-		if _, err := t.CopyOut(statusAddr, wr.Status); err != nil {
+		if _, err := primitive.CopyUint32Out(t, statusAddr, wr.Status); err != nil {
 			return 0, err
 		}
 	}
 	if rusageAddr != 0 {
 		ru := getrusage(wr.Task, linux.RUSAGE_BOTH)
-		if _, err := t.CopyOut(rusageAddr, &ru); err != nil {
+		if _, err := ru.CopyOut(t, rusageAddr); err != nil {
 			return 0, err
 		}
 	}
@@ -395,14 +396,14 @@ func Waitid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 			// as well.
 			if infop != 0 {
 				var si arch.SignalInfo
-				_, err = t.CopyOut(infop, &si)
+				_, err = si.CopyOut(t, infop)
 			}
 		}
 		return 0, nil, err
 	}
 	if rusageAddr != 0 {
 		ru := getrusage(wr.Task, linux.RUSAGE_BOTH)
-		if _, err := t.CopyOut(rusageAddr, &ru); err != nil {
+		if _, err := ru.CopyOut(t, rusageAddr); err != nil {
 			return 0, nil, err
 		}
 	}
@@ -441,7 +442,7 @@ func Waitid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 	default:
 		t.Warningf("waitid got incomprehensible wait status %d", s)
 	}
-	_, err = t.CopyOut(infop, &si)
+	_, err = si.CopyOut(t, infop)
 	return 0, nil, err
 }
 
@@ -558,9 +559,7 @@ func Getcpu(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 	// third argument to this system call is nowadays unused.
 
 	if cpu != 0 {
-		buf := t.CopyScratchBuffer(4)
-		usermem.ByteOrder.PutUint32(buf, uint32(t.CPU()))
-		if _, err := t.CopyOutBytes(cpu, buf); err != nil {
+		if _, err := primitive.CopyInt32Out(t, cpu, t.CPU()); err != nil {
 			return 0, nil, err
 		}
 	}
diff --git a/pkg/sentry/syscalls/linux/sys_time.go b/pkg/sentry/syscalls/linux/sys_time.go
index a2a24a027..c5054d2f1 100644
--- a/pkg/sentry/syscalls/linux/sys_time.go
+++ b/pkg/sentry/syscalls/linux/sys_time.go
@@ -19,6 +19,7 @@ import (
 	"time"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
@@ -168,7 +169,7 @@ func Time(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC
 		return uintptr(r), nil, nil
 	}
 
-	if _, err := t.CopyOut(addr, r); err != nil {
+	if _, err := r.CopyOut(t, addr); err != nil {
 		return 0, nil, err
 	}
 	return uintptr(r), nil, nil
@@ -334,8 +335,8 @@ func Gettimeofday(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.
 		// Ask the time package for the timezone.
 		_, offset := time.Now().Zone()
 		// This int32 array mimics linux's struct timezone.
-		timezone := [2]int32{-int32(offset) / 60, 0}
-		_, err := t.CopyOut(tz, timezone)
+		timezone := []int32{-int32(offset) / 60, 0}
+		_, err := primitive.CopyInt32SliceOut(t, tz, timezone)
 		return 0, nil, err
 	}
 	return 0, nil, nil
diff --git a/pkg/sentry/syscalls/linux/sys_timer.go b/pkg/sentry/syscalls/linux/sys_timer.go
index a4c400f87..45eef4feb 100644
--- a/pkg/sentry/syscalls/linux/sys_timer.go
+++ b/pkg/sentry/syscalls/linux/sys_timer.go
@@ -21,81 +21,63 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/syserror"
-	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 const nsecPerSec = int64(time.Second)
 
-// copyItimerValIn copies an ItimerVal from the untrusted app range to the
-// kernel.  The ItimerVal may be either 32 or 64 bits.
-// A NULL address is allowed because because Linux allows
-// setitimer(which, NULL, &old_value) which disables the timer.
-// There is a KERN_WARN message saying this misfeature will be removed.
-// However, that hasn't happened as of 3.19, so we continue to support it.
-func copyItimerValIn(t *kernel.Task, addr usermem.Addr) (linux.ItimerVal, error) {
-	if addr == usermem.Addr(0) {
-		return linux.ItimerVal{}, nil
-	}
-
-	switch t.Arch().Width() {
-	case 8:
-		// Native size, just copy directly.
-		var itv linux.ItimerVal
-		if _, err := t.CopyIn(addr, &itv); err != nil {
-			return linux.ItimerVal{}, err
-		}
-
-		return itv, nil
-	default:
-		return linux.ItimerVal{}, syserror.ENOSYS
-	}
-}
-
-// copyItimerValOut copies an ItimerVal to the untrusted app range.
-// The ItimerVal may be either 32 or 64 bits.
-// A NULL address is allowed, in which case no copy takes place
-func copyItimerValOut(t *kernel.Task, addr usermem.Addr, itv *linux.ItimerVal) error {
-	if addr == usermem.Addr(0) {
-		return nil
-	}
-
-	switch t.Arch().Width() {
-	case 8:
-		// Native size, just copy directly.
-		_, err := t.CopyOut(addr, itv)
-		return err
-	default:
-		return syserror.ENOSYS
-	}
-}
-
 // Getitimer implements linux syscall getitimer(2).
 func Getitimer(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	if t.Arch().Width() != 8 {
+		// Definition of linux.ItimerVal assumes 64-bit architecture.
+		return 0, nil, syserror.ENOSYS
+	}
+
 	timerID := args[0].Int()
-	val := args[1].Pointer()
+	addr := args[1].Pointer()
 
 	olditv, err := t.Getitimer(timerID)
 	if err != nil {
 		return 0, nil, err
 	}
-	return 0, nil, copyItimerValOut(t, val, &olditv)
+	// A NULL address is allowed, in which case no copy out takes place.
+	if addr == 0 {
+		return 0, nil, nil
+	}
+	_, err = olditv.CopyOut(t, addr)
+	return 0, nil, err
 }
 
 // Setitimer implements linux syscall setitimer(2).
 func Setitimer(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
-	timerID := args[0].Int()
-	newVal := args[1].Pointer()
-	oldVal := args[2].Pointer()
+	if t.Arch().Width() != 8 {
+		// Definition of linux.ItimerVal assumes 64-bit architecture.
+		return 0, nil, syserror.ENOSYS
+	}
 
-	newitv, err := copyItimerValIn(t, newVal)
-	if err != nil {
-		return 0, nil, err
+	timerID := args[0].Int()
+	newAddr := args[1].Pointer()
+	oldAddr := args[2].Pointer()
+
+	var newitv linux.ItimerVal
+	// A NULL address is allowed because because Linux allows
+	// setitimer(which, NULL, &old_value) which disables the timer. There is a
+	// KERN_WARN message saying this misfeature will be removed. However, that
+	// hasn't happened as of 3.19, so we continue to support it.
+	if newAddr != 0 {
+		if _, err := newitv.CopyIn(t, newAddr); err != nil {
+			return 0, nil, err
+		}
 	}
 	olditv, err := t.Setitimer(timerID, newitv)
 	if err != nil {
 		return 0, nil, err
 	}
-	return 0, nil, copyItimerValOut(t, oldVal, &olditv)
+	// A NULL address is allowed, in which case no copy out takes place.
+	if oldAddr == 0 {
+		return 0, nil, nil
+	}
+	_, err = olditv.CopyOut(t, oldAddr)
+	return 0, nil, err
 }
 
 // Alarm implements linux syscall alarm(2).
@@ -131,7 +113,7 @@ func TimerCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S
 	var sev *linux.Sigevent
 	if sevp != 0 {
 		sev = &linux.Sigevent{}
-		if _, err = t.CopyIn(sevp, sev); err != nil {
+		if _, err = sev.CopyIn(t, sevp); err != nil {
 			return 0, nil, err
 		}
 	}
@@ -141,7 +123,7 @@ func TimerCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S
 		return 0, nil, err
 	}
 
-	if _, err := t.CopyOut(timerIDp, &id); err != nil {
+	if _, err := id.CopyOut(t, timerIDp); err != nil {
 		t.IntervalTimerDelete(id)
 		return 0, nil, err
 	}
@@ -157,7 +139,7 @@ func TimerSettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.
 	oldValAddr := args[3].Pointer()
 
 	var newVal linux.Itimerspec
-	if _, err := t.CopyIn(newValAddr, &newVal); err != nil {
+	if _, err := newVal.CopyIn(t, newValAddr); err != nil {
 		return 0, nil, err
 	}
 	oldVal, err := t.IntervalTimerSettime(timerID, newVal, flags&linux.TIMER_ABSTIME != 0)
@@ -165,9 +147,8 @@ func TimerSettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.
 		return 0, nil, err
 	}
 	if oldValAddr != 0 {
-		if _, err := t.CopyOut(oldValAddr, &oldVal); err != nil {
-			return 0, nil, err
-		}
+		_, err = oldVal.CopyOut(t, oldValAddr)
+		return 0, nil, err
 	}
 	return 0, nil, nil
 }
@@ -181,7 +162,7 @@ func TimerGettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.
 	if err != nil {
 		return 0, nil, err
 	}
-	_, err = t.CopyOut(curValAddr, &curVal)
+	_, err = curVal.CopyOut(t, curValAddr)
 	return 0, nil, err
 }
 
diff --git a/pkg/sentry/syscalls/linux/sys_timerfd.go b/pkg/sentry/syscalls/linux/sys_timerfd.go
index 34b03e4ee..cadd9d348 100644
--- a/pkg/sentry/syscalls/linux/sys_timerfd.go
+++ b/pkg/sentry/syscalls/linux/sys_timerfd.go
@@ -81,7 +81,7 @@ func TimerfdSettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne
 	}
 
 	var newVal linux.Itimerspec
-	if _, err := t.CopyIn(newValAddr, &newVal); err != nil {
+	if _, err := newVal.CopyIn(t, newValAddr); err != nil {
 		return 0, nil, err
 	}
 	newS, err := ktime.SettingFromItimerspec(newVal, flags&linux.TFD_TIMER_ABSTIME != 0, tf.Clock())
@@ -91,7 +91,7 @@ func TimerfdSettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne
 	tm, oldS := tf.SetTime(newS)
 	if oldValAddr != 0 {
 		oldVal := ktime.ItimerspecFromSetting(tm, oldS)
-		if _, err := t.CopyOut(oldValAddr, &oldVal); err != nil {
+		if _, err := oldVal.CopyOut(t, oldValAddr); err != nil {
 			return 0, nil, err
 		}
 	}
@@ -116,6 +116,6 @@ func TimerfdGettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne
 
 	tm, s := tf.GetTime()
 	curVal := ktime.ItimerspecFromSetting(tm, s)
-	_, err := t.CopyOut(curValAddr, &curVal)
+	_, err := curVal.CopyOut(t, curValAddr)
 	return 0, nil, err
 }
diff --git a/pkg/sentry/syscalls/linux/sys_tls_amd64.go b/pkg/sentry/syscalls/linux/sys_tls_amd64.go
index b3eb96a1c..6ddd30d5c 100644
--- a/pkg/sentry/syscalls/linux/sys_tls_amd64.go
+++ b/pkg/sentry/syscalls/linux/sys_tls_amd64.go
@@ -18,6 +18,7 @@ package linux
 
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/syserror"
@@ -30,17 +31,19 @@ func ArchPrctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 	case linux.ARCH_GET_FS:
 		addr := args[1].Pointer()
 		fsbase := t.Arch().TLS()
-		_, err := t.CopyOut(addr, uint64(fsbase))
-		if err != nil {
-			return 0, nil, err
+		switch t.Arch().Width() {
+		case 8:
+			if _, err := primitive.CopyUint64Out(t, addr, uint64(fsbase)); err != nil {
+				return 0, nil, err
+			}
+		default:
+			return 0, nil, syserror.ENOSYS
 		}
-
 	case linux.ARCH_SET_FS:
 		fsbase := args[1].Uint64()
 		if !t.Arch().SetTLS(uintptr(fsbase)) {
 			return 0, nil, syserror.EPERM
 		}
-
 	case linux.ARCH_GET_GS, linux.ARCH_SET_GS:
 		t.Kernel().EmitUnimplementedEvent(t)
 		fallthrough
diff --git a/pkg/sentry/syscalls/linux/sys_utsname.go b/pkg/sentry/syscalls/linux/sys_utsname.go
index e9d702e8e..66c5974f5 100644
--- a/pkg/sentry/syscalls/linux/sys_utsname.go
+++ b/pkg/sentry/syscalls/linux/sys_utsname.go
@@ -46,7 +46,7 @@ func Uname(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 
 	// Copy out the result.
 	va := args[0].Pointer()
-	_, err := t.CopyOut(va, u)
+	_, err := u.CopyOut(t, va)
 	return 0, nil, err
 }
 
diff --git a/pkg/sentry/syscalls/linux/vfs2/BUILD b/pkg/sentry/syscalls/linux/vfs2/BUILD
index 64696b438..9ee766552 100644
--- a/pkg/sentry/syscalls/linux/vfs2/BUILD
+++ b/pkg/sentry/syscalls/linux/vfs2/BUILD
@@ -44,6 +44,9 @@ go_library(
         "//pkg/context",
         "//pkg/fspath",
         "//pkg/gohacks",
+        "//pkg/log",
+        "//pkg/marshal",
+        "//pkg/marshal/primitive",
         "//pkg/sentry/arch",
         "//pkg/sentry/fs/lock",
         "//pkg/sentry/fsbridge",
@@ -72,7 +75,5 @@ go_library(
         "//pkg/syserror",
         "//pkg/usermem",
         "//pkg/waiter",
-        "//tools/go_marshal/marshal",
-        "//tools/go_marshal/primitive",
     ],
 )
diff --git a/pkg/sentry/syscalls/linux/vfs2/aio.go b/pkg/sentry/syscalls/linux/vfs2/aio.go
index 42559bf69..6d0a38330 100644
--- a/pkg/sentry/syscalls/linux/vfs2/aio.go
+++ b/pkg/sentry/syscalls/linux/vfs2/aio.go
@@ -17,6 +17,7 @@ package vfs2
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/eventfd"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
@@ -38,21 +39,27 @@ func IoSubmit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 	}
 
 	for i := int32(0); i < nrEvents; i++ {
-		// Copy in the address.
-		cbAddrNative := t.Arch().Native(0)
-		if _, err := t.CopyIn(addr, cbAddrNative); err != nil {
-			if i > 0 {
-				// Some successful.
-				return uintptr(i), nil, nil
+		// Copy in the callback address.
+		var cbAddr usermem.Addr
+		switch t.Arch().Width() {
+		case 8:
+			var cbAddrP primitive.Uint64
+			if _, err := cbAddrP.CopyIn(t, addr); err != nil {
+				if i > 0 {
+					// Some successful.
+					return uintptr(i), nil, nil
+				}
+				// Nothing done.
+				return 0, nil, err
 			}
-			// Nothing done.
-			return 0, nil, err
+			cbAddr = usermem.Addr(cbAddrP)
+		default:
+			return 0, nil, syserror.ENOSYS
 		}
 
 		// Copy in this callback.
 		var cb linux.IOCallback
-		cbAddr := usermem.Addr(t.Arch().Value(cbAddrNative))
-		if _, err := t.CopyIn(cbAddr, &cb); err != nil {
+		if _, err := cb.CopyIn(t, cbAddr); err != nil {
 			if i > 0 {
 				// Some have been successful.
 				return uintptr(i), nil, nil
diff --git a/pkg/sentry/syscalls/linux/vfs2/epoll.go b/pkg/sentry/syscalls/linux/vfs2/epoll.go
index c62f03509..d0cbb77eb 100644
--- a/pkg/sentry/syscalls/linux/vfs2/epoll.go
+++ b/pkg/sentry/syscalls/linux/vfs2/epoll.go
@@ -24,7 +24,6 @@ import (
 	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
-	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
@@ -141,50 +140,26 @@ func EpollWait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 		return 0, nil, syserror.EINVAL
 	}
 
-	// Use a fixed-size buffer in a loop, instead of make([]linux.EpollEvent,
-	// maxEvents), so that the buffer can be allocated on the stack.
+	// Allocate space for a few events on the stack for the common case in
+	// which we don't have too many events.
 	var (
-		events       [16]linux.EpollEvent
-		total        int
+		eventsArr    [16]linux.EpollEvent
 		ch           chan struct{}
 		haveDeadline bool
 		deadline     ktime.Time
 	)
 	for {
-		batchEvents := len(events)
-		if batchEvents > maxEvents {
-			batchEvents = maxEvents
-		}
-		n := ep.ReadEvents(events[:batchEvents])
-		maxEvents -= n
-		if n != 0 {
-			// Copy what we read out.
-			copiedBytes, err := linux.CopyEpollEventSliceOut(t, eventsAddr, events[:n])
+		events := ep.ReadEvents(eventsArr[:0], maxEvents)
+		if len(events) != 0 {
+			copiedBytes, err := linux.CopyEpollEventSliceOut(t, eventsAddr, events)
 			copiedEvents := copiedBytes / sizeofEpollEvent // rounded down
-			eventsAddr += usermem.Addr(copiedEvents * sizeofEpollEvent)
-			total += copiedEvents
-			if err != nil {
-				if total != 0 {
-					return uintptr(total), nil, nil
-				}
-				return 0, nil, err
-			}
-			// If we've filled the application's event buffer, we're done.
-			if maxEvents == 0 {
-				return uintptr(total), nil, nil
-			}
-			// Loop if we read a full batch, under the expectation that there
-			// may be more events to read.
-			if n == batchEvents {
-				continue
+			if copiedEvents != 0 {
+				return uintptr(copiedEvents), nil, nil
 			}
+			return 0, nil, err
 		}
-		// We get here if n != batchEvents. If we read any number of events
-		// (just now, or in a previous iteration of this loop), or if timeout
-		// is 0 (such that epoll_wait should be non-blocking), return the
-		// events we've read so far to the application.
-		if total != 0 || timeout == 0 {
-			return uintptr(total), nil, nil
+		if timeout == 0 {
+			return 0, nil, nil
 		}
 		// In the first iteration of this loop, register with the epoll
 		// instance for readability events, but then immediately continue the
@@ -207,8 +182,6 @@ func EpollWait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 				if err == syserror.ETIMEDOUT {
 					err = nil
 				}
-				// total must be 0 since otherwise we would have returned
-				// above.
 				return 0, nil, err
 			}
 		}
diff --git a/pkg/sentry/syscalls/linux/vfs2/execve.go b/pkg/sentry/syscalls/linux/vfs2/execve.go
index 066ee0863..c8ce2aabc 100644
--- a/pkg/sentry/syscalls/linux/vfs2/execve.go
+++ b/pkg/sentry/syscalls/linux/vfs2/execve.go
@@ -110,8 +110,7 @@ func execveat(t *kernel.Task, dirfd int32, pathnameAddr, argvAddr, envvAddr user
 	}
 
 	// Load the new TaskContext.
-	mntns := t.MountNamespaceVFS2() // FIXME(jamieliu): useless refcount change
-	defer mntns.DecRef(t)
+	mntns := t.MountNamespaceVFS2()
 	wd := t.FSContext().WorkingDirectoryVFS2()
 	defer wd.DecRef(t)
 	remainingTraversals := uint(linux.MaxSymlinkTraversals)
diff --git a/pkg/sentry/syscalls/linux/vfs2/fd.go b/pkg/sentry/syscalls/linux/vfs2/fd.go
index 4856554fe..36e89700e 100644
--- a/pkg/sentry/syscalls/linux/vfs2/fd.go
+++ b/pkg/sentry/syscalls/linux/vfs2/fd.go
@@ -34,7 +34,7 @@ func Close(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	// Note that Remove provides a reference on the file that we may use to
 	// flush. It is still active until we drop the final reference below
 	// (and other reference-holding operations complete).
-	_, file := t.FDTable().Remove(fd)
+	_, file := t.FDTable().Remove(t, fd)
 	if file == nil {
 		return 0, nil, syserror.EBADF
 	}
@@ -137,7 +137,7 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 		return uintptr(flags.ToLinuxFDFlags()), nil, nil
 	case linux.F_SETFD:
 		flags := args[2].Uint()
-		err := t.FDTable().SetFlagsVFS2(fd, kernel.FDFlags{
+		err := t.FDTable().SetFlagsVFS2(t, fd, kernel.FDFlags{
 			CloseOnExec: flags&linux.FD_CLOEXEC != 0,
 		})
 		return 0, nil, err
@@ -145,16 +145,6 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 		return uintptr(file.StatusFlags()), nil, nil
 	case linux.F_SETFL:
 		return 0, nil, file.SetStatusFlags(t, t.Credentials(), args[2].Uint())
-	case linux.F_SETPIPE_SZ:
-		pipefile, ok := file.Impl().(*pipe.VFSPipeFD)
-		if !ok {
-			return 0, nil, syserror.EBADF
-		}
-		n, err := pipefile.SetPipeSize(int64(args[2].Int()))
-		if err != nil {
-			return 0, nil, err
-		}
-		return uintptr(n), nil, nil
 	case linux.F_GETOWN:
 		owner, hasOwner := getAsyncOwner(t, file)
 		if !hasOwner {
@@ -181,15 +171,25 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 		if !hasOwner {
 			return 0, nil, nil
 		}
-		_, err := t.CopyOut(args[2].Pointer(), &owner)
+		_, err := owner.CopyOut(t, args[2].Pointer())
 		return 0, nil, err
 	case linux.F_SETOWN_EX:
 		var owner linux.FOwnerEx
-		_, err := t.CopyIn(args[2].Pointer(), &owner)
+		_, err := owner.CopyIn(t, args[2].Pointer())
 		if err != nil {
 			return 0, nil, err
 		}
 		return 0, nil, setAsyncOwner(t, file, owner.Type, owner.PID)
+	case linux.F_SETPIPE_SZ:
+		pipefile, ok := file.Impl().(*pipe.VFSPipeFD)
+		if !ok {
+			return 0, nil, syserror.EBADF
+		}
+		n, err := pipefile.SetPipeSize(int64(args[2].Int()))
+		if err != nil {
+			return 0, nil, err
+		}
+		return uintptr(n), nil, nil
 	case linux.F_GETPIPE_SZ:
 		pipefile, ok := file.Impl().(*pipe.VFSPipeFD)
 		if !ok {
@@ -286,7 +286,7 @@ func posixLock(t *kernel.Task, args arch.SyscallArguments, file *vfs.FileDescrip
 	// Copy in the lock request.
 	flockAddr := args[2].Pointer()
 	var flock linux.Flock
-	if _, err := t.CopyIn(flockAddr, &flock); err != nil {
+	if _, err := flock.CopyIn(t, flockAddr); err != nil {
 		return err
 	}
 
diff --git a/pkg/sentry/syscalls/linux/vfs2/ioctl.go b/pkg/sentry/syscalls/linux/vfs2/ioctl.go
index 38778a388..2806c3f6f 100644
--- a/pkg/sentry/syscalls/linux/vfs2/ioctl.go
+++ b/pkg/sentry/syscalls/linux/vfs2/ioctl.go
@@ -16,6 +16,7 @@ package vfs2
 
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/syserror"
@@ -34,20 +35,20 @@ func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	// Handle ioctls that apply to all FDs.
 	switch args[1].Int() {
 	case linux.FIONCLEX:
-		t.FDTable().SetFlagsVFS2(fd, kernel.FDFlags{
+		t.FDTable().SetFlagsVFS2(t, fd, kernel.FDFlags{
 			CloseOnExec: false,
 		})
 		return 0, nil, nil
 
 	case linux.FIOCLEX:
-		t.FDTable().SetFlagsVFS2(fd, kernel.FDFlags{
+		t.FDTable().SetFlagsVFS2(t, fd, kernel.FDFlags{
 			CloseOnExec: true,
 		})
 		return 0, nil, nil
 
 	case linux.FIONBIO:
 		var set int32
-		if _, err := t.CopyIn(args[2].Pointer(), &set); err != nil {
+		if _, err := primitive.CopyInt32In(t, args[2].Pointer(), &set); err != nil {
 			return 0, nil, err
 		}
 		flags := file.StatusFlags()
@@ -60,7 +61,7 @@ func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 
 	case linux.FIOASYNC:
 		var set int32
-		if _, err := t.CopyIn(args[2].Pointer(), &set); err != nil {
+		if _, err := primitive.CopyInt32In(t, args[2].Pointer(), &set); err != nil {
 			return 0, nil, err
 		}
 		flags := file.StatusFlags()
@@ -82,12 +83,12 @@ func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 				who = owner.PID
 			}
 		}
-		_, err := t.CopyOut(args[2].Pointer(), &who)
+		_, err := primitive.CopyInt32Out(t, args[2].Pointer(), who)
 		return 0, nil, err
 
 	case linux.FIOSETOWN, linux.SIOCSPGRP:
 		var who int32
-		if _, err := t.CopyIn(args[2].Pointer(), &who); err != nil {
+		if _, err := primitive.CopyInt32In(t, args[2].Pointer(), &who); err != nil {
 			return 0, nil, err
 		}
 		ownerType := int32(linux.F_OWNER_PID)
diff --git a/pkg/sentry/syscalls/linux/vfs2/mmap.go b/pkg/sentry/syscalls/linux/vfs2/mmap.go
index dc05c2994..9d9dbf775 100644
--- a/pkg/sentry/syscalls/linux/vfs2/mmap.go
+++ b/pkg/sentry/syscalls/linux/vfs2/mmap.go
@@ -17,6 +17,7 @@ package vfs2
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/syserror"
@@ -85,6 +86,17 @@ func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC
 		if err := file.ConfigureMMap(t, &opts); err != nil {
 			return 0, nil, err
 		}
+	} else if shared {
+		// Back shared anonymous mappings with an anonymous tmpfs file.
+		opts.Offset = 0
+		file, err := tmpfs.NewZeroFile(t, t.Credentials(), t.Kernel().ShmMount(), opts.Length)
+		if err != nil {
+			return 0, nil, err
+		}
+		defer file.DecRef(t)
+		if err := file.ConfigureMMap(t, &opts); err != nil {
+			return 0, nil, err
+		}
 	}
 
 	rv, err := t.MemoryManager().MMap(t, opts)
diff --git a/pkg/sentry/syscalls/linux/vfs2/mount.go b/pkg/sentry/syscalls/linux/vfs2/mount.go
index 4bd5c7ca2..769c9b92f 100644
--- a/pkg/sentry/syscalls/linux/vfs2/mount.go
+++ b/pkg/sentry/syscalls/linux/vfs2/mount.go
@@ -109,8 +109,8 @@ func Mount(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 		return 0, nil, err
 	}
 	defer target.Release(t)
-
-	return 0, nil, t.Kernel().VFS().MountAt(t, creds, source, &target.pop, fsType, &opts)
+	_, err = t.Kernel().VFS().MountAt(t, creds, source, &target.pop, fsType, &opts)
+	return 0, nil, err
 }
 
 // Umount2 implements Linux syscall umount2(2).
diff --git a/pkg/sentry/syscalls/linux/vfs2/pipe.go b/pkg/sentry/syscalls/linux/vfs2/pipe.go
index 9b4848d9e..ee38fdca0 100644
--- a/pkg/sentry/syscalls/linux/vfs2/pipe.go
+++ b/pkg/sentry/syscalls/linux/vfs2/pipe.go
@@ -16,6 +16,7 @@ package vfs2
 
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/pipefs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
@@ -51,9 +52,9 @@ func pipe2(t *kernel.Task, addr usermem.Addr, flags int32) error {
 	if err != nil {
 		return err
 	}
-	if _, err := t.CopyOut(addr, fds); err != nil {
+	if _, err := primitive.CopyInt32SliceOut(t, addr, fds); err != nil {
 		for _, fd := range fds {
-			if _, file := t.FDTable().Remove(fd); file != nil {
+			if _, file := t.FDTable().Remove(t, fd); file != nil {
 				file.DecRef(t)
 			}
 		}
diff --git a/pkg/sentry/syscalls/linux/vfs2/poll.go b/pkg/sentry/syscalls/linux/vfs2/poll.go
index 79ad64039..c22e4ce54 100644
--- a/pkg/sentry/syscalls/linux/vfs2/poll.go
+++ b/pkg/sentry/syscalls/linux/vfs2/poll.go
@@ -165,7 +165,7 @@ func copyInPollFDs(t *kernel.Task, addr usermem.Addr, nfds uint) ([]linux.PollFD
 
 	pfd := make([]linux.PollFD, nfds)
 	if nfds > 0 {
-		if _, err := t.CopyIn(addr, &pfd); err != nil {
+		if _, err := linux.CopyPollFDSliceIn(t, addr, pfd); err != nil {
 			return nil, err
 		}
 	}
@@ -192,7 +192,7 @@ func doPoll(t *kernel.Task, addr usermem.Addr, nfds uint, timeout time.Duration)
 	// The poll entries are copied out regardless of whether
 	// any are set or not. This aligns with the Linux behavior.
 	if nfds > 0 && err == nil {
-		if _, err := t.CopyOut(addr, pfd); err != nil {
+		if _, err := linux.CopyPollFDSliceOut(t, addr, pfd); err != nil {
 			return remainingTimeout, 0, err
 		}
 	}
@@ -205,7 +205,7 @@ func CopyInFDSet(t *kernel.Task, addr usermem.Addr, nBytes, nBitsInLastPartialBy
 	set := make([]byte, nBytes)
 
 	if addr != 0 {
-		if _, err := t.CopyIn(addr, &set); err != nil {
+		if _, err := t.CopyInBytes(addr, set); err != nil {
 			return nil, err
 		}
 		// If we only use part of the last byte, mask out the extraneous bits.
@@ -332,19 +332,19 @@ func doSelect(t *kernel.Task, nfds int, readFDs, writeFDs, exceptFDs usermem.Add
 
 	// Copy updated vectors back.
 	if readFDs != 0 {
-		if _, err := t.CopyOut(readFDs, r); err != nil {
+		if _, err := t.CopyOutBytes(readFDs, r); err != nil {
 			return 0, err
 		}
 	}
 
 	if writeFDs != 0 {
-		if _, err := t.CopyOut(writeFDs, w); err != nil {
+		if _, err := t.CopyOutBytes(writeFDs, w); err != nil {
 			return 0, err
 		}
 	}
 
 	if exceptFDs != 0 {
-		if _, err := t.CopyOut(exceptFDs, e); err != nil {
+		if _, err := t.CopyOutBytes(exceptFDs, e); err != nil {
 			return 0, err
 		}
 	}
@@ -497,6 +497,12 @@ func Select(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 	return n, nil, err
 }
 
+// +marshal
+type sigSetWithSize struct {
+	sigsetAddr   uint64
+	sizeofSigset uint64
+}
+
 // Pselect implements linux syscall pselect(2).
 func Pselect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	nfds := int(args[0].Int()) // select(2) uses an int.
@@ -538,12 +544,6 @@ func Pselect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
 	return n, nil, err
 }
 
-// +marshal
-type sigSetWithSize struct {
-	sigsetAddr   uint64
-	sizeofSigset uint64
-}
-
 // copyTimespecInToDuration copies a Timespec from the untrusted app range,
 // validates it and converts it to a Duration.
 //
diff --git a/pkg/sentry/syscalls/linux/vfs2/setstat.go b/pkg/sentry/syscalls/linux/vfs2/setstat.go
index 5e6eb13ba..1ee37e5a8 100644
--- a/pkg/sentry/syscalls/linux/vfs2/setstat.go
+++ b/pkg/sentry/syscalls/linux/vfs2/setstat.go
@@ -346,7 +346,7 @@ func populateSetStatOptionsForUtimes(t *kernel.Task, timesAddr usermem.Addr, opt
 		return nil
 	}
 	var times [2]linux.Timeval
-	if _, err := t.CopyIn(timesAddr, &times); err != nil {
+	if _, err := linux.CopyTimevalSliceIn(t, timesAddr, times[:]); err != nil {
 		return err
 	}
 	if times[0].Usec < 0 || times[0].Usec > 999999 || times[1].Usec < 0 || times[1].Usec > 999999 {
@@ -410,7 +410,7 @@ func populateSetStatOptionsForUtimens(t *kernel.Task, timesAddr usermem.Addr, op
 		return nil
 	}
 	var times [2]linux.Timespec
-	if _, err := t.CopyIn(timesAddr, &times); err != nil {
+	if _, err := linux.CopyTimespecSliceIn(t, timesAddr, times[:]); err != nil {
 		return err
 	}
 	if times[0].Nsec != linux.UTIME_OMIT {
diff --git a/pkg/sentry/syscalls/linux/vfs2/socket.go b/pkg/sentry/syscalls/linux/vfs2/socket.go
index a5032657a..7b33b3f59 100644
--- a/pkg/sentry/syscalls/linux/vfs2/socket.go
+++ b/pkg/sentry/syscalls/linux/vfs2/socket.go
@@ -19,6 +19,8 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/marshal"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
@@ -30,8 +32,6 @@ import (
 	"gvisor.dev/gvisor/pkg/syserr"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
-	"gvisor.dev/gvisor/tools/go_marshal/marshal"
-	"gvisor.dev/gvisor/tools/go_marshal/primitive"
 )
 
 // minListenBacklog is the minimum reasonable backlog for listening sockets.
@@ -66,10 +66,10 @@ const flagsOffset = 48
 const sizeOfInt32 = 4
 
 // messageHeader64Len is the length of a MessageHeader64 struct.
-var messageHeader64Len = uint64(binary.Size(MessageHeader64{}))
+var messageHeader64Len = uint64((*MessageHeader64)(nil).SizeBytes())
 
 // multipleMessageHeader64Len is the length of a multipeMessageHeader64 struct.
-var multipleMessageHeader64Len = uint64(binary.Size(multipleMessageHeader64{}))
+var multipleMessageHeader64Len = uint64((*multipleMessageHeader64)(nil).SizeBytes())
 
 // baseRecvFlags are the flags that are accepted across recvmsg(2),
 // recvmmsg(2), and recvfrom(2).
@@ -77,6 +77,8 @@ const baseRecvFlags = linux.MSG_OOB | linux.MSG_DONTROUTE | linux.MSG_DONTWAIT |
 
 // MessageHeader64 is the 64-bit representation of the msghdr struct used in
 // the recvmsg and sendmsg syscalls.
+//
+// +marshal
 type MessageHeader64 struct {
 	// Name is the optional pointer to a network address buffer.
 	Name uint64
@@ -105,30 +107,14 @@ type MessageHeader64 struct {
 
 // multipleMessageHeader64 is the 64-bit representation of the mmsghdr struct used in
 // the recvmmsg and sendmmsg syscalls.
+//
+// +marshal
 type multipleMessageHeader64 struct {
 	msgHdr MessageHeader64
 	msgLen uint32
 	_      int32
 }
 
-// CopyInMessageHeader64 copies a message header from user to kernel memory.
-func CopyInMessageHeader64(t *kernel.Task, addr usermem.Addr, msg *MessageHeader64) error {
-	b := t.CopyScratchBuffer(52)
-	if _, err := t.CopyInBytes(addr, b); err != nil {
-		return err
-	}
-
-	msg.Name = usermem.ByteOrder.Uint64(b[0:])
-	msg.NameLen = usermem.ByteOrder.Uint32(b[8:])
-	msg.Iov = usermem.ByteOrder.Uint64(b[16:])
-	msg.IovLen = usermem.ByteOrder.Uint64(b[24:])
-	msg.Control = usermem.ByteOrder.Uint64(b[32:])
-	msg.ControlLen = usermem.ByteOrder.Uint64(b[40:])
-	msg.Flags = int32(usermem.ByteOrder.Uint32(b[48:]))
-
-	return nil
-}
-
 // CaptureAddress allocates memory for and copies a socket address structure
 // from the untrusted address space range.
 func CaptureAddress(t *kernel.Task, addr usermem.Addr, addrlen uint32) ([]byte, error) {
@@ -147,10 +133,10 @@ func CaptureAddress(t *kernel.Task, addr usermem.Addr, addrlen uint32) ([]byte,
 // writeAddress writes a sockaddr structure and its length to an output buffer
 // in the unstrusted address space range. If the address is bigger than the
 // buffer, it is truncated.
-func writeAddress(t *kernel.Task, addr interface{}, addrLen uint32, addrPtr usermem.Addr, addrLenPtr usermem.Addr) error {
+func writeAddress(t *kernel.Task, addr linux.SockAddr, addrLen uint32, addrPtr usermem.Addr, addrLenPtr usermem.Addr) error {
 	// Get the buffer length.
 	var bufLen uint32
-	if _, err := t.CopyIn(addrLenPtr, &bufLen); err != nil {
+	if _, err := primitive.CopyUint32In(t, addrLenPtr, &bufLen); err != nil {
 		return err
 	}
 
@@ -159,7 +145,7 @@ func writeAddress(t *kernel.Task, addr interface{}, addrLen uint32, addrPtr user
 	}
 
 	// Write the length unconditionally.
-	if _, err := t.CopyOut(addrLenPtr, addrLen); err != nil {
+	if _, err := primitive.CopyUint32Out(t, addrLenPtr, addrLen); err != nil {
 		return err
 	}
 
@@ -172,7 +158,8 @@ func writeAddress(t *kernel.Task, addr interface{}, addrLen uint32, addrPtr user
 	}
 
 	// Copy as much of the address as will fit in the buffer.
-	encodedAddr := binary.Marshal(nil, usermem.ByteOrder, addr)
+	encodedAddr := t.CopyScratchBuffer(addr.SizeBytes())
+	addr.MarshalUnsafe(encodedAddr)
 	if bufLen > uint32(len(encodedAddr)) {
 		bufLen = uint32(len(encodedAddr))
 	}
@@ -250,9 +237,9 @@ func SocketPair(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
 		return 0, nil, err
 	}
 
-	if _, err := t.CopyOut(addr, fds); err != nil {
+	if _, err := primitive.CopyInt32SliceOut(t, addr, fds); err != nil {
 		for _, fd := range fds {
-			if _, file := t.FDTable().Remove(fd); file != nil {
+			if _, file := t.FDTable().Remove(t, fd); file != nil {
 				file.DecRef(t)
 			}
 		}
@@ -459,8 +446,8 @@ func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
 	}
 
 	// Read the length. Reject negative values.
-	optLen := int32(0)
-	if _, err := t.CopyIn(optLenAddr, &optLen); err != nil {
+	var optLen int32
+	if _, err := primitive.CopyInt32In(t, optLenAddr, &optLen); err != nil {
 		return 0, nil, err
 	}
 	if optLen < 0 {
@@ -474,7 +461,7 @@ func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
 	}
 
 	vLen := int32(binary.Size(v))
-	if _, err := t.CopyOut(optLenAddr, vLen); err != nil {
+	if _, err := primitive.CopyInt32Out(t, optLenAddr, vLen); err != nil {
 		return 0, nil, err
 	}
 
@@ -736,7 +723,7 @@ func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 		if !ok {
 			return 0, nil, syserror.EFAULT
 		}
-		if _, err = t.CopyOut(lp, uint32(n)); err != nil {
+		if _, err = primitive.CopyUint32Out(t, lp, uint32(n)); err != nil {
 			break
 		}
 		count++
@@ -751,7 +738,7 @@ func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 func recvSingleMsg(t *kernel.Task, s socket.SocketVFS2, msgPtr usermem.Addr, flags int32, haveDeadline bool, deadline ktime.Time) (uintptr, error) {
 	// Capture the message header and io vectors.
 	var msg MessageHeader64
-	if err := CopyInMessageHeader64(t, msgPtr, &msg); err != nil {
+	if _, err := msg.CopyIn(t, msgPtr); err != nil {
 		return 0, err
 	}
 
@@ -783,7 +770,7 @@ func recvSingleMsg(t *kernel.Task, s socket.SocketVFS2, msgPtr usermem.Addr, fla
 
 		if int(msg.Flags) != mflags {
 			// Copy out the flags to the caller.
-			if _, err := t.CopyOut(msgPtr+flagsOffset, int32(mflags)); err != nil {
+			if _, err := primitive.CopyInt32Out(t, msgPtr+flagsOffset, int32(mflags)); err != nil {
 				return 0, err
 			}
 		}
@@ -820,17 +807,17 @@ func recvSingleMsg(t *kernel.Task, s socket.SocketVFS2, msgPtr usermem.Addr, fla
 	}
 
 	// Copy the control data to the caller.
-	if _, err := t.CopyOut(msgPtr+controlLenOffset, uint64(len(controlData))); err != nil {
+	if _, err := primitive.CopyUint64Out(t, msgPtr+controlLenOffset, uint64(len(controlData))); err != nil {
 		return 0, err
 	}
 	if len(controlData) > 0 {
-		if _, err := t.CopyOut(usermem.Addr(msg.Control), controlData); err != nil {
+		if _, err := t.CopyOutBytes(usermem.Addr(msg.Control), controlData); err != nil {
 			return 0, err
 		}
 	}
 
 	// Copy out the flags to the caller.
-	if _, err := t.CopyOut(msgPtr+flagsOffset, int32(mflags)); err != nil {
+	if _, err := primitive.CopyInt32Out(t, msgPtr+flagsOffset, int32(mflags)); err != nil {
 		return 0, err
 	}
 
@@ -999,7 +986,7 @@ func SendMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 		if !ok {
 			return 0, nil, syserror.EFAULT
 		}
-		if _, err = t.CopyOut(lp, uint32(n)); err != nil {
+		if _, err = primitive.CopyUint32Out(t, lp, uint32(n)); err != nil {
 			break
 		}
 		count++
@@ -1014,7 +1001,7 @@ func SendMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 func sendSingleMsg(t *kernel.Task, s socket.SocketVFS2, file *vfs.FileDescription, msgPtr usermem.Addr, flags int32) (uintptr, error) {
 	// Capture the message header.
 	var msg MessageHeader64
-	if err := CopyInMessageHeader64(t, msgPtr, &msg); err != nil {
+	if _, err := msg.CopyIn(t, msgPtr); err != nil {
 		return 0, err
 	}
 
@@ -1025,7 +1012,7 @@ func sendSingleMsg(t *kernel.Task, s socket.SocketVFS2, file *vfs.FileDescriptio
 			return 0, syserror.ENOBUFS
 		}
 		controlData = make([]byte, msg.ControlLen)
-		if _, err := t.CopyIn(usermem.Addr(msg.Control), &controlData); err != nil {
+		if _, err := t.CopyInBytes(usermem.Addr(msg.Control), controlData); err != nil {
 			return 0, err
 		}
 	}
@@ -1068,7 +1055,9 @@ func sendSingleMsg(t *kernel.Task, s socket.SocketVFS2, file *vfs.FileDescriptio
 	// Call the syscall implementation.
 	n, e := s.SendMsg(t, src, to, int(flags), haveDeadline, deadline, controlMessages)
 	err = slinux.HandleIOErrorVFS2(t, n != 0, e.ToError(), syserror.ERESTARTSYS, "sendmsg", file)
-	if err != nil {
+	// Control messages should be released on error as well as for zero-length
+	// messages, which are discarded by the receiver.
+	if n == 0 || err != nil {
 		controlMessages.Release(t)
 	}
 	return uintptr(n), err
diff --git a/pkg/sentry/syscalls/linux/vfs2/splice.go b/pkg/sentry/syscalls/linux/vfs2/splice.go
index 75bfa2c79..035e2a6b0 100644
--- a/pkg/sentry/syscalls/linux/vfs2/splice.go
+++ b/pkg/sentry/syscalls/linux/vfs2/splice.go
@@ -18,9 +18,12 @@ import (
 	"io"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/pipe"
+	slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
@@ -42,6 +45,9 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 	if count > int64(kernel.MAX_RW_COUNT) {
 		count = int64(kernel.MAX_RW_COUNT)
 	}
+	if count < 0 {
+		return 0, nil, syserror.EINVAL
+	}
 
 	// Check for invalid flags.
 	if flags&^(linux.SPLICE_F_MOVE|linux.SPLICE_F_NONBLOCK|linux.SPLICE_F_MORE|linux.SPLICE_F_GIFT) != 0 {
@@ -88,7 +94,7 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 		if inFile.Options().DenyPRead {
 			return 0, nil, syserror.EINVAL
 		}
-		if _, err := t.CopyIn(inOffsetPtr, &inOffset); err != nil {
+		if _, err := primitive.CopyInt64In(t, inOffsetPtr, &inOffset); err != nil {
 			return 0, nil, err
 		}
 		if inOffset < 0 {
@@ -103,7 +109,7 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 		if outFile.Options().DenyPWrite {
 			return 0, nil, syserror.EINVAL
 		}
-		if _, err := t.CopyIn(outOffsetPtr, &outOffset); err != nil {
+		if _, err := primitive.CopyInt64In(t, outOffsetPtr, &outOffset); err != nil {
 			return 0, nil, err
 		}
 		if outOffset < 0 {
@@ -131,21 +137,17 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 		case inIsPipe && outIsPipe:
 			n, err = pipe.Splice(t, outPipeFD, inPipeFD, count)
 		case inIsPipe:
+			n, err = inPipeFD.SpliceToNonPipe(t, outFile, outOffset, count)
 			if outOffset != -1 {
-				n, err = outFile.PWrite(t, inPipeFD.IOSequence(count), outOffset, vfs.WriteOptions{})
 				outOffset += n
-			} else {
-				n, err = outFile.Write(t, inPipeFD.IOSequence(count), vfs.WriteOptions{})
 			}
 		case outIsPipe:
+			n, err = outPipeFD.SpliceFromNonPipe(t, inFile, inOffset, count)
 			if inOffset != -1 {
-				n, err = inFile.PRead(t, outPipeFD.IOSequence(count), inOffset, vfs.ReadOptions{})
 				inOffset += n
-			} else {
-				n, err = inFile.Read(t, outPipeFD.IOSequence(count), vfs.ReadOptions{})
 			}
 		default:
-			panic("not possible")
+			panic("at least one end of splice must be a pipe")
 		}
 
 		if n != 0 || err != syserror.ErrWouldBlock || nonBlock {
@@ -158,25 +160,26 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 
 	// Copy updated offsets out.
 	if inOffsetPtr != 0 {
-		if _, err := t.CopyOut(inOffsetPtr, &inOffset); err != nil {
+		if _, err := primitive.CopyInt64Out(t, inOffsetPtr, inOffset); err != nil {
 			return 0, nil, err
 		}
 	}
 	if outOffsetPtr != 0 {
-		if _, err := t.CopyOut(outOffsetPtr, &outOffset); err != nil {
+		if _, err := primitive.CopyInt64Out(t, outOffsetPtr, outOffset); err != nil {
 			return 0, nil, err
 		}
 	}
 
-	if n == 0 {
-		return 0, nil, err
+	if n != 0 {
+		// On Linux, inotify behavior is not very consistent with splice(2). We try
+		// our best to emulate Linux for very basic calls to splice, where for some
+		// reason, events are generated for output files, but not input files.
+		outFile.Dentry().InotifyWithParent(t, linux.IN_MODIFY, 0, vfs.PathEvent)
 	}
 
-	// On Linux, inotify behavior is not very consistent with splice(2). We try
-	// our best to emulate Linux for very basic calls to splice, where for some
-	// reason, events are generated for output files, but not input files.
-	outFile.Dentry().InotifyWithParent(t, linux.IN_MODIFY, 0, vfs.PathEvent)
-	return uintptr(n), nil, nil
+	// We can only pass a single file to handleIOError, so pick inFile arbitrarily.
+	// This is used only for debugging purposes.
+	return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "splice", outFile)
 }
 
 // Tee implements Linux syscall tee(2).
@@ -192,6 +195,9 @@ func Tee(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallCo
 	if count > int64(kernel.MAX_RW_COUNT) {
 		count = int64(kernel.MAX_RW_COUNT)
 	}
+	if count < 0 {
+		return 0, nil, syserror.EINVAL
+	}
 
 	// Check for invalid flags.
 	if flags&^(linux.SPLICE_F_MOVE|linux.SPLICE_F_NONBLOCK|linux.SPLICE_F_MORE|linux.SPLICE_F_GIFT) != 0 {
@@ -248,11 +254,20 @@ func Tee(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallCo
 			break
 		}
 	}
-	if n == 0 {
-		return 0, nil, err
+
+	if n != 0 {
+		outFile.Dentry().InotifyWithParent(t, linux.IN_MODIFY, 0, vfs.PathEvent)
+
+		// If a partial write is completed, the error is dropped. Log it here.
+		if err != nil && err != io.EOF && err != syserror.ErrWouldBlock {
+			log.Debugf("tee completed a partial write with error: %v", err)
+			err = nil
+		}
 	}
-	outFile.Dentry().InotifyWithParent(t, linux.IN_MODIFY, 0, vfs.PathEvent)
-	return uintptr(n), nil, nil
+
+	// We can only pass a single file to handleIOError, so pick inFile arbitrarily.
+	// This is used only for debugging purposes.
+	return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "tee", inFile)
 }
 
 // Sendfile implements linux system call sendfile(2).
@@ -301,9 +316,12 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 		if inFile.Options().DenyPRead {
 			return 0, nil, syserror.ESPIPE
 		}
-		if _, err := t.CopyIn(offsetAddr, &offset); err != nil {
+		var offsetP primitive.Int64
+		if _, err := offsetP.CopyIn(t, offsetAddr); err != nil {
 			return 0, nil, err
 		}
+		offset = int64(offsetP)
+
 		if offset < 0 {
 			return 0, nil, syserror.EINVAL
 		}
@@ -341,16 +359,9 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 	if outIsPipe {
 		for n < count {
 			var spliceN int64
+			spliceN, err = outPipeFD.SpliceFromNonPipe(t, inFile, offset, count)
 			if offset != -1 {
-				spliceN, err = inFile.PRead(t, outPipeFD.IOSequence(count), offset, vfs.ReadOptions{})
 				offset += spliceN
-			} else {
-				spliceN, err = inFile.Read(t, outPipeFD.IOSequence(count), vfs.ReadOptions{})
-			}
-			if spliceN == 0 && err == io.EOF {
-				// We reached the end of the file. Eat the error and exit the loop.
-				err = nil
-				break
 			}
 			n += spliceN
 			if err == syserror.ErrWouldBlock && !nonBlock {
@@ -371,19 +382,11 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 			} else {
 				readN, err = inFile.Read(t, usermem.BytesIOSequence(buf), vfs.ReadOptions{})
 			}
-			if readN == 0 && err == io.EOF {
-				// We reached the end of the file. Eat the error and exit the loop.
-				err = nil
-				break
-			}
 			n += readN
-			if err != nil {
-				break
-			}
 
 			// Write all of the bytes that we read. This may need
 			// multiple write calls to complete.
-			wbuf := buf[:n]
+			wbuf := buf[:readN]
 			for len(wbuf) > 0 {
 				var writeN int64
 				writeN, err = outFile.Write(t, usermem.BytesIOSequence(wbuf), vfs.WriteOptions{})
@@ -392,12 +395,21 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 					err = dw.waitForOut(t)
 				}
 				if err != nil {
-					// We didn't complete the write. Only
-					// report the bytes that were actually
-					// written, and rewind the offset.
+					// We didn't complete the write. Only report the bytes that were actually
+					// written, and rewind offsets as needed.
 					notWritten := int64(len(wbuf))
 					n -= notWritten
-					if offset != -1 {
+					if offset == -1 {
+						// We modified the offset of the input file itself during the read
+						// operation. Rewind it.
+						if _, seekErr := inFile.Seek(t, -notWritten, linux.SEEK_CUR); seekErr != nil {
+							// Log the error but don't return it, since the write has already
+							// completed successfully.
+							log.Warningf("failed to roll back input file offset: %v", seekErr)
+						}
+					} else {
+						// The sendfile call was provided an offset parameter that should be
+						// adjusted to reflect the number of bytes sent. Rewind it.
 						offset -= notWritten
 					}
 					break
@@ -414,18 +426,26 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 
 	if offsetAddr != 0 {
 		// Copy out the new offset.
-		if _, err := t.CopyOut(offsetAddr, offset); err != nil {
+		offsetP := primitive.Uint64(offset)
+		if _, err := offsetP.CopyOut(t, offsetAddr); err != nil {
 			return 0, nil, err
 		}
 	}
 
-	if n == 0 {
-		return 0, nil, err
+	if n != 0 {
+		inFile.Dentry().InotifyWithParent(t, linux.IN_ACCESS, 0, vfs.PathEvent)
+		outFile.Dentry().InotifyWithParent(t, linux.IN_MODIFY, 0, vfs.PathEvent)
+
+		if err != nil && err != io.EOF && err != syserror.ErrWouldBlock {
+			// If a partial write is completed, the error is dropped. Log it here.
+			log.Debugf("sendfile completed a partial write with error: %v", err)
+			err = nil
+		}
 	}
 
-	inFile.Dentry().InotifyWithParent(t, linux.IN_ACCESS, 0, vfs.PathEvent)
-	outFile.Dentry().InotifyWithParent(t, linux.IN_MODIFY, 0, vfs.PathEvent)
-	return uintptr(n), nil, nil
+	// We can only pass a single file to handleIOError, so pick inFile arbitrarily.
+	// This is used only for debugging purposes.
+	return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "sendfile", inFile)
 }
 
 // dualWaiter is used to wait on one or both vfs.FileDescriptions. It is not
diff --git a/pkg/sentry/syscalls/linux/vfs2/timerfd.go b/pkg/sentry/syscalls/linux/vfs2/timerfd.go
index 7a26890ef..250870c03 100644
--- a/pkg/sentry/syscalls/linux/vfs2/timerfd.go
+++ b/pkg/sentry/syscalls/linux/vfs2/timerfd.go
@@ -87,7 +87,7 @@ func TimerfdSettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne
 	}
 
 	var newVal linux.Itimerspec
-	if _, err := t.CopyIn(newValAddr, &newVal); err != nil {
+	if _, err := newVal.CopyIn(t, newValAddr); err != nil {
 		return 0, nil, err
 	}
 	newS, err := ktime.SettingFromItimerspec(newVal, flags&linux.TFD_TIMER_ABSTIME != 0, tfd.Clock())
@@ -97,7 +97,7 @@ func TimerfdSettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne
 	tm, oldS := tfd.SetTime(newS)
 	if oldValAddr != 0 {
 		oldVal := ktime.ItimerspecFromSetting(tm, oldS)
-		if _, err := t.CopyOut(oldValAddr, &oldVal); err != nil {
+		if _, err := oldVal.CopyOut(t, oldValAddr); err != nil {
 			return 0, nil, err
 		}
 	}
@@ -122,6 +122,6 @@ func TimerfdGettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne
 
 	tm, s := tfd.GetTime()
 	curVal := ktime.ItimerspecFromSetting(tm, s)
-	_, err := t.CopyOut(curValAddr, &curVal)
+	_, err := curVal.CopyOut(t, curValAddr)
 	return 0, nil, err
 }
diff --git a/pkg/sentry/syscalls/linux/vfs2/vfs2.go b/pkg/sentry/syscalls/linux/vfs2/vfs2.go
index c576d9475..c50fd97eb 100644
--- a/pkg/sentry/syscalls/linux/vfs2/vfs2.go
+++ b/pkg/sentry/syscalls/linux/vfs2/vfs2.go
@@ -93,16 +93,16 @@ func Override() {
 	s.Table[165] = syscalls.Supported("mount", Mount)
 	s.Table[166] = syscalls.Supported("umount2", Umount2)
 	s.Table[187] = syscalls.Supported("readahead", Readahead)
-	s.Table[188] = syscalls.Supported("setxattr", Setxattr)
+	s.Table[188] = syscalls.Supported("setxattr", SetXattr)
 	s.Table[189] = syscalls.Supported("lsetxattr", Lsetxattr)
 	s.Table[190] = syscalls.Supported("fsetxattr", Fsetxattr)
-	s.Table[191] = syscalls.Supported("getxattr", Getxattr)
+	s.Table[191] = syscalls.Supported("getxattr", GetXattr)
 	s.Table[192] = syscalls.Supported("lgetxattr", Lgetxattr)
 	s.Table[193] = syscalls.Supported("fgetxattr", Fgetxattr)
-	s.Table[194] = syscalls.Supported("listxattr", Listxattr)
+	s.Table[194] = syscalls.Supported("listxattr", ListXattr)
 	s.Table[195] = syscalls.Supported("llistxattr", Llistxattr)
 	s.Table[196] = syscalls.Supported("flistxattr", Flistxattr)
-	s.Table[197] = syscalls.Supported("removexattr", Removexattr)
+	s.Table[197] = syscalls.Supported("removexattr", RemoveXattr)
 	s.Table[198] = syscalls.Supported("lremovexattr", Lremovexattr)
 	s.Table[199] = syscalls.Supported("fremovexattr", Fremovexattr)
 	s.Table[209] = syscalls.PartiallySupported("io_submit", IoSubmit, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"})
@@ -163,16 +163,17 @@ func Override() {
 
 	// Override ARM64.
 	s = linux.ARM64
-	s.Table[5] = syscalls.Supported("setxattr", Setxattr)
+	s.Table[2] = syscalls.PartiallySupported("io_submit", IoSubmit, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"})
+	s.Table[5] = syscalls.Supported("setxattr", SetXattr)
 	s.Table[6] = syscalls.Supported("lsetxattr", Lsetxattr)
 	s.Table[7] = syscalls.Supported("fsetxattr", Fsetxattr)
-	s.Table[8] = syscalls.Supported("getxattr", Getxattr)
+	s.Table[8] = syscalls.Supported("getxattr", GetXattr)
 	s.Table[9] = syscalls.Supported("lgetxattr", Lgetxattr)
 	s.Table[10] = syscalls.Supported("fgetxattr", Fgetxattr)
-	s.Table[11] = syscalls.Supported("listxattr", Listxattr)
+	s.Table[11] = syscalls.Supported("listxattr", ListXattr)
 	s.Table[12] = syscalls.Supported("llistxattr", Llistxattr)
 	s.Table[13] = syscalls.Supported("flistxattr", Flistxattr)
-	s.Table[14] = syscalls.Supported("removexattr", Removexattr)
+	s.Table[14] = syscalls.Supported("removexattr", RemoveXattr)
 	s.Table[15] = syscalls.Supported("lremovexattr", Lremovexattr)
 	s.Table[16] = syscalls.Supported("fremovexattr", Fremovexattr)
 	s.Table[17] = syscalls.Supported("getcwd", Getcwd)
@@ -200,6 +201,7 @@ func Override() {
 	s.Table[44] = syscalls.Supported("fstatfs", Fstatfs)
 	s.Table[45] = syscalls.Supported("truncate", Truncate)
 	s.Table[46] = syscalls.Supported("ftruncate", Ftruncate)
+	s.Table[47] = syscalls.PartiallySupported("fallocate", Fallocate, "Not all options are supported.", nil)
 	s.Table[48] = syscalls.Supported("faccessat", Faccessat)
 	s.Table[49] = syscalls.Supported("chdir", Chdir)
 	s.Table[50] = syscalls.Supported("fchdir", Fchdir)
@@ -221,12 +223,14 @@ func Override() {
 	s.Table[68] = syscalls.Supported("pwrite64", Pwrite64)
 	s.Table[69] = syscalls.Supported("preadv", Preadv)
 	s.Table[70] = syscalls.Supported("pwritev", Pwritev)
+	s.Table[71] = syscalls.Supported("sendfile", Sendfile)
 	s.Table[72] = syscalls.Supported("pselect", Pselect)
 	s.Table[73] = syscalls.Supported("ppoll", Ppoll)
 	s.Table[74] = syscalls.Supported("signalfd4", Signalfd4)
 	s.Table[76] = syscalls.Supported("splice", Splice)
 	s.Table[77] = syscalls.Supported("tee", Tee)
 	s.Table[78] = syscalls.Supported("readlinkat", Readlinkat)
+	s.Table[79] = syscalls.Supported("newfstatat", Newfstatat)
 	s.Table[80] = syscalls.Supported("fstat", Fstat)
 	s.Table[81] = syscalls.Supported("sync", Sync)
 	s.Table[82] = syscalls.Supported("fsync", Fsync)
@@ -251,8 +255,10 @@ func Override() {
 	s.Table[210] = syscalls.Supported("shutdown", Shutdown)
 	s.Table[211] = syscalls.Supported("sendmsg", SendMsg)
 	s.Table[212] = syscalls.Supported("recvmsg", RecvMsg)
+	s.Table[213] = syscalls.Supported("readahead", Readahead)
 	s.Table[221] = syscalls.Supported("execve", Execve)
 	s.Table[222] = syscalls.Supported("mmap", Mmap)
+	s.Table[223] = syscalls.PartiallySupported("fadvise64", Fadvise64, "Not all options are supported.", nil)
 	s.Table[242] = syscalls.Supported("accept4", Accept4)
 	s.Table[243] = syscalls.Supported("recvmmsg", RecvMMsg)
 	s.Table[267] = syscalls.Supported("syncfs", Syncfs)
diff --git a/pkg/sentry/syscalls/linux/vfs2/xattr.go b/pkg/sentry/syscalls/linux/vfs2/xattr.go
index ef99246ed..e05723ef9 100644
--- a/pkg/sentry/syscalls/linux/vfs2/xattr.go
+++ b/pkg/sentry/syscalls/linux/vfs2/xattr.go
@@ -26,8 +26,8 @@ import (
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
-// Listxattr implements Linux syscall listxattr(2).
-func Listxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+// ListXattr implements Linux syscall listxattr(2).
+func ListXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	return listxattr(t, args, followFinalSymlink)
 }
 
@@ -51,7 +51,7 @@ func listxattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSyml
 	}
 	defer tpop.Release(t)
 
-	names, err := t.Kernel().VFS().ListxattrAt(t, t.Credentials(), &tpop.pop, uint64(size))
+	names, err := t.Kernel().VFS().ListXattrAt(t, t.Credentials(), &tpop.pop, uint64(size))
 	if err != nil {
 		return 0, nil, err
 	}
@@ -74,7 +74,7 @@ func Flistxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
 	}
 	defer file.DecRef(t)
 
-	names, err := file.Listxattr(t, uint64(size))
+	names, err := file.ListXattr(t, uint64(size))
 	if err != nil {
 		return 0, nil, err
 	}
@@ -85,8 +85,8 @@ func Flistxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
 	return uintptr(n), nil, nil
 }
 
-// Getxattr implements Linux syscall getxattr(2).
-func Getxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+// GetXattr implements Linux syscall getxattr(2).
+func GetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	return getxattr(t, args, followFinalSymlink)
 }
 
@@ -116,7 +116,7 @@ func getxattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSymli
 		return 0, nil, err
 	}
 
-	value, err := t.Kernel().VFS().GetxattrAt(t, t.Credentials(), &tpop.pop, &vfs.GetxattrOptions{
+	value, err := t.Kernel().VFS().GetXattrAt(t, t.Credentials(), &tpop.pop, &vfs.GetXattrOptions{
 		Name: name,
 		Size: uint64(size),
 	})
@@ -148,7 +148,7 @@ func Fgetxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 		return 0, nil, err
 	}
 
-	value, err := file.Getxattr(t, &vfs.GetxattrOptions{Name: name, Size: uint64(size)})
+	value, err := file.GetXattr(t, &vfs.GetXattrOptions{Name: name, Size: uint64(size)})
 	if err != nil {
 		return 0, nil, err
 	}
@@ -159,8 +159,8 @@ func Fgetxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 	return uintptr(n), nil, nil
 }
 
-// Setxattr implements Linux syscall setxattr(2).
-func Setxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+// SetXattr implements Linux syscall setxattr(2).
+func SetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	return 0, nil, setxattr(t, args, followFinalSymlink)
 }
 
@@ -199,7 +199,7 @@ func setxattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSymli
 		return err
 	}
 
-	return t.Kernel().VFS().SetxattrAt(t, t.Credentials(), &tpop.pop, &vfs.SetxattrOptions{
+	return t.Kernel().VFS().SetXattrAt(t, t.Credentials(), &tpop.pop, &vfs.SetXattrOptions{
 		Name:  name,
 		Value: value,
 		Flags: uint32(flags),
@@ -233,15 +233,15 @@ func Fsetxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 		return 0, nil, err
 	}
 
-	return 0, nil, file.Setxattr(t, &vfs.SetxattrOptions{
+	return 0, nil, file.SetXattr(t, &vfs.SetXattrOptions{
 		Name:  name,
 		Value: value,
 		Flags: uint32(flags),
 	})
 }
 
-// Removexattr implements Linux syscall removexattr(2).
-func Removexattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+// RemoveXattr implements Linux syscall removexattr(2).
+func RemoveXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	return 0, nil, removexattr(t, args, followFinalSymlink)
 }
 
@@ -269,7 +269,7 @@ func removexattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSy
 		return err
 	}
 
-	return t.Kernel().VFS().RemovexattrAt(t, t.Credentials(), &tpop.pop, name)
+	return t.Kernel().VFS().RemoveXattrAt(t, t.Credentials(), &tpop.pop, name)
 }
 
 // Fremovexattr implements Linux syscall fremovexattr(2).
@@ -288,7 +288,7 @@ func Fremovexattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.
 		return 0, nil, err
 	}
 
-	return 0, nil, file.Removexattr(t, name)
+	return 0, nil, file.RemoveXattr(t, name)
 }
 
 func copyInXattrName(t *kernel.Task, nameAddr usermem.Addr) (string, error) {
diff --git a/pkg/sentry/usage/memory.go b/pkg/sentry/usage/memory.go
index ab1d140d2..5ed6726ab 100644
--- a/pkg/sentry/usage/memory.go
+++ b/pkg/sentry/usage/memory.go
@@ -278,7 +278,7 @@ func TotalMemory(memSize, used uint64) uint64 {
 	}
 	if memSize < used {
 		memSize = used
-		// Bump totalSize to the next largest power of 2, if one exists, so
+		// Bump memSize to the next largest power of 2, if one exists, so
 		// that MemFree isn't 0.
 		if msb := bits.MostSignificantOne64(memSize); msb < 63 {
 			memSize = uint64(1) << (uint(msb) + 1)
diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD
index 642769e7c..c855608db 100644
--- a/pkg/sentry/vfs/BUILD
+++ b/pkg/sentry/vfs/BUILD
@@ -27,6 +27,39 @@ go_template_instance(
     },
 )
 
+go_template_instance(
+    name = "file_description_refs",
+    out = "file_description_refs.go",
+    package = "vfs",
+    prefix = "FileDescription",
+    template = "//pkg/refs_vfs2:refs_template",
+    types = {
+        "T": "FileDescription",
+    },
+)
+
+go_template_instance(
+    name = "mount_namespace_refs",
+    out = "mount_namespace_refs.go",
+    package = "vfs",
+    prefix = "MountNamespace",
+    template = "//pkg/refs_vfs2:refs_template",
+    types = {
+        "T": "MountNamespace",
+    },
+)
+
+go_template_instance(
+    name = "filesystem_refs",
+    out = "filesystem_refs.go",
+    package = "vfs",
+    prefix = "Filesystem",
+    template = "//pkg/refs_vfs2:refs_template",
+    types = {
+        "T": "Filesystem",
+    },
+)
+
 go_library(
     name = "vfs",
     srcs = [
@@ -40,12 +73,15 @@ go_library(
         "event_list.go",
         "file_description.go",
         "file_description_impl_util.go",
+        "file_description_refs.go",
         "filesystem.go",
         "filesystem_impl_util.go",
+        "filesystem_refs.go",
         "filesystem_type.go",
         "inotify.go",
         "lock.go",
         "mount.go",
+        "mount_namespace_refs.go",
         "mount_unsafe.go",
         "options.go",
         "pathname.go",
@@ -56,13 +92,13 @@ go_library(
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
-        "//pkg/binary",
         "//pkg/context",
         "//pkg/fd",
         "//pkg/fdnotifier",
         "//pkg/fspath",
         "//pkg/gohacks",
         "//pkg/log",
+        "//pkg/refs",
         "//pkg/safemem",
         "//pkg/sentry/arch",
         "//pkg/sentry/fs",
diff --git a/pkg/sentry/vfs/README.md b/pkg/sentry/vfs/README.md
index 4b9faf2ea..5aad31b78 100644
--- a/pkg/sentry/vfs/README.md
+++ b/pkg/sentry/vfs/README.md
@@ -184,12 +184,3 @@ This construction, which is essentially a type-safe analogue to Linux's
     -   File locking
 
     -   `O_ASYNC`
-
--   Reference counts in the `vfs` package do not use the `refs` package since
-    `refs.AtomicRefCount` adds 64 bytes of overhead to each 8-byte reference
-    count, resulting in considerable cache bloat. 24 bytes of this overhead is
-    for weak reference support, which have poor performance and will not be used
-    by VFS2. The remaining 40 bytes is to store a descriptive string and stack
-    trace for reference leak checking; we can support reference leak checking
-    without incurring this space overhead by including the applicable
-    information directly in finalizers for applicable types.
diff --git a/pkg/sentry/vfs/anonfs.go b/pkg/sentry/vfs/anonfs.go
index 5a0e3e6b5..7ad0eaf86 100644
--- a/pkg/sentry/vfs/anonfs.go
+++ b/pkg/sentry/vfs/anonfs.go
@@ -52,6 +52,8 @@ const (
 )
 
 // anonFilesystemType implements FilesystemType.
+//
+// +stateify savable
 type anonFilesystemType struct{}
 
 // GetFilesystem implements FilesystemType.GetFilesystem.
@@ -59,22 +61,28 @@ func (anonFilesystemType) GetFilesystem(context.Context, *VirtualFilesystem, *au
 	panic("cannot instaniate an anon filesystem")
 }
 
-// Name implemenents FilesystemType.Name.
+// Name implements FilesystemType.Name.
 func (anonFilesystemType) Name() string {
 	return "none"
 }
 
+// Release implemenents FilesystemType.Release.
+func (anonFilesystemType) Release(ctx context.Context) {}
+
 // anonFilesystem is the implementation of FilesystemImpl that backs
 // VirtualDentries returned by VirtualFilesystem.NewAnonVirtualDentry().
 //
 // Since all Dentries in anonFilesystem are non-directories, all FilesystemImpl
 // methods that would require an anonDentry to be a directory return ENOTDIR.
+//
+// +stateify savable
 type anonFilesystem struct {
 	vfsfs Filesystem
 
 	devMinor uint32
 }
 
+// +stateify savable
 type anonDentry struct {
 	vfsd Dentry
 
@@ -245,32 +253,32 @@ func (fs *anonFilesystem) BoundEndpointAt(ctx context.Context, rp *ResolvingPath
 	return nil, syserror.ECONNREFUSED
 }
 
-// ListxattrAt implements FilesystemImpl.ListxattrAt.
-func (fs *anonFilesystem) ListxattrAt(ctx context.Context, rp *ResolvingPath, size uint64) ([]string, error) {
+// ListXattrAt implements FilesystemImpl.ListXattrAt.
+func (fs *anonFilesystem) ListXattrAt(ctx context.Context, rp *ResolvingPath, size uint64) ([]string, error) {
 	if !rp.Done() {
 		return nil, syserror.ENOTDIR
 	}
 	return nil, nil
 }
 
-// GetxattrAt implements FilesystemImpl.GetxattrAt.
-func (fs *anonFilesystem) GetxattrAt(ctx context.Context, rp *ResolvingPath, opts GetxattrOptions) (string, error) {
+// GetXattrAt implements FilesystemImpl.GetXattrAt.
+func (fs *anonFilesystem) GetXattrAt(ctx context.Context, rp *ResolvingPath, opts GetXattrOptions) (string, error) {
 	if !rp.Done() {
 		return "", syserror.ENOTDIR
 	}
 	return "", syserror.ENOTSUP
 }
 
-// SetxattrAt implements FilesystemImpl.SetxattrAt.
-func (fs *anonFilesystem) SetxattrAt(ctx context.Context, rp *ResolvingPath, opts SetxattrOptions) error {
+// SetXattrAt implements FilesystemImpl.SetXattrAt.
+func (fs *anonFilesystem) SetXattrAt(ctx context.Context, rp *ResolvingPath, opts SetXattrOptions) error {
 	if !rp.Done() {
 		return syserror.ENOTDIR
 	}
 	return syserror.EPERM
 }
 
-// RemovexattrAt implements FilesystemImpl.RemovexattrAt.
-func (fs *anonFilesystem) RemovexattrAt(ctx context.Context, rp *ResolvingPath, name string) error {
+// RemoveXattrAt implements FilesystemImpl.RemoveXattrAt.
+func (fs *anonFilesystem) RemoveXattrAt(ctx context.Context, rp *ResolvingPath, name string) error {
 	if !rp.Done() {
 		return syserror.ENOTDIR
 	}
diff --git a/pkg/sentry/vfs/context.go b/pkg/sentry/vfs/context.go
index c9e724fef..97018651f 100644
--- a/pkg/sentry/vfs/context.go
+++ b/pkg/sentry/vfs/context.go
@@ -40,6 +40,30 @@ func MountNamespaceFromContext(ctx context.Context) *MountNamespace {
 	return nil
 }
 
+type mountNamespaceContext struct {
+	context.Context
+	mntns *MountNamespace
+}
+
+// Value implements Context.Value.
+func (mc mountNamespaceContext) Value(key interface{}) interface{} {
+	switch key {
+	case CtxMountNamespace:
+		mc.mntns.IncRef()
+		return mc.mntns
+	default:
+		return mc.Context.Value(key)
+	}
+}
+
+// WithMountNamespace returns a copy of ctx with the given MountNamespace.
+func WithMountNamespace(ctx context.Context, mntns *MountNamespace) context.Context {
+	return &mountNamespaceContext{
+		Context: ctx,
+		mntns:   mntns,
+	}
+}
+
 // RootFromContext returns the VFS root used by ctx. It takes a reference on
 // the returned VirtualDentry. If ctx does not have a specific VFS root,
 // RootFromContext returns a zero-value VirtualDentry.
diff --git a/pkg/sentry/vfs/dentry.go b/pkg/sentry/vfs/dentry.go
index bc7ea93ea..320ab7ce1 100644
--- a/pkg/sentry/vfs/dentry.go
+++ b/pkg/sentry/vfs/dentry.go
@@ -89,6 +89,8 @@ func (d *Dentry) Impl() DentryImpl {
 // DentryImpl contains implementation details for a Dentry. Implementations of
 // DentryImpl should contain their associated Dentry by value as their first
 // field.
+//
+// +stateify savable
 type DentryImpl interface {
 	// IncRef increments the Dentry's reference count. A Dentry with a non-zero
 	// reference count must remain coherent with the state of the filesystem.
@@ -242,8 +244,9 @@ func (vfs *VirtualFilesystem) InvalidateDentry(ctx context.Context, d *Dentry) {
 // caller must call AbortRenameDentry, CommitRenameReplaceDentry, or
 // CommitRenameExchangeDentry depending on the rename's outcome.
 //
-// Preconditions: If to is not nil, it must be a child Dentry from the same
-// Filesystem. from != to.
+// Preconditions:
+// * If to is not nil, it must be a child Dentry from the same Filesystem.
+// * from != to.
 func (vfs *VirtualFilesystem) PrepareRenameDentry(mntns *MountNamespace, from, to *Dentry) error {
 	vfs.mountMu.Lock()
 	if mntns.mountpoints[from] != 0 {
diff --git a/pkg/sentry/vfs/device.go b/pkg/sentry/vfs/device.go
index 1e9dffc8f..dde2ad79b 100644
--- a/pkg/sentry/vfs/device.go
+++ b/pkg/sentry/vfs/device.go
@@ -22,6 +22,8 @@ import (
 )
 
 // DeviceKind indicates whether a device is a block or character device.
+//
+// +stateify savable
 type DeviceKind uint32
 
 const (
@@ -44,6 +46,7 @@ func (kind DeviceKind) String() string {
 	}
 }
 
+// +stateify savable
 type devTuple struct {
 	kind  DeviceKind
 	major uint32
diff --git a/pkg/sentry/vfs/epoll.go b/pkg/sentry/vfs/epoll.go
index 1b5af9f73..8f36c3e3b 100644
--- a/pkg/sentry/vfs/epoll.go
+++ b/pkg/sentry/vfs/epoll.go
@@ -27,6 +27,8 @@ import (
 var epollCycleMu sync.Mutex
 
 // EpollInstance represents an epoll instance, as described by epoll(7).
+//
+// +stateify savable
 type EpollInstance struct {
 	vfsfd FileDescription
 	FileDescriptionDefaultImpl
@@ -38,11 +40,11 @@ type EpollInstance struct {
 
 	// interest is the set of file descriptors that are registered with the
 	// EpollInstance for monitoring. interest is protected by interestMu.
-	interestMu sync.Mutex
+	interestMu sync.Mutex `state:"nosave"`
 	interest   map[epollInterestKey]*epollInterest
 
 	// mu protects fields in registered epollInterests.
-	mu sync.Mutex
+	mu sync.Mutex `state:"nosave"`
 
 	// ready is the set of file descriptors that may be "ready" for I/O. Note
 	// that this must be an ordered list, not a map: "If more than maxevents
@@ -55,6 +57,7 @@ type EpollInstance struct {
 	ready epollInterestList
 }
 
+// +stateify savable
 type epollInterestKey struct {
 	// file is the registered FileDescription. No reference is held on file;
 	// instead, when the last reference is dropped, FileDescription.DecRef()
@@ -67,6 +70,8 @@ type epollInterestKey struct {
 }
 
 // epollInterest represents an EpollInstance's interest in a file descriptor.
+//
+// +stateify savable
 type epollInterest struct {
 	// epoll is the owning EpollInstance. epoll is immutable.
 	epoll *EpollInstance
@@ -331,11 +336,9 @@ func (ep *EpollInstance) removeLocked(epi *epollInterest) {
 	ep.mu.Unlock()
 }
 
-// ReadEvents reads up to len(events) ready events into events and returns the
-// number of events read.
-//
-// Preconditions: len(events) != 0.
-func (ep *EpollInstance) ReadEvents(events []linux.EpollEvent) int {
+// ReadEvents appends up to maxReady events to events and returns the updated
+// slice of events.
+func (ep *EpollInstance) ReadEvents(events []linux.EpollEvent, maxEvents int) []linux.EpollEvent {
 	i := 0
 	// Hot path: avoid defer.
 	ep.mu.Lock()
@@ -368,16 +371,16 @@ func (ep *EpollInstance) ReadEvents(events []linux.EpollEvent) int {
 			requeue.PushBack(epi)
 		}
 		// Report ievents.
-		events[i] = linux.EpollEvent{
+		events = append(events, linux.EpollEvent{
 			Events: ievents.ToLinux(),
 			Data:   epi.userData,
-		}
+		})
 		i++
-		if i == len(events) {
+		if i == maxEvents {
 			break
 		}
 	}
 	ep.ready.PushBackList(&requeue)
 	ep.mu.Unlock()
-	return i
+	return events
 }
diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go
index dcafffe57..183957ad8 100644
--- a/pkg/sentry/vfs/file_description.go
+++ b/pkg/sentry/vfs/file_description.go
@@ -37,13 +37,13 @@ import (
 // FileDescription methods require that a reference is held.
 //
 // FileDescription is analogous to Linux's struct file.
+//
+// +stateify savable
 type FileDescription struct {
-	// refs is the reference count. refs is accessed using atomic memory
-	// operations.
-	refs int64
+	FileDescriptionRefs
 
 	// flagsMu protects statusFlags and asyncHandler below.
-	flagsMu sync.Mutex
+	flagsMu sync.Mutex `state:"nosave"`
 
 	// statusFlags contains status flags, "initialized by open(2) and possibly
 	// modified by fcntl()" - fcntl(2). statusFlags can be read using atomic
@@ -58,7 +58,7 @@ type FileDescription struct {
 
 	// epolls is the set of epollInterests registered for this FileDescription.
 	// epolls is protected by epollMu.
-	epollMu sync.Mutex
+	epollMu sync.Mutex `state:"nosave"`
 	epolls  map[*epollInterest]struct{}
 
 	// vd is the filesystem location at which this FileDescription was opened.
@@ -90,6 +90,8 @@ type FileDescription struct {
 }
 
 // FileDescriptionOptions contains options to FileDescription.Init().
+//
+// +stateify savable
 type FileDescriptionOptions struct {
 	// If AllowDirectIO is true, allow O_DIRECT to be set on the file.
 	AllowDirectIO bool
@@ -103,7 +105,7 @@ type FileDescriptionOptions struct {
 
 	// If UseDentryMetadata is true, calls to FileDescription methods that
 	// interact with file and filesystem metadata (Stat, SetStat, StatFS,
-	// Listxattr, Getxattr, Setxattr, Removexattr) are implemented by calling
+	// ListXattr, GetXattr, SetXattr, RemoveXattr) are implemented by calling
 	// the corresponding FilesystemImpl methods instead of the corresponding
 	// FileDescriptionImpl methods.
 	//
@@ -131,7 +133,7 @@ func (fd *FileDescription) Init(impl FileDescriptionImpl, flags uint32, mnt *Mou
 		}
 	}
 
-	fd.refs = 1
+	fd.EnableLeakCheck()
 
 	// Remove "file creation flags" to mirror the behavior from file.f_flags in
 	// fs/open.c:do_dentry_open.
@@ -149,30 +151,9 @@ func (fd *FileDescription) Init(impl FileDescriptionImpl, flags uint32, mnt *Mou
 	return nil
 }
 
-// IncRef increments fd's reference count.
-func (fd *FileDescription) IncRef() {
-	atomic.AddInt64(&fd.refs, 1)
-}
-
-// TryIncRef increments fd's reference count and returns true. If fd's
-// reference count is already zero, TryIncRef does nothing and returns false.
-//
-// TryIncRef does not require that a reference is held on fd.
-func (fd *FileDescription) TryIncRef() bool {
-	for {
-		refs := atomic.LoadInt64(&fd.refs)
-		if refs <= 0 {
-			return false
-		}
-		if atomic.CompareAndSwapInt64(&fd.refs, refs, refs+1) {
-			return true
-		}
-	}
-}
-
 // DecRef decrements fd's reference count.
 func (fd *FileDescription) DecRef(ctx context.Context) {
-	if refs := atomic.AddInt64(&fd.refs, -1); refs == 0 {
+	fd.FileDescriptionRefs.DecRef(func() {
 		// Unregister fd from all epoll instances.
 		fd.epollMu.Lock()
 		epolls := fd.epolls
@@ -208,15 +189,7 @@ func (fd *FileDescription) DecRef(ctx context.Context) {
 		}
 		fd.asyncHandler = nil
 		fd.flagsMu.Unlock()
-	} else if refs < 0 {
-		panic("FileDescription.DecRef() called without holding a reference")
-	}
-}
-
-// Refs returns the current number of references. The returned count
-// is inherently racy and is unsafe to use without external synchronization.
-func (fd *FileDescription) Refs() int64 {
-	return atomic.LoadInt64(&fd.refs)
+	})
 }
 
 // Mount returns the mount on which fd was opened. It does not take a reference
@@ -357,6 +330,9 @@ type FileDescriptionImpl interface {
 	// Allocate grows the file to offset + length bytes.
 	// Only mode == 0 is supported currently.
 	//
+	// Allocate should return EISDIR on directories, ESPIPE on pipes, and ENODEV on
+	// other files where it is not supported.
+	//
 	// Preconditions: The FileDescription was opened for writing.
 	Allocate(ctx context.Context, mode, offset, length uint64) error
 
@@ -371,8 +347,9 @@ type FileDescriptionImpl interface {
 	//
 	// - If opts.Flags specifies unsupported options, PRead returns EOPNOTSUPP.
 	//
-	// Preconditions: The FileDescription was opened for reading.
-	// FileDescriptionOptions.DenyPRead == false.
+	// Preconditions:
+	// * The FileDescription was opened for reading.
+	// * FileDescriptionOptions.DenyPRead == false.
 	PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error)
 
 	// Read is similar to PRead, but does not specify an offset.
@@ -403,8 +380,9 @@ type FileDescriptionImpl interface {
 	// - If opts.Flags specifies unsupported options, PWrite returns
 	// EOPNOTSUPP.
 	//
-	// Preconditions: The FileDescription was opened for writing.
-	// FileDescriptionOptions.DenyPWrite == false.
+	// Preconditions:
+	// * The FileDescription was opened for writing.
+	// * FileDescriptionOptions.DenyPWrite == false.
 	PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error)
 
 	// Write is similar to PWrite, but does not specify an offset, which is
@@ -449,19 +427,19 @@ type FileDescriptionImpl interface {
 	// Ioctl implements the ioctl(2) syscall.
 	Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error)
 
-	// Listxattr returns all extended attribute names for the file.
-	Listxattr(ctx context.Context, size uint64) ([]string, error)
+	// ListXattr returns all extended attribute names for the file.
+	ListXattr(ctx context.Context, size uint64) ([]string, error)
 
-	// Getxattr returns the value associated with the given extended attribute
+	// GetXattr returns the value associated with the given extended attribute
 	// for the file.
-	Getxattr(ctx context.Context, opts GetxattrOptions) (string, error)
+	GetXattr(ctx context.Context, opts GetXattrOptions) (string, error)
 
-	// Setxattr changes the value associated with the given extended attribute
+	// SetXattr changes the value associated with the given extended attribute
 	// for the file.
-	Setxattr(ctx context.Context, opts SetxattrOptions) error
+	SetXattr(ctx context.Context, opts SetXattrOptions) error
 
-	// Removexattr removes the given extended attribute from the file.
-	Removexattr(ctx context.Context, name string) error
+	// RemoveXattr removes the given extended attribute from the file.
+	RemoveXattr(ctx context.Context, name string) error
 
 	// LockBSD tries to acquire a BSD-style advisory file lock.
 	LockBSD(ctx context.Context, uid lock.UniqueID, t lock.LockType, block lock.Blocker) error
@@ -477,6 +455,8 @@ type FileDescriptionImpl interface {
 }
 
 // Dirent holds the information contained in struct linux_dirent64.
+//
+// +stateify savable
 type Dirent struct {
 	// Name is the filename.
 	Name string
@@ -664,25 +644,25 @@ func (fd *FileDescription) Ioctl(ctx context.Context, uio usermem.IO, args arch.
 	return fd.impl.Ioctl(ctx, uio, args)
 }
 
-// Listxattr returns all extended attribute names for the file represented by
+// ListXattr returns all extended attribute names for the file represented by
 // fd.
 //
 // If the size of the list (including a NUL terminating byte after every entry)
 // would exceed size, ERANGE may be returned. Note that implementations
 // are free to ignore size entirely and return without error). In all cases,
 // if size is 0, the list should be returned without error, regardless of size.
-func (fd *FileDescription) Listxattr(ctx context.Context, size uint64) ([]string, error) {
+func (fd *FileDescription) ListXattr(ctx context.Context, size uint64) ([]string, error) {
 	if fd.opts.UseDentryMetadata {
 		vfsObj := fd.vd.mount.vfs
 		rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
 			Root:  fd.vd,
 			Start: fd.vd,
 		})
-		names, err := fd.vd.mount.fs.impl.ListxattrAt(ctx, rp, size)
+		names, err := fd.vd.mount.fs.impl.ListXattrAt(ctx, rp, size)
 		vfsObj.putResolvingPath(ctx, rp)
 		return names, err
 	}
-	names, err := fd.impl.Listxattr(ctx, size)
+	names, err := fd.impl.ListXattr(ctx, size)
 	if err == syserror.ENOTSUP {
 		// Linux doesn't actually return ENOTSUP in this case; instead,
 		// fs/xattr.c:vfs_listxattr() falls back to allowing the security
@@ -693,57 +673,57 @@ func (fd *FileDescription) Listxattr(ctx context.Context, size uint64) ([]string
 	return names, err
 }
 
-// Getxattr returns the value associated with the given extended attribute for
+// GetXattr returns the value associated with the given extended attribute for
 // the file represented by fd.
 //
 // If the size of the return value exceeds opts.Size, ERANGE may be returned
 // (note that implementations are free to ignore opts.Size entirely and return
 // without error). In all cases, if opts.Size is 0, the value should be
 // returned without error, regardless of size.
-func (fd *FileDescription) Getxattr(ctx context.Context, opts *GetxattrOptions) (string, error) {
+func (fd *FileDescription) GetXattr(ctx context.Context, opts *GetXattrOptions) (string, error) {
 	if fd.opts.UseDentryMetadata {
 		vfsObj := fd.vd.mount.vfs
 		rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
 			Root:  fd.vd,
 			Start: fd.vd,
 		})
-		val, err := fd.vd.mount.fs.impl.GetxattrAt(ctx, rp, *opts)
+		val, err := fd.vd.mount.fs.impl.GetXattrAt(ctx, rp, *opts)
 		vfsObj.putResolvingPath(ctx, rp)
 		return val, err
 	}
-	return fd.impl.Getxattr(ctx, *opts)
+	return fd.impl.GetXattr(ctx, *opts)
 }
 
-// Setxattr changes the value associated with the given extended attribute for
+// SetXattr changes the value associated with the given extended attribute for
 // the file represented by fd.
-func (fd *FileDescription) Setxattr(ctx context.Context, opts *SetxattrOptions) error {
+func (fd *FileDescription) SetXattr(ctx context.Context, opts *SetXattrOptions) error {
 	if fd.opts.UseDentryMetadata {
 		vfsObj := fd.vd.mount.vfs
 		rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
 			Root:  fd.vd,
 			Start: fd.vd,
 		})
-		err := fd.vd.mount.fs.impl.SetxattrAt(ctx, rp, *opts)
+		err := fd.vd.mount.fs.impl.SetXattrAt(ctx, rp, *opts)
 		vfsObj.putResolvingPath(ctx, rp)
 		return err
 	}
-	return fd.impl.Setxattr(ctx, *opts)
+	return fd.impl.SetXattr(ctx, *opts)
 }
 
-// Removexattr removes the given extended attribute from the file represented
+// RemoveXattr removes the given extended attribute from the file represented
 // by fd.
-func (fd *FileDescription) Removexattr(ctx context.Context, name string) error {
+func (fd *FileDescription) RemoveXattr(ctx context.Context, name string) error {
 	if fd.opts.UseDentryMetadata {
 		vfsObj := fd.vd.mount.vfs
 		rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
 			Root:  fd.vd,
 			Start: fd.vd,
 		})
-		err := fd.vd.mount.fs.impl.RemovexattrAt(ctx, rp, name)
+		err := fd.vd.mount.fs.impl.RemoveXattrAt(ctx, rp, name)
 		vfsObj.putResolvingPath(ctx, rp)
 		return err
 	}
-	return fd.impl.Removexattr(ctx, name)
+	return fd.impl.RemoveXattr(ctx, name)
 }
 
 // SyncFS instructs the filesystem containing fd to execute the semantics of
@@ -845,3 +825,45 @@ func (fd *FileDescription) SetAsyncHandler(newHandler func() FileAsync) FileAsyn
 	}
 	return fd.asyncHandler
 }
+
+// FileReadWriteSeeker is a helper struct to pass a FileDescription as
+// io.Reader/io.Writer/io.ReadSeeker/io.ReaderAt/io.WriterAt/etc.
+type FileReadWriteSeeker struct {
+	FD    *FileDescription
+	Ctx   context.Context
+	ROpts ReadOptions
+	WOpts WriteOptions
+}
+
+// ReadAt implements io.ReaderAt.ReadAt.
+func (f *FileReadWriteSeeker) ReadAt(p []byte, off int64) (int, error) {
+	dst := usermem.BytesIOSequence(p)
+	n, err := f.FD.PRead(f.Ctx, dst, off, f.ROpts)
+	return int(n), err
+}
+
+// Read implements io.ReadWriteSeeker.Read.
+func (f *FileReadWriteSeeker) Read(p []byte) (int, error) {
+	dst := usermem.BytesIOSequence(p)
+	n, err := f.FD.Read(f.Ctx, dst, f.ROpts)
+	return int(n), err
+}
+
+// Seek implements io.ReadWriteSeeker.Seek.
+func (f *FileReadWriteSeeker) Seek(offset int64, whence int) (int64, error) {
+	return f.FD.Seek(f.Ctx, offset, int32(whence))
+}
+
+// WriteAt implements io.WriterAt.WriteAt.
+func (f *FileReadWriteSeeker) WriteAt(p []byte, off int64) (int, error) {
+	dst := usermem.BytesIOSequence(p)
+	n, err := f.FD.PWrite(f.Ctx, dst, off, f.WOpts)
+	return int(n), err
+}
+
+// Write implements io.ReadWriteSeeker.Write.
+func (f *FileReadWriteSeeker) Write(p []byte) (int, error) {
+	buf := usermem.BytesIOSequence(p)
+	n, err := f.FD.Write(f.Ctx, buf, f.WOpts)
+	return int(n), err
+}
diff --git a/pkg/sentry/vfs/file_description_impl_util.go b/pkg/sentry/vfs/file_description_impl_util.go
index 6b8b4ad49..48ca9de44 100644
--- a/pkg/sentry/vfs/file_description_impl_util.go
+++ b/pkg/sentry/vfs/file_description_impl_util.go
@@ -42,6 +42,8 @@ import (
 // FileDescriptionDefaultImpl may be embedded by implementations of
 // FileDescriptionImpl to obtain implementations of many FileDescriptionImpl
 // methods with default behavior analogous to Linux's.
+//
+// +stateify savable
 type FileDescriptionDefaultImpl struct{}
 
 // OnClose implements FileDescriptionImpl.OnClose analogously to
@@ -57,7 +59,11 @@ func (FileDescriptionDefaultImpl) StatFS(ctx context.Context) (linux.Statfs, err
 }
 
 // Allocate implements FileDescriptionImpl.Allocate analogously to
-// fallocate called on regular file, directory or FIFO in Linux.
+// fallocate called on an invalid type of file in Linux.
+//
+// Note that directories can rely on this implementation even though they
+// should technically return EISDIR. Allocate should never be called for a
+// directory, because it requires a writable fd.
 func (FileDescriptionDefaultImpl) Allocate(ctx context.Context, mode, offset, length uint64) error {
 	return syserror.ENODEV
 }
@@ -134,34 +140,36 @@ func (FileDescriptionDefaultImpl) Ioctl(ctx context.Context, uio usermem.IO, arg
 	return 0, syserror.ENOTTY
 }
 
-// Listxattr implements FileDescriptionImpl.Listxattr analogously to
+// ListXattr implements FileDescriptionImpl.ListXattr analogously to
 // inode_operations::listxattr == NULL in Linux.
-func (FileDescriptionDefaultImpl) Listxattr(ctx context.Context, size uint64) ([]string, error) {
-	// This isn't exactly accurate; see FileDescription.Listxattr.
+func (FileDescriptionDefaultImpl) ListXattr(ctx context.Context, size uint64) ([]string, error) {
+	// This isn't exactly accurate; see FileDescription.ListXattr.
 	return nil, syserror.ENOTSUP
 }
 
-// Getxattr implements FileDescriptionImpl.Getxattr analogously to
+// GetXattr implements FileDescriptionImpl.GetXattr analogously to
 // inode::i_opflags & IOP_XATTR == 0 in Linux.
-func (FileDescriptionDefaultImpl) Getxattr(ctx context.Context, opts GetxattrOptions) (string, error) {
+func (FileDescriptionDefaultImpl) GetXattr(ctx context.Context, opts GetXattrOptions) (string, error) {
 	return "", syserror.ENOTSUP
 }
 
-// Setxattr implements FileDescriptionImpl.Setxattr analogously to
+// SetXattr implements FileDescriptionImpl.SetXattr analogously to
 // inode::i_opflags & IOP_XATTR == 0 in Linux.
-func (FileDescriptionDefaultImpl) Setxattr(ctx context.Context, opts SetxattrOptions) error {
+func (FileDescriptionDefaultImpl) SetXattr(ctx context.Context, opts SetXattrOptions) error {
 	return syserror.ENOTSUP
 }
 
-// Removexattr implements FileDescriptionImpl.Removexattr analogously to
+// RemoveXattr implements FileDescriptionImpl.RemoveXattr analogously to
 // inode::i_opflags & IOP_XATTR == 0 in Linux.
-func (FileDescriptionDefaultImpl) Removexattr(ctx context.Context, name string) error {
+func (FileDescriptionDefaultImpl) RemoveXattr(ctx context.Context, name string) error {
 	return syserror.ENOTSUP
 }
 
 // DirectoryFileDescriptionDefaultImpl may be embedded by implementations of
 // FileDescriptionImpl that always represent directories to obtain
 // implementations of non-directory I/O methods that return EISDIR.
+//
+// +stateify savable
 type DirectoryFileDescriptionDefaultImpl struct{}
 
 // Allocate implements DirectoryFileDescriptionDefaultImpl.Allocate.
@@ -192,6 +200,8 @@ func (DirectoryFileDescriptionDefaultImpl) Write(ctx context.Context, src userme
 // DentryMetadataFileDescriptionImpl may be embedded by implementations of
 // FileDescriptionImpl for which FileDescriptionOptions.UseDentryMetadata is
 // true to obtain implementations of Stat and SetStat that panic.
+//
+// +stateify savable
 type DentryMetadataFileDescriptionImpl struct{}
 
 // Stat implements FileDescriptionImpl.Stat.
@@ -206,12 +216,16 @@ func (DentryMetadataFileDescriptionImpl) SetStat(ctx context.Context, opts SetSt
 
 // DynamicBytesSource represents a data source for a
 // DynamicBytesFileDescriptionImpl.
+//
+// +stateify savable
 type DynamicBytesSource interface {
 	// Generate writes the file's contents to buf.
 	Generate(ctx context.Context, buf *bytes.Buffer) error
 }
 
 // StaticData implements DynamicBytesSource over a static string.
+//
+// +stateify savable
 type StaticData struct {
 	Data string
 }
@@ -238,14 +252,24 @@ type WritableDynamicBytesSource interface {
 //
 // DynamicBytesFileDescriptionImpl.SetDataSource() must be called before first
 // use.
+//
+// +stateify savable
 type DynamicBytesFileDescriptionImpl struct {
 	data     DynamicBytesSource // immutable
-	mu       sync.Mutex         // protects the following fields
-	buf      bytes.Buffer
+	mu       sync.Mutex         `state:"nosave"` // protects the following fields
+	buf      bytes.Buffer       `state:".([]byte)"`
 	off      int64
 	lastRead int64 // offset at which the last Read, PRead, or Seek ended
 }
 
+func (fd *DynamicBytesFileDescriptionImpl) saveBuf() []byte {
+	return fd.buf.Bytes()
+}
+
+func (fd *DynamicBytesFileDescriptionImpl) loadBuf(p []byte) {
+	fd.buf.Write(p)
+}
+
 // SetDataSource must be called exactly once on fd before first use.
 func (fd *DynamicBytesFileDescriptionImpl) SetDataSource(data DynamicBytesSource) {
 	fd.data = data
@@ -378,6 +402,8 @@ func GenericConfigureMMap(fd *FileDescription, m memmap.Mappable, opts *memmap.M
 
 // LockFD may be used by most implementations of FileDescriptionImpl.Lock*
 // functions. Caller must call Init().
+//
+// +stateify savable
 type LockFD struct {
 	locks *FileLocks
 }
@@ -405,6 +431,8 @@ func (fd *LockFD) UnlockBSD(ctx context.Context, uid fslock.UniqueID) error {
 
 // NoLockFD implements Lock*/Unlock* portion of FileDescriptionImpl interface
 // returning ENOLCK.
+//
+// +stateify savable
 type NoLockFD struct{}
 
 // LockBSD implements vfs.FileDescriptionImpl.LockBSD.
diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go
index df3758fd1..c93d94634 100644
--- a/pkg/sentry/vfs/filesystem.go
+++ b/pkg/sentry/vfs/filesystem.go
@@ -15,8 +15,6 @@
 package vfs
 
 import (
-	"sync/atomic"
-
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fspath"
@@ -34,9 +32,7 @@ import (
 //
 // +stateify savable
 type Filesystem struct {
-	// refs is the reference count. refs is accessed using atomic memory
-	// operations.
-	refs int64
+	FilesystemRefs
 
 	// vfs is the VirtualFilesystem that uses this Filesystem. vfs is
 	// immutable.
@@ -52,7 +48,7 @@ type Filesystem struct {
 
 // Init must be called before first use of fs.
 func (fs *Filesystem) Init(vfsObj *VirtualFilesystem, fsType FilesystemType, impl FilesystemImpl) {
-	fs.refs = 1
+	fs.EnableLeakCheck()
 	fs.vfs = vfsObj
 	fs.fsType = fsType
 	fs.impl = impl
@@ -76,39 +72,14 @@ func (fs *Filesystem) Impl() FilesystemImpl {
 	return fs.impl
 }
 
-// IncRef increments fs' reference count.
-func (fs *Filesystem) IncRef() {
-	if atomic.AddInt64(&fs.refs, 1) <= 1 {
-		panic("Filesystem.IncRef() called without holding a reference")
-	}
-}
-
-// TryIncRef increments fs' reference count and returns true. If fs' reference
-// count is zero, TryIncRef does nothing and returns false.
-//
-// TryIncRef does not require that a reference is held on fs.
-func (fs *Filesystem) TryIncRef() bool {
-	for {
-		refs := atomic.LoadInt64(&fs.refs)
-		if refs <= 0 {
-			return false
-		}
-		if atomic.CompareAndSwapInt64(&fs.refs, refs, refs+1) {
-			return true
-		}
-	}
-}
-
 // DecRef decrements fs' reference count.
 func (fs *Filesystem) DecRef(ctx context.Context) {
-	if refs := atomic.AddInt64(&fs.refs, -1); refs == 0 {
+	fs.FilesystemRefs.DecRef(func() {
 		fs.vfs.filesystemsMu.Lock()
 		delete(fs.vfs.filesystems, fs)
 		fs.vfs.filesystemsMu.Unlock()
 		fs.impl.Release(ctx)
-	} else if refs < 0 {
-		panic("Filesystem.decRef() called without holding a reference")
-	}
+	})
 }
 
 // FilesystemImpl contains implementation details for a Filesystem.
@@ -212,8 +183,9 @@ type FilesystemImpl interface {
 	// ENOENT. Equivalently, if vd represents a file with a link count of 0 not
 	// created by open(O_TMPFILE) without O_EXCL, LinkAt returns ENOENT.
 	//
-	// Preconditions: !rp.Done(). For the final path component in rp,
-	// !rp.ShouldFollowSymlink().
+	// Preconditions:
+	// * !rp.Done().
+	// * For the final path component in rp, !rp.ShouldFollowSymlink().
 	//
 	// Postconditions: If LinkAt returns an error returned by
 	// ResolvingPath.Resolve*(), then !rp.Done().
@@ -231,8 +203,9 @@ type FilesystemImpl interface {
 	// - If the directory in which the new directory would be created has been
 	// removed by RmdirAt or RenameAt, MkdirAt returns ENOENT.
 	//
-	// Preconditions: !rp.Done(). For the final path component in rp,
-	// !rp.ShouldFollowSymlink().
+	// Preconditions:
+	// * !rp.Done().
+	// * For the final path component in rp, !rp.ShouldFollowSymlink().
 	//
 	// Postconditions: If MkdirAt returns an error returned by
 	// ResolvingPath.Resolve*(), then !rp.Done().
@@ -253,8 +226,9 @@ type FilesystemImpl interface {
 	// - If the directory in which the file would be created has been removed
 	// by RmdirAt or RenameAt, MknodAt returns ENOENT.
 	//
-	// Preconditions: !rp.Done(). For the final path component in rp,
-	// !rp.ShouldFollowSymlink().
+	// Preconditions:
+	// * !rp.Done().
+	// * For the final path component in rp, !rp.ShouldFollowSymlink().
 	//
 	// Postconditions: If MknodAt returns an error returned by
 	// ResolvingPath.Resolve*(), then !rp.Done().
@@ -345,11 +319,12 @@ type FilesystemImpl interface {
 	// - If renaming would replace a non-empty directory, RenameAt returns
 	// ENOTEMPTY.
 	//
-	// Preconditions: !rp.Done(). For the final path component in rp,
-	// !rp.ShouldFollowSymlink(). oldParentVD.Dentry() was obtained from a
-	// previous call to
-	// oldParentVD.Mount().Filesystem().Impl().GetParentDentryAt(). oldName is
-	// not "." or "..".
+	// Preconditions:
+	// * !rp.Done().
+	// * For the final path component in rp, !rp.ShouldFollowSymlink().
+	// * oldParentVD.Dentry() was obtained from a previous call to
+	//   oldParentVD.Mount().Filesystem().Impl().GetParentDentryAt().
+	// * oldName is not "." or "..".
 	//
 	// Postconditions: If RenameAt returns an error returned by
 	// ResolvingPath.Resolve*(), then !rp.Done().
@@ -372,8 +347,9 @@ type FilesystemImpl interface {
 	// - If the file at rp exists but is not a directory, RmdirAt returns
 	// ENOTDIR.
 	//
-	// Preconditions: !rp.Done(). For the final path component in rp,
-	// !rp.ShouldFollowSymlink().
+	// Preconditions:
+	// * !rp.Done().
+	// * For the final path component in rp, !rp.ShouldFollowSymlink().
 	//
 	// Postconditions: If RmdirAt returns an error returned by
 	// ResolvingPath.Resolve*(), then !rp.Done().
@@ -410,8 +386,9 @@ type FilesystemImpl interface {
 	// - If the directory in which the symbolic link would be created has been
 	// removed by RmdirAt or RenameAt, SymlinkAt returns ENOENT.
 	//
-	// Preconditions: !rp.Done(). For the final path component in rp,
-	// !rp.ShouldFollowSymlink().
+	// Preconditions:
+	// * !rp.Done().
+	// * For the final path component in rp, !rp.ShouldFollowSymlink().
 	//
 	// Postconditions: If SymlinkAt returns an error returned by
 	// ResolvingPath.Resolve*(), then !rp.Done().
@@ -431,33 +408,34 @@ type FilesystemImpl interface {
 	//
 	// - If the file at rp exists but is a directory, UnlinkAt returns EISDIR.
 	//
-	// Preconditions: !rp.Done(). For the final path component in rp,
-	// !rp.ShouldFollowSymlink().
+	// Preconditions:
+	// * !rp.Done().
+	// * For the final path component in rp, !rp.ShouldFollowSymlink().
 	//
 	// Postconditions: If UnlinkAt returns an error returned by
 	// ResolvingPath.Resolve*(), then !rp.Done().
 	UnlinkAt(ctx context.Context, rp *ResolvingPath) error
 
-	// ListxattrAt returns all extended attribute names for the file at rp.
+	// ListXattrAt returns all extended attribute names for the file at rp.
 	//
 	// Errors:
 	//
 	// - If extended attributes are not supported by the filesystem,
-	// ListxattrAt returns ENOTSUP.
+	// ListXattrAt returns ENOTSUP.
 	//
 	// - If the size of the list (including a NUL terminating byte after every
 	// entry) would exceed size, ERANGE may be returned. Note that
 	// implementations are free to ignore size entirely and return without
 	// error). In all cases, if size is 0, the list should be returned without
 	// error, regardless of size.
-	ListxattrAt(ctx context.Context, rp *ResolvingPath, size uint64) ([]string, error)
+	ListXattrAt(ctx context.Context, rp *ResolvingPath, size uint64) ([]string, error)
 
-	// GetxattrAt returns the value associated with the given extended
+	// GetXattrAt returns the value associated with the given extended
 	// attribute for the file at rp.
 	//
 	// Errors:
 	//
-	// - If extended attributes are not supported by the filesystem, GetxattrAt
+	// - If extended attributes are not supported by the filesystem, GetXattrAt
 	// returns ENOTSUP.
 	//
 	// - If an extended attribute named opts.Name does not exist, ENODATA is
@@ -467,30 +445,30 @@ type FilesystemImpl interface {
 	// returned (note that implementations are free to ignore opts.Size entirely
 	// and return without error). In all cases, if opts.Size is 0, the value
 	// should be returned without error, regardless of size.
-	GetxattrAt(ctx context.Context, rp *ResolvingPath, opts GetxattrOptions) (string, error)
+	GetXattrAt(ctx context.Context, rp *ResolvingPath, opts GetXattrOptions) (string, error)
 
-	// SetxattrAt changes the value associated with the given extended
+	// SetXattrAt changes the value associated with the given extended
 	// attribute for the file at rp.
 	//
 	// Errors:
 	//
-	// - If extended attributes are not supported by the filesystem, SetxattrAt
+	// - If extended attributes are not supported by the filesystem, SetXattrAt
 	// returns ENOTSUP.
 	//
 	// - If XATTR_CREATE is set in opts.Flag and opts.Name already exists,
 	// EEXIST is returned. If XATTR_REPLACE is set and opts.Name does not exist,
 	// ENODATA is returned.
-	SetxattrAt(ctx context.Context, rp *ResolvingPath, opts SetxattrOptions) error
+	SetXattrAt(ctx context.Context, rp *ResolvingPath, opts SetXattrOptions) error
 
-	// RemovexattrAt removes the given extended attribute from the file at rp.
+	// RemoveXattrAt removes the given extended attribute from the file at rp.
 	//
 	// Errors:
 	//
 	// - If extended attributes are not supported by the filesystem,
-	// RemovexattrAt returns ENOTSUP.
+	// RemoveXattrAt returns ENOTSUP.
 	//
 	// - If name does not exist, ENODATA is returned.
-	RemovexattrAt(ctx context.Context, rp *ResolvingPath, name string) error
+	RemoveXattrAt(ctx context.Context, rp *ResolvingPath, name string) error
 
 	// BoundEndpointAt returns the Unix socket endpoint bound at the path rp.
 	//
@@ -528,6 +506,8 @@ type FilesystemImpl interface {
 
 // PrependPathAtVFSRootError is returned by implementations of
 // FilesystemImpl.PrependPath() when they encounter the contextual VFS root.
+//
+// +stateify savable
 type PrependPathAtVFSRootError struct{}
 
 // Error implements error.Error.
@@ -538,6 +518,8 @@ func (PrependPathAtVFSRootError) Error() string {
 // PrependPathAtNonMountRootError is returned by implementations of
 // FilesystemImpl.PrependPath() when they encounter an independent ancestor
 // Dentry that is not the Mount root.
+//
+// +stateify savable
 type PrependPathAtNonMountRootError struct{}
 
 // Error implements error.Error.
@@ -548,6 +530,8 @@ func (PrependPathAtNonMountRootError) Error() string {
 // PrependPathSyntheticError is returned by implementations of
 // FilesystemImpl.PrependPath() for which prepended names do not represent real
 // paths.
+//
+// +stateify savable
 type PrependPathSyntheticError struct{}
 
 // Error implements error.Error.
diff --git a/pkg/sentry/vfs/filesystem_impl_util.go b/pkg/sentry/vfs/filesystem_impl_util.go
index 465e610e0..2620cf975 100644
--- a/pkg/sentry/vfs/filesystem_impl_util.go
+++ b/pkg/sentry/vfs/filesystem_impl_util.go
@@ -16,6 +16,9 @@ package vfs
 
 import (
 	"strings"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // GenericParseMountOptions parses a comma-separated list of options of the
@@ -41,3 +44,13 @@ func GenericParseMountOptions(str string) map[string]string {
 	}
 	return m
 }
+
+// GenericStatFS returns a statfs struct filled with the common fields for a
+// general filesystem. This is analogous to Linux's fs/libfs.cs:simple_statfs().
+func GenericStatFS(fsMagic uint64) linux.Statfs {
+	return linux.Statfs{
+		Type:       fsMagic,
+		BlockSize:  usermem.PageSize,
+		NameLength: linux.NAME_MAX,
+	}
+}
diff --git a/pkg/sentry/vfs/filesystem_type.go b/pkg/sentry/vfs/filesystem_type.go
index f2298f7f6..9d54cc4ed 100644
--- a/pkg/sentry/vfs/filesystem_type.go
+++ b/pkg/sentry/vfs/filesystem_type.go
@@ -33,6 +33,9 @@ type FilesystemType interface {
 
 	// Name returns the name of this FilesystemType.
 	Name() string
+
+	// Release releases all resources held by this FilesystemType.
+	Release(ctx context.Context)
 }
 
 // GetFilesystemOptions contains options to FilesystemType.GetFilesystem.
@@ -55,10 +58,13 @@ type registeredFilesystemType struct {
 
 // RegisterFilesystemTypeOptions contains options to
 // VirtualFilesystem.RegisterFilesystem().
+//
+// +stateify savable
 type RegisterFilesystemTypeOptions struct {
-	// If AllowUserMount is true, allow calls to VirtualFilesystem.MountAt()
-	// for which MountOptions.InternalMount == false to use this filesystem
-	// type.
+	// AllowUserMount determines whether users are allowed to mount a file system
+	// of this type, i.e. through mount(2). If AllowUserMount is true, allow calls
+	// to VirtualFilesystem.MountAt() for which MountOptions.InternalMount == false
+	// to use this filesystem type.
 	AllowUserMount bool
 
 	// If AllowUserList is true, make this filesystem type visible in
diff --git a/pkg/sentry/vfs/g3doc/inotify.md b/pkg/sentry/vfs/g3doc/inotify.md
index e7da49faa..833db213f 100644
--- a/pkg/sentry/vfs/g3doc/inotify.md
+++ b/pkg/sentry/vfs/g3doc/inotify.md
@@ -28,9 +28,9 @@ The set of all watches held on a single file (i.e., the watch target) is stored
 in vfs.Watches. Each watch will belong to a different inotify instance (an
 instance can only have one watch on any watch target). The watches are stored in
 a map indexed by their vfs.Inotify owner’s id. Hard links and file descriptions
-to a single file will all share the same vfs.Watches. Activity on the target
-causes its vfs.Watches to generate notifications on its watches’ inotify
-instances.
+to a single file will all share the same vfs.Watches (with the exception of the
+gofer filesystem, described in a later section). Activity on the target causes
+its vfs.Watches to generate notifications on its watches’ inotify instances.
 
 ### vfs.Watch
 
@@ -103,12 +103,12 @@ inotify:
     unopened p9 file (and possibly an open FID), through which the Sentry
     interacts with the gofer.
     *   *Solution:* Because there is no inode structure stored in the sandbox,
-        inotify watches must be held on the dentry. This would be an issue in
-        the presence of hard links, where multiple dentries would need to share
-        the same set of watches, but in VFS2, we do not support the internal
-        creation of hard links on gofer fs. As a result, we make the assumption
-        that every dentry corresponds to a unique inode. However, the next point
-        raises an issue with this assumption:
+        inotify watches must be held on the dentry. For the purposes of inotify,
+        we assume that every dentry corresponds to a unique inode, which may
+        cause unexpected behavior in the presence of hard links, where multiple
+        dentries should share the same set of watches. Indeed, it is impossible
+        for us to be absolutely sure whether dentries correspond to the same
+        file or not, due to the following point:
 *   **The Sentry cannot always be aware of hard links on the remote
     filesystem.** There is no way for us to confirm whether two files on the
     remote filesystem are actually links to the same inode. QIDs and inodes are
diff --git a/pkg/sentry/vfs/genericfstree/genericfstree.go b/pkg/sentry/vfs/genericfstree/genericfstree.go
index 8882fa84a..2d27d9d35 100644
--- a/pkg/sentry/vfs/genericfstree/genericfstree.go
+++ b/pkg/sentry/vfs/genericfstree/genericfstree.go
@@ -27,6 +27,8 @@ import (
 )
 
 // Dentry is a required type parameter that is a struct with the given fields.
+//
+// +stateify savable
 type Dentry struct {
 	// vfsd is the embedded vfs.Dentry corresponding to this vfs.DentryImpl.
 	vfsd vfs.Dentry
diff --git a/pkg/sentry/vfs/inotify.go b/pkg/sentry/vfs/inotify.go
index aff220a61..3f0b8f45b 100644
--- a/pkg/sentry/vfs/inotify.go
+++ b/pkg/sentry/vfs/inotify.go
@@ -37,6 +37,8 @@ const inotifyEventBaseSize = 16
 //
 // The way events are labelled appears somewhat arbitrary, but they must match
 // Linux so that IN_EXCL_UNLINK behaves as it does in Linux.
+//
+// +stateify savable
 type EventType uint8
 
 // PathEvent and InodeEvent correspond to FSNOTIFY_EVENT_PATH and
diff --git a/pkg/sentry/vfs/lock.go b/pkg/sentry/vfs/lock.go
index 42666eebf..55783d4eb 100644
--- a/pkg/sentry/vfs/lock.go
+++ b/pkg/sentry/vfs/lock.go
@@ -33,6 +33,8 @@ import (
 // Note that in Linux these two types of locks are _not_ cooperative, because
 // race and deadlock conditions make merging them prohibitive. We do the same
 // and keep them oblivious to each other.
+//
+// +stateify savable
 type FileLocks struct {
 	// bsd is a set of BSD-style advisory file wide locks, see flock(2).
 	bsd fslock.Locks
diff --git a/pkg/sentry/vfs/memxattr/xattr.go b/pkg/sentry/vfs/memxattr/xattr.go
index cc1e7d764..638b5d830 100644
--- a/pkg/sentry/vfs/memxattr/xattr.go
+++ b/pkg/sentry/vfs/memxattr/xattr.go
@@ -33,8 +33,8 @@ type SimpleExtendedAttributes struct {
 	xattrs map[string]string
 }
 
-// Getxattr returns the value at 'name'.
-func (x *SimpleExtendedAttributes) Getxattr(opts *vfs.GetxattrOptions) (string, error) {
+// GetXattr returns the value at 'name'.
+func (x *SimpleExtendedAttributes) GetXattr(opts *vfs.GetXattrOptions) (string, error) {
 	x.mu.RLock()
 	value, ok := x.xattrs[opts.Name]
 	x.mu.RUnlock()
@@ -49,8 +49,8 @@ func (x *SimpleExtendedAttributes) Getxattr(opts *vfs.GetxattrOptions) (string,
 	return value, nil
 }
 
-// Setxattr sets 'value' at 'name'.
-func (x *SimpleExtendedAttributes) Setxattr(opts *vfs.SetxattrOptions) error {
+// SetXattr sets 'value' at 'name'.
+func (x *SimpleExtendedAttributes) SetXattr(opts *vfs.SetXattrOptions) error {
 	x.mu.Lock()
 	defer x.mu.Unlock()
 	if x.xattrs == nil {
@@ -72,8 +72,8 @@ func (x *SimpleExtendedAttributes) Setxattr(opts *vfs.SetxattrOptions) error {
 	return nil
 }
 
-// Listxattr returns all names in xattrs.
-func (x *SimpleExtendedAttributes) Listxattr(size uint64) ([]string, error) {
+// ListXattr returns all names in xattrs.
+func (x *SimpleExtendedAttributes) ListXattr(size uint64) ([]string, error) {
 	// Keep track of the size of the buffer needed in listxattr(2) for the list.
 	listSize := 0
 	x.mu.RLock()
@@ -90,8 +90,8 @@ func (x *SimpleExtendedAttributes) Listxattr(size uint64) ([]string, error) {
 	return names, nil
 }
 
-// Removexattr removes the xattr at 'name'.
-func (x *SimpleExtendedAttributes) Removexattr(name string) error {
+// RemoveXattr removes the xattr at 'name'.
+func (x *SimpleExtendedAttributes) RemoveXattr(name string) error {
 	x.mu.Lock()
 	defer x.mu.Unlock()
 	if _, ok := x.xattrs[name]; !ok {
diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go
index 67dfba986..78f115bfa 100644
--- a/pkg/sentry/vfs/mount.go
+++ b/pkg/sentry/vfs/mount.go
@@ -46,8 +46,9 @@ import (
 // +stateify savable
 type Mount struct {
 	// vfs, fs, root are immutable. References are held on fs and root.
+	// Note that for a disconnected mount, root may be nil.
 	//
-	// Invariant: root belongs to fs.
+	// Invariant: if not nil, root belongs to fs.
 	vfs  *VirtualFilesystem
 	fs   *Filesystem
 	root *Dentry
@@ -65,7 +66,7 @@ type Mount struct {
 	//
 	// Invariant: key.parent != nil iff key.point != nil. key.point belongs to
 	// key.parent.fs.
-	key mountKey
+	key mountKey `state:".(VirtualDentry)"`
 
 	// ns is the namespace in which this Mount was mounted. ns is protected by
 	// VirtualFilesystem.mountMu.
@@ -126,16 +127,14 @@ func (mnt *Mount) Options() MountOptions {
 //
 // +stateify savable
 type MountNamespace struct {
+	MountNamespaceRefs
+
 	// Owner is the usernamespace that owns this mount namespace.
 	Owner *auth.UserNamespace
 
 	// root is the MountNamespace's root mount. root is immutable.
 	root *Mount
 
-	// refs is the reference count. refs is accessed using atomic memory
-	// operations.
-	refs int64
-
 	// mountpoints maps all Dentries which are mount points in this namespace
 	// to the number of Mounts for which they are mount points. mountpoints is
 	// protected by VirtualFilesystem.mountMu.
@@ -154,22 +153,22 @@ type MountNamespace struct {
 // NewMountNamespace returns a new mount namespace with a root filesystem
 // configured by the given arguments. A reference is taken on the returned
 // MountNamespace.
-func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth.Credentials, source, fsTypeName string, opts *GetFilesystemOptions) (*MountNamespace, error) {
+func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth.Credentials, source, fsTypeName string, opts *MountOptions) (*MountNamespace, error) {
 	rft := vfs.getFilesystemType(fsTypeName)
 	if rft == nil {
 		ctx.Warningf("Unknown filesystem type: %s", fsTypeName)
 		return nil, syserror.ENODEV
 	}
-	fs, root, err := rft.fsType.GetFilesystem(ctx, vfs, creds, source, *opts)
+	fs, root, err := rft.fsType.GetFilesystem(ctx, vfs, creds, source, opts.GetFilesystemOptions)
 	if err != nil {
 		return nil, err
 	}
 	mntns := &MountNamespace{
 		Owner:       creds.UserNamespace,
-		refs:        1,
 		mountpoints: make(map[*Dentry]uint32),
 	}
-	mntns.root = newMount(vfs, fs, root, mntns, &MountOptions{})
+	mntns.EnableLeakCheck()
+	mntns.root = newMount(vfs, fs, root, mntns, opts)
 	return mntns, nil
 }
 
@@ -263,16 +262,20 @@ func (vfs *VirtualFilesystem) ConnectMountAt(ctx context.Context, creds *auth.Cr
 }
 
 // MountAt creates and mounts a Filesystem configured by the given arguments.
-func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentials, source string, target *PathOperation, fsTypeName string, opts *MountOptions) error {
+// The VirtualFilesystem will hold a reference to the Mount until it is unmounted.
+//
+// This method returns the mounted Mount without a reference, for convenience
+// during VFS setup when there is no chance of racing with unmount.
+func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentials, source string, target *PathOperation, fsTypeName string, opts *MountOptions) (*Mount, error) {
 	mnt, err := vfs.MountDisconnected(ctx, creds, source, fsTypeName, opts)
 	if err != nil {
-		return err
+		return nil, err
 	}
 	defer mnt.DecRef(ctx)
 	if err := vfs.ConnectMountAt(ctx, creds, mnt, target); err != nil {
-		return err
+		return nil, err
 	}
-	return nil
+	return mnt, nil
 }
 
 // UmountAt removes the Mount at the given path.
@@ -343,6 +346,7 @@ func (vfs *VirtualFilesystem) UmountAt(ctx context.Context, creds *auth.Credenti
 	return nil
 }
 
+// +stateify savable
 type umountRecursiveOptions struct {
 	// If eager is true, ensure that future calls to Mount.tryIncMountedRef()
 	// on umounted mounts fail.
@@ -369,8 +373,9 @@ type umountRecursiveOptions struct {
 //
 // umountRecursiveLocked is analogous to Linux's fs/namespace.c:umount_tree().
 //
-// Preconditions: vfs.mountMu must be locked. vfs.mounts.seq must be in a
-// writer critical section.
+// Preconditions:
+// * vfs.mountMu must be locked.
+// * vfs.mounts.seq must be in a writer critical section.
 func (vfs *VirtualFilesystem) umountRecursiveLocked(mnt *Mount, opts *umountRecursiveOptions, vdsToDecRef []VirtualDentry, mountsToDecRef []*Mount) ([]VirtualDentry, []*Mount) {
 	if !mnt.umounted {
 		mnt.umounted = true
@@ -399,9 +404,11 @@ func (vfs *VirtualFilesystem) umountRecursiveLocked(mnt *Mount, opts *umountRecu
 // connectLocked makes vd the mount parent/point for mnt. It consumes
 // references held by vd.
 //
-// Preconditions: vfs.mountMu must be locked. vfs.mounts.seq must be in a
-// writer critical section. d.mu must be locked. mnt.parent() == nil, i.e. mnt
-// must not already be connected.
+// Preconditions:
+// * vfs.mountMu must be locked.
+// * vfs.mounts.seq must be in a writer critical section.
+// * d.mu must be locked.
+// * mnt.parent() == nil, i.e. mnt must not already be connected.
 func (vfs *VirtualFilesystem) connectLocked(mnt *Mount, vd VirtualDentry, mntns *MountNamespace) {
 	if checkInvariants {
 		if mnt.parent() != nil {
@@ -409,7 +416,7 @@ func (vfs *VirtualFilesystem) connectLocked(mnt *Mount, vd VirtualDentry, mntns
 		}
 	}
 	mnt.IncRef() // dropped by callers of umountRecursiveLocked
-	mnt.storeKey(vd)
+	mnt.setKey(vd)
 	if vd.mount.children == nil {
 		vd.mount.children = make(map[*Mount]struct{})
 	}
@@ -429,16 +436,18 @@ func (vfs *VirtualFilesystem) connectLocked(mnt *Mount, vd VirtualDentry, mntns
 // disconnectLocked makes vd have no mount parent/point and returns its old
 // mount parent/point with a reference held.
 //
-// Preconditions: vfs.mountMu must be locked. vfs.mounts.seq must be in a
-// writer critical section. mnt.parent() != nil.
+// Preconditions:
+// * vfs.mountMu must be locked.
+// * vfs.mounts.seq must be in a writer critical section.
+// * mnt.parent() != nil.
 func (vfs *VirtualFilesystem) disconnectLocked(mnt *Mount) VirtualDentry {
-	vd := mnt.loadKey()
+	vd := mnt.getKey()
 	if checkInvariants {
 		if vd.mount != nil {
 			panic("VFS.disconnectLocked called on disconnected mount")
 		}
 	}
-	mnt.storeKey(VirtualDentry{})
+	mnt.loadKey(VirtualDentry{})
 	delete(vd.mount.children, mnt)
 	atomic.AddUint32(&vd.dentry.mounts, math.MaxUint32) // -1
 	mnt.ns.mountpoints[vd.dentry]--
@@ -490,7 +499,9 @@ func (mnt *Mount) DecRef(ctx context.Context) {
 			mnt.vfs.mounts.seq.EndWrite()
 			mnt.vfs.mountMu.Unlock()
 		}
-		mnt.root.DecRef(ctx)
+		if mnt.root != nil {
+			mnt.root.DecRef(ctx)
+		}
 		mnt.fs.DecRef(ctx)
 		if vd.Ok() {
 			vd.DecRef(ctx)
@@ -498,17 +509,10 @@ func (mnt *Mount) DecRef(ctx context.Context) {
 	}
 }
 
-// IncRef increments mntns' reference count.
-func (mntns *MountNamespace) IncRef() {
-	if atomic.AddInt64(&mntns.refs, 1) <= 1 {
-		panic("MountNamespace.IncRef() called without holding a reference")
-	}
-}
-
 // DecRef decrements mntns' reference count.
 func (mntns *MountNamespace) DecRef(ctx context.Context) {
 	vfs := mntns.root.fs.VirtualFilesystem()
-	if refs := atomic.AddInt64(&mntns.refs, -1); refs == 0 {
+	mntns.MountNamespaceRefs.DecRef(func() {
 		vfs.mountMu.Lock()
 		vfs.mounts.seq.BeginWrite()
 		vdsToDecRef, mountsToDecRef := vfs.umountRecursiveLocked(mntns.root, &umountRecursiveOptions{
@@ -522,9 +526,7 @@ func (mntns *MountNamespace) DecRef(ctx context.Context) {
 		for _, mnt := range mountsToDecRef {
 			mnt.DecRef(ctx)
 		}
-	} else if refs < 0 {
-		panic("MountNamespace.DecRef() called without holding a reference")
-	}
+	})
 }
 
 // getMountAt returns the last Mount in the stack mounted at (mnt, d). It takes
@@ -576,8 +578,9 @@ retryFirst:
 // mnt. It takes a reference on the returned VirtualDentry. If no such mount
 // point exists (i.e. mnt is a root mount), getMountpointAt returns (nil, nil).
 //
-// Preconditions: References are held on mnt and root. vfsroot is not (mnt,
-// mnt.root).
+// Preconditions:
+// * References are held on mnt and root.
+// * vfsroot is not (mnt, mnt.root).
 func (vfs *VirtualFilesystem) getMountpointAt(ctx context.Context, mnt *Mount, vfsroot VirtualDentry) VirtualDentry {
 	// The first mount is special-cased:
 	//
@@ -651,6 +654,13 @@ retryFirst:
 	return VirtualDentry{mnt, d}
 }
 
+// SetMountReadOnly sets the mount as ReadOnly.
+func (vfs *VirtualFilesystem) SetMountReadOnly(mnt *Mount, ro bool) error {
+	vfs.mountMu.Lock()
+	defer vfs.mountMu.Unlock()
+	return mnt.setReadOnlyLocked(ro)
+}
+
 // CheckBeginWrite increments the counter of in-progress write operations on
 // mnt. If mnt is mounted MS_RDONLY, CheckBeginWrite does nothing and returns
 // EROFS.
@@ -717,14 +727,12 @@ func (mnt *Mount) Root() *Dentry {
 	return mnt.root
 }
 
-// Root returns mntns' root. A reference is taken on the returned
-// VirtualDentry.
+// Root returns mntns' root. It does not take a reference on the returned Dentry.
 func (mntns *MountNamespace) Root() VirtualDentry {
 	vd := VirtualDentry{
 		mount:  mntns.root,
 		dentry: mntns.root.root,
 	}
-	vd.IncRef()
 	return vd
 }
 
@@ -732,11 +740,23 @@ func (mntns *MountNamespace) Root() VirtualDentry {
 //
 // Preconditions: taskRootDir.Ok().
 func (vfs *VirtualFilesystem) GenerateProcMounts(ctx context.Context, taskRootDir VirtualDentry, buf *bytes.Buffer) {
-	vfs.mountMu.Lock()
-	defer vfs.mountMu.Unlock()
 	rootMnt := taskRootDir.mount
+
+	vfs.mountMu.Lock()
 	mounts := rootMnt.submountsLocked()
+	// Take a reference on mounts since we need to drop vfs.mountMu before
+	// calling vfs.PathnameReachable() (=> FilesystemImpl.PrependPath()).
+	for _, mnt := range mounts {
+		mnt.IncRef()
+	}
+	vfs.mountMu.Unlock()
+	defer func() {
+		for _, mnt := range mounts {
+			mnt.DecRef(ctx)
+		}
+	}()
 	sort.Slice(mounts, func(i, j int) bool { return mounts[i].ID < mounts[j].ID })
+
 	for _, mnt := range mounts {
 		// Get the path to this mount relative to task root.
 		mntRootVD := VirtualDentry{
@@ -747,7 +767,7 @@ func (vfs *VirtualFilesystem) GenerateProcMounts(ctx context.Context, taskRootDi
 		if err != nil {
 			// For some reason we didn't get a path. Log a warning
 			// and run with empty path.
-			ctx.Warningf("Error getting pathname for mount root %+v: %v", mnt.root, err)
+			ctx.Warningf("VFS.GenerateProcMounts: error getting pathname for mount root %+v: %v", mnt.root, err)
 			path = ""
 		}
 		if path == "" {
@@ -781,11 +801,25 @@ func (vfs *VirtualFilesystem) GenerateProcMounts(ctx context.Context, taskRootDi
 //
 // Preconditions: taskRootDir.Ok().
 func (vfs *VirtualFilesystem) GenerateProcMountInfo(ctx context.Context, taskRootDir VirtualDentry, buf *bytes.Buffer) {
-	vfs.mountMu.Lock()
-	defer vfs.mountMu.Unlock()
 	rootMnt := taskRootDir.mount
+
+	vfs.mountMu.Lock()
 	mounts := rootMnt.submountsLocked()
+	// Take a reference on mounts since we need to drop vfs.mountMu before
+	// calling vfs.PathnameReachable() (=> FilesystemImpl.PrependPath()) or
+	// vfs.StatAt() (=> FilesystemImpl.StatAt()).
+	for _, mnt := range mounts {
+		mnt.IncRef()
+	}
+	vfs.mountMu.Unlock()
+	defer func() {
+		for _, mnt := range mounts {
+			mnt.DecRef(ctx)
+		}
+	}()
 	sort.Slice(mounts, func(i, j int) bool { return mounts[i].ID < mounts[j].ID })
+
+	creds := auth.CredentialsFromContext(ctx)
 	for _, mnt := range mounts {
 		// Get the path to this mount relative to task root.
 		mntRootVD := VirtualDentry{
@@ -796,7 +830,7 @@ func (vfs *VirtualFilesystem) GenerateProcMountInfo(ctx context.Context, taskRoo
 		if err != nil {
 			// For some reason we didn't get a path. Log a warning
 			// and run with empty path.
-			ctx.Warningf("Error getting pathname for mount root %+v: %v", mnt.root, err)
+			ctx.Warningf("VFS.GenerateProcMountInfo: error getting pathname for mount root %+v: %v", mnt.root, err)
 			path = ""
 		}
 		if path == "" {
@@ -809,9 +843,10 @@ func (vfs *VirtualFilesystem) GenerateProcMountInfo(ctx context.Context, taskRoo
 			Root:  mntRootVD,
 			Start: mntRootVD,
 		}
-		statx, err := vfs.StatAt(ctx, auth.NewAnonymousCredentials(), pop, &StatOptions{})
+		statx, err := vfs.StatAt(ctx, creds, pop, &StatOptions{})
 		if err != nil {
 			// Well that's not good. Ignore this mount.
+			ctx.Warningf("VFS.GenerateProcMountInfo: failed to stat mount root %+v: %v", mnt.root, err)
 			break
 		}
 
@@ -823,6 +858,9 @@ func (vfs *VirtualFilesystem) GenerateProcMountInfo(ctx context.Context, taskRoo
 		fmt.Fprintf(buf, "%d ", mnt.ID)
 
 		// (2)  Parent ID (or this ID if there is no parent).
+		// Note that even if the call to mnt.parent() races with Mount
+		// destruction (which is possible since we're not holding vfs.mountMu),
+		// its Mount.ID will still be valid.
 		pID := mnt.ID
 		if p := mnt.parent(); p != nil {
 			pID = p.ID
diff --git a/pkg/sentry/vfs/mount_test.go b/pkg/sentry/vfs/mount_test.go
index 3335e4057..cb8c56bd3 100644
--- a/pkg/sentry/vfs/mount_test.go
+++ b/pkg/sentry/vfs/mount_test.go
@@ -38,7 +38,7 @@ func TestMountTableInsertLookup(t *testing.T) {
 	mt.Init()
 
 	mount := &Mount{}
-	mount.storeKey(VirtualDentry{&Mount{}, &Dentry{}})
+	mount.setKey(VirtualDentry{&Mount{}, &Dentry{}})
 	mt.Insert(mount)
 
 	if m := mt.Lookup(mount.parent(), mount.point()); m != mount {
@@ -79,7 +79,7 @@ const enableComparativeBenchmarks = false
 
 func newBenchMount() *Mount {
 	mount := &Mount{}
-	mount.storeKey(VirtualDentry{&Mount{}, &Dentry{}})
+	mount.loadKey(VirtualDentry{&Mount{}, &Dentry{}})
 	return mount
 }
 
@@ -94,7 +94,7 @@ func BenchmarkMountTableParallelLookup(b *testing.B) {
 				for i := 0; i < numMounts; i++ {
 					mount := newBenchMount()
 					mt.Insert(mount)
-					keys = append(keys, mount.loadKey())
+					keys = append(keys, mount.saveKey())
 				}
 
 				var ready sync.WaitGroup
@@ -146,7 +146,7 @@ func BenchmarkMountMapParallelLookup(b *testing.B) {
 				keys := make([]VirtualDentry, 0, numMounts)
 				for i := 0; i < numMounts; i++ {
 					mount := newBenchMount()
-					key := mount.loadKey()
+					key := mount.saveKey()
 					ms[key] = mount
 					keys = append(keys, key)
 				}
@@ -201,7 +201,7 @@ func BenchmarkMountSyncMapParallelLookup(b *testing.B) {
 				keys := make([]VirtualDentry, 0, numMounts)
 				for i := 0; i < numMounts; i++ {
 					mount := newBenchMount()
-					key := mount.loadKey()
+					key := mount.getKey()
 					ms.Store(key, mount)
 					keys = append(keys, key)
 				}
@@ -283,7 +283,7 @@ func BenchmarkMountMapNegativeLookup(b *testing.B) {
 			ms := make(map[VirtualDentry]*Mount)
 			for i := 0; i < numMounts; i++ {
 				mount := newBenchMount()
-				ms[mount.loadKey()] = mount
+				ms[mount.getKey()] = mount
 			}
 			negkeys := make([]VirtualDentry, 0, numMounts)
 			for i := 0; i < numMounts; i++ {
@@ -318,7 +318,7 @@ func BenchmarkMountSyncMapNegativeLookup(b *testing.B) {
 			var ms sync.Map
 			for i := 0; i < numMounts; i++ {
 				mount := newBenchMount()
-				ms.Store(mount.loadKey(), mount)
+				ms.Store(mount.saveKey(), mount)
 			}
 			negkeys := make([]VirtualDentry, 0, numMounts)
 			for i := 0; i < numMounts; i++ {
@@ -372,7 +372,7 @@ func BenchmarkMountMapInsert(b *testing.B) {
 	b.ResetTimer()
 	for i := range mounts {
 		mount := mounts[i]
-		ms[mount.loadKey()] = mount
+		ms[mount.saveKey()] = mount
 	}
 }
 
@@ -392,7 +392,7 @@ func BenchmarkMountSyncMapInsert(b *testing.B) {
 	b.ResetTimer()
 	for i := range mounts {
 		mount := mounts[i]
-		ms.Store(mount.loadKey(), mount)
+		ms.Store(mount.saveKey(), mount)
 	}
 }
 
@@ -425,13 +425,13 @@ func BenchmarkMountMapRemove(b *testing.B) {
 	ms := make(map[VirtualDentry]*Mount)
 	for i := range mounts {
 		mount := mounts[i]
-		ms[mount.loadKey()] = mount
+		ms[mount.saveKey()] = mount
 	}
 
 	b.ResetTimer()
 	for i := range mounts {
 		mount := mounts[i]
-		delete(ms, mount.loadKey())
+		delete(ms, mount.saveKey())
 	}
 }
 
@@ -447,12 +447,12 @@ func BenchmarkMountSyncMapRemove(b *testing.B) {
 	var ms sync.Map
 	for i := range mounts {
 		mount := mounts[i]
-		ms.Store(mount.loadKey(), mount)
+		ms.Store(mount.saveKey(), mount)
 	}
 
 	b.ResetTimer()
 	for i := range mounts {
 		mount := mounts[i]
-		ms.Delete(mount.loadKey())
+		ms.Delete(mount.saveKey())
 	}
 }
diff --git a/pkg/sentry/vfs/mount_unsafe.go b/pkg/sentry/vfs/mount_unsafe.go
index 70f850ca4..b7d122d22 100644
--- a/pkg/sentry/vfs/mount_unsafe.go
+++ b/pkg/sentry/vfs/mount_unsafe.go
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 // +build go1.12
-// +build !go1.16
+// +build !go1.17
 
 // Check go:linkname function signatures when updating Go version.
 
@@ -34,6 +34,8 @@ import (
 // structurally identical to VirtualDentry, but stores its fields as
 // unsafe.Pointer since mutators synchronize with VFS path traversal using
 // seqcounts.
+//
+// This is explicitly not savable.
 type mountKey struct {
 	parent unsafe.Pointer // *Mount
 	point  unsafe.Pointer // *Dentry
@@ -47,19 +49,23 @@ func (mnt *Mount) point() *Dentry {
 	return (*Dentry)(atomic.LoadPointer(&mnt.key.point))
 }
 
-func (mnt *Mount) loadKey() VirtualDentry {
+func (mnt *Mount) getKey() VirtualDentry {
 	return VirtualDentry{
 		mount:  mnt.parent(),
 		dentry: mnt.point(),
 	}
 }
 
+func (mnt *Mount) saveKey() VirtualDentry { return mnt.getKey() }
+
 // Invariant: mnt.key.parent == nil. vd.Ok().
-func (mnt *Mount) storeKey(vd VirtualDentry) {
+func (mnt *Mount) setKey(vd VirtualDentry) {
 	atomic.StorePointer(&mnt.key.parent, unsafe.Pointer(vd.mount))
 	atomic.StorePointer(&mnt.key.point, unsafe.Pointer(vd.dentry))
 }
 
+func (mnt *Mount) loadKey(vd VirtualDentry) { mnt.setKey(vd) }
+
 // mountTable maps (mount parent, mount point) pairs to mounts. It supports
 // efficient concurrent lookup, even in the presence of concurrent mutators
 // (provided mutation is sufficiently uncommon).
@@ -92,6 +98,7 @@ type mountTable struct {
 	// length and cap in separate uint32s) for ~free.
 	size uint64
 
+	// FIXME(gvisor.dev/issue/1663): Slots need to be saved.
 	slots unsafe.Pointer `state:"nosave"` // []mountSlot; never nil after Init
 }
 
@@ -217,8 +224,9 @@ func (mt *mountTable) Insert(mount *Mount) {
 
 // insertSeqed inserts the given mount into mt.
 //
-// Preconditions: mt.seq must be in a writer critical section. mt must not
-// already contain a Mount with the same mount point and parent.
+// Preconditions:
+// * mt.seq must be in a writer critical section.
+// * mt must not already contain a Mount with the same mount point and parent.
 func (mt *mountTable) insertSeqed(mount *Mount) {
 	hash := memhash(unsafe.Pointer(&mount.key), uintptr(mt.seed), mountKeyBytes)
 
@@ -269,9 +277,11 @@ func (mt *mountTable) insertSeqed(mount *Mount) {
 	atomic.StorePointer(&mt.slots, newSlots)
 }
 
-// Preconditions: There are no concurrent mutators of the table (slots, cap).
-// If the table is visible to readers, then mt.seq must be in a writer critical
-// section. cap must be a power of 2.
+// Preconditions:
+// * There are no concurrent mutators of the table (slots, cap).
+// * If the table is visible to readers, then mt.seq must be in a writer
+//   critical section.
+// * cap must be a power of 2.
 func mtInsertLocked(slots unsafe.Pointer, cap uintptr, value unsafe.Pointer, hash uintptr) {
 	mask := cap - 1
 	off := (hash & mask) * mountSlotBytes
@@ -313,8 +323,9 @@ func (mt *mountTable) Remove(mount *Mount) {
 
 // removeSeqed removes the given mount from mt.
 //
-// Preconditions: mt.seq must be in a writer critical section. mt must contain
-// mount.
+// Preconditions:
+// * mt.seq must be in a writer critical section.
+// * mt must contain mount.
 func (mt *mountTable) removeSeqed(mount *Mount) {
 	hash := memhash(unsafe.Pointer(&mount.key), uintptr(mt.seed), mountKeyBytes)
 	tcap := uintptr(1) << (mt.size & mtSizeOrderMask)
diff --git a/pkg/sentry/vfs/options.go b/pkg/sentry/vfs/options.go
index dfc8573fd..bc79e5ecc 100644
--- a/pkg/sentry/vfs/options.go
+++ b/pkg/sentry/vfs/options.go
@@ -21,6 +21,8 @@ import (
 
 // GetDentryOptions contains options to VirtualFilesystem.GetDentryAt() and
 // FilesystemImpl.GetDentryAt().
+//
+// +stateify savable
 type GetDentryOptions struct {
 	// If CheckSearchable is true, FilesystemImpl.GetDentryAt() must check that
 	// the returned Dentry is a directory for which creds has search
@@ -30,6 +32,8 @@ type GetDentryOptions struct {
 
 // MkdirOptions contains options to VirtualFilesystem.MkdirAt() and
 // FilesystemImpl.MkdirAt().
+//
+// +stateify savable
 type MkdirOptions struct {
 	// Mode is the file mode bits for the created directory.
 	Mode linux.FileMode
@@ -56,6 +60,8 @@ type MkdirOptions struct {
 
 // MknodOptions contains options to VirtualFilesystem.MknodAt() and
 // FilesystemImpl.MknodAt().
+//
+// +stateify savable
 type MknodOptions struct {
 	// Mode is the file type and mode bits for the created file.
 	Mode linux.FileMode
@@ -72,6 +78,8 @@ type MknodOptions struct {
 
 // MountFlags contains flags as specified for mount(2), e.g. MS_NOEXEC.
 // MS_RDONLY is not part of MountFlags because it's tracked in Mount.writers.
+//
+// +stateify savable
 type MountFlags struct {
 	// NoExec is equivalent to MS_NOEXEC.
 	NoExec bool
@@ -93,6 +101,8 @@ type MountFlags struct {
 }
 
 // MountOptions contains options to VirtualFilesystem.MountAt().
+//
+// +stateify savable
 type MountOptions struct {
 	// Flags contains flags as specified for mount(2), e.g. MS_NOEXEC.
 	Flags MountFlags
@@ -103,13 +113,17 @@ type MountOptions struct {
 	// GetFilesystemOptions contains options to FilesystemType.GetFilesystem().
 	GetFilesystemOptions GetFilesystemOptions
 
-	// If InternalMount is true, allow the use of filesystem types for which
-	// RegisterFilesystemTypeOptions.AllowUserMount == false.
+	// InternalMount indicates whether the mount operation is coming from the
+	// application, i.e. through mount(2). If InternalMount is true, allow the use
+	// of filesystem types for which RegisterFilesystemTypeOptions.AllowUserMount
+	// == false.
 	InternalMount bool
 }
 
 // OpenOptions contains options to VirtualFilesystem.OpenAt() and
 // FilesystemImpl.OpenAt().
+//
+// +stateify savable
 type OpenOptions struct {
 	// Flags contains access mode and flags as specified for open(2).
 	//
@@ -135,6 +149,8 @@ type OpenOptions struct {
 // ReadOptions contains options to FileDescription.PRead(),
 // FileDescriptionImpl.PRead(), FileDescription.Read(), and
 // FileDescriptionImpl.Read().
+//
+// +stateify savable
 type ReadOptions struct {
 	// Flags contains flags as specified for preadv2(2).
 	Flags uint32
@@ -142,6 +158,8 @@ type ReadOptions struct {
 
 // RenameOptions contains options to VirtualFilesystem.RenameAt() and
 // FilesystemImpl.RenameAt().
+//
+// +stateify savable
 type RenameOptions struct {
 	// Flags contains flags as specified for renameat2(2).
 	Flags uint32
@@ -153,6 +171,8 @@ type RenameOptions struct {
 // SetStatOptions contains options to VirtualFilesystem.SetStatAt(),
 // FilesystemImpl.SetStatAt(), FileDescription.SetStat(), and
 // FileDescriptionImpl.SetStat().
+//
+// +stateify savable
 type SetStatOptions struct {
 	// Stat is the metadata that should be set. Only fields indicated by
 	// Stat.Mask should be set.
@@ -174,6 +194,8 @@ type SetStatOptions struct {
 
 // BoundEndpointOptions contains options to VirtualFilesystem.BoundEndpointAt()
 // and FilesystemImpl.BoundEndpointAt().
+//
+// +stateify savable
 type BoundEndpointOptions struct {
 	// Addr is the path of the file whose socket endpoint is being retrieved.
 	// It is generally irrelevant: most endpoints are stored at a dentry that
@@ -190,10 +212,12 @@ type BoundEndpointOptions struct {
 	Addr string
 }
 
-// GetxattrOptions contains options to VirtualFilesystem.GetxattrAt(),
-// FilesystemImpl.GetxattrAt(), FileDescription.Getxattr(), and
-// FileDescriptionImpl.Getxattr().
-type GetxattrOptions struct {
+// GetXattrOptions contains options to VirtualFilesystem.GetXattrAt(),
+// FilesystemImpl.GetXattrAt(), FileDescription.GetXattr(), and
+// FileDescriptionImpl.GetXattr().
+//
+// +stateify savable
+type GetXattrOptions struct {
 	// Name is the name of the extended attribute to retrieve.
 	Name string
 
@@ -204,10 +228,12 @@ type GetxattrOptions struct {
 	Size uint64
 }
 
-// SetxattrOptions contains options to VirtualFilesystem.SetxattrAt(),
-// FilesystemImpl.SetxattrAt(), FileDescription.Setxattr(), and
-// FileDescriptionImpl.Setxattr().
-type SetxattrOptions struct {
+// SetXattrOptions contains options to VirtualFilesystem.SetXattrAt(),
+// FilesystemImpl.SetXattrAt(), FileDescription.SetXattr(), and
+// FileDescriptionImpl.SetXattr().
+//
+// +stateify savable
+type SetXattrOptions struct {
 	// Name is the name of the extended attribute being mutated.
 	Name string
 
@@ -221,6 +247,8 @@ type SetxattrOptions struct {
 // StatOptions contains options to VirtualFilesystem.StatAt(),
 // FilesystemImpl.StatAt(), FileDescription.Stat(), and
 // FileDescriptionImpl.Stat().
+//
+// +stateify savable
 type StatOptions struct {
 	// Mask is the set of fields in the returned Statx that the FilesystemImpl
 	// or FileDescriptionImpl should provide. Bits are as in linux.Statx.Mask.
@@ -238,6 +266,8 @@ type StatOptions struct {
 }
 
 // UmountOptions contains options to VirtualFilesystem.UmountAt().
+//
+// +stateify savable
 type UmountOptions struct {
 	// Flags contains flags as specified for umount2(2).
 	Flags uint32
@@ -246,6 +276,8 @@ type UmountOptions struct {
 // WriteOptions contains options to FileDescription.PWrite(),
 // FileDescriptionImpl.PWrite(), FileDescription.Write(), and
 // FileDescriptionImpl.Write().
+//
+// +stateify savable
 type WriteOptions struct {
 	// Flags contains flags as specified for pwritev2(2).
 	Flags uint32
diff --git a/pkg/sentry/vfs/permissions.go b/pkg/sentry/vfs/permissions.go
index 33389c1df..d48520d58 100644
--- a/pkg/sentry/vfs/permissions.go
+++ b/pkg/sentry/vfs/permissions.go
@@ -16,6 +16,7 @@ package vfs
 
 import (
 	"math"
+	"strings"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
@@ -25,6 +26,8 @@ import (
 )
 
 // AccessTypes is a bitmask of Unix file permissions.
+//
+// +stateify savable
 type AccessTypes uint16
 
 // Bits in AccessTypes.
@@ -271,7 +274,7 @@ func HasCapabilityOnFile(creds *auth.Credentials, cp linux.Capability, kuid auth
 // operation must not proceed. Otherwise it returns the max length allowed to
 // without violating the limit.
 func CheckLimit(ctx context.Context, offset, size int64) (int64, error) {
-	fileSizeLimit := limits.FromContext(ctx).Get(limits.FileSize).Cur
+	fileSizeLimit := limits.FromContextOrDie(ctx).Get(limits.FileSize).Cur
 	if fileSizeLimit > math.MaxInt64 {
 		return size, nil
 	}
@@ -284,3 +287,40 @@ func CheckLimit(ctx context.Context, offset, size int64) (int64, error) {
 	}
 	return size, nil
 }
+
+// CheckXattrPermissions checks permissions for extended attribute access.
+// This is analogous to fs/xattr.c:xattr_permission(). Some key differences:
+// * Does not check for read-only filesystem property.
+// * Does not check inode immutability or append only mode. In both cases EPERM
+//   must be returned by filesystem implementations.
+// * Does not do inode permission checks. Filesystem implementations should
+//   handle inode permission checks as they may differ across implementations.
+func CheckXattrPermissions(creds *auth.Credentials, ats AccessTypes, mode linux.FileMode, kuid auth.KUID, name string) error {
+	switch {
+	case strings.HasPrefix(name, linux.XATTR_TRUSTED_PREFIX):
+		// The trusted.* namespace can only be accessed by privileged
+		// users.
+		if creds.HasCapability(linux.CAP_SYS_ADMIN) {
+			return nil
+		}
+		if ats.MayWrite() {
+			return syserror.EPERM
+		}
+		return syserror.ENODATA
+	case strings.HasPrefix(name, linux.XATTR_USER_PREFIX):
+		// In the user.* namespace, only regular files and directories can have
+		// extended attributes. For sticky directories, only the owner and
+		// privileged users can write attributes.
+		filetype := mode.FileType()
+		if filetype != linux.ModeRegular && filetype != linux.ModeDirectory {
+			if ats.MayWrite() {
+				return syserror.EPERM
+			}
+			return syserror.ENODATA
+		}
+		if filetype == linux.ModeDirectory && mode&linux.ModeSticky != 0 && ats.MayWrite() && !CanActAsOwner(creds, kuid) {
+			return syserror.EPERM
+		}
+	}
+	return nil
+}
diff --git a/pkg/sentry/vfs/resolving_path.go b/pkg/sentry/vfs/resolving_path.go
index 3304372d9..e4fd55012 100644
--- a/pkg/sentry/vfs/resolving_path.go
+++ b/pkg/sentry/vfs/resolving_path.go
@@ -35,6 +35,8 @@ import (
 // FilesystemImpl methods.
 //
 // ResolvingPath is loosely analogous to Linux's struct nameidata.
+//
+// +stateify savable
 type ResolvingPath struct {
 	vfs   *VirtualFilesystem
 	root  VirtualDentry // refs borrowed from PathOperation
@@ -88,6 +90,7 @@ func init() {
 // so error "constants" are really mutable vars, necessitating somewhat
 // expensive interface object comparisons.
 
+// +stateify savable
 type resolveMountRootOrJumpError struct{}
 
 // Error implements error.Error.
@@ -95,6 +98,7 @@ func (resolveMountRootOrJumpError) Error() string {
 	return "resolving mount root or jump"
 }
 
+// +stateify savable
 type resolveMountPointError struct{}
 
 // Error implements error.Error.
@@ -102,6 +106,7 @@ func (resolveMountPointError) Error() string {
 	return "resolving mount point"
 }
 
+// +stateify savable
 type resolveAbsSymlinkError struct{}
 
 // Error implements error.Error.
diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go
index 9c2420683..38d2701d2 100644
--- a/pkg/sentry/vfs/vfs.go
+++ b/pkg/sentry/vfs/vfs.go
@@ -24,9 +24,9 @@
 //           Locks acquired by FilesystemImpls between Prepare{Delete,Rename}Dentry and Commit{Delete,Rename*}Dentry
 //         VirtualFilesystem.filesystemsMu
 //       EpollInstance.mu
-//		   Inotify.mu
-// 		     Watches.mu
-//  		     Inotify.evMu
+//       Inotify.mu
+//         Watches.mu
+//           Inotify.evMu
 // VirtualFilesystem.fsTypesMu
 //
 // Locking Dentry.mu in multiple Dentries requires holding
@@ -36,6 +36,7 @@ package vfs
 
 import (
 	"fmt"
+	"path"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
@@ -157,11 +158,23 @@ func (vfs *VirtualFilesystem) Init(ctx context.Context) error {
 	return nil
 }
 
+// Release drops references on filesystem objects held by vfs.
+//
+// Precondition: This must be called after VFS.Init() has succeeded.
+func (vfs *VirtualFilesystem) Release(ctx context.Context) {
+	vfs.anonMount.DecRef(ctx)
+	for _, fst := range vfs.fsTypes {
+		fst.fsType.Release(ctx)
+	}
+}
+
 // PathOperation specifies the path operated on by a VFS method.
 //
 // PathOperation is passed to VFS methods by pointer to reduce memory copying:
 // it's somewhat large and should never escape. (Options structs are passed by
 // pointer to VFS and FileDescription methods for the same reason.)
+//
+// +stateify savable
 type PathOperation struct {
 	// Root is the VFS root. References on Root are borrowed from the provider
 	// of the PathOperation.
@@ -296,6 +309,8 @@ func (vfs *VirtualFilesystem) LinkAt(ctx context.Context, creds *auth.Credential
 // MkdirAt creates a directory at the given path.
 func (vfs *VirtualFilesystem) MkdirAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *MkdirOptions) error {
 	if !pop.Path.Begin.Ok() {
+		// pop.Path should not be empty in operations that create/delete files.
+		// This is consistent with mkdirat(dirfd, "", mode).
 		if pop.Path.Absolute {
 			return syserror.EEXIST
 		}
@@ -332,6 +347,8 @@ func (vfs *VirtualFilesystem) MkdirAt(ctx context.Context, creds *auth.Credentia
 // error from the syserror package.
 func (vfs *VirtualFilesystem) MknodAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *MknodOptions) error {
 	if !pop.Path.Begin.Ok() {
+		// pop.Path should not be empty in operations that create/delete files.
+		// This is consistent with mknodat(dirfd, "", mode, dev).
 		if pop.Path.Absolute {
 			return syserror.EEXIST
 		}
@@ -517,6 +534,8 @@ func (vfs *VirtualFilesystem) RenameAt(ctx context.Context, creds *auth.Credenti
 // RmdirAt removes the directory at the given path.
 func (vfs *VirtualFilesystem) RmdirAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) error {
 	if !pop.Path.Begin.Ok() {
+		// pop.Path should not be empty in operations that create/delete files.
+		// This is consistent with unlinkat(dirfd, "", AT_REMOVEDIR).
 		if pop.Path.Absolute {
 			return syserror.EBUSY
 		}
@@ -598,6 +617,8 @@ func (vfs *VirtualFilesystem) StatFSAt(ctx context.Context, creds *auth.Credenti
 // SymlinkAt creates a symbolic link at the given path with the given target.
 func (vfs *VirtualFilesystem) SymlinkAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, target string) error {
 	if !pop.Path.Begin.Ok() {
+		// pop.Path should not be empty in operations that create/delete files.
+		// This is consistent with symlinkat(oldpath, newdirfd, "").
 		if pop.Path.Absolute {
 			return syserror.EEXIST
 		}
@@ -630,6 +651,8 @@ func (vfs *VirtualFilesystem) SymlinkAt(ctx context.Context, creds *auth.Credent
 // UnlinkAt deletes the non-directory file at the given path.
 func (vfs *VirtualFilesystem) UnlinkAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) error {
 	if !pop.Path.Begin.Ok() {
+		// pop.Path should not be empty in operations that create/delete files.
+		// This is consistent with unlinkat(dirfd, "", 0).
 		if pop.Path.Absolute {
 			return syserror.EBUSY
 		}
@@ -661,12 +684,6 @@ func (vfs *VirtualFilesystem) UnlinkAt(ctx context.Context, creds *auth.Credenti
 
 // BoundEndpointAt gets the bound endpoint at the given path, if one exists.
 func (vfs *VirtualFilesystem) BoundEndpointAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *BoundEndpointOptions) (transport.BoundEndpoint, error) {
-	if !pop.Path.Begin.Ok() {
-		if pop.Path.Absolute {
-			return nil, syserror.ECONNREFUSED
-		}
-		return nil, syserror.ENOENT
-	}
 	rp := vfs.getResolvingPath(creds, pop)
 	for {
 		bep, err := rp.mount.fs.impl.BoundEndpointAt(ctx, rp, *opts)
@@ -686,12 +703,12 @@ func (vfs *VirtualFilesystem) BoundEndpointAt(ctx context.Context, creds *auth.C
 	}
 }
 
-// ListxattrAt returns all extended attribute names for the file at the given
+// ListXattrAt returns all extended attribute names for the file at the given
 // path.
-func (vfs *VirtualFilesystem) ListxattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, size uint64) ([]string, error) {
+func (vfs *VirtualFilesystem) ListXattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, size uint64) ([]string, error) {
 	rp := vfs.getResolvingPath(creds, pop)
 	for {
-		names, err := rp.mount.fs.impl.ListxattrAt(ctx, rp, size)
+		names, err := rp.mount.fs.impl.ListXattrAt(ctx, rp, size)
 		if err == nil {
 			vfs.putResolvingPath(ctx, rp)
 			return names, nil
@@ -711,12 +728,12 @@ func (vfs *VirtualFilesystem) ListxattrAt(ctx context.Context, creds *auth.Crede
 	}
 }
 
-// GetxattrAt returns the value associated with the given extended attribute
+// GetXattrAt returns the value associated with the given extended attribute
 // for the file at the given path.
-func (vfs *VirtualFilesystem) GetxattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *GetxattrOptions) (string, error) {
+func (vfs *VirtualFilesystem) GetXattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *GetXattrOptions) (string, error) {
 	rp := vfs.getResolvingPath(creds, pop)
 	for {
-		val, err := rp.mount.fs.impl.GetxattrAt(ctx, rp, *opts)
+		val, err := rp.mount.fs.impl.GetXattrAt(ctx, rp, *opts)
 		if err == nil {
 			vfs.putResolvingPath(ctx, rp)
 			return val, nil
@@ -728,12 +745,12 @@ func (vfs *VirtualFilesystem) GetxattrAt(ctx context.Context, creds *auth.Creden
 	}
 }
 
-// SetxattrAt changes the value associated with the given extended attribute
+// SetXattrAt changes the value associated with the given extended attribute
 // for the file at the given path.
-func (vfs *VirtualFilesystem) SetxattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *SetxattrOptions) error {
+func (vfs *VirtualFilesystem) SetXattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *SetXattrOptions) error {
 	rp := vfs.getResolvingPath(creds, pop)
 	for {
-		err := rp.mount.fs.impl.SetxattrAt(ctx, rp, *opts)
+		err := rp.mount.fs.impl.SetXattrAt(ctx, rp, *opts)
 		if err == nil {
 			vfs.putResolvingPath(ctx, rp)
 			return nil
@@ -745,11 +762,11 @@ func (vfs *VirtualFilesystem) SetxattrAt(ctx context.Context, creds *auth.Creden
 	}
 }
 
-// RemovexattrAt removes the given extended attribute from the file at rp.
-func (vfs *VirtualFilesystem) RemovexattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, name string) error {
+// RemoveXattrAt removes the given extended attribute from the file at rp.
+func (vfs *VirtualFilesystem) RemoveXattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, name string) error {
 	rp := vfs.getResolvingPath(creds, pop)
 	for {
-		err := rp.mount.fs.impl.RemovexattrAt(ctx, rp, name)
+		err := rp.mount.fs.impl.RemoveXattrAt(ctx, rp, name)
 		if err == nil {
 			vfs.putResolvingPath(ctx, rp)
 			return nil
@@ -782,6 +799,62 @@ func (vfs *VirtualFilesystem) SyncAllFilesystems(ctx context.Context) error {
 	return retErr
 }
 
+// MkdirAllAt recursively creates non-existent directories on the given path
+// (including the last component).
+func (vfs *VirtualFilesystem) MkdirAllAt(ctx context.Context, currentPath string, root VirtualDentry, creds *auth.Credentials, mkdirOpts *MkdirOptions) error {
+	pop := &PathOperation{
+		Root:  root,
+		Start: root,
+		Path:  fspath.Parse(currentPath),
+	}
+	stat, err := vfs.StatAt(ctx, creds, pop, &StatOptions{Mask: linux.STATX_TYPE})
+	switch err {
+	case nil:
+		if stat.Mask&linux.STATX_TYPE == 0 || stat.Mode&linux.FileTypeMask != linux.ModeDirectory {
+			return syserror.ENOTDIR
+		}
+		// Directory already exists.
+		return nil
+	case syserror.ENOENT:
+		// Expected, we will create the dir.
+	default:
+		return fmt.Errorf("stat failed for %q during directory creation: %w", currentPath, err)
+	}
+
+	// Recurse to ensure parent is created and then create the final directory.
+	if err := vfs.MkdirAllAt(ctx, path.Dir(currentPath), root, creds, mkdirOpts); err != nil {
+		return err
+	}
+	if err := vfs.MkdirAt(ctx, creds, pop, mkdirOpts); err != nil {
+		return fmt.Errorf("failed to create directory %q: %w", currentPath, err)
+	}
+	return nil
+}
+
+// MakeSyntheticMountpoint creates parent directories of target if they do not
+// exist and attempts to create a directory for the mountpoint. If a
+// non-directory file already exists there then we allow it.
+func (vfs *VirtualFilesystem) MakeSyntheticMountpoint(ctx context.Context, target string, root VirtualDentry, creds *auth.Credentials) error {
+	mkdirOpts := &MkdirOptions{Mode: 0777, ForSyntheticMountpoint: true}
+
+	// Make sure the parent directory of target exists.
+	if err := vfs.MkdirAllAt(ctx, path.Dir(target), root, creds, mkdirOpts); err != nil {
+		return fmt.Errorf("failed to create parent directory of mountpoint %q: %w", target, err)
+	}
+
+	// Attempt to mkdir the final component. If a file (of any type) exists
+	// then we let allow mounting on top of that because we do not require the
+	// target to be an existing directory, unlike Linux mount(2).
+	if err := vfs.MkdirAt(ctx, creds, &PathOperation{
+		Root:  root,
+		Start: root,
+		Path:  fspath.Parse(target),
+	}, mkdirOpts); err != nil && err != syserror.EEXIST {
+		return fmt.Errorf("failed to create mountpoint %q: %w", target, err)
+	}
+	return nil
+}
+
 // A VirtualDentry represents a node in a VFS tree, by combining a Dentry
 // (which represents a node in a Filesystem's tree) and a Mount (which
 // represents the Filesystem's position in a VFS mount tree).
diff --git a/pkg/sentry/watchdog/watchdog.go b/pkg/sentry/watchdog/watchdog.go
index 748273366..bbafb8b7f 100644
--- a/pkg/sentry/watchdog/watchdog.go
+++ b/pkg/sentry/watchdog/watchdog.go
@@ -96,15 +96,33 @@ const (
 	Panic
 )
 
+// Set implements flag.Value.
+func (a *Action) Set(v string) error {
+	switch v {
+	case "log", "logwarning":
+		*a = LogWarning
+	case "panic":
+		*a = Panic
+	default:
+		return fmt.Errorf("invalid watchdog action %q", v)
+	}
+	return nil
+}
+
+// Get implements flag.Value.
+func (a *Action) Get() interface{} {
+	return *a
+}
+
 // String returns Action's string representation.
-func (a Action) String() string {
-	switch a {
+func (a *Action) String() string {
+	switch *a {
 	case LogWarning:
-		return "LogWarning"
+		return "logWarning"
 	case Panic:
-		return "Panic"
+		return "panic"
 	default:
-		panic(fmt.Sprintf("Invalid action: %d", a))
+		panic(fmt.Sprintf("Invalid watchdog action: %d", *a))
 	}
 }
 
diff --git a/pkg/shim/v2/runtimeoptions/BUILD b/pkg/shim/v2/runtimeoptions/BUILD
index ba2ed1ea7..abb8c3be3 100644
--- a/pkg/shim/v2/runtimeoptions/BUILD
+++ b/pkg/shim/v2/runtimeoptions/BUILD
@@ -11,12 +11,12 @@ proto_library(
 
 go_library(
     name = "runtimeoptions",
-    srcs = ["runtimeoptions.go"],
-    visibility = ["//pkg/shim/v2:__pkg__"],
-    deps = [
-        ":api_go_proto",
-        "@com_github_gogo_protobuf//proto:go_default_library",
+    srcs = [
+        "runtimeoptions.go",
+        "runtimeoptions_cri.go",
     ],
+    visibility = ["//pkg/shim/v2:__pkg__"],
+    deps = ["@com_github_gogo_protobuf//proto:go_default_library"],
 )
 
 go_test(
@@ -27,6 +27,6 @@ go_test(
     deps = [
         "@com_github_containerd_containerd//runtime/v1/shim/v1:go_default_library",
         "@com_github_containerd_typeurl//:go_default_library",
-        "@com_github_golang_protobuf//proto:go_default_library",
+        "@com_github_gogo_protobuf//proto:go_default_library",
     ],
 )
diff --git a/pkg/shim/v2/runtimeoptions/runtimeoptions.go b/pkg/shim/v2/runtimeoptions/runtimeoptions.go
index aaf17b87a..072dd87f0 100644
--- a/pkg/shim/v2/runtimeoptions/runtimeoptions.go
+++ b/pkg/shim/v2/runtimeoptions/runtimeoptions.go
@@ -13,18 +13,5 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+// Package runtimeoptions contains the runtimeoptions proto.
 package runtimeoptions
-
-import (
-	proto "github.com/gogo/protobuf/proto"
-	pb "gvisor.dev/gvisor/pkg/shim/v2/runtimeoptions/api_go_proto"
-)
-
-type Options = pb.Options
-
-func init() {
-	// The generated proto file auto registers with "golang/protobuf/proto"
-	// package. However, typeurl uses "golang/gogo/protobuf/proto". So registers
-	// the type there too.
-	proto.RegisterType((*Options)(nil), "cri.runtimeoptions.v1.Options")
-}
diff --git a/pkg/shim/v2/runtimeoptions/runtimeoptions_cri.go b/pkg/shim/v2/runtimeoptions/runtimeoptions_cri.go
new file mode 100644
index 000000000..e6102b4cf
--- /dev/null
+++ b/pkg/shim/v2/runtimeoptions/runtimeoptions_cri.go
@@ -0,0 +1,383 @@
+// Copyright 2018 The containerd Authors.
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package runtimeoptions
+
+import (
+	"fmt"
+	"io"
+	"reflect"
+	"strings"
+
+	proto "github.com/gogo/protobuf/proto"
+)
+
+// This is a compile-time assertion to ensure that this generated file
+// is compatible with the proto package it is being compiled against.
+// A compilation error at this line likely means your copy of the
+// proto package needs to be updated.
+const _ = proto.GoGoProtoPackageIsVersion2 // please upgrade the proto package
+
+type Options struct {
+	// TypeUrl specifies the type of the content inside the config file.
+	TypeUrl string `protobuf:"bytes,1,opt,name=type_url,json=typeUrl,proto3" json:"type_url,omitempty"`
+	// ConfigPath specifies the filesystem location of the config file
+	// used by the runtime.
+	ConfigPath string `protobuf:"bytes,2,opt,name=config_path,json=configPath,proto3" json:"config_path,omitempty"`
+}
+
+func (m *Options) Reset()                    { *m = Options{} }
+func (*Options) ProtoMessage()               {}
+func (*Options) Descriptor() ([]byte, []int) { return fileDescriptorApi, []int{0} }
+
+func (m *Options) GetTypeUrl() string {
+	if m != nil {
+		return m.TypeUrl
+	}
+	return ""
+}
+
+func (m *Options) GetConfigPath() string {
+	if m != nil {
+		return m.ConfigPath
+	}
+	return ""
+}
+
+func init() {
+	proto.RegisterType((*Options)(nil), "cri.runtimeoptions.v1.Options")
+}
+
+func (m *Options) Marshal() (dAtA []byte, err error) {
+	size := m.Size()
+	dAtA = make([]byte, size)
+	n, err := m.MarshalTo(dAtA)
+	if err != nil {
+		return nil, err
+	}
+	return dAtA[:n], nil
+}
+
+func (m *Options) MarshalTo(dAtA []byte) (int, error) {
+	var i int
+	_ = i
+	var l int
+	_ = l
+	if len(m.TypeUrl) > 0 {
+		dAtA[i] = 0xa
+		i++
+		i = encodeVarintApi(dAtA, i, uint64(len(m.TypeUrl)))
+		i += copy(dAtA[i:], m.TypeUrl)
+	}
+	if len(m.ConfigPath) > 0 {
+		dAtA[i] = 0x12
+		i++
+		i = encodeVarintApi(dAtA, i, uint64(len(m.ConfigPath)))
+		i += copy(dAtA[i:], m.ConfigPath)
+	}
+	return i, nil
+}
+
+func encodeVarintApi(dAtA []byte, offset int, v uint64) int {
+	for v >= 1<<7 {
+		dAtA[offset] = uint8(v&0x7f | 0x80)
+		v >>= 7
+		offset++
+	}
+	dAtA[offset] = uint8(v)
+	return offset + 1
+}
+
+func (m *Options) Size() (n int) {
+	var l int
+	_ = l
+	l = len(m.TypeUrl)
+	if l > 0 {
+		n += 1 + l + sovApi(uint64(l))
+	}
+	l = len(m.ConfigPath)
+	if l > 0 {
+		n += 1 + l + sovApi(uint64(l))
+	}
+	return n
+}
+
+func sovApi(x uint64) (n int) {
+	for {
+		n++
+		x >>= 7
+		if x == 0 {
+			break
+		}
+	}
+	return n
+}
+
+func sozApi(x uint64) (n int) {
+	return sovApi(uint64((x << 1) ^ uint64((int64(x) >> 63))))
+}
+
+func (this *Options) String() string {
+	if this == nil {
+		return "nil"
+	}
+	s := strings.Join([]string{`&Options{`,
+		`TypeUrl:` + fmt.Sprintf("%v", this.TypeUrl) + `,`,
+		`ConfigPath:` + fmt.Sprintf("%v", this.ConfigPath) + `,`,
+		`}`,
+	}, "")
+	return s
+}
+
+func valueToStringApi(v interface{}) string {
+	rv := reflect.ValueOf(v)
+	if rv.IsNil() {
+		return "nil"
+	}
+	pv := reflect.Indirect(rv).Interface()
+	return fmt.Sprintf("*%v", pv)
+}
+
+func (m *Options) Unmarshal(dAtA []byte) error {
+	l := len(dAtA)
+	iNdEx := 0
+	for iNdEx < l {
+		preIndex := iNdEx
+		var wire uint64
+		for shift := uint(0); ; shift += 7 {
+			if shift >= 64 {
+				return ErrIntOverflowApi
+			}
+			if iNdEx >= l {
+				return io.ErrUnexpectedEOF
+			}
+			b := dAtA[iNdEx]
+			iNdEx++
+			wire |= (uint64(b) & 0x7F) << shift
+			if b < 0x80 {
+				break
+			}
+		}
+		fieldNum := int32(wire >> 3)
+		wireType := int(wire & 0x7)
+		if wireType == 4 {
+			return fmt.Errorf("proto: Options: wiretype end group for non-group")
+		}
+		if fieldNum <= 0 {
+			return fmt.Errorf("proto: Options: illegal tag %d (wire type %d)", fieldNum, wire)
+		}
+		switch fieldNum {
+		case 1:
+			if wireType != 2 {
+				return fmt.Errorf("proto: wrong wireType = %d for field TypeUrl", wireType)
+			}
+			var stringLen uint64
+			for shift := uint(0); ; shift += 7 {
+				if shift >= 64 {
+					return ErrIntOverflowApi
+				}
+				if iNdEx >= l {
+					return io.ErrUnexpectedEOF
+				}
+				b := dAtA[iNdEx]
+				iNdEx++
+				stringLen |= (uint64(b) & 0x7F) << shift
+				if b < 0x80 {
+					break
+				}
+			}
+			intStringLen := int(stringLen)
+			if intStringLen < 0 {
+				return ErrInvalidLengthApi
+			}
+			postIndex := iNdEx + intStringLen
+			if postIndex > l {
+				return io.ErrUnexpectedEOF
+			}
+			m.TypeUrl = string(dAtA[iNdEx:postIndex])
+			iNdEx = postIndex
+		case 2:
+			if wireType != 2 {
+				return fmt.Errorf("proto: wrong wireType = %d for field ConfigPath", wireType)
+			}
+			var stringLen uint64
+			for shift := uint(0); ; shift += 7 {
+				if shift >= 64 {
+					return ErrIntOverflowApi
+				}
+				if iNdEx >= l {
+					return io.ErrUnexpectedEOF
+				}
+				b := dAtA[iNdEx]
+				iNdEx++
+				stringLen |= (uint64(b) & 0x7F) << shift
+				if b < 0x80 {
+					break
+				}
+			}
+			intStringLen := int(stringLen)
+			if intStringLen < 0 {
+				return ErrInvalidLengthApi
+			}
+			postIndex := iNdEx + intStringLen
+			if postIndex > l {
+				return io.ErrUnexpectedEOF
+			}
+			m.ConfigPath = string(dAtA[iNdEx:postIndex])
+			iNdEx = postIndex
+		default:
+			iNdEx = preIndex
+			skippy, err := skipApi(dAtA[iNdEx:])
+			if err != nil {
+				return err
+			}
+			if skippy < 0 {
+				return ErrInvalidLengthApi
+			}
+			if (iNdEx + skippy) > l {
+				return io.ErrUnexpectedEOF
+			}
+			iNdEx += skippy
+		}
+	}
+
+	if iNdEx > l {
+		return io.ErrUnexpectedEOF
+	}
+	return nil
+}
+
+func skipApi(dAtA []byte) (n int, err error) {
+	l := len(dAtA)
+	iNdEx := 0
+	for iNdEx < l {
+		var wire uint64
+		for shift := uint(0); ; shift += 7 {
+			if shift >= 64 {
+				return 0, ErrIntOverflowApi
+			}
+			if iNdEx >= l {
+				return 0, io.ErrUnexpectedEOF
+			}
+			b := dAtA[iNdEx]
+			iNdEx++
+			wire |= (uint64(b) & 0x7F) << shift
+			if b < 0x80 {
+				break
+			}
+		}
+		wireType := int(wire & 0x7)
+		switch wireType {
+		case 0:
+			for shift := uint(0); ; shift += 7 {
+				if shift >= 64 {
+					return 0, ErrIntOverflowApi
+				}
+				if iNdEx >= l {
+					return 0, io.ErrUnexpectedEOF
+				}
+				iNdEx++
+				if dAtA[iNdEx-1] < 0x80 {
+					break
+				}
+			}
+			return iNdEx, nil
+		case 1:
+			iNdEx += 8
+			return iNdEx, nil
+		case 2:
+			var length int
+			for shift := uint(0); ; shift += 7 {
+				if shift >= 64 {
+					return 0, ErrIntOverflowApi
+				}
+				if iNdEx >= l {
+					return 0, io.ErrUnexpectedEOF
+				}
+				b := dAtA[iNdEx]
+				iNdEx++
+				length |= (int(b) & 0x7F) << shift
+				if b < 0x80 {
+					break
+				}
+			}
+			iNdEx += length
+			if length < 0 {
+				return 0, ErrInvalidLengthApi
+			}
+			return iNdEx, nil
+		case 3:
+			for {
+				var innerWire uint64
+				var start int = iNdEx
+				for shift := uint(0); ; shift += 7 {
+					if shift >= 64 {
+						return 0, ErrIntOverflowApi
+					}
+					if iNdEx >= l {
+						return 0, io.ErrUnexpectedEOF
+					}
+					b := dAtA[iNdEx]
+					iNdEx++
+					innerWire |= (uint64(b) & 0x7F) << shift
+					if b < 0x80 {
+						break
+					}
+				}
+				innerWireType := int(innerWire & 0x7)
+				if innerWireType == 4 {
+					break
+				}
+				next, err := skipApi(dAtA[start:])
+				if err != nil {
+					return 0, err
+				}
+				iNdEx = start + next
+			}
+			return iNdEx, nil
+		case 4:
+			return iNdEx, nil
+		case 5:
+			iNdEx += 4
+			return iNdEx, nil
+		default:
+			return 0, fmt.Errorf("proto: illegal wireType %d", wireType)
+		}
+	}
+	panic("unreachable")
+}
+
+var (
+	ErrInvalidLengthApi = fmt.Errorf("proto: negative length found during unmarshaling")
+	ErrIntOverflowApi   = fmt.Errorf("proto: integer overflow")
+)
+
+func init() { proto.RegisterFile("api.proto", fileDescriptorApi) }
+
+var fileDescriptorApi = []byte{
+	// 183 bytes of a gzipped FileDescriptorProto
+	0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0xe2, 0xe2, 0x4c, 0x2c, 0xc8, 0xd4,
+	0x2b, 0x28, 0xca, 0x2f, 0xc9, 0x17, 0x12, 0x4d, 0x2e, 0xca, 0xd4, 0x2b, 0x2a, 0xcd, 0x2b, 0xc9,
+	0xcc, 0x4d, 0xcd, 0x2f, 0x28, 0xc9, 0xcc, 0xcf, 0x2b, 0xd6, 0x2b, 0x33, 0x94, 0xd2, 0x4d, 0xcf,
+	0x2c, 0xc9, 0x28, 0x4d, 0xd2, 0x4b, 0xce, 0xcf, 0xd5, 0x4f, 0xcf, 0x4f, 0xcf, 0xd7, 0x07, 0xab,
+	0x4e, 0x2a, 0x4d, 0x03, 0xf3, 0xc0, 0x1c, 0x30, 0x0b, 0x62, 0x8a, 0x92, 0x2b, 0x17, 0xbb, 0x3f,
+	0x44, 0xb3, 0x90, 0x24, 0x17, 0x47, 0x49, 0x65, 0x41, 0x6a, 0x7c, 0x69, 0x51, 0x8e, 0x04, 0xa3,
+	0x02, 0xa3, 0x06, 0x67, 0x10, 0x3b, 0x88, 0x1f, 0x5a, 0x94, 0x23, 0x24, 0xcf, 0xc5, 0x9d, 0x9c,
+	0x9f, 0x97, 0x96, 0x99, 0x1e, 0x5f, 0x90, 0x58, 0x92, 0x21, 0xc1, 0x04, 0x96, 0xe5, 0x82, 0x08,
+	0x05, 0x24, 0x96, 0x64, 0x38, 0xc9, 0x9c, 0x78, 0x28, 0xc7, 0x78, 0xe3, 0xa1, 0x1c, 0x43, 0xc3,
+	0x23, 0x39, 0xc6, 0x13, 0x8f, 0xe4, 0x18, 0x2f, 0x3c, 0x92, 0x63, 0x7c, 0xf0, 0x48, 0x8e, 0x71,
+	0xc2, 0x63, 0x39, 0x86, 0x24, 0x36, 0xb0, 0x5d, 0xc6, 0x80, 0x00, 0x00, 0x00, 0xff, 0xff, 0x07,
+	0x00, 0xf2, 0x18, 0xbe, 0x00, 0x00, 0x00,
+}
diff --git a/pkg/shim/v2/runtimeoptions/runtimeoptions_test.go b/pkg/shim/v2/runtimeoptions/runtimeoptions_test.go
index f4c238a00..c59a2400e 100644
--- a/pkg/shim/v2/runtimeoptions/runtimeoptions_test.go
+++ b/pkg/shim/v2/runtimeoptions/runtimeoptions_test.go
@@ -15,11 +15,12 @@
 package runtimeoptions
 
 import (
+	"bytes"
 	"testing"
 
 	shim "github.com/containerd/containerd/runtime/v1/shim/v1"
 	"github.com/containerd/typeurl"
-	"github.com/golang/protobuf/proto"
+	"github.com/gogo/protobuf/proto"
 )
 
 func TestCreateTaskRequest(t *testing.T) {
@@ -32,7 +33,11 @@ func TestCreateTaskRequest(t *testing.T) {
 	if err := proto.UnmarshalText(encodedText, got); err != nil {
 		t.Fatalf("unable to unmarshal text: %v", err)
 	}
-	t.Logf("got: %s", proto.MarshalTextString(got))
+	var textBuffer bytes.Buffer
+	if err := proto.MarshalText(&textBuffer, got); err != nil {
+		t.Errorf("unable to marshal text: %v", err)
+	}
+	t.Logf("got: %s", string(textBuffer.Bytes()))
 
 	// Check the options.
 	wantOptions := &Options{}
diff --git a/pkg/sleep/sleep_unsafe.go b/pkg/sleep/sleep_unsafe.go
index 118805492..19bce2afb 100644
--- a/pkg/sleep/sleep_unsafe.go
+++ b/pkg/sleep/sleep_unsafe.go
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 // +build go1.11
-// +build !go1.16
+// +build !go1.17
 
 // Check go:linkname function signatures when updating Go version.
 
diff --git a/pkg/state/decode.go b/pkg/state/decode.go
index c9971cdf6..89467ca8e 100644
--- a/pkg/state/decode.go
+++ b/pkg/state/decode.go
@@ -584,10 +584,12 @@ func (ds *decodeState) Load(obj reflect.Value) {
 	})
 
 	// Create the root object.
-	ds.objectsByID = append(ds.objectsByID, &objectDecodeState{
+	rootOds := &objectDecodeState{
 		id:  1,
 		obj: obj,
-	})
+	}
+	ds.objectsByID = append(ds.objectsByID, rootOds)
+	ds.pending.PushBack(rootOds)
 
 	// Read the number of objects.
 	lastID, object, err := ReadHeader(ds.r)
diff --git a/pkg/state/pretty/pretty.go b/pkg/state/pretty/pretty.go
index cf37aaa49..887f453a9 100644
--- a/pkg/state/pretty/pretty.go
+++ b/pkg/state/pretty/pretty.go
@@ -26,12 +26,17 @@ import (
 	"gvisor.dev/gvisor/pkg/state/wire"
 )
 
-func formatRef(x *wire.Ref, graph uint64, html bool) string {
+type printer struct {
+	html      bool
+	typeSpecs map[string]*wire.Type
+}
+
+func (p *printer) formatRef(x *wire.Ref, graph uint64) string {
 	baseRef := fmt.Sprintf("g%dr%d", graph, x.Root)
 	fullRef := baseRef
 	if len(x.Dots) > 0 {
 		// See wire.Ref; Type valid if Dots non-zero.
-		typ, _ := formatType(x.Type, graph, html)
+		typ, _ := p.formatType(x.Type, graph)
 		var buf strings.Builder
 		buf.WriteString("(*")
 		buf.WriteString(typ)
@@ -51,34 +56,40 @@ func formatRef(x *wire.Ref, graph uint64, html bool) string {
 		buf.WriteString(")")
 		fullRef = buf.String()
 	}
-	if html {
+	if p.html {
 		return fmt.Sprintf("<a href=\"#%s\">%s</a>", baseRef, fullRef)
 	}
 	return fullRef
 }
 
-func formatType(t wire.TypeSpec, graph uint64, html bool) (string, bool) {
+func (p *printer) formatType(t wire.TypeSpec, graph uint64) (string, bool) {
 	switch x := t.(type) {
 	case wire.TypeID:
-		base := fmt.Sprintf("g%dt%d", graph, x)
-		if html {
-			return fmt.Sprintf("<a href=\"#%s\">%s</a>", base, base), true
+		tag := fmt.Sprintf("g%dt%d", graph, x)
+		desc := tag
+		if spec, ok := p.typeSpecs[tag]; ok {
+			desc += fmt.Sprintf("=%s", spec.Name)
+		} else {
+			desc += "!missing-type-spec"
+		}
+		if p.html {
+			return fmt.Sprintf("<a href=\"#%s\">%s</a>", tag, desc), true
 		}
-		return fmt.Sprintf("%s", base), true
+		return desc, true
 	case wire.TypeSpecNil:
 		return "", false // Only nil type.
 	case *wire.TypeSpecPointer:
-		element, _ := formatType(x.Type, graph, html)
+		element, _ := p.formatType(x.Type, graph)
 		return fmt.Sprintf("(*%s)", element), true
 	case *wire.TypeSpecArray:
-		element, _ := formatType(x.Type, graph, html)
+		element, _ := p.formatType(x.Type, graph)
 		return fmt.Sprintf("[%d](%s)", x.Count, element), true
 	case *wire.TypeSpecSlice:
-		element, _ := formatType(x.Type, graph, html)
+		element, _ := p.formatType(x.Type, graph)
 		return fmt.Sprintf("([]%s)", element), true
 	case *wire.TypeSpecMap:
-		key, _ := formatType(x.Key, graph, html)
-		value, _ := formatType(x.Value, graph, html)
+		key, _ := p.formatType(x.Key, graph)
+		value, _ := p.formatType(x.Value, graph)
 		return fmt.Sprintf("(map[%s]%s)", key, value), true
 	default:
 		panic(fmt.Sprintf("unreachable: unknown type %T", t))
@@ -87,7 +98,7 @@ func formatType(t wire.TypeSpec, graph uint64, html bool) (string, bool) {
 
 // format formats a single object, for pretty-printing. It also returns whether
 // the value is a non-zero value.
-func format(graph uint64, depth int, encoded wire.Object, html bool) (string, bool) {
+func (p *printer) format(graph uint64, depth int, encoded wire.Object) (string, bool) {
 	switch x := encoded.(type) {
 	case wire.Nil:
 		return "nil", false
@@ -98,7 +109,7 @@ func format(graph uint64, depth int, encoded wire.Object, html bool) (string, bo
 	case *wire.Complex128:
 		return fmt.Sprintf("%f+%fi", real(*x), imag(*x)), *x != 0.0
 	case *wire.Ref:
-		return formatRef(x, graph, html), x.Root != 0
+		return p.formatRef(x, graph), x.Root != 0
 	case *wire.Type:
 		tabs := "\n" + strings.Repeat("\t", depth)
 		items := make([]string, 0, len(x.Fields)+2)
@@ -109,7 +120,7 @@ func format(graph uint64, depth int, encoded wire.Object, html bool) (string, bo
 		items = append(items, "}")
 		return strings.Join(items, tabs), true // No zero value.
 	case *wire.Slice:
-		return fmt.Sprintf("%s{len:%d,cap:%d}", formatRef(&x.Ref, graph, html), x.Length, x.Capacity), x.Capacity != 0
+		return fmt.Sprintf("%s{len:%d,cap:%d}", p.formatRef(&x.Ref, graph), x.Length, x.Capacity), x.Capacity != 0
 	case *wire.Array:
 		if len(x.Contents) == 0 {
 			return "[]", false
@@ -119,7 +130,7 @@ func format(graph uint64, depth int, encoded wire.Object, html bool) (string, bo
 		items = append(items, "[")
 		tabs := "\n" + strings.Repeat("\t", depth)
 		for i := 0; i < len(x.Contents); i++ {
-			item, ok := format(graph, depth+1, x.Contents[i], html)
+			item, ok := p.format(graph, depth+1, x.Contents[i])
 			if !ok {
 				zeros = append(zeros, fmt.Sprintf("\t%s,", item))
 				continue
@@ -136,7 +147,9 @@ func format(graph uint64, depth int, encoded wire.Object, html bool) (string, bo
 		items = append(items, "]")
 		return strings.Join(items, tabs), len(zeros) < len(x.Contents)
 	case *wire.Struct:
-		typ, _ := formatType(x.TypeID, graph, html)
+		tag := fmt.Sprintf("g%dt%d", graph, x.TypeID)
+		spec, _ := p.typeSpecs[tag]
+		typ, _ := p.formatType(x.TypeID, graph)
 		if x.Fields() == 0 {
 			return fmt.Sprintf("struct[%s]{}", typ), false
 		}
@@ -145,10 +158,15 @@ func format(graph uint64, depth int, encoded wire.Object, html bool) (string, bo
 		tabs := "\n" + strings.Repeat("\t", depth)
 		allZero := true
 		for i := 0; i < x.Fields(); i++ {
-			element, ok := format(graph, depth+1, *x.Field(i), html)
+			var name string
+			if spec != nil && i < len(spec.Fields) {
+				name = spec.Fields[i]
+			} else {
+				name = fmt.Sprintf("%d", i)
+			}
+			element, ok := p.format(graph, depth+1, *x.Field(i))
 			allZero = allZero && !ok
-			items = append(items, fmt.Sprintf("\t%d: %s,", i, element))
-			i++
+			items = append(items, fmt.Sprintf("\t%s: %s,", name, element))
 		}
 		items = append(items, "}")
 		return strings.Join(items, tabs), !allZero
@@ -160,15 +178,15 @@ func format(graph uint64, depth int, encoded wire.Object, html bool) (string, bo
 		items = append(items, "map{")
 		tabs := "\n" + strings.Repeat("\t", depth)
 		for i := 0; i < len(x.Keys); i++ {
-			key, _ := format(graph, depth+1, x.Keys[i], html)
-			value, _ := format(graph, depth+1, x.Values[i], html)
+			key, _ := p.format(graph, depth+1, x.Keys[i])
+			value, _ := p.format(graph, depth+1, x.Values[i])
 			items = append(items, fmt.Sprintf("\t%s: %s,", key, value))
 		}
 		items = append(items, "}")
 		return strings.Join(items, tabs), true
 	case *wire.Interface:
-		typ, typOk := formatType(x.Type, graph, html)
-		element, elementOk := format(graph, depth+1, x.Value, html)
+		typ, typOk := p.formatType(x.Type, graph)
+		element, elementOk := p.format(graph, depth+1, x.Value)
 		return fmt.Sprintf("interface[%s]{%s}", typ, element), typOk || elementOk
 	default:
 		// Must be a primitive; use reflection.
@@ -177,11 +195,11 @@ func format(graph uint64, depth int, encoded wire.Object, html bool) (string, bo
 }
 
 // printStream is the basic print implementation.
-func printStream(w io.Writer, r wire.Reader, html bool) (err error) {
+func (p *printer) printStream(w io.Writer, r wire.Reader) (err error) {
 	// current graph ID.
 	var graph uint64
 
-	if html {
+	if p.html {
 		fmt.Fprintf(w, "<pre>")
 		defer fmt.Fprintf(w, "</pre>")
 	}
@@ -196,6 +214,8 @@ func printStream(w io.Writer, r wire.Reader, html bool) (err error) {
 		}
 	}()
 
+	p.typeSpecs = make(map[string]*wire.Type)
+
 	for {
 		// Find the first object to begin generation.
 		length, object, err := state.ReadHeader(r)
@@ -223,18 +243,19 @@ func printStream(w io.Writer, r wire.Reader, html bool) (err error) {
 		// loop in decode.go. But we don't register type information,
 		// etc. and just print the raw structures.
 		var (
-			oid uint64 = 1
-			tid uint64 = 1
+			tid     uint64 = 1
+			objects []wire.Object
 		)
-		for oid <= length {
+		for oid := uint64(1); oid <= length; {
 			// Unmarshal the object.
 			encoded := wire.Load(r)
 
 			// Is this a type?
-			if _, ok := encoded.(*wire.Type); ok {
-				str, _ := format(graph, 0, encoded, html)
+			if typ, ok := encoded.(*wire.Type); ok {
+				str, _ := p.format(graph, 0, encoded)
 				tag := fmt.Sprintf("g%dt%d", graph, tid)
-				if html {
+				p.typeSpecs[tag] = typ
+				if p.html {
 					// See below.
 					tag = fmt.Sprintf("<a name=\"%s\">%s</a><a href=\"#%s\">&#9875;</a>", tag, tag, tag)
 				}
@@ -245,17 +266,24 @@ func printStream(w io.Writer, r wire.Reader, html bool) (err error) {
 				continue
 			}
 
+			// Otherwise, it is a node.
+			objects = append(objects, encoded)
+			oid++
+		}
+
+		for i, encoded := range objects {
+			// oid starts at 1.
+			oid := i + 1
 			// Format the node.
-			str, _ := format(graph, 0, encoded, html)
+			str, _ := p.format(graph, 0, encoded)
 			tag := fmt.Sprintf("g%dr%d", graph, oid)
-			if html {
+			if p.html {
 				// Create a little tag with an anchor next to it for linking.
 				tag = fmt.Sprintf("<a name=\"%s\">%s</a><a href=\"#%s\">&#9875;</a>", tag, tag, tag)
 			}
 			if _, err := fmt.Fprintf(w, "%s = %s\n", tag, str); err != nil {
 				return err
 			}
-			oid++
 		}
 	}
 
@@ -264,10 +292,10 @@ func printStream(w io.Writer, r wire.Reader, html bool) (err error) {
 
 // PrintText reads the stream from r and prints text to w.
 func PrintText(w io.Writer, r wire.Reader) error {
-	return printStream(w, r, false /* html */)
+	return (&printer{}).printStream(w, r)
 }
 
 // PrintHTML reads the stream from r and prints html to w.
 func PrintHTML(w io.Writer, r wire.Reader) error {
-	return printStream(w, r, true /* html */)
+	return (&printer{html: true}).printStream(w, r)
 }
diff --git a/pkg/state/tests/load_test.go b/pkg/state/tests/load_test.go
index 1e9794296..3c73ac391 100644
--- a/pkg/state/tests/load_test.go
+++ b/pkg/state/tests/load_test.go
@@ -20,6 +20,14 @@ import (
 
 func TestLoadHooks(t *testing.T) {
 	runTestCases(t, false, "load-hooks", []interface{}{
+		// Root object being a struct.
+		afterLoadStruct{v: 1},
+		valueLoadStruct{v: 1},
+		genericContainer{v: &afterLoadStruct{v: 1}},
+		genericContainer{v: &valueLoadStruct{v: 1}},
+		sliceContainer{v: []interface{}{&afterLoadStruct{v: 1}}},
+		sliceContainer{v: []interface{}{&valueLoadStruct{v: 1}}},
+		// Root object being a pointer.
 		&afterLoadStruct{v: 1},
 		&valueLoadStruct{v: 1},
 		&genericContainer{v: &afterLoadStruct{v: 1}},
diff --git a/pkg/state/types.go b/pkg/state/types.go
index 215ef80f8..84aed8732 100644
--- a/pkg/state/types.go
+++ b/pkg/state/types.go
@@ -107,6 +107,14 @@ func lookupNameFields(typ reflect.Type) (string, []string, bool) {
 		}
 		return name, nil, true
 	}
+	// Sanity check the type.
+	if raceEnabled {
+		if _, ok := reverseTypeDatabase[typ]; !ok {
+			// The type was not registered? Must be an embedded
+			// structure or something else.
+			return "", nil, false
+		}
+	}
 	// Extract the name from the object.
 	name := t.StateTypeName()
 	fields := t.StateFields()
@@ -313,6 +321,9 @@ var primitiveTypeDatabase = func() map[string]reflect.Type {
 // globalTypeDatabase is used for dispatching interfaces on decode.
 var globalTypeDatabase = map[string]reflect.Type{}
 
+// reverseTypeDatabase is a reverse mapping.
+var reverseTypeDatabase = map[reflect.Type]string{}
+
 // Register registers a type.
 //
 // This must be called on init and only done once.
@@ -358,4 +369,7 @@ func Register(t Type) {
 		Failf("conflicting name for %T: matches interfaceType", t)
 	}
 	globalTypeDatabase[name] = typ
+	if raceEnabled {
+		reverseTypeDatabase[typ] = name
+	}
 }
diff --git a/pkg/sync/BUILD b/pkg/sync/BUILD
index 4d47207f7..68535c3b1 100644
--- a/pkg/sync/BUILD
+++ b/pkg/sync/BUILD
@@ -38,6 +38,7 @@ go_library(
         "race_unsafe.go",
         "rwmutex_unsafe.go",
         "seqcount.go",
+        "spin_unsafe.go",
         "sync.go",
     ],
     marshal = False,
diff --git a/pkg/sync/memmove_unsafe.go b/pkg/sync/memmove_unsafe.go
index 1d7780695..f5e630009 100644
--- a/pkg/sync/memmove_unsafe.go
+++ b/pkg/sync/memmove_unsafe.go
@@ -4,7 +4,7 @@
 // license that can be found in the LICENSE file.
 
 // +build go1.12
-// +build !go1.16
+// +build !go1.17
 
 // Check go:linkname function signatures when updating Go version.
 
diff --git a/pkg/sync/mutex_unsafe.go b/pkg/sync/mutex_unsafe.go
index dc034d561..f4c2e9642 100644
--- a/pkg/sync/mutex_unsafe.go
+++ b/pkg/sync/mutex_unsafe.go
@@ -4,7 +4,7 @@
 // license that can be found in the LICENSE file.
 
 // +build go1.13
-// +build !go1.16
+// +build !go1.17
 
 // When updating the build constraint (above), check that syncMutex matches the
 // standard library sync.Mutex definition.
diff --git a/pkg/sync/rwmutex_unsafe.go b/pkg/sync/rwmutex_unsafe.go
index 995c0346e..b3b4dee78 100644
--- a/pkg/sync/rwmutex_unsafe.go
+++ b/pkg/sync/rwmutex_unsafe.go
@@ -4,7 +4,7 @@
 // license that can be found in the LICENSE file.
 
 // +build go1.13
-// +build !go1.16
+// +build !go1.17
 
 // Check go:linkname function signatures when updating Go version.
 
diff --git a/pkg/sync/seqatomic_unsafe.go b/pkg/sync/seqatomic_unsafe.go
index eda6fb131..2184cb5ab 100644
--- a/pkg/sync/seqatomic_unsafe.go
+++ b/pkg/sync/seqatomic_unsafe.go
@@ -25,41 +25,35 @@ import (
 type Value struct{}
 
 // SeqAtomicLoad returns a copy of *ptr, ensuring that the read does not race
-// with any writer critical sections in sc.
-func SeqAtomicLoad(sc *sync.SeqCount, ptr *Value) Value {
-	// This function doesn't use SeqAtomicTryLoad because doing so is
-	// measurably, significantly (~20%) slower; Go is awful at inlining.
-	var val Value
+// with any writer critical sections in seq.
+//
+//go:nosplit
+func SeqAtomicLoad(seq *sync.SeqCount, ptr *Value) Value {
 	for {
-		epoch := sc.BeginRead()
-		if sync.RaceEnabled {
-			// runtime.RaceDisable() doesn't actually stop the race detector,
-			// so it can't help us here. Instead, call runtime.memmove
-			// directly, which is not instrumented by the race detector.
-			sync.Memmove(unsafe.Pointer(&val), unsafe.Pointer(ptr), unsafe.Sizeof(val))
-		} else {
-			// This is ~40% faster for short reads than going through memmove.
-			val = *ptr
-		}
-		if sc.ReadOk(epoch) {
-			break
+		if val, ok := SeqAtomicTryLoad(seq, seq.BeginRead(), ptr); ok {
+			return val
 		}
 	}
-	return val
 }
 
 // SeqAtomicTryLoad returns a copy of *ptr while in a reader critical section
-// in sc initiated by a call to sc.BeginRead() that returned epoch. If the read
-// would race with a writer critical section, SeqAtomicTryLoad returns
+// in seq initiated by a call to seq.BeginRead() that returned epoch. If the
+// read would race with a writer critical section, SeqAtomicTryLoad returns
 // (unspecified, false).
-func SeqAtomicTryLoad(sc *sync.SeqCount, epoch sync.SeqCountEpoch, ptr *Value) (Value, bool) {
-	var val Value
+//
+//go:nosplit
+func SeqAtomicTryLoad(seq *sync.SeqCount, epoch sync.SeqCountEpoch, ptr *Value) (val Value, ok bool) {
 	if sync.RaceEnabled {
+		// runtime.RaceDisable() doesn't actually stop the race detector, so it
+		// can't help us here. Instead, call runtime.memmove directly, which is
+		// not instrumented by the race detector.
 		sync.Memmove(unsafe.Pointer(&val), unsafe.Pointer(ptr), unsafe.Sizeof(val))
 	} else {
+		// This is ~40% faster for short reads than going through memmove.
 		val = *ptr
 	}
-	return val, sc.ReadOk(epoch)
+	ok = seq.ReadOk(epoch)
+	return
 }
 
 func init() {
diff --git a/pkg/sync/seqcount.go b/pkg/sync/seqcount.go
index a1e895352..2c5d3df99 100644
--- a/pkg/sync/seqcount.go
+++ b/pkg/sync/seqcount.go
@@ -8,7 +8,6 @@ package sync
 import (
 	"fmt"
 	"reflect"
-	"runtime"
 	"sync/atomic"
 )
 
@@ -43,9 +42,7 @@ type SeqCount struct {
 }
 
 // SeqCountEpoch tracks writer critical sections in a SeqCount.
-type SeqCountEpoch struct {
-	val uint32
-}
+type SeqCountEpoch uint32
 
 // We assume that:
 //
@@ -83,12 +80,25 @@ type SeqCountEpoch struct {
 // using this pattern. Most users of SeqCount will need to use the
 // SeqAtomicLoad function template in seqatomic.go.
 func (s *SeqCount) BeginRead() SeqCountEpoch {
-	epoch := atomic.LoadUint32(&s.epoch)
-	for epoch&1 != 0 {
-		runtime.Gosched()
-		epoch = atomic.LoadUint32(&s.epoch)
+	if epoch := atomic.LoadUint32(&s.epoch); epoch&1 == 0 {
+		return SeqCountEpoch(epoch)
+	}
+	return s.beginReadSlow()
+}
+
+func (s *SeqCount) beginReadSlow() SeqCountEpoch {
+	i := 0
+	for {
+		if canSpin(i) {
+			i++
+			doSpin()
+		} else {
+			goyield()
+		}
+		if epoch := atomic.LoadUint32(&s.epoch); epoch&1 == 0 {
+			return SeqCountEpoch(epoch)
+		}
 	}
-	return SeqCountEpoch{epoch}
 }
 
 // ReadOk returns true if the reader critical section initiated by a previous
@@ -99,7 +109,7 @@ func (s *SeqCount) BeginRead() SeqCountEpoch {
 // Reader critical sections do not need to be explicitly terminated; the last
 // call to ReadOk is implicitly the end of the reader critical section.
 func (s *SeqCount) ReadOk(epoch SeqCountEpoch) bool {
-	return atomic.LoadUint32(&s.epoch) == epoch.val
+	return atomic.LoadUint32(&s.epoch) == uint32(epoch)
 }
 
 // BeginWrite indicates the beginning of a writer critical section.
diff --git a/pkg/sync/spin_unsafe.go b/pkg/sync/spin_unsafe.go
new file mode 100644
index 000000000..cafb2d065
--- /dev/null
+++ b/pkg/sync/spin_unsafe.go
@@ -0,0 +1,24 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build go1.13
+// +build !go1.17
+
+// Check go:linkname function signatures when updating Go version.
+
+package sync
+
+import (
+	_ "unsafe" // for go:linkname
+)
+
+//go:linkname canSpin sync.runtime_canSpin
+func canSpin(i int) bool
+
+//go:linkname doSpin sync.runtime_doSpin
+func doSpin()
+
+//go:linkname goyield runtime.goyield
+func goyield()
diff --git a/pkg/syncevent/broadcaster.go b/pkg/syncevent/broadcaster.go
index 4bff59e7d..dabf08895 100644
--- a/pkg/syncevent/broadcaster.go
+++ b/pkg/syncevent/broadcaster.go
@@ -111,7 +111,9 @@ func (b *Broadcaster) SubscribeEvents(r *Receiver, filter Set) SubscriptionID {
 	return id
 }
 
-// Preconditions: table must not be full. len(table) is a power of 2.
+// Preconditions:
+// * table must not be full.
+// * len(table) is a power of 2.
 func broadcasterTableInsert(table []broadcasterSlot, id SubscriptionID, r *Receiver, filter Set) {
 	entry := broadcasterSlot{
 		receiver: r,
diff --git a/pkg/syncevent/source.go b/pkg/syncevent/source.go
index ddffb171a..d3d0f34c5 100644
--- a/pkg/syncevent/source.go
+++ b/pkg/syncevent/source.go
@@ -19,9 +19,11 @@ type Source interface {
 	// SubscribeEvents causes the Source to notify the given Receiver of the
 	// given subset of events.
 	//
-	// Preconditions: r != nil. The ReceiverCallback for r must not take locks
-	// that are ordered prior to the Source; for example, it cannot call any
-	// Source methods.
+	// Preconditions:
+	// * r != nil.
+	// * The ReceiverCallback for r must not take locks that are ordered
+	//   prior to the Source; for example, it cannot call any Source
+	//   methods.
 	SubscribeEvents(r *Receiver, filter Set) SubscriptionID
 
 	// UnsubscribeEvents causes the Source to stop notifying the Receiver
diff --git a/pkg/syncevent/waiter_unsafe.go b/pkg/syncevent/waiter_unsafe.go
index ad271e1a0..518f18479 100644
--- a/pkg/syncevent/waiter_unsafe.go
+++ b/pkg/syncevent/waiter_unsafe.go
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 // +build go1.11
-// +build !go1.16
+// +build !go1.17
 
 // Check go:linkname function signatures when updating Go version.
 
diff --git a/pkg/syserror/syserror.go b/pkg/syserror/syserror.go
index fe9f50169..f516c8e46 100644
--- a/pkg/syserror/syserror.go
+++ b/pkg/syserror/syserror.go
@@ -33,6 +33,7 @@ var (
 	EBADFD       = error(syscall.EBADFD)
 	EBUSY        = error(syscall.EBUSY)
 	ECHILD       = error(syscall.ECHILD)
+	ECONNABORTED = error(syscall.ECONNABORTED)
 	ECONNREFUSED = error(syscall.ECONNREFUSED)
 	ECONNRESET   = error(syscall.ECONNRESET)
 	EDEADLK      = error(syscall.EDEADLK)
diff --git a/pkg/syserror/syserror_test.go b/pkg/syserror/syserror_test.go
index 29719752e..7036467c4 100644
--- a/pkg/syserror/syserror_test.go
+++ b/pkg/syserror/syserror_test.go
@@ -24,27 +24,20 @@ import (
 
 var globalError error
 
-func returnErrnoAsError() error {
-	return syscall.EINVAL
-}
-
-func returnError() error {
-	return syserror.EINVAL
-}
-
-func BenchmarkReturnErrnoAsError(b *testing.B) {
+func BenchmarkAssignErrno(b *testing.B) {
 	for i := b.N; i > 0; i-- {
-		returnErrnoAsError()
+		globalError = syscall.EINVAL
 	}
 }
 
-func BenchmarkReturnError(b *testing.B) {
+func BenchmarkAssignError(b *testing.B) {
 	for i := b.N; i > 0; i-- {
-		returnError()
+		globalError = syserror.EINVAL
 	}
 }
 
 func BenchmarkCompareErrno(b *testing.B) {
+	globalError = syscall.EAGAIN
 	j := 0
 	for i := b.N; i > 0; i-- {
 		if globalError == syscall.EINVAL {
@@ -54,6 +47,7 @@ func BenchmarkCompareErrno(b *testing.B) {
 }
 
 func BenchmarkCompareError(b *testing.B) {
+	globalError = syserror.EAGAIN
 	j := 0
 	for i := b.N; i > 0; i-- {
 		if globalError == syserror.EINVAL {
@@ -63,6 +57,7 @@ func BenchmarkCompareError(b *testing.B) {
 }
 
 func BenchmarkSwitchErrno(b *testing.B) {
+	globalError = syscall.EPERM
 	j := 0
 	for i := b.N; i > 0; i-- {
 		switch globalError {
@@ -77,6 +72,7 @@ func BenchmarkSwitchErrno(b *testing.B) {
 }
 
 func BenchmarkSwitchError(b *testing.B) {
+	globalError = syserror.EPERM
 	j := 0
 	for i := b.N; i > 0; i-- {
 		switch globalError {
diff --git a/pkg/tcpip/adapters/gonet/gonet.go b/pkg/tcpip/adapters/gonet/gonet.go
index d82ed5205..4f551cd92 100644
--- a/pkg/tcpip/adapters/gonet/gonet.go
+++ b/pkg/tcpip/adapters/gonet/gonet.go
@@ -245,7 +245,7 @@ func NewTCPConn(wq *waiter.Queue, ep tcpip.Endpoint) *TCPConn {
 
 // Accept implements net.Conn.Accept.
 func (l *TCPListener) Accept() (net.Conn, error) {
-	n, wq, err := l.ep.Accept()
+	n, wq, err := l.ep.Accept(nil)
 
 	if err == tcpip.ErrWouldBlock {
 		// Create wait queue entry that notifies a channel.
@@ -254,7 +254,7 @@ func (l *TCPListener) Accept() (net.Conn, error) {
 		defer l.wq.EventUnregister(&waitEntry)
 
 		for {
-			n, wq, err = l.ep.Accept()
+			n, wq, err = l.ep.Accept(nil)
 
 			if err != tcpip.ErrWouldBlock {
 				break
@@ -541,7 +541,7 @@ func DialContextTCP(ctx context.Context, s *stack.Stack, addr tcpip.FullAddress,
 		case <-notifyCh:
 		}
 
-		err = ep.GetSockOpt(tcpip.ErrorOption{})
+		err = ep.LastError()
 	}
 	if err != nil {
 		ep.Close()
diff --git a/pkg/tcpip/adapters/gonet/gonet_test.go b/pkg/tcpip/adapters/gonet/gonet_test.go
index 3c552988a..b196324c7 100644
--- a/pkg/tcpip/adapters/gonet/gonet_test.go
+++ b/pkg/tcpip/adapters/gonet/gonet_test.go
@@ -61,8 +61,8 @@ func TestTimeouts(t *testing.T) {
 func newLoopbackStack() (*stack.Stack, *tcpip.Error) {
 	// Create the stack and add a NIC.
 	s := stack.New(stack.Options{
-		NetworkProtocols:   []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol()},
-		TransportProtocols: []stack.TransportProtocol{tcp.NewProtocol(), udp.NewProtocol()},
+		NetworkProtocols:   []stack.NetworkProtocolFactory{ipv4.NewProtocol, ipv6.NewProtocol},
+		TransportProtocols: []stack.TransportProtocolFactory{tcp.NewProtocol, udp.NewProtocol},
 	})
 
 	if err := s.CreateNIC(NICID, loopback.New()); err != nil {
@@ -97,6 +97,9 @@ type testConnection struct {
 func connect(s *stack.Stack, addr tcpip.FullAddress) (*testConnection, *tcpip.Error) {
 	wq := &waiter.Queue{}
 	ep, err := s.NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, wq)
+	if err != nil {
+		return nil, err
+	}
 
 	entry, ch := waiter.NewChannelEntry(nil)
 	wq.EventRegister(&entry, waiter.EventOut)
@@ -104,7 +107,7 @@ func connect(s *stack.Stack, addr tcpip.FullAddress) (*testConnection, *tcpip.Er
 	err = ep.Connect(addr)
 	if err == tcpip.ErrConnectStarted {
 		<-ch
-		err = ep.GetSockOpt(tcpip.ErrorOption{})
+		err = ep.LastError()
 	}
 	if err != nil {
 		return nil, err
@@ -145,7 +148,9 @@ func TestCloseReader(t *testing.T) {
 		defer close(done)
 		c, err := l.Accept()
 		if err != nil {
-			t.Fatalf("l.Accept() = %v", err)
+			t.Errorf("l.Accept() = %v", err)
+			// Cannot call Fatalf in goroutine. Just return from the goroutine.
+			return
 		}
 
 		// Give c.Read() a chance to block before closing the connection.
@@ -416,7 +421,9 @@ func TestDeadlineChange(t *testing.T) {
 		defer close(done)
 		c, err := l.Accept()
 		if err != nil {
-			t.Fatalf("l.Accept() = %v", err)
+			t.Errorf("l.Accept() = %v", err)
+			// Cannot call Fatalf in goroutine. Just return from the goroutine.
+			return
 		}
 
 		c.SetDeadline(time.Now().Add(time.Minute))
diff --git a/pkg/tcpip/buffer/BUILD b/pkg/tcpip/buffer/BUILD
index 563bc78ea..c326fab54 100644
--- a/pkg/tcpip/buffer/BUILD
+++ b/pkg/tcpip/buffer/BUILD
@@ -14,6 +14,8 @@ go_library(
 go_test(
     name = "buffer_test",
     size = "small",
-    srcs = ["view_test.go"],
+    srcs = [
+        "view_test.go",
+    ],
     library = ":buffer",
 )
diff --git a/pkg/tcpip/buffer/view.go b/pkg/tcpip/buffer/view.go
index ea0c5413d..8db70a700 100644
--- a/pkg/tcpip/buffer/view.go
+++ b/pkg/tcpip/buffer/view.go
@@ -84,8 +84,8 @@ type VectorisedView struct {
 	size  int
 }
 
-// NewVectorisedView creates a new vectorised view from an already-allocated slice
-// of View and sets its size.
+// NewVectorisedView creates a new vectorised view from an already-allocated
+// slice of View and sets its size.
 func NewVectorisedView(size int, views []View) VectorisedView {
 	return VectorisedView{views: views, size: size}
 }
@@ -170,8 +170,9 @@ func (vv *VectorisedView) CapLength(length int) {
 }
 
 // Clone returns a clone of this VectorisedView.
-// If the buffer argument is large enough to contain all the Views of this VectorisedView,
-// the method will avoid allocations and use the buffer to store the Views of the clone.
+// If the buffer argument is large enough to contain all the Views of this
+// VectorisedView, the method will avoid allocations and use the buffer to
+// store the Views of the clone.
 func (vv *VectorisedView) Clone(buffer []View) VectorisedView {
 	return VectorisedView{views: append(buffer[:0], vv.views...), size: vv.size}
 }
@@ -209,7 +210,8 @@ func (vv *VectorisedView) PullUp(count int) (View, bool) {
 	return newFirst, true
 }
 
-// Size returns the size in bytes of the entire content stored in the vectorised view.
+// Size returns the size in bytes of the entire content stored in the
+// vectorised view.
 func (vv *VectorisedView) Size() int {
 	return vv.size
 }
@@ -222,6 +224,12 @@ func (vv *VectorisedView) ToView() View {
 	if len(vv.views) == 1 {
 		return vv.views[0]
 	}
+	return vv.ToOwnedView()
+}
+
+// ToOwnedView returns a single view containing the content of the vectorised
+// view that vv does not own.
+func (vv *VectorisedView) ToOwnedView() View {
 	u := make([]byte, 0, vv.size)
 	for _, v := range vv.views {
 		u = append(u, v...)
diff --git a/pkg/tcpip/checker/checker.go b/pkg/tcpip/checker/checker.go
index b769094dc..6f81b0164 100644
--- a/pkg/tcpip/checker/checker.go
+++ b/pkg/tcpip/checker/checker.go
@@ -118,18 +118,100 @@ func TTL(ttl uint8) NetworkChecker {
 			v = ip.HopLimit()
 		}
 		if v != ttl {
-			t.Fatalf("Bad TTL, got %v, want %v", v, ttl)
+			t.Fatalf("Bad TTL, got = %d, want = %d", v, ttl)
+		}
+	}
+}
+
+// IPFullLength creates a checker for the full IP packet length. The
+// expected size is checked against both the Total Length in the
+// header and the number of bytes received.
+func IPFullLength(packetLength uint16) NetworkChecker {
+	return func(t *testing.T, h []header.Network) {
+		t.Helper()
+
+		var v uint16
+		var l uint16
+		switch ip := h[0].(type) {
+		case header.IPv4:
+			v = ip.TotalLength()
+			l = uint16(len(ip))
+		case header.IPv6:
+			v = ip.PayloadLength() + header.IPv6FixedHeaderSize
+			l = uint16(len(ip))
+		default:
+			t.Fatalf("unexpected network header passed to checker, got = %T, want = header.IPv4 or header.IPv6", ip)
+		}
+		if l != packetLength {
+			t.Errorf("bad packet length, got = %d, want = %d", l, packetLength)
+		}
+		if v != packetLength {
+			t.Errorf("unexpected packet length in header, got = %d, want = %d", v, packetLength)
+		}
+	}
+}
+
+// IPv4HeaderLength creates a checker that checks the IPv4 Header length.
+func IPv4HeaderLength(headerLength int) NetworkChecker {
+	return func(t *testing.T, h []header.Network) {
+		t.Helper()
+
+		switch ip := h[0].(type) {
+		case header.IPv4:
+			if hl := ip.HeaderLength(); hl != uint8(headerLength) {
+				t.Errorf("Bad header length, got = %d, want = %d", hl, headerLength)
+			}
+		default:
+			t.Fatalf("unexpected network header passed to checker, got = %T, want = header.IPv4", ip)
 		}
 	}
 }
 
 // PayloadLen creates a checker that checks the payload length.
-func PayloadLen(plen int) NetworkChecker {
+func PayloadLen(payloadLength int) NetworkChecker {
 	return func(t *testing.T, h []header.Network) {
 		t.Helper()
 
-		if l := len(h[0].Payload()); l != plen {
-			t.Errorf("Bad payload length, got %v, want %v", l, plen)
+		if l := len(h[0].Payload()); l != payloadLength {
+			t.Errorf("Bad payload length, got = %d, want = %d", l, payloadLength)
+		}
+	}
+}
+
+// IPPayload creates a checker that checks the payload.
+func IPPayload(payload []byte) NetworkChecker {
+	return func(t *testing.T, h []header.Network) {
+		t.Helper()
+
+		got := h[0].Payload()
+
+		// cmp.Diff does not consider nil slices equal to empty slices, but we do.
+		if len(got) == 0 && len(payload) == 0 {
+			return
+		}
+
+		if diff := cmp.Diff(payload, got); diff != "" {
+			t.Errorf("payload mismatch (-want +got):\n%s", diff)
+		}
+	}
+}
+
+// IPv4Options returns a checker that checks the options in an IPv4 packet.
+func IPv4Options(want []byte) NetworkChecker {
+	return func(t *testing.T, h []header.Network) {
+		t.Helper()
+
+		ip, ok := h[0].(header.IPv4)
+		if !ok {
+			t.Fatalf("unexpected network header passed to checker, got = %T, want = header.IPv4", h[0])
+		}
+		options := ip.Options()
+		// cmp.Diff does not consider nil slices equal to empty slices, but we do.
+		if len(want) == 0 && len(options) == 0 {
+			return
+		}
+		if diff := cmp.Diff(want, options); diff != "" {
+			t.Errorf("options mismatch (-want +got):\n%s", diff)
 		}
 	}
 }
@@ -139,11 +221,11 @@ func FragmentOffset(offset uint16) NetworkChecker {
 	return func(t *testing.T, h []header.Network) {
 		t.Helper()
 
-		// We only do this of IPv4 for now.
+		// We only do this for IPv4 for now.
 		switch ip := h[0].(type) {
 		case header.IPv4:
 			if v := ip.FragmentOffset(); v != offset {
-				t.Errorf("Bad fragment offset, got %v, want %v", v, offset)
+				t.Errorf("Bad fragment offset, got = %d, want = %d", v, offset)
 			}
 		}
 	}
@@ -154,11 +236,11 @@ func FragmentFlags(flags uint8) NetworkChecker {
 	return func(t *testing.T, h []header.Network) {
 		t.Helper()
 
-		// We only do this of IPv4 for now.
+		// We only do this for IPv4 for now.
 		switch ip := h[0].(type) {
 		case header.IPv4:
 			if v := ip.Flags(); v != flags {
-				t.Errorf("Bad fragment offset, got %v, want %v", v, flags)
+				t.Errorf("Bad fragment offset, got = %d, want = %d", v, flags)
 			}
 		}
 	}
@@ -208,7 +290,7 @@ func TOS(tos uint8, label uint32) NetworkChecker {
 		t.Helper()
 
 		if v, l := h[0].TOS(); v != tos || l != label {
-			t.Errorf("Bad TOS, got (%v, %v), want (%v,%v)", v, l, tos, label)
+			t.Errorf("Bad TOS, got = (%d, %d), want = (%d,%d)", v, l, tos, label)
 		}
 	}
 }
@@ -234,7 +316,7 @@ func IPv6Fragment(checkers ...NetworkChecker) NetworkChecker {
 		t.Helper()
 
 		if p := h[0].TransportProtocol(); p != header.IPv6FragmentHeader {
-			t.Errorf("Bad protocol, got %v, want %v", p, header.UDPProtocolNumber)
+			t.Errorf("Bad protocol, got = %d, want = %d", p, header.UDPProtocolNumber)
 		}
 
 		ipv6Frag := header.IPv6Fragment(h[0].Payload())
@@ -261,7 +343,7 @@ func TCP(checkers ...TransportChecker) NetworkChecker {
 		last := h[len(h)-1]
 
 		if p := last.TransportProtocol(); p != header.TCPProtocolNumber {
-			t.Errorf("Bad protocol, got %v, want %v", p, header.TCPProtocolNumber)
+			t.Errorf("Bad protocol, got = %d, want = %d", p, header.TCPProtocolNumber)
 		}
 
 		// Verify the checksum.
@@ -297,7 +379,7 @@ func UDP(checkers ...TransportChecker) NetworkChecker {
 		last := h[len(h)-1]
 
 		if p := last.TransportProtocol(); p != header.UDPProtocolNumber {
-			t.Errorf("Bad protocol, got %v, want %v", p, header.UDPProtocolNumber)
+			t.Errorf("Bad protocol, got = %d, want = %d", p, header.UDPProtocolNumber)
 		}
 
 		udp := header.UDP(last.Payload())
@@ -316,7 +398,7 @@ func SrcPort(port uint16) TransportChecker {
 		t.Helper()
 
 		if p := h.SourcePort(); p != port {
-			t.Errorf("Bad source port, got %v, want %v", p, port)
+			t.Errorf("Bad source port, got = %d, want = %d", p, port)
 		}
 	}
 }
@@ -327,7 +409,7 @@ func DstPort(port uint16) TransportChecker {
 		t.Helper()
 
 		if p := h.DestinationPort(); p != port {
-			t.Errorf("Bad destination port, got %v, want %v", p, port)
+			t.Errorf("Bad destination port, got = %d, want = %d", p, port)
 		}
 	}
 }
@@ -339,7 +421,7 @@ func NoChecksum(noChecksum bool) TransportChecker {
 
 		udp, ok := h.(header.UDP)
 		if !ok {
-			return
+			t.Fatalf("UDP header not found in h: %T", h)
 		}
 
 		if b := udp.Checksum() == 0; b != noChecksum {
@@ -348,50 +430,84 @@ func NoChecksum(noChecksum bool) TransportChecker {
 	}
 }
 
-// SeqNum creates a checker that checks the sequence number.
-func SeqNum(seq uint32) TransportChecker {
+// TCPSeqNum creates a checker that checks the sequence number.
+func TCPSeqNum(seq uint32) TransportChecker {
 	return func(t *testing.T, h header.Transport) {
 		t.Helper()
 
 		tcp, ok := h.(header.TCP)
 		if !ok {
-			return
+			t.Fatalf("TCP header not found in h: %T", h)
 		}
 
 		if s := tcp.SequenceNumber(); s != seq {
-			t.Errorf("Bad sequence number, got %v, want %v", s, seq)
+			t.Errorf("Bad sequence number, got = %d, want = %d", s, seq)
 		}
 	}
 }
 
-// AckNum creates a checker that checks the ack number.
-func AckNum(seq uint32) TransportChecker {
+// TCPAckNum creates a checker that checks the ack number.
+func TCPAckNum(seq uint32) TransportChecker {
 	return func(t *testing.T, h header.Transport) {
 		t.Helper()
 
 		tcp, ok := h.(header.TCP)
 		if !ok {
-			return
+			t.Fatalf("TCP header not found in h: %T", h)
 		}
 
 		if s := tcp.AckNumber(); s != seq {
-			t.Errorf("Bad ack number, got %v, want %v", s, seq)
+			t.Errorf("Bad ack number, got = %d, want = %d", s, seq)
 		}
 	}
 }
 
-// Window creates a checker that checks the tcp window.
-func Window(window uint16) TransportChecker {
+// TCPWindow creates a checker that checks the tcp window.
+func TCPWindow(window uint16) TransportChecker {
 	return func(t *testing.T, h header.Transport) {
 		t.Helper()
 
 		tcp, ok := h.(header.TCP)
 		if !ok {
-			return
+			t.Fatalf("TCP header not found in hdr : %T", h)
 		}
 
 		if w := tcp.WindowSize(); w != window {
-			t.Errorf("Bad window, got 0x%x, want 0x%x", w, window)
+			t.Errorf("Bad window, got %d, want %d", w, window)
+		}
+	}
+}
+
+// TCPWindowGreaterThanEq creates a checker that checks that the TCP window
+// is greater than or equal to the provided value.
+func TCPWindowGreaterThanEq(window uint16) TransportChecker {
+	return func(t *testing.T, h header.Transport) {
+		t.Helper()
+
+		tcp, ok := h.(header.TCP)
+		if !ok {
+			t.Fatalf("TCP header not found in h: %T", h)
+		}
+
+		if w := tcp.WindowSize(); w < window {
+			t.Errorf("Bad window, got %d, want > %d", w, window)
+		}
+	}
+}
+
+// TCPWindowLessThanEq creates a checker that checks that the tcp window
+// is less than or equal to the provided value.
+func TCPWindowLessThanEq(window uint16) TransportChecker {
+	return func(t *testing.T, h header.Transport) {
+		t.Helper()
+
+		tcp, ok := h.(header.TCP)
+		if !ok {
+			t.Fatalf("TCP header not found in h: %T", h)
+		}
+
+		if w := tcp.WindowSize(); w > window {
+			t.Errorf("Bad window, got %d, want < %d", w, window)
 		}
 	}
 }
@@ -403,7 +519,7 @@ func TCPFlags(flags uint8) TransportChecker {
 
 		tcp, ok := h.(header.TCP)
 		if !ok {
-			return
+			t.Fatalf("TCP header not found in h: %T", h)
 		}
 
 		if f := tcp.Flags(); f != flags {
@@ -420,7 +536,7 @@ func TCPFlagsMatch(flags, mask uint8) TransportChecker {
 
 		tcp, ok := h.(header.TCP)
 		if !ok {
-			return
+			t.Fatalf("TCP header not found in h: %T", h)
 		}
 
 		if f := tcp.Flags(); (f & mask) != (flags & mask) {
@@ -458,7 +574,7 @@ func TCPSynOptions(wantOpts header.TCPSynOptions) TransportChecker {
 			case header.TCPOptionMSS:
 				v := uint16(opts[i+2])<<8 | uint16(opts[i+3])
 				if wantOpts.MSS != v {
-					t.Errorf("Bad MSS: got %v, want %v", v, wantOpts.MSS)
+					t.Errorf("Bad MSS, got = %d, want = %d", v, wantOpts.MSS)
 				}
 				foundMSS = true
 				i += 4
@@ -468,7 +584,7 @@ func TCPSynOptions(wantOpts header.TCPSynOptions) TransportChecker {
 				}
 				v := int(opts[i+2])
 				if v != wantOpts.WS {
-					t.Errorf("Bad WS: got %v, want %v", v, wantOpts.WS)
+					t.Errorf("Bad WS, got = %d, want = %d", v, wantOpts.WS)
 				}
 				foundWS = true
 				i += 3
@@ -517,7 +633,7 @@ func TCPSynOptions(wantOpts header.TCPSynOptions) TransportChecker {
 			t.Error("TS option specified but the timestamp value is zero")
 		}
 		if foundTS && tsEcr == 0 && wantOpts.TSEcr != 0 {
-			t.Errorf("TS option specified but TSEcr is incorrect: got %d, want: %d", tsEcr, wantOpts.TSEcr)
+			t.Errorf("TS option specified but TSEcr is incorrect, got = %d, want = %d", tsEcr, wantOpts.TSEcr)
 		}
 		if wantOpts.SACKPermitted && !foundSACKPermitted {
 			t.Errorf("SACKPermitted option not found. Options: %x", opts)
@@ -555,7 +671,7 @@ func TCPTimestampChecker(wantTS bool, wantTSVal uint32, wantTSEcr uint32) Transp
 					t.Errorf("TS option found, but option is truncated, option length: %d, want 10 bytes", limit-i)
 				}
 				if opts[i+1] != 10 {
-					t.Errorf("TS option found, but bad length specified: %d, want: 10", opts[i+1])
+					t.Errorf("TS option found, but bad length specified: got = %d, want = 10", opts[i+1])
 				}
 				tsVal = binary.BigEndian.Uint32(opts[i+2:])
 				tsEcr = binary.BigEndian.Uint32(opts[i+6:])
@@ -575,19 +691,19 @@ func TCPTimestampChecker(wantTS bool, wantTSVal uint32, wantTSEcr uint32) Transp
 		}
 
 		if wantTS != foundTS {
-			t.Errorf("TS Option mismatch: got TS= %v, want TS= %v", foundTS, wantTS)
+			t.Errorf("TS Option mismatch, got TS= %t, want TS= %t", foundTS, wantTS)
 		}
 		if wantTS && wantTSVal != 0 && wantTSVal != tsVal {
-			t.Errorf("Timestamp value is incorrect: got: %d, want: %d", tsVal, wantTSVal)
+			t.Errorf("Timestamp value is incorrect, got = %d, want = %d", tsVal, wantTSVal)
 		}
 		if wantTS && wantTSEcr != 0 && tsEcr != wantTSEcr {
-			t.Errorf("Timestamp Echo Reply is incorrect: got: %d, want: %d", tsEcr, wantTSEcr)
+			t.Errorf("Timestamp Echo Reply is incorrect, got = %d, want = %d", tsEcr, wantTSEcr)
 		}
 	}
 }
 
-// TCPNoSACKBlockChecker creates a checker that verifies that the segment does not
-// contain any SACK blocks in the TCP options.
+// TCPNoSACKBlockChecker creates a checker that verifies that the segment does
+// not contain any SACK blocks in the TCP options.
 func TCPNoSACKBlockChecker() TransportChecker {
 	return TCPSACKBlockChecker(nil)
 }
@@ -645,7 +761,7 @@ func TCPSACKBlockChecker(sackBlocks []header.SACKBlock) TransportChecker {
 		}
 
 		if !reflect.DeepEqual(gotSACKBlocks, sackBlocks) {
-			t.Errorf("SACKBlocks are not equal, got: %v, want: %v", gotSACKBlocks, sackBlocks)
+			t.Errorf("SACKBlocks are not equal, got = %v, want = %v", gotSACKBlocks, sackBlocks)
 		}
 	}
 }
@@ -661,8 +777,8 @@ func Payload(want []byte) TransportChecker {
 	}
 }
 
-// ICMPv4 creates a checker that checks that the transport protocol is ICMPv4 and
-// potentially additional ICMPv4 header fields.
+// ICMPv4 creates a checker that checks that the transport protocol is ICMPv4
+// and potentially additional ICMPv4 header fields.
 func ICMPv4(checkers ...TransportChecker) NetworkChecker {
 	return func(t *testing.T, h []header.Network) {
 		t.Helper()
@@ -690,10 +806,10 @@ func ICMPv4Type(want header.ICMPv4Type) TransportChecker {
 
 		icmpv4, ok := h.(header.ICMPv4)
 		if !ok {
-			t.Fatalf("unexpected transport header passed to checker got: %+v, want: header.ICMPv4", h)
+			t.Fatalf("unexpected transport header passed to checker, got = %T, want = header.ICMPv4", h)
 		}
 		if got := icmpv4.Type(); got != want {
-			t.Fatalf("unexpected icmp type got: %d, want: %d", got, want)
+			t.Fatalf("unexpected icmp type, got = %d, want = %d", got, want)
 		}
 	}
 }
@@ -705,10 +821,76 @@ func ICMPv4Code(want header.ICMPv4Code) TransportChecker {
 
 		icmpv4, ok := h.(header.ICMPv4)
 		if !ok {
-			t.Fatalf("unexpected transport header passed to checker got: %+v, want: header.ICMPv4", h)
+			t.Fatalf("unexpected transport header passed to checker, got = %T, want = header.ICMPv4", h)
 		}
 		if got := icmpv4.Code(); got != want {
-			t.Fatalf("unexpected ICMP code got: %d, want: %d", got, want)
+			t.Fatalf("unexpected ICMP code, got = %d, want = %d", got, want)
+		}
+	}
+}
+
+// ICMPv4Ident creates a checker that checks the ICMPv4 echo Ident.
+func ICMPv4Ident(want uint16) TransportChecker {
+	return func(t *testing.T, h header.Transport) {
+		t.Helper()
+
+		icmpv4, ok := h.(header.ICMPv4)
+		if !ok {
+			t.Fatalf("unexpected transport header passed to checker, got = %T, want = header.ICMPv4", h)
+		}
+		if got := icmpv4.Ident(); got != want {
+			t.Fatalf("unexpected ICMP ident, got = %d, want = %d", got, want)
+		}
+	}
+}
+
+// ICMPv4Seq creates a checker that checks the ICMPv4 echo Sequence.
+func ICMPv4Seq(want uint16) TransportChecker {
+	return func(t *testing.T, h header.Transport) {
+		t.Helper()
+
+		icmpv4, ok := h.(header.ICMPv4)
+		if !ok {
+			t.Fatalf("unexpected transport header passed to checker, got = %T, want = header.ICMPv4", h)
+		}
+		if got := icmpv4.Sequence(); got != want {
+			t.Fatalf("unexpected ICMP sequence, got = %d, want = %d", got, want)
+		}
+	}
+}
+
+// ICMPv4Checksum creates a checker that checks the ICMPv4 Checksum.
+// This assumes that the payload exactly makes up the rest of the slice.
+func ICMPv4Checksum() TransportChecker {
+	return func(t *testing.T, h header.Transport) {
+		t.Helper()
+
+		icmpv4, ok := h.(header.ICMPv4)
+		if !ok {
+			t.Fatalf("unexpected transport header passed to checker, got = %T, want = header.ICMPv4", h)
+		}
+		heldChecksum := icmpv4.Checksum()
+		icmpv4.SetChecksum(0)
+		newChecksum := ^header.Checksum(icmpv4, 0)
+		icmpv4.SetChecksum(heldChecksum)
+		if heldChecksum != newChecksum {
+			t.Errorf("unexpected ICMP checksum, got = %d, want = %d", heldChecksum, newChecksum)
+		}
+	}
+}
+
+// ICMPv4Payload creates a checker that checks the payload in an ICMPv4 packet.
+func ICMPv4Payload(want []byte) TransportChecker {
+	return func(t *testing.T, h header.Transport) {
+		t.Helper()
+
+		icmpv4, ok := h.(header.ICMPv4)
+		if !ok {
+			t.Fatalf("unexpected transport header passed to checker, got = %T, want = header.ICMPv4", h)
+		}
+		payload := icmpv4.Payload()
+		if diff := cmp.Diff(want, payload); diff != "" {
+			t.Errorf("ICMP payload mismatch (-want +got):\n%s", diff)
 		}
 	}
 }
@@ -748,10 +930,10 @@ func ICMPv6Type(want header.ICMPv6Type) TransportChecker {
 
 		icmpv6, ok := h.(header.ICMPv6)
 		if !ok {
-			t.Fatalf("unexpected transport header passed to checker got: %+v, want: header.ICMPv6", h)
+			t.Fatalf("unexpected transport header passed to checker, got = %T, want = header.ICMPv6", h)
 		}
 		if got := icmpv6.Type(); got != want {
-			t.Fatalf("unexpected icmp type got: %d, want: %d", got, want)
+			t.Fatalf("unexpected icmp type, got = %d, want = %d", got, want)
 		}
 	}
 }
@@ -763,10 +945,10 @@ func ICMPv6Code(want header.ICMPv6Code) TransportChecker {
 
 		icmpv6, ok := h.(header.ICMPv6)
 		if !ok {
-			t.Fatalf("unexpected transport header passed to checker got: %+v, want: header.ICMPv6", h)
+			t.Fatalf("unexpected transport header passed to checker, got = %T, want = header.ICMPv6", h)
 		}
 		if got := icmpv6.Code(); got != want {
-			t.Fatalf("unexpected ICMP code got: %d, want: %d", got, want)
+			t.Fatalf("unexpected ICMP code, got = %d, want = %d", got, want)
 		}
 	}
 }
diff --git a/pkg/tcpip/faketime/BUILD b/pkg/tcpip/faketime/BUILD
new file mode 100644
index 000000000..114d43df3
--- /dev/null
+++ b/pkg/tcpip/faketime/BUILD
@@ -0,0 +1,24 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "faketime",
+    srcs = ["faketime.go"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//pkg/tcpip",
+        "@com_github_dpjacques_clockwork//:go_default_library",
+    ],
+)
+
+go_test(
+    name = "faketime_test",
+    size = "small",
+    srcs = [
+        "faketime_test.go",
+    ],
+    deps = [
+        "//pkg/tcpip/faketime",
+    ],
+)
diff --git a/pkg/tcpip/stack/fake_time_test.go b/pkg/tcpip/faketime/faketime.go
index 92c8cb534..f7a4fbde1 100644
--- a/pkg/tcpip/stack/fake_time_test.go
+++ b/pkg/tcpip/faketime/faketime.go
@@ -12,7 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-package stack
+// Package faketime provides a fake clock that implements tcpip.Clock interface.
+package faketime
 
 import (
 	"container/heap"
@@ -23,7 +24,29 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip"
 )
 
-type fakeClock struct {
+// NullClock implements a clock that never advances.
+type NullClock struct{}
+
+var _ tcpip.Clock = (*NullClock)(nil)
+
+// NowNanoseconds implements tcpip.Clock.NowNanoseconds.
+func (*NullClock) NowNanoseconds() int64 {
+	return 0
+}
+
+// NowMonotonic implements tcpip.Clock.NowMonotonic.
+func (*NullClock) NowMonotonic() int64 {
+	return 0
+}
+
+// AfterFunc implements tcpip.Clock.AfterFunc.
+func (*NullClock) AfterFunc(time.Duration, func()) tcpip.Timer {
+	return nil
+}
+
+// ManualClock implements tcpip.Clock and only advances manually with Advance
+// method.
+type ManualClock struct {
 	clock clockwork.FakeClock
 
 	// mu protects the fields below.
@@ -39,34 +62,35 @@ type fakeClock struct {
 	waitGroups map[time.Time]*sync.WaitGroup
 }
 
-func newFakeClock() *fakeClock {
-	return &fakeClock{
+// NewManualClock creates a new ManualClock instance.
+func NewManualClock() *ManualClock {
+	return &ManualClock{
 		clock:      clockwork.NewFakeClock(),
 		times:      &timeHeap{},
 		waitGroups: make(map[time.Time]*sync.WaitGroup),
 	}
 }
 
-var _ tcpip.Clock = (*fakeClock)(nil)
+var _ tcpip.Clock = (*ManualClock)(nil)
 
 // NowNanoseconds implements tcpip.Clock.NowNanoseconds.
-func (fc *fakeClock) NowNanoseconds() int64 {
-	return fc.clock.Now().UnixNano()
+func (mc *ManualClock) NowNanoseconds() int64 {
+	return mc.clock.Now().UnixNano()
 }
 
 // NowMonotonic implements tcpip.Clock.NowMonotonic.
-func (fc *fakeClock) NowMonotonic() int64 {
-	return fc.NowNanoseconds()
+func (mc *ManualClock) NowMonotonic() int64 {
+	return mc.NowNanoseconds()
 }
 
 // AfterFunc implements tcpip.Clock.AfterFunc.
-func (fc *fakeClock) AfterFunc(d time.Duration, f func()) tcpip.Timer {
-	until := fc.clock.Now().Add(d)
-	wg := fc.addWait(until)
-	return &fakeTimer{
-		clock: fc,
+func (mc *ManualClock) AfterFunc(d time.Duration, f func()) tcpip.Timer {
+	until := mc.clock.Now().Add(d)
+	wg := mc.addWait(until)
+	return &manualTimer{
+		clock: mc,
 		until: until,
-		timer: fc.clock.AfterFunc(d, func() {
+		timer: mc.clock.AfterFunc(d, func() {
 			defer wg.Done()
 			f()
 		}),
@@ -75,110 +99,113 @@ func (fc *fakeClock) AfterFunc(d time.Duration, f func()) tcpip.Timer {
 
 // addWait adds an additional wait to the WaitGroup for parallel execution of
 // all work scheduled for t. Returns a reference to the WaitGroup modified.
-func (fc *fakeClock) addWait(t time.Time) *sync.WaitGroup {
-	fc.mu.RLock()
-	wg, ok := fc.waitGroups[t]
-	fc.mu.RUnlock()
+func (mc *ManualClock) addWait(t time.Time) *sync.WaitGroup {
+	mc.mu.RLock()
+	wg, ok := mc.waitGroups[t]
+	mc.mu.RUnlock()
 
 	if ok {
 		wg.Add(1)
 		return wg
 	}
 
-	fc.mu.Lock()
-	heap.Push(fc.times, t)
-	fc.mu.Unlock()
+	mc.mu.Lock()
+	heap.Push(mc.times, t)
+	mc.mu.Unlock()
 
 	wg = &sync.WaitGroup{}
 	wg.Add(1)
 
-	fc.mu.Lock()
-	fc.waitGroups[t] = wg
-	fc.mu.Unlock()
+	mc.mu.Lock()
+	mc.waitGroups[t] = wg
+	mc.mu.Unlock()
 
 	return wg
 }
 
 // removeWait removes a wait from the WaitGroup for parallel execution of all
 // work scheduled for t.
-func (fc *fakeClock) removeWait(t time.Time) {
-	fc.mu.RLock()
-	defer fc.mu.RUnlock()
+func (mc *ManualClock) removeWait(t time.Time) {
+	mc.mu.RLock()
+	defer mc.mu.RUnlock()
 
-	wg := fc.waitGroups[t]
+	wg := mc.waitGroups[t]
 	wg.Done()
 }
 
-// advance executes all work that have been scheduled to execute within d from
-// the current fake time. Blocks until all work has completed execution.
-func (fc *fakeClock) advance(d time.Duration) {
+// Advance executes all work that have been scheduled to execute within d from
+// the current  time. Blocks until all work has completed execution.
+func (mc *ManualClock) Advance(d time.Duration) {
 	// Block until all the work is done
-	until := fc.clock.Now().Add(d)
+	until := mc.clock.Now().Add(d)
 	for {
-		fc.mu.Lock()
-		if fc.times.Len() == 0 {
-			fc.mu.Unlock()
-			return
+		mc.mu.Lock()
+		if mc.times.Len() == 0 {
+			mc.mu.Unlock()
+			break
 		}
 
-		t := heap.Pop(fc.times).(time.Time)
+		t := heap.Pop(mc.times).(time.Time)
 		if t.After(until) {
 			// No work to do
-			heap.Push(fc.times, t)
-			fc.mu.Unlock()
-			return
+			heap.Push(mc.times, t)
+			mc.mu.Unlock()
+			break
 		}
-		fc.mu.Unlock()
+		mc.mu.Unlock()
 
-		diff := t.Sub(fc.clock.Now())
-		fc.clock.Advance(diff)
+		diff := t.Sub(mc.clock.Now())
+		mc.clock.Advance(diff)
 
-		fc.mu.RLock()
-		wg := fc.waitGroups[t]
-		fc.mu.RUnlock()
+		mc.mu.RLock()
+		wg := mc.waitGroups[t]
+		mc.mu.RUnlock()
 
 		wg.Wait()
 
-		fc.mu.Lock()
-		delete(fc.waitGroups, t)
-		fc.mu.Unlock()
+		mc.mu.Lock()
+		delete(mc.waitGroups, t)
+		mc.mu.Unlock()
+	}
+	if now := mc.clock.Now(); until.After(now) {
+		mc.clock.Advance(until.Sub(now))
 	}
 }
 
-type fakeTimer struct {
-	clock *fakeClock
+type manualTimer struct {
+	clock *ManualClock
 	timer clockwork.Timer
 
 	mu    sync.RWMutex
 	until time.Time
 }
 
-var _ tcpip.Timer = (*fakeTimer)(nil)
+var _ tcpip.Timer = (*manualTimer)(nil)
 
 // Reset implements tcpip.Timer.Reset.
-func (ft *fakeTimer) Reset(d time.Duration) {
-	if !ft.timer.Reset(d) {
+func (t *manualTimer) Reset(d time.Duration) {
+	if !t.timer.Reset(d) {
 		return
 	}
 
-	ft.mu.Lock()
-	defer ft.mu.Unlock()
+	t.mu.Lock()
+	defer t.mu.Unlock()
 
-	ft.clock.removeWait(ft.until)
-	ft.until = ft.clock.clock.Now().Add(d)
-	ft.clock.addWait(ft.until)
+	t.clock.removeWait(t.until)
+	t.until = t.clock.clock.Now().Add(d)
+	t.clock.addWait(t.until)
 }
 
 // Stop implements tcpip.Timer.Stop.
-func (ft *fakeTimer) Stop() bool {
-	if !ft.timer.Stop() {
+func (t *manualTimer) Stop() bool {
+	if !t.timer.Stop() {
 		return false
 	}
 
-	ft.mu.RLock()
-	defer ft.mu.RUnlock()
+	t.mu.RLock()
+	defer t.mu.RUnlock()
 
-	ft.clock.removeWait(ft.until)
+	t.clock.removeWait(t.until)
 	return true
 }
 
diff --git a/pkg/tcpip/faketime/faketime_test.go b/pkg/tcpip/faketime/faketime_test.go
new file mode 100644
index 000000000..c2704df2c
--- /dev/null
+++ b/pkg/tcpip/faketime/faketime_test.go
@@ -0,0 +1,95 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package faketime_test
+
+import (
+	"testing"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/tcpip/faketime"
+)
+
+func TestManualClockAdvance(t *testing.T) {
+	const timeout = time.Millisecond
+	clock := faketime.NewManualClock()
+	start := clock.NowMonotonic()
+	clock.Advance(timeout)
+	if got, want := time.Duration(clock.NowMonotonic()-start)*time.Nanosecond, timeout; got != want {
+		t.Errorf("got = %d, want = %d", got, want)
+	}
+}
+
+func TestManualClockAfterFunc(t *testing.T) {
+	const (
+		timeout1 = time.Millisecond     // timeout for counter1
+		timeout2 = 2 * time.Millisecond // timeout for counter2
+	)
+	tests := []struct {
+		name         string
+		advance      time.Duration
+		wantCounter1 int
+		wantCounter2 int
+	}{
+		{
+			name:         "before timeout1",
+			advance:      timeout1 - 1,
+			wantCounter1: 0,
+			wantCounter2: 0,
+		},
+		{
+			name:         "timeout1",
+			advance:      timeout1,
+			wantCounter1: 1,
+			wantCounter2: 0,
+		},
+		{
+			name:         "timeout2",
+			advance:      timeout2,
+			wantCounter1: 1,
+			wantCounter2: 1,
+		},
+		{
+			name:         "after timeout2",
+			advance:      timeout2 + 1,
+			wantCounter1: 1,
+			wantCounter2: 1,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			clock := faketime.NewManualClock()
+			counter1 := 0
+			counter2 := 0
+			clock.AfterFunc(timeout1, func() {
+				counter1++
+			})
+			clock.AfterFunc(timeout2, func() {
+				counter2++
+			})
+			start := clock.NowMonotonic()
+			clock.Advance(test.advance)
+			if got, want := counter1, test.wantCounter1; got != want {
+				t.Errorf("got counter1 = %d, want = %d", got, want)
+			}
+			if got, want := counter2, test.wantCounter2; got != want {
+				t.Errorf("got counter2 = %d, want = %d", got, want)
+			}
+			if got, want := time.Duration(clock.NowMonotonic()-start)*time.Nanosecond, test.advance; got != want {
+				t.Errorf("got elapsed = %d, want = %d", got, want)
+			}
+		})
+	}
+}
diff --git a/pkg/tcpip/header/eth.go b/pkg/tcpip/header/eth.go
index eaface8cb..95ade0e5c 100644
--- a/pkg/tcpip/header/eth.go
+++ b/pkg/tcpip/header/eth.go
@@ -117,25 +117,31 @@ func (b Ethernet) Encode(e *EthernetFields) {
 	copy(b[dstMAC:][:EthernetAddressSize], e.DstAddr)
 }
 
-// IsValidUnicastEthernetAddress returns true if addr is a valid unicast
+// IsMulticastEthernetAddress returns true if the address is a multicast
+// ethernet address.
+func IsMulticastEthernetAddress(addr tcpip.LinkAddress) bool {
+	if len(addr) != EthernetAddressSize {
+		return false
+	}
+
+	return addr[unicastMulticastFlagByteIdx]&unicastMulticastFlagMask != 0
+}
+
+// IsValidUnicastEthernetAddress returns true if the address is a unicast
 // ethernet address.
 func IsValidUnicastEthernetAddress(addr tcpip.LinkAddress) bool {
-	// Must be of the right length.
 	if len(addr) != EthernetAddressSize {
 		return false
 	}
 
-	// Must not be unspecified.
 	if addr == unspecifiedEthernetAddress {
 		return false
 	}
 
-	// Must not be a multicast.
 	if addr[unicastMulticastFlagByteIdx]&unicastMulticastFlagMask != 0 {
 		return false
 	}
 
-	// addr is a valid unicast ethernet address.
 	return true
 }
 
diff --git a/pkg/tcpip/header/eth_test.go b/pkg/tcpip/header/eth_test.go
index 14413f2ce..3bc8b2b21 100644
--- a/pkg/tcpip/header/eth_test.go
+++ b/pkg/tcpip/header/eth_test.go
@@ -67,6 +67,53 @@ func TestIsValidUnicastEthernetAddress(t *testing.T) {
 	}
 }
 
+func TestIsMulticastEthernetAddress(t *testing.T) {
+	tests := []struct {
+		name     string
+		addr     tcpip.LinkAddress
+		expected bool
+	}{
+		{
+			"Nil",
+			tcpip.LinkAddress([]byte(nil)),
+			false,
+		},
+		{
+			"Empty",
+			tcpip.LinkAddress(""),
+			false,
+		},
+		{
+			"InvalidLength",
+			tcpip.LinkAddress("\x01\x02\x03"),
+			false,
+		},
+		{
+			"Unspecified",
+			unspecifiedEthernetAddress,
+			false,
+		},
+		{
+			"Multicast",
+			tcpip.LinkAddress("\x01\x02\x03\x04\x05\x06"),
+			true,
+		},
+		{
+			"Unicast",
+			tcpip.LinkAddress("\x02\x02\x03\x04\x05\x06"),
+			false,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			if got := IsMulticastEthernetAddress(test.addr); got != test.expected {
+				t.Fatalf("got IsMulticastEthernetAddress = %t, want = %t", got, test.expected)
+			}
+		})
+	}
+}
+
 func TestEthernetAddressFromMulticastIPv4Address(t *testing.T) {
 	tests := []struct {
 		name             string
diff --git a/pkg/tcpip/header/icmpv4.go b/pkg/tcpip/header/icmpv4.go
index be03fb086..504408878 100644
--- a/pkg/tcpip/header/icmpv4.go
+++ b/pkg/tcpip/header/icmpv4.go
@@ -31,6 +31,27 @@ const (
 	// ICMPv4MinimumSize is the minimum size of a valid ICMP packet.
 	ICMPv4MinimumSize = 8
 
+	// ICMPv4MinimumErrorPayloadSize Is the smallest number of bytes of an
+	// errant packet's transport layer that an ICMP error type packet should
+	// attempt to send as per RFC 792 (see each type) and RFC 1122
+	// section 3.2.2 which states:
+	//      Every ICMP error message includes the Internet header and at
+	//      least the first 8 data octets of the datagram that triggered
+	//      the error; more than 8 octets MAY be sent; this header and data
+	//      MUST be unchanged from the received datagram.
+	//
+	// RFC 792 shows:
+	//   0                   1                   2                   3
+	//  0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+	// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+	// |     Type      |     Code      |          Checksum             |
+	// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+	// |                             unused                            |
+	// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+	// |      Internet Header + 64 bits of Original Data Datagram      |
+	// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+	ICMPv4MinimumErrorPayloadSize = 8
+
 	// ICMPv4ProtocolNumber is the ICMP transport protocol number.
 	ICMPv4ProtocolNumber tcpip.TransportProtocolNumber = 1
 
@@ -39,15 +60,19 @@ const (
 	icmpv4ChecksumOffset = 2
 
 	// icmpv4MTUOffset is the offset of the MTU field
-	// in a ICMPv4FragmentationNeeded message.
+	// in an ICMPv4FragmentationNeeded message.
 	icmpv4MTUOffset = 6
 
 	// icmpv4IdentOffset is the offset of the ident field
-	// in a ICMPv4EchoRequest/Reply message.
+	// in an ICMPv4EchoRequest/Reply message.
 	icmpv4IdentOffset = 4
 
+	// icmpv4PointerOffset is the offset of the pointer field
+	// in an ICMPv4ParamProblem message.
+	icmpv4PointerOffset = 4
+
 	// icmpv4SequenceOffset is the offset of the sequence field
-	// in a ICMPv4EchoRequest/Reply message.
+	// in an ICMPv4EchoRequest/Reply message.
 	icmpv4SequenceOffset = 6
 )
 
@@ -72,15 +97,23 @@ const (
 	ICMPv4InfoReply      ICMPv4Type = 16
 )
 
+// ICMP codes for ICMPv4 Time Exceeded messages as defined in RFC 792.
+const (
+	ICMPv4TTLExceeded ICMPv4Code = 0
+)
+
 // ICMP codes for ICMPv4 Destination Unreachable messages as defined in RFC 792.
 const (
-	ICMPv4TTLExceeded         ICMPv4Code = 0
+	ICMPv4NetUnreachable      ICMPv4Code = 0
 	ICMPv4HostUnreachable     ICMPv4Code = 1
 	ICMPv4ProtoUnreachable    ICMPv4Code = 2
 	ICMPv4PortUnreachable     ICMPv4Code = 3
 	ICMPv4FragmentationNeeded ICMPv4Code = 4
 )
 
+// ICMPv4UnusedCode is a code to use in ICMP messages where no code is needed.
+const ICMPv4UnusedCode ICMPv4Code = 0
+
 // Type is the ICMP type field.
 func (b ICMPv4) Type() ICMPv4Type { return ICMPv4Type(b[0]) }
 
diff --git a/pkg/tcpip/header/icmpv6.go b/pkg/tcpip/header/icmpv6.go
index 20b01d8f4..4303fc5d5 100644
--- a/pkg/tcpip/header/icmpv6.go
+++ b/pkg/tcpip/header/icmpv6.go
@@ -49,14 +49,17 @@ const (
 	// neighbor advertisement packet.
 	ICMPv6NeighborAdvertMinimumSize = ICMPv6HeaderSize + NDPNAMinimumSize
 
-	// ICMPv6NeighborAdvertSize is size of a neighbor advertisement
-	// including the NDP Target Link Layer option for an Ethernet
-	// address.
-	ICMPv6NeighborAdvertSize = ICMPv6HeaderSize + NDPNAMinimumSize + NDPLinkLayerAddressSize
-
-	// ICMPv6EchoMinimumSize is the minimum size of a valid ICMP echo packet.
+	// ICMPv6EchoMinimumSize is the minimum size of a valid echo packet.
 	ICMPv6EchoMinimumSize = 8
 
+	// ICMPv6ErrorHeaderSize is the size of an ICMP error packet header,
+	// as per RFC 4443, Apendix A, item 4 and the errata.
+	//   ... all ICMP error messages shall have exactly
+	//   32 bits of type-specific data, so that receivers can reliably find
+	//   the embedded invoking packet even when they don't recognize the
+	//   ICMP message Type.
+	ICMPv6ErrorHeaderSize = 8
+
 	// ICMPv6DstUnreachableMinimumSize is the minimum size of a valid ICMP
 	// destination unreachable packet.
 	ICMPv6DstUnreachableMinimumSize = ICMPv6MinimumSize
@@ -69,6 +72,10 @@ const (
 	// in an ICMPv6 message.
 	icmpv6ChecksumOffset = 2
 
+	// icmpv6PointerOffset is the offset of the pointer
+	// in an ICMPv6 Parameter problem message.
+	icmpv6PointerOffset = 4
+
 	// icmpv6MTUOffset is the offset of the MTU field in an ICMPv6
 	// PacketTooBig message.
 	icmpv6MTUOffset = 4
@@ -89,9 +96,10 @@ const (
 	NDPHopLimit = 255
 )
 
-// ICMPv6Type is the ICMP type field described in RFC 4443 and friends.
+// ICMPv6Type is the ICMP type field described in RFC 4443.
 type ICMPv6Type byte
 
+// Values for use in the Type field of ICMPv6 packet from RFC 4433.
 const (
 	ICMPv6DstUnreachable ICMPv6Type = 1
 	ICMPv6PacketTooBig   ICMPv6Type = 2
@@ -109,7 +117,18 @@ const (
 	ICMPv6RedirectMsg     ICMPv6Type = 137
 )
 
-// ICMPv6Code is the ICMP code field described in RFC 4443.
+// IsErrorType returns true if the receiver is an ICMP error type.
+func (typ ICMPv6Type) IsErrorType() bool {
+	// Per RFC 4443 section 2.1:
+	//   ICMPv6 messages are grouped into two classes: error messages and
+	//   informational messages.  Error messages are identified as such by a
+	//   zero in the high-order bit of their message Type field values.  Thus,
+	//   error messages have message types from 0 to 127; informational
+	//   messages have message types from 128 to 255.
+	return typ&0x80 == 0
+}
+
+// ICMPv6Code is the ICMP Code field described in RFC 4443.
 type ICMPv6Code byte
 
 // ICMP codes used with Destination Unreachable (Type 1). As per RFC 4443
@@ -132,9 +151,14 @@ const (
 
 // ICMP codes used with Parameter Problem (Type 4). As per RFC 4443 section 3.4.
 const (
+	// ICMPv6ErroneousHeader indicates an erroneous header field was encountered.
 	ICMPv6ErroneousHeader ICMPv6Code = 0
-	ICMPv6UnknownHeader   ICMPv6Code = 1
-	ICMPv6UnknownOption   ICMPv6Code = 2
+
+	// ICMPv6UnknownHeader indicates an unrecognized Next Header type encountered.
+	ICMPv6UnknownHeader ICMPv6Code = 1
+
+	// ICMPv6UnknownOption indicates an unrecognized IPv6 option was encountered.
+	ICMPv6UnknownOption ICMPv6Code = 2
 )
 
 // ICMPv6UnusedCode is the code value used with ICMPv6 messages which don't use
@@ -153,6 +177,16 @@ func (b ICMPv6) Code() ICMPv6Code { return ICMPv6Code(b[1]) }
 // SetCode sets the ICMP code field.
 func (b ICMPv6) SetCode(c ICMPv6Code) { b[1] = byte(c) }
 
+// TypeSpecific returns the type specific data field.
+func (b ICMPv6) TypeSpecific() uint32 {
+	return binary.BigEndian.Uint32(b[icmpv6PointerOffset:])
+}
+
+// SetTypeSpecific sets the type specific data field.
+func (b ICMPv6) SetTypeSpecific(val uint32) {
+	binary.BigEndian.PutUint32(b[icmpv6PointerOffset:], val)
+}
+
 // Checksum is the ICMP checksum field.
 func (b ICMPv6) Checksum() uint16 {
 	return binary.BigEndian.Uint16(b[icmpv6ChecksumOffset:])
diff --git a/pkg/tcpip/header/ipv4.go b/pkg/tcpip/header/ipv4.go
index 680eafd16..4c6e4be64 100644
--- a/pkg/tcpip/header/ipv4.go
+++ b/pkg/tcpip/header/ipv4.go
@@ -16,10 +16,29 @@ package header
 
 import (
 	"encoding/binary"
+	"fmt"
 
 	"gvisor.dev/gvisor/pkg/tcpip"
 )
 
+// RFC 971 defines the fields of the IPv4 header on page 11 using the following
+// diagram: ("Figure 4")
+//    0                   1                   2                   3
+//    0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+//   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+//   |Version|  IHL  |Type of Service|          Total Length         |
+//   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+//   |         Identification        |Flags|      Fragment Offset    |
+//   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+//   |  Time to Live |    Protocol   |         Header Checksum       |
+//   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+//   |                       Source Address                          |
+//   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+//   |                    Destination Address                        |
+//   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+//   |                    Options                    |    Padding    |
+//   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+//
 const (
 	versIHL = 0
 	tos     = 1
@@ -33,6 +52,7 @@ const (
 	checksum           = 10
 	srcAddr            = 12
 	dstAddr            = 16
+	options            = 20
 )
 
 // IPv4Fields contains the fields of an IPv4 packet. It is used to describe the
@@ -76,11 +96,13 @@ type IPv4Fields struct {
 // IPv4 represents an ipv4 header stored in a byte array.
 // Most of the methods of IPv4 access to the underlying slice without
 // checking the boundaries and could panic because of 'index out of range'.
-// Always call IsValid() to validate an instance of IPv4 before using other methods.
+// Always call IsValid() to validate an instance of IPv4 before using other
+// methods.
 type IPv4 []byte
 
 const (
-	// IPv4MinimumSize is the minimum size of a valid IPv4 packet.
+	// IPv4MinimumSize is the minimum size of a valid IPv4 packet;
+	// i.e. a packet header with no options.
 	IPv4MinimumSize = 20
 
 	// IPv4MaximumHeaderSize is the maximum size of an IPv4 header. Given
@@ -88,6 +110,16 @@ const (
 	// units, the header cannot exceed 15*4 = 60 bytes.
 	IPv4MaximumHeaderSize = 60
 
+	// IPv4MaximumPayloadSize is the maximum size of a valid IPv4 payload.
+	//
+	// Linux limits this to 65,515 octets (the max IP datagram size - the IPv4
+	// header size). But RFC 791 section 3.2 discusses the design of the IPv4
+	// fragment "allows 2**13 = 8192 fragments of 8 octets each for a total of
+	// 65,536 octets. Note that this is consistent with the the datagram total
+	// length field (of course, the header is counted in the total length and not
+	// in the fragments)."
+	IPv4MaximumPayloadSize = 65536
+
 	// MinIPFragmentPayloadSize is the minimum number of payload bytes that
 	// the first fragment must carry when an IPv4 packet is fragmented.
 	MinIPFragmentPayloadSize = 8
@@ -140,13 +172,44 @@ func IPVersion(b []byte) int {
 	if len(b) < versIHL+1 {
 		return -1
 	}
-	return int(b[versIHL] >> 4)
+	return int(b[versIHL] >> ipVersionShift)
 }
 
+// RFC 791 page 11 shows the header length (IHL) is in the lower 4 bits
+// of the first byte, and is counted in multiples of 4 bytes.
+//
+//     0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+//    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+//    |Version|  IHL  |Type of Service|          Total Length         |
+//    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+//      (...)
+//     Version:  4 bits
+//       The Version field indicates the format of the internet header.  This
+//       document describes version 4.
+//
+//     IHL:  4 bits
+//       Internet Header Length is the length of the internet header in 32
+//       bit words, and thus points to the beginning of the data.  Note that
+//       the minimum value for a correct header is 5.
+//
+const (
+	ipVersionShift = 4
+	ipIHLMask      = 0x0f
+	IPv4IHLStride  = 4
+)
+
 // HeaderLength returns the value of the "header length" field of the ipv4
 // header. The length returned is in bytes.
 func (b IPv4) HeaderLength() uint8 {
-	return (b[versIHL] & 0xf) * 4
+	return (b[versIHL] & ipIHLMask) * IPv4IHLStride
+}
+
+// SetHeaderLength sets the value of the "Internet Header Length" field.
+func (b IPv4) SetHeaderLength(hdrLen uint8) {
+	if hdrLen > IPv4MaximumHeaderSize {
+		panic(fmt.Sprintf("got IPv4 Header size = %d, want <= %d", hdrLen, IPv4MaximumHeaderSize))
+	}
+	b[versIHL] = (IPv4Version << ipVersionShift) | ((hdrLen / IPv4IHLStride) & ipIHLMask)
 }
 
 // ID returns the value of the identifier field of the ipv4 header.
@@ -200,6 +263,12 @@ func (b IPv4) DestinationAddress() tcpip.Address {
 	return tcpip.Address(b[dstAddr : dstAddr+IPv4AddressSize])
 }
 
+// Options returns a a buffer holding the options.
+func (b IPv4) Options() []byte {
+	hdrLen := b.HeaderLength()
+	return b[options:hdrLen:hdrLen]
+}
+
 // TransportProtocol implements Network.TransportProtocol.
 func (b IPv4) TransportProtocol() tcpip.TransportProtocolNumber {
 	return tcpip.TransportProtocolNumber(b.Protocol())
@@ -225,6 +294,11 @@ func (b IPv4) SetTOS(v uint8, _ uint32) {
 	b[tos] = v
 }
 
+// SetTTL sets the "Time to Live" field of the IPv4 header.
+func (b IPv4) SetTTL(v byte) {
+	b[ttl] = v
+}
+
 // SetTotalLength sets the "total length" field of the ipv4 header.
 func (b IPv4) SetTotalLength(totalLength uint16) {
 	binary.BigEndian.PutUint16(b[IPv4TotalLenOffset:], totalLength)
@@ -265,7 +339,7 @@ func (b IPv4) CalculateChecksum() uint16 {
 
 // Encode encodes all the fields of the ipv4 header.
 func (b IPv4) Encode(i *IPv4Fields) {
-	b[versIHL] = (4 << 4) | ((i.IHL / 4) & 0xf)
+	b.SetHeaderLength(i.IHL)
 	b[tos] = i.TOS
 	b.SetTotalLength(i.TotalLength)
 	binary.BigEndian.PutUint16(b[id:], i.ID)
@@ -317,7 +391,7 @@ func IsV4MulticastAddress(addr tcpip.Address) bool {
 }
 
 // IsV4LoopbackAddress determines if the provided address is an IPv4 loopback
-// address (belongs to 127.0.0.1/8 subnet).
+// address (belongs to 127.0.0.0/8 subnet). See RFC 1122 section 3.2.1.3.
 func IsV4LoopbackAddress(addr tcpip.Address) bool {
 	if len(addr) != IPv4AddressSize {
 		return false
diff --git a/pkg/tcpip/header/ipv6.go b/pkg/tcpip/header/ipv6.go
index ea3823898..c5d8a3456 100644
--- a/pkg/tcpip/header/ipv6.go
+++ b/pkg/tcpip/header/ipv6.go
@@ -34,6 +34,9 @@ const (
 	hopLimit             = 7
 	v6SrcAddr            = 8
 	v6DstAddr            = v6SrcAddr + IPv6AddressSize
+
+	// IPv6FixedHeaderSize is the size of the fixed header.
+	IPv6FixedHeaderSize = v6DstAddr + IPv6AddressSize
 )
 
 // IPv6Fields contains the fields of an IPv6 packet. It is used to describe the
@@ -69,11 +72,15 @@ type IPv6 []byte
 
 const (
 	// IPv6MinimumSize is the minimum size of a valid IPv6 packet.
-	IPv6MinimumSize = 40
+	IPv6MinimumSize = IPv6FixedHeaderSize
 
 	// IPv6AddressSize is the size, in bytes, of an IPv6 address.
 	IPv6AddressSize = 16
 
+	// IPv6MaximumPayloadSize is the maximum size of a valid IPv6 payload per
+	// RFC 8200 Section 4.5.
+	IPv6MaximumPayloadSize = 65535
+
 	// IPv6ProtocolNumber is IPv6's network protocol number.
 	IPv6ProtocolNumber tcpip.NetworkProtocolNumber = 0x86dd
 
@@ -302,14 +309,21 @@ func IsV6UnicastAddress(addr tcpip.Address) bool {
 	return addr[0] != 0xff
 }
 
+const solicitedNodeMulticastPrefix = "\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\xff"
+
 // SolicitedNodeAddr computes the solicited-node multicast address. This is
 // used for NDP. Described in RFC 4291. The argument must be a full-length IPv6
 // address.
 func SolicitedNodeAddr(addr tcpip.Address) tcpip.Address {
-	const solicitedNodeMulticastPrefix = "\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\xff"
 	return solicitedNodeMulticastPrefix + addr[len(addr)-3:]
 }
 
+// IsSolicitedNodeAddr determines whether the address is a solicited-node
+// multicast address.
+func IsSolicitedNodeAddr(addr tcpip.Address) bool {
+	return solicitedNodeMulticastPrefix == addr[:len(addr)-3]
+}
+
 // EthernetAdddressToModifiedEUI64IntoBuf populates buf with a modified EUI-64
 // from a 48-bit Ethernet/MAC address, as per RFC 4291 section 2.5.1.
 //
diff --git a/pkg/tcpip/header/ipv6_extension_headers.go b/pkg/tcpip/header/ipv6_extension_headers.go
index 3499d8399..583c2c5d3 100644
--- a/pkg/tcpip/header/ipv6_extension_headers.go
+++ b/pkg/tcpip/header/ipv6_extension_headers.go
@@ -149,6 +149,19 @@ func (b ipv6OptionsExtHdr) Iter() IPv6OptionsExtHdrOptionsIterator {
 // obtained before modification is no longer used.
 type IPv6OptionsExtHdrOptionsIterator struct {
 	reader bytes.Reader
+
+	// optionOffset is the number of bytes from the first byte of the
+	// options field to the beginning of the current option.
+	optionOffset uint32
+
+	// nextOptionOffset is the offset of the next option.
+	nextOptionOffset uint32
+}
+
+// OptionOffset returns the number of bytes parsed while processing the
+// option field of the current Extension Header.
+func (i *IPv6OptionsExtHdrOptionsIterator) OptionOffset() uint32 {
+	return i.optionOffset
 }
 
 // IPv6OptionUnknownAction is the action that must be taken if the processing
@@ -226,6 +239,7 @@ func (*IPv6UnknownExtHdrOption) isIPv6ExtHdrOption() {}
 // the options data, or an error occured.
 func (i *IPv6OptionsExtHdrOptionsIterator) Next() (IPv6ExtHdrOption, bool, error) {
 	for {
+		i.optionOffset = i.nextOptionOffset
 		temp, err := i.reader.ReadByte()
 		if err != nil {
 			// If we can't read the first byte of a new option, then we know the
@@ -238,6 +252,7 @@ func (i *IPv6OptionsExtHdrOptionsIterator) Next() (IPv6ExtHdrOption, bool, error
 		// know the option does not have Length and Data fields. End processing of
 		// the Pad1 option and continue processing the buffer as a new option.
 		if id == ipv6Pad1ExtHdrOptionIdentifier {
+			i.nextOptionOffset = i.optionOffset + 1
 			continue
 		}
 
@@ -254,41 +269,40 @@ func (i *IPv6OptionsExtHdrOptionsIterator) Next() (IPv6ExtHdrOption, bool, error
 			return nil, true, fmt.Errorf("error when reading the option's Length field for option with id = %d: %w", id, io.ErrUnexpectedEOF)
 		}
 
-		// Special-case the variable length padding option to avoid a copy.
-		if id == ipv6PadNExtHdrOptionIdentifier {
-			// Do we have enough bytes in the reader for the PadN option?
-			if n := i.reader.Len(); n < int(length) {
-				// Reset the reader to effectively consume the remaining buffer.
-				i.reader.Reset(nil)
-
-				// We return the same error as if we failed to read a non-padding option
-				// so consumers of this iterator don't need to differentiate between
-				// padding and non-padding options.
-				return nil, true, fmt.Errorf("read %d out of %d option data bytes for option with id = %d: %w", n, length, id, io.ErrUnexpectedEOF)
-			}
+		// Do we have enough bytes in the reader for the next option?
+		if n := i.reader.Len(); n < int(length) {
+			// Reset the reader to effectively consume the remaining buffer.
+			i.reader.Reset(nil)
+
+			// We return the same error as if we failed to read a non-padding option
+			// so consumers of this iterator don't need to differentiate between
+			// padding and non-padding options.
+			return nil, true, fmt.Errorf("read %d out of %d option data bytes for option with id = %d: %w", n, length, id, io.ErrUnexpectedEOF)
+		}
+
+		i.nextOptionOffset = i.optionOffset + uint32(length) + 1 /* option ID */ + 1 /* length byte */
 
+		switch id {
+		case ipv6PadNExtHdrOptionIdentifier:
+			// Special-case the variable length padding option to avoid a copy.
 			if _, err := i.reader.Seek(int64(length), io.SeekCurrent); err != nil {
 				panic(fmt.Sprintf("error when skipping PadN (N = %d) option's data bytes: %s", length, err))
 			}
-
-			// End processing of the PadN option and continue processing the buffer as
-			// a new option.
 			continue
-		}
-
-		bytes := make([]byte, length)
-		if n, err := io.ReadFull(&i.reader, bytes); err != nil {
-			// io.ReadFull may return io.EOF if i.reader has been exhausted. We use
-			// io.ErrUnexpectedEOF instead as the io.EOF is unexpected given the
-			// Length field found in the option.
-			if err == io.EOF {
-				err = io.ErrUnexpectedEOF
+		default:
+			bytes := make([]byte, length)
+			if n, err := io.ReadFull(&i.reader, bytes); err != nil {
+				// io.ReadFull may return io.EOF if i.reader has been exhausted. We use
+				// io.ErrUnexpectedEOF instead as the io.EOF is unexpected given the
+				// Length field found in the option.
+				if err == io.EOF {
+					err = io.ErrUnexpectedEOF
+				}
+
+				return nil, true, fmt.Errorf("read %d out of %d option data bytes for option with id = %d: %w", n, length, id, err)
 			}
-
-			return nil, true, fmt.Errorf("read %d out of %d option data bytes for option with id = %d: %w", n, length, id, err)
+			return &IPv6UnknownExtHdrOption{Identifier: id, Data: bytes}, false, nil
 		}
-
-		return &IPv6UnknownExtHdrOption{Identifier: id, Data: bytes}, false, nil
 	}
 }
 
@@ -382,6 +396,29 @@ type IPv6PayloadIterator struct {
 	// Indicates to the iterator that it should return the remaining payload as a
 	// raw payload on the next call to Next.
 	forceRaw bool
+
+	// headerOffset is the offset of the beginning of the current extension
+	// header starting from the beginning of the fixed header.
+	headerOffset uint32
+
+	// parseOffset is the byte offset into the current extension header of the
+	// field we are currently examining. It can be added to the header offset
+	// if the absolute offset within the packet is required.
+	parseOffset uint32
+
+	// nextOffset is the offset of the next header.
+	nextOffset uint32
+}
+
+// HeaderOffset returns the offset to the start of the extension
+// header most recently processed.
+func (i IPv6PayloadIterator) HeaderOffset() uint32 {
+	return i.headerOffset
+}
+
+// ParseOffset returns the number of bytes successfully parsed.
+func (i IPv6PayloadIterator) ParseOffset() uint32 {
+	return i.headerOffset + i.parseOffset
 }
 
 // MakeIPv6PayloadIterator returns an iterator over the IPv6 payload containing
@@ -397,7 +434,8 @@ func MakeIPv6PayloadIterator(nextHdrIdentifier IPv6ExtensionHeaderIdentifier, pa
 		nextHdrIdentifier: nextHdrIdentifier,
 		payload:           payload.Clone(nil),
 		// We need a buffer of size 1 for calls to bufio.Reader.ReadByte.
-		reader: *bufio.NewReaderSize(io.MultiReader(readerPs...), 1),
+		reader:     *bufio.NewReaderSize(io.MultiReader(readerPs...), 1),
+		nextOffset: IPv6FixedHeaderSize,
 	}
 }
 
@@ -434,6 +472,8 @@ func (i *IPv6PayloadIterator) AsRawHeader(consume bool) IPv6RawPayloadHeader {
 // Next is unable to return anything because the iterator has reached the end of
 // the payload, or an error occured.
 func (i *IPv6PayloadIterator) Next() (IPv6PayloadHeader, bool, error) {
+	i.headerOffset = i.nextOffset
+	i.parseOffset = 0
 	// We could be forced to return i as a raw header when the previous header was
 	// a fragment extension header as the data following the fragment extension
 	// header may not be complete.
@@ -461,7 +501,7 @@ func (i *IPv6PayloadIterator) Next() (IPv6PayloadHeader, bool, error) {
 		return IPv6RoutingExtHdr(bytes), false, nil
 	case IPv6FragmentExtHdrIdentifier:
 		var data [6]byte
-		// We ignore the returned bytes becauase we know the fragment extension
+		// We ignore the returned bytes because we know the fragment extension
 		// header specific data will fit in data.
 		nextHdrIdentifier, _, err := i.nextHeaderData(true /* fragmentHdr */, data[:])
 		if err != nil {
@@ -519,10 +559,12 @@ func (i *IPv6PayloadIterator) nextHeaderData(fragmentHdr bool, bytes []byte) (IP
 	if err != nil {
 		return 0, nil, fmt.Errorf("error when reading the Next Header field for extension header with id = %d: %w", i.nextHdrIdentifier, err)
 	}
+	i.parseOffset++
 
 	var length uint8
 	length, err = i.reader.ReadByte()
 	i.payload.TrimFront(1)
+
 	if err != nil {
 		if fragmentHdr {
 			return 0, nil, fmt.Errorf("error when reading the Length field for extension header with id = %d: %w", i.nextHdrIdentifier, err)
@@ -534,6 +576,17 @@ func (i *IPv6PayloadIterator) nextHeaderData(fragmentHdr bool, bytes []byte) (IP
 		length = 0
 	}
 
+	// Make parseOffset point to the first byte of the Extension Header
+	// specific data.
+	i.parseOffset++
+
+	// length is in 8 byte chunks but doesn't include the first one.
+	// See RFC 8200 for each header type, sections 4.3-4.6 and the requirement
+	// in section 4.8 for new extension headers at the top of page 24.
+	//   [ Hdr Ext Len ] ... Length of the Destination Options header in 8-octet
+	//   units, not including the first 8 octets.
+	i.nextOffset += uint32((length + 1) * ipv6ExtHdrLenBytesPerUnit)
+
 	bytesLen := int(length)*ipv6ExtHdrLenBytesPerUnit + ipv6ExtHdrLenBytesExcluded
 	if bytes == nil {
 		bytes = make([]byte, bytesLen)
diff --git a/pkg/tcpip/header/ipversion_test.go b/pkg/tcpip/header/ipversion_test.go
index b5540bf66..17a49d4fa 100644
--- a/pkg/tcpip/header/ipversion_test.go
+++ b/pkg/tcpip/header/ipversion_test.go
@@ -22,7 +22,7 @@ import (
 
 func TestIPv4(t *testing.T) {
 	b := header.IPv4(make([]byte, header.IPv4MinimumSize))
-	b.Encode(&header.IPv4Fields{})
+	b.Encode(&header.IPv4Fields{IHL: header.IPv4MinimumSize})
 
 	const want = header.IPv4Version
 	if v := header.IPVersion(b); v != want {
diff --git a/pkg/tcpip/header/parse/BUILD b/pkg/tcpip/header/parse/BUILD
new file mode 100644
index 000000000..2adee9288
--- /dev/null
+++ b/pkg/tcpip/header/parse/BUILD
@@ -0,0 +1,15 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "parse",
+    srcs = ["parse.go"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//pkg/tcpip",
+        "//pkg/tcpip/buffer",
+        "//pkg/tcpip/header",
+        "//pkg/tcpip/stack",
+    ],
+)
diff --git a/pkg/tcpip/header/parse/parse.go b/pkg/tcpip/header/parse/parse.go
new file mode 100644
index 000000000..5ca75c834
--- /dev/null
+++ b/pkg/tcpip/header/parse/parse.go
@@ -0,0 +1,168 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package parse provides utilities to parse packets.
+package parse
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+)
+
+// ARP populates pkt's network header with an ARP header found in
+// pkt.Data.
+//
+// Returns true if the header was successfully parsed.
+func ARP(pkt *stack.PacketBuffer) bool {
+	_, ok := pkt.NetworkHeader().Consume(header.ARPSize)
+	if ok {
+		pkt.NetworkProtocolNumber = header.ARPProtocolNumber
+	}
+	return ok
+}
+
+// IPv4 parses an IPv4 packet found in pkt.Data and populates pkt's network
+// header with the IPv4 header.
+//
+// Returns true if the header was successfully parsed.
+func IPv4(pkt *stack.PacketBuffer) bool {
+	hdr, ok := pkt.Data.PullUp(header.IPv4MinimumSize)
+	if !ok {
+		return false
+	}
+	ipHdr := header.IPv4(hdr)
+
+	// Header may have options, determine the true header length.
+	headerLen := int(ipHdr.HeaderLength())
+	if headerLen < header.IPv4MinimumSize {
+		// TODO(gvisor.dev/issue/2404): Per RFC 791, IHL needs to be at least 5 in
+		// order for the packet to be valid. Figure out if we want to reject this
+		// case.
+		headerLen = header.IPv4MinimumSize
+	}
+	hdr, ok = pkt.NetworkHeader().Consume(headerLen)
+	if !ok {
+		return false
+	}
+	ipHdr = header.IPv4(hdr)
+
+	pkt.NetworkProtocolNumber = header.IPv4ProtocolNumber
+	pkt.Data.CapLength(int(ipHdr.TotalLength()) - len(hdr))
+	return true
+}
+
+// IPv6 parses an IPv6 packet found in pkt.Data and populates pkt's network
+// header with the IPv6 header.
+func IPv6(pkt *stack.PacketBuffer) (proto tcpip.TransportProtocolNumber, fragID uint32, fragOffset uint16, fragMore bool, ok bool) {
+	hdr, ok := pkt.Data.PullUp(header.IPv6MinimumSize)
+	if !ok {
+		return 0, 0, 0, false, false
+	}
+	ipHdr := header.IPv6(hdr)
+
+	// dataClone consists of:
+	// - Any IPv6 header bytes after the first 40 (i.e. extensions).
+	// - The transport header, if present.
+	// - Any other payload data.
+	views := [8]buffer.View{}
+	dataClone := pkt.Data.Clone(views[:])
+	dataClone.TrimFront(header.IPv6MinimumSize)
+	it := header.MakeIPv6PayloadIterator(header.IPv6ExtensionHeaderIdentifier(ipHdr.NextHeader()), dataClone)
+
+	// Iterate over the IPv6 extensions to find their length.
+	var nextHdr tcpip.TransportProtocolNumber
+	var extensionsSize int
+
+traverseExtensions:
+	for {
+		extHdr, done, err := it.Next()
+		if err != nil {
+			break
+		}
+
+		// If we exhaust the extension list, the entire packet is the IPv6 header
+		// and (possibly) extensions.
+		if done {
+			extensionsSize = dataClone.Size()
+			break
+		}
+
+		switch extHdr := extHdr.(type) {
+		case header.IPv6FragmentExtHdr:
+			if fragID == 0 && fragOffset == 0 && !fragMore {
+				fragID = extHdr.ID()
+				fragOffset = extHdr.FragmentOffset()
+				fragMore = extHdr.More()
+			}
+
+		case header.IPv6RawPayloadHeader:
+			// We've found the payload after any extensions.
+			extensionsSize = dataClone.Size() - extHdr.Buf.Size()
+			nextHdr = tcpip.TransportProtocolNumber(extHdr.Identifier)
+			break traverseExtensions
+
+		default:
+			// Any other extension is a no-op, keep looping until we find the payload.
+		}
+	}
+
+	// Put the IPv6 header with extensions in pkt.NetworkHeader().
+	hdr, ok = pkt.NetworkHeader().Consume(header.IPv6MinimumSize + extensionsSize)
+	if !ok {
+		panic(fmt.Sprintf("pkt.Data should have at least %d bytes, but only has %d.", header.IPv6MinimumSize+extensionsSize, pkt.Data.Size()))
+	}
+	ipHdr = header.IPv6(hdr)
+	pkt.Data.CapLength(int(ipHdr.PayloadLength()))
+	pkt.NetworkProtocolNumber = header.IPv6ProtocolNumber
+
+	return nextHdr, fragID, fragOffset, fragMore, true
+}
+
+// UDP parses a UDP packet found in pkt.Data and populates pkt's transport
+// header with the UDP header.
+//
+// Returns true if the header was successfully parsed.
+func UDP(pkt *stack.PacketBuffer) bool {
+	_, ok := pkt.TransportHeader().Consume(header.UDPMinimumSize)
+	pkt.TransportProtocolNumber = header.UDPProtocolNumber
+	return ok
+}
+
+// TCP parses a TCP packet found in pkt.Data and populates pkt's transport
+// header with the TCP header.
+//
+// Returns true if the header was successfully parsed.
+func TCP(pkt *stack.PacketBuffer) bool {
+	// TCP header is variable length, peek at it first.
+	hdrLen := header.TCPMinimumSize
+	hdr, ok := pkt.Data.PullUp(hdrLen)
+	if !ok {
+		return false
+	}
+
+	// If the header has options, pull those up as well.
+	if offset := int(header.TCP(hdr).DataOffset()); offset > header.TCPMinimumSize && offset <= pkt.Data.Size() {
+		// TODO(gvisor.dev/issue/2404): Figure out whether to reject this kind of
+		// packets.
+		hdrLen = offset
+	}
+
+	_, ok = pkt.TransportHeader().Consume(hdrLen)
+	pkt.TransportProtocolNumber = header.TCPProtocolNumber
+	return ok
+}
diff --git a/pkg/tcpip/header/udp.go b/pkg/tcpip/header/udp.go
index 9339d637f..98bdd29db 100644
--- a/pkg/tcpip/header/udp.go
+++ b/pkg/tcpip/header/udp.go
@@ -16,6 +16,7 @@ package header
 
 import (
 	"encoding/binary"
+	"math"
 
 	"gvisor.dev/gvisor/pkg/tcpip"
 )
@@ -55,6 +56,10 @@ const (
 	// UDPMinimumSize is the minimum size of a valid UDP packet.
 	UDPMinimumSize = 8
 
+	// UDPMaximumSize is the maximum size of a valid UDP packet. The length field
+	// in the UDP header is 16 bits as per RFC 768.
+	UDPMaximumSize = math.MaxUint16
+
 	// UDPProtocolNumber is UDP's transport protocol number.
 	UDPProtocolNumber tcpip.TransportProtocolNumber = 17
 )
diff --git a/pkg/tcpip/link/ethernet/BUILD b/pkg/tcpip/link/ethernet/BUILD
new file mode 100644
index 000000000..ec92ed623
--- /dev/null
+++ b/pkg/tcpip/link/ethernet/BUILD
@@ -0,0 +1,15 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "ethernet",
+    srcs = ["ethernet.go"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//pkg/tcpip",
+        "//pkg/tcpip/header",
+        "//pkg/tcpip/link/nested",
+        "//pkg/tcpip/stack",
+    ],
+)
diff --git a/pkg/tcpip/link/ethernet/ethernet.go b/pkg/tcpip/link/ethernet/ethernet.go
new file mode 100644
index 000000000..3eef7cd56
--- /dev/null
+++ b/pkg/tcpip/link/ethernet/ethernet.go
@@ -0,0 +1,99 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package ethernet provides an implementation of an ethernet link endpoint that
+// wraps an inner link endpoint.
+package ethernet
+
+import (
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/link/nested"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+)
+
+var _ stack.NetworkDispatcher = (*Endpoint)(nil)
+var _ stack.LinkEndpoint = (*Endpoint)(nil)
+
+// New returns an ethernet link endpoint that wraps an inner link endpoint.
+func New(ep stack.LinkEndpoint) *Endpoint {
+	var e Endpoint
+	e.Endpoint.Init(ep, &e)
+	return &e
+}
+
+// Endpoint is an ethernet endpoint.
+//
+// It adds an ethernet header to packets before sending them out through its
+// inner link endpoint and consumes an ethernet header before sending the
+// packet to the stack.
+type Endpoint struct {
+	nested.Endpoint
+}
+
+// DeliverNetworkPacket implements stack.NetworkDispatcher.
+func (e *Endpoint) DeliverNetworkPacket(_, _ tcpip.LinkAddress, _ tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
+	hdr, ok := pkt.LinkHeader().Consume(header.EthernetMinimumSize)
+	if !ok {
+		return
+	}
+
+	eth := header.Ethernet(hdr)
+	if dst := eth.DestinationAddress(); dst == e.Endpoint.LinkAddress() || dst == header.EthernetBroadcastAddress || header.IsMulticastEthernetAddress(dst) {
+		e.Endpoint.DeliverNetworkPacket(eth.SourceAddress() /* remote */, dst /* local */, eth.Type() /* protocol */, pkt)
+	}
+}
+
+// Capabilities implements stack.LinkEndpoint.
+func (e *Endpoint) Capabilities() stack.LinkEndpointCapabilities {
+	return stack.CapabilityResolutionRequired | e.Endpoint.Capabilities()
+}
+
+// WritePacket implements stack.LinkEndpoint.
+func (e *Endpoint) WritePacket(r *stack.Route, gso *stack.GSO, proto tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) *tcpip.Error {
+	e.AddHeader(e.Endpoint.LinkAddress(), r.RemoteLinkAddress, proto, pkt)
+	return e.Endpoint.WritePacket(r, gso, proto, pkt)
+}
+
+// WritePackets implements stack.LinkEndpoint.
+func (e *Endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.PacketBufferList, proto tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+	linkAddr := e.Endpoint.LinkAddress()
+
+	for pkt := pkts.Front(); pkt != nil; pkt = pkt.Next() {
+		e.AddHeader(linkAddr, r.RemoteLinkAddress, proto, pkt)
+	}
+
+	return e.Endpoint.WritePackets(r, gso, pkts, proto)
+}
+
+// MaxHeaderLength implements stack.LinkEndpoint.
+func (e *Endpoint) MaxHeaderLength() uint16 {
+	return header.EthernetMinimumSize + e.Endpoint.MaxHeaderLength()
+}
+
+// ARPHardwareType implements stack.LinkEndpoint.
+func (*Endpoint) ARPHardwareType() header.ARPHardwareType {
+	return header.ARPHardwareEther
+}
+
+// AddHeader implements stack.LinkEndpoint.
+func (*Endpoint) AddHeader(local, remote tcpip.LinkAddress, proto tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
+	eth := header.Ethernet(pkt.LinkHeader().Push(header.EthernetMinimumSize))
+	fields := header.EthernetFields{
+		SrcAddr: local,
+		DstAddr: remote,
+		Type:    proto,
+	}
+	eth.Encode(&fields)
+}
diff --git a/pkg/tcpip/link/pipe/BUILD b/pkg/tcpip/link/pipe/BUILD
new file mode 100644
index 000000000..9f31c1ffc
--- /dev/null
+++ b/pkg/tcpip/link/pipe/BUILD
@@ -0,0 +1,15 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "pipe",
+    srcs = ["pipe.go"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//pkg/tcpip",
+        "//pkg/tcpip/buffer",
+        "//pkg/tcpip/header",
+        "//pkg/tcpip/stack",
+    ],
+)
diff --git a/pkg/tcpip/link/pipe/pipe.go b/pkg/tcpip/link/pipe/pipe.go
new file mode 100644
index 000000000..523b0d24b
--- /dev/null
+++ b/pkg/tcpip/link/pipe/pipe.go
@@ -0,0 +1,115 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package pipe provides the implementation of pipe-like data-link layer
+// endpoints. Such endpoints allow packets to be sent between two interfaces.
+package pipe
+
+import (
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+)
+
+var _ stack.LinkEndpoint = (*Endpoint)(nil)
+
+// New returns both ends of a new pipe.
+func New(linkAddr1, linkAddr2 tcpip.LinkAddress) (*Endpoint, *Endpoint) {
+	ep1 := &Endpoint{
+		linkAddr: linkAddr1,
+	}
+	ep2 := &Endpoint{
+		linkAddr: linkAddr2,
+	}
+	ep1.linked = ep2
+	ep2.linked = ep1
+	return ep1, ep2
+}
+
+// Endpoint is one end of a pipe.
+type Endpoint struct {
+	dispatcher stack.NetworkDispatcher
+	linked     *Endpoint
+	linkAddr   tcpip.LinkAddress
+}
+
+// WritePacket implements stack.LinkEndpoint.
+func (e *Endpoint) WritePacket(r *stack.Route, _ *stack.GSO, proto tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) *tcpip.Error {
+	if !e.linked.IsAttached() {
+		return nil
+	}
+
+	// Note that the local address from the perspective of this endpoint is the
+	// remote address from the perspective of the other end of the pipe
+	// (e.linked). Similarly, the remote address from the perspective of this
+	// endpoint is the local address on the other end.
+	e.linked.dispatcher.DeliverNetworkPacket(r.LocalLinkAddress /* remote */, r.RemoteLinkAddress /* local */, proto, stack.NewPacketBuffer(stack.PacketBufferOptions{
+		Data: buffer.NewVectorisedView(pkt.Size(), pkt.Views()),
+	}))
+
+	return nil
+}
+
+// WritePackets implements stack.LinkEndpoint.
+func (*Endpoint) WritePackets(*stack.Route, *stack.GSO, stack.PacketBufferList, tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+	panic("not implemented")
+}
+
+// WriteRawPacket implements stack.LinkEndpoint.
+func (*Endpoint) WriteRawPacket(buffer.VectorisedView) *tcpip.Error {
+	panic("not implemented")
+}
+
+// Attach implements stack.LinkEndpoint.
+func (e *Endpoint) Attach(dispatcher stack.NetworkDispatcher) {
+	e.dispatcher = dispatcher
+}
+
+// IsAttached implements stack.LinkEndpoint.
+func (e *Endpoint) IsAttached() bool {
+	return e.dispatcher != nil
+}
+
+// Wait implements stack.LinkEndpoint.
+func (*Endpoint) Wait() {}
+
+// MTU implements stack.LinkEndpoint.
+func (*Endpoint) MTU() uint32 {
+	return header.IPv6MinimumMTU
+}
+
+// Capabilities implements stack.LinkEndpoint.
+func (*Endpoint) Capabilities() stack.LinkEndpointCapabilities {
+	return 0
+}
+
+// MaxHeaderLength implements stack.LinkEndpoint.
+func (*Endpoint) MaxHeaderLength() uint16 {
+	return 0
+}
+
+// LinkAddress implements stack.LinkEndpoint.
+func (e *Endpoint) LinkAddress() tcpip.LinkAddress {
+	return e.linkAddr
+}
+
+// ARPHardwareType implements stack.LinkEndpoint.
+func (*Endpoint) ARPHardwareType() header.ARPHardwareType {
+	return header.ARPHardwareNone
+}
+
+// AddHeader implements stack.LinkEndpoint.
+func (*Endpoint) AddHeader(_, _ tcpip.LinkAddress, _ tcpip.NetworkProtocolNumber, _ *stack.PacketBuffer) {
+}
diff --git a/pkg/tcpip/link/rawfile/BUILD b/pkg/tcpip/link/rawfile/BUILD
index 14b527bc2..6c410c5a6 100644
--- a/pkg/tcpip/link/rawfile/BUILD
+++ b/pkg/tcpip/link/rawfile/BUILD
@@ -1,4 +1,4 @@
-load("//tools:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -18,3 +18,14 @@ go_library(
         "@org_golang_x_sys//unix:go_default_library",
     ],
 )
+
+go_test(
+    name = "rawfile_test",
+    srcs = [
+        "errors_test.go",
+    ],
+    library = "rawfile",
+    deps = [
+        "//pkg/tcpip",
+    ],
+)
diff --git a/pkg/tcpip/link/rawfile/blockingpoll_yield_unsafe.go b/pkg/tcpip/link/rawfile/blockingpoll_yield_unsafe.go
index 99313ee25..5db4bf12b 100644
--- a/pkg/tcpip/link/rawfile/blockingpoll_yield_unsafe.go
+++ b/pkg/tcpip/link/rawfile/blockingpoll_yield_unsafe.go
@@ -14,7 +14,7 @@
 
 // +build linux,amd64 linux,arm64
 // +build go1.12
-// +build !go1.16
+// +build !go1.17
 
 // Check go:linkname function signatures when updating Go version.
 
diff --git a/pkg/tcpip/link/rawfile/errors.go b/pkg/tcpip/link/rawfile/errors.go
index a0a873c84..604868fd8 100644
--- a/pkg/tcpip/link/rawfile/errors.go
+++ b/pkg/tcpip/link/rawfile/errors.go
@@ -31,10 +31,12 @@ var translations [maxErrno]*tcpip.Error
 // *tcpip.Error.
 //
 // Valid, but unrecognized errnos will be translated to
-// tcpip.ErrInvalidEndpointState (EINVAL). Panics on invalid errnos.
+// tcpip.ErrInvalidEndpointState (EINVAL).
 func TranslateErrno(e syscall.Errno) *tcpip.Error {
-	if err := translations[e]; err != nil {
-		return err
+	if e > 0 && e < syscall.Errno(len(translations)) {
+		if err := translations[e]; err != nil {
+			return err
+		}
 	}
 	return tcpip.ErrInvalidEndpointState
 }
diff --git a/pkg/tcpip/link/rawfile/errors_test.go b/pkg/tcpip/link/rawfile/errors_test.go
new file mode 100644
index 000000000..e4cdc66bd
--- /dev/null
+++ b/pkg/tcpip/link/rawfile/errors_test.go
@@ -0,0 +1,53 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build linux
+
+package rawfile
+
+import (
+	"syscall"
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+)
+
+func TestTranslateErrno(t *testing.T) {
+	for _, test := range []struct {
+		errno      syscall.Errno
+		translated *tcpip.Error
+	}{
+		{
+			errno:      syscall.Errno(0),
+			translated: tcpip.ErrInvalidEndpointState,
+		},
+		{
+			errno:      syscall.Errno(maxErrno),
+			translated: tcpip.ErrInvalidEndpointState,
+		},
+		{
+			errno:      syscall.Errno(514),
+			translated: tcpip.ErrInvalidEndpointState,
+		},
+		{
+			errno:      syscall.EEXIST,
+			translated: tcpip.ErrDuplicateAddress,
+		},
+	} {
+		got := TranslateErrno(test.errno)
+		if got != test.translated {
+			t.Errorf("TranslateErrno(%q) = %q, want %q", test.errno, got, test.translated)
+		}
+	}
+}
diff --git a/pkg/tcpip/link/sharedmem/pipe/pipe_test.go b/pkg/tcpip/link/sharedmem/pipe/pipe_test.go
index dc239a0d0..2777f1411 100644
--- a/pkg/tcpip/link/sharedmem/pipe/pipe_test.go
+++ b/pkg/tcpip/link/sharedmem/pipe/pipe_test.go
@@ -470,6 +470,7 @@ func TestConcurrentReaderWriter(t *testing.T) {
 
 	const count = 1000000
 	var wg sync.WaitGroup
+	defer wg.Wait()
 	wg.Add(1)
 	go func() {
 		defer wg.Done()
@@ -489,30 +490,23 @@ func TestConcurrentReaderWriter(t *testing.T) {
 		}
 	}()
 
-	wg.Add(1)
-	go func() {
-		defer wg.Done()
-		runtime.Gosched()
-		for i := 0; i < count; i++ {
-			n := 1 + rr.Intn(80)
-			rb := rx.Pull()
-			for rb == nil {
-				rb = rx.Pull()
-			}
+	for i := 0; i < count; i++ {
+		n := 1 + rr.Intn(80)
+		rb := rx.Pull()
+		for rb == nil {
+			rb = rx.Pull()
+		}
 
-			if n != len(rb) {
-				t.Fatalf("Bad %v-th buffer length: got %v, want %v", i, len(rb), n)
-			}
+		if n != len(rb) {
+			t.Fatalf("Bad %v-th buffer length: got %v, want %v", i, len(rb), n)
+		}
 
-			for j := range rb {
-				if v := byte(rr.Intn(256)); v != rb[j] {
-					t.Fatalf("Bad %v-th read buffer at index %v: got %v, want %v", i, j, rb[j], v)
-				}
+		for j := range rb {
+			if v := byte(rr.Intn(256)); v != rb[j] {
+				t.Fatalf("Bad %v-th read buffer at index %v: got %v, want %v", i, j, rb[j], v)
 			}
-
-			rx.Flush()
 		}
-	}()
 
-	wg.Wait()
+		rx.Flush()
+	}
 }
diff --git a/pkg/tcpip/link/sniffer/BUILD b/pkg/tcpip/link/sniffer/BUILD
index 7cbc305e7..4aac12a8c 100644
--- a/pkg/tcpip/link/sniffer/BUILD
+++ b/pkg/tcpip/link/sniffer/BUILD
@@ -14,6 +14,7 @@ go_library(
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/header",
+        "//pkg/tcpip/header/parse",
         "//pkg/tcpip/link/nested",
         "//pkg/tcpip/stack",
     ],
diff --git a/pkg/tcpip/link/sniffer/sniffer.go b/pkg/tcpip/link/sniffer/sniffer.go
index 4fb127978..560477926 100644
--- a/pkg/tcpip/link/sniffer/sniffer.go
+++ b/pkg/tcpip/link/sniffer/sniffer.go
@@ -31,6 +31,7 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/header/parse"
 	"gvisor.dev/gvisor/pkg/tcpip/link/nested"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 )
@@ -195,49 +196,52 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, pkt *stack.P
 	var transProto uint8
 	src := tcpip.Address("unknown")
 	dst := tcpip.Address("unknown")
-	id := 0
-	size := uint16(0)
+	var size uint16
+	var id uint32
 	var fragmentOffset uint16
 	var moreFragments bool
 
-	// Examine the packet using a new VV. Backing storage must not be written.
-	vv := buffer.NewVectorisedView(pkt.Size(), pkt.Views())
-
+	// Clone the packet buffer to not modify the original.
+	//
+	// We don't clone the original packet buffer so that the new packet buffer
+	// does not have any of its headers set.
+	pkt = stack.NewPacketBuffer(stack.PacketBufferOptions{Data: buffer.NewVectorisedView(pkt.Size(), pkt.Views())})
 	switch protocol {
 	case header.IPv4ProtocolNumber:
-		hdr, ok := vv.PullUp(header.IPv4MinimumSize)
-		if !ok {
+		if ok := parse.IPv4(pkt); !ok {
 			return
 		}
-		ipv4 := header.IPv4(hdr)
+
+		ipv4 := header.IPv4(pkt.NetworkHeader().View())
 		fragmentOffset = ipv4.FragmentOffset()
 		moreFragments = ipv4.Flags()&header.IPv4FlagMoreFragments == header.IPv4FlagMoreFragments
 		src = ipv4.SourceAddress()
 		dst = ipv4.DestinationAddress()
 		transProto = ipv4.Protocol()
 		size = ipv4.TotalLength() - uint16(ipv4.HeaderLength())
-		vv.TrimFront(int(ipv4.HeaderLength()))
-		id = int(ipv4.ID())
+		id = uint32(ipv4.ID())
 
 	case header.IPv6ProtocolNumber:
-		hdr, ok := vv.PullUp(header.IPv6MinimumSize)
+		proto, fragID, fragOffset, fragMore, ok := parse.IPv6(pkt)
 		if !ok {
 			return
 		}
-		ipv6 := header.IPv6(hdr)
+
+		ipv6 := header.IPv6(pkt.NetworkHeader().View())
 		src = ipv6.SourceAddress()
 		dst = ipv6.DestinationAddress()
-		transProto = ipv6.NextHeader()
+		transProto = uint8(proto)
 		size = ipv6.PayloadLength()
-		vv.TrimFront(header.IPv6MinimumSize)
+		id = fragID
+		moreFragments = fragMore
+		fragmentOffset = fragOffset
 
 	case header.ARPProtocolNumber:
-		hdr, ok := vv.PullUp(header.ARPSize)
-		if !ok {
+		if parse.ARP(pkt) {
 			return
 		}
-		vv.TrimFront(header.ARPSize)
-		arp := header.ARP(hdr)
+
+		arp := header.ARP(pkt.NetworkHeader().View())
 		log.Infof(
 			"%s arp %s (%s) -> %s (%s) valid:%t",
 			prefix,
@@ -259,7 +263,7 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, pkt *stack.P
 	switch tcpip.TransportProtocolNumber(transProto) {
 	case header.ICMPv4ProtocolNumber:
 		transName = "icmp"
-		hdr, ok := vv.PullUp(header.ICMPv4MinimumSize)
+		hdr, ok := pkt.Data.PullUp(header.ICMPv4MinimumSize)
 		if !ok {
 			break
 		}
@@ -296,7 +300,7 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, pkt *stack.P
 
 	case header.ICMPv6ProtocolNumber:
 		transName = "icmp"
-		hdr, ok := vv.PullUp(header.ICMPv6MinimumSize)
+		hdr, ok := pkt.Data.PullUp(header.ICMPv6MinimumSize)
 		if !ok {
 			break
 		}
@@ -331,11 +335,11 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, pkt *stack.P
 
 	case header.UDPProtocolNumber:
 		transName = "udp"
-		hdr, ok := vv.PullUp(header.UDPMinimumSize)
-		if !ok {
+		if ok := parse.UDP(pkt); !ok {
 			break
 		}
-		udp := header.UDP(hdr)
+
+		udp := header.UDP(pkt.TransportHeader().View())
 		if fragmentOffset == 0 {
 			srcPort = udp.SourcePort()
 			dstPort = udp.DestinationPort()
@@ -345,19 +349,19 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, pkt *stack.P
 
 	case header.TCPProtocolNumber:
 		transName = "tcp"
-		hdr, ok := vv.PullUp(header.TCPMinimumSize)
-		if !ok {
+		if ok := parse.TCP(pkt); !ok {
 			break
 		}
-		tcp := header.TCP(hdr)
+
+		tcp := header.TCP(pkt.TransportHeader().View())
 		if fragmentOffset == 0 {
 			offset := int(tcp.DataOffset())
 			if offset < header.TCPMinimumSize {
 				details += fmt.Sprintf("invalid packet: tcp data offset too small %d", offset)
 				break
 			}
-			if offset > vv.Size() && !moreFragments {
-				details += fmt.Sprintf("invalid packet: tcp data offset %d larger than packet buffer length %d", offset, vv.Size())
+			if size := pkt.Data.Size() + len(tcp); offset > size && !moreFragments {
+				details += fmt.Sprintf("invalid packet: tcp data offset %d larger than tcp packet length %d", offset, size)
 				break
 			}
 
diff --git a/pkg/tcpip/link/tun/BUILD b/pkg/tcpip/link/tun/BUILD
index 6c137f693..0243424f6 100644
--- a/pkg/tcpip/link/tun/BUILD
+++ b/pkg/tcpip/link/tun/BUILD
@@ -1,18 +1,32 @@
 load("//tools:defs.bzl", "go_library")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
 
 package(licenses = ["notice"])
 
+go_template_instance(
+    name = "tun_endpoint_refs",
+    out = "tun_endpoint_refs.go",
+    package = "tun",
+    prefix = "tunEndpoint",
+    template = "//pkg/refs_vfs2:refs_template",
+    types = {
+        "T": "tunEndpoint",
+    },
+)
+
 go_library(
     name = "tun",
     srcs = [
         "device.go",
         "protocol.go",
+        "tun_endpoint_refs.go",
         "tun_unsafe.go",
     ],
     visibility = ["//visibility:public"],
     deps = [
         "//pkg/abi/linux",
         "//pkg/context",
+        "//pkg/log",
         "//pkg/refs",
         "//pkg/sync",
         "//pkg/syserror",
diff --git a/pkg/tcpip/link/tun/device.go b/pkg/tcpip/link/tun/device.go
index 3b1510a33..f94491026 100644
--- a/pkg/tcpip/link/tun/device.go
+++ b/pkg/tcpip/link/tun/device.go
@@ -19,7 +19,6 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
-	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/tcpip"
@@ -77,13 +76,29 @@ func (d *Device) Release(ctx context.Context) {
 	}
 }
 
+// NICID returns the NIC ID of the device.
+//
+// Must only be called after the device has been attached to an endpoint.
+func (d *Device) NICID() tcpip.NICID {
+	d.mu.RLock()
+	defer d.mu.RUnlock()
+
+	if d.endpoint == nil {
+		panic("called NICID on a device that has not been attached")
+	}
+
+	return d.endpoint.nicID
+}
+
 // SetIff services TUNSETIFF ioctl(2) request.
-func (d *Device) SetIff(s *stack.Stack, name string, flags uint16) error {
+//
+// Returns true if a new NIC was created; false if an existing one was attached.
+func (d *Device) SetIff(s *stack.Stack, name string, flags uint16) (bool, error) {
 	d.mu.Lock()
 	defer d.mu.Unlock()
 
 	if d.endpoint != nil {
-		return syserror.EINVAL
+		return false, syserror.EINVAL
 	}
 
 	// Input validations.
@@ -91,7 +106,7 @@ func (d *Device) SetIff(s *stack.Stack, name string, flags uint16) error {
 	isTap := flags&linux.IFF_TAP != 0
 	supportedFlags := uint16(linux.IFF_TUN | linux.IFF_TAP | linux.IFF_NO_PI)
 	if isTap && isTun || !isTap && !isTun || flags&^supportedFlags != 0 {
-		return syserror.EINVAL
+		return false, syserror.EINVAL
 	}
 
 	prefix := "tun"
@@ -104,37 +119,38 @@ func (d *Device) SetIff(s *stack.Stack, name string, flags uint16) error {
 		linkCaps |= stack.CapabilityResolutionRequired
 	}
 
-	endpoint, err := attachOrCreateNIC(s, name, prefix, linkCaps)
+	endpoint, created, err := attachOrCreateNIC(s, name, prefix, linkCaps)
 	if err != nil {
-		return syserror.EINVAL
+		return false, syserror.EINVAL
 	}
 
 	d.endpoint = endpoint
 	d.notifyHandle = d.endpoint.AddNotify(d)
 	d.flags = flags
-	return nil
+	return created, nil
 }
 
-func attachOrCreateNIC(s *stack.Stack, name, prefix string, linkCaps stack.LinkEndpointCapabilities) (*tunEndpoint, error) {
+func attachOrCreateNIC(s *stack.Stack, name, prefix string, linkCaps stack.LinkEndpointCapabilities) (*tunEndpoint, bool, error) {
 	for {
 		// 1. Try to attach to an existing NIC.
 		if name != "" {
-			if nic, found := s.GetNICByName(name); found {
-				endpoint, ok := nic.LinkEndpoint().(*tunEndpoint)
+			if linkEP := s.GetLinkEndpointByName(name); linkEP != nil {
+				endpoint, ok := linkEP.(*tunEndpoint)
 				if !ok {
 					// Not a NIC created by tun device.
-					return nil, syserror.EOPNOTSUPP
+					return nil, false, syserror.EOPNOTSUPP
 				}
 				if !endpoint.TryIncRef() {
 					// Race detected: NIC got deleted in between.
 					continue
 				}
-				return endpoint, nil
+				return endpoint, false, nil
 			}
 		}
 
 		// 2. Creating a new NIC.
 		id := tcpip.NICID(s.UniqueID())
+		// TODO(gvisor.dev/1486): enable leak check for tunEndpoint.
 		endpoint := &tunEndpoint{
 			Endpoint: channel.New(defaultDevOutQueueLen, defaultDevMtu, ""),
 			stack:    s,
@@ -151,12 +167,12 @@ func attachOrCreateNIC(s *stack.Stack, name, prefix string, linkCaps stack.LinkE
 		})
 		switch err {
 		case nil:
-			return endpoint, nil
+			return endpoint, true, nil
 		case tcpip.ErrDuplicateNICID:
 			// Race detected: A NIC has been created in between.
 			continue
 		default:
-			return nil, syserror.EINVAL
+			return nil, false, syserror.EINVAL
 		}
 	}
 }
@@ -331,19 +347,18 @@ func (d *Device) WriteNotify() {
 // It is ref-counted as multiple opening files can attach to the same NIC.
 // The last owner is responsible for deleting the NIC.
 type tunEndpoint struct {
+	tunEndpointRefs
 	*channel.Endpoint
 
-	refs.AtomicRefCount
-
 	stack *stack.Stack
 	nicID tcpip.NICID
 	name  string
 	isTap bool
 }
 
-// DecRef decrements refcount of e, removes NIC if refcount goes to 0.
+// DecRef decrements refcount of e, removing NIC if it reaches 0.
 func (e *tunEndpoint) DecRef(ctx context.Context) {
-	e.DecRefWithDestructor(ctx, func(context.Context) {
+	e.tunEndpointRefs.DecRef(func() {
 		e.stack.RemoveNIC(e.nicID)
 	})
 }
diff --git a/pkg/tcpip/network/BUILD b/pkg/tcpip/network/BUILD
index 46083925c..c118a2929 100644
--- a/pkg/tcpip/network/BUILD
+++ b/pkg/tcpip/network/BUILD
@@ -9,14 +9,17 @@ go_test(
         "ip_test.go",
     ],
     deps = [
+        "//pkg/sync",
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
+        "//pkg/tcpip/checker",
         "//pkg/tcpip/header",
         "//pkg/tcpip/link/channel",
         "//pkg/tcpip/link/loopback",
         "//pkg/tcpip/network/ipv4",
         "//pkg/tcpip/network/ipv6",
         "//pkg/tcpip/stack",
+        "//pkg/tcpip/transport/icmp",
         "//pkg/tcpip/transport/tcp",
         "//pkg/tcpip/transport/udp",
     ],
diff --git a/pkg/tcpip/network/arp/BUILD b/pkg/tcpip/network/arp/BUILD
index eddf7b725..b40dde96b 100644
--- a/pkg/tcpip/network/arp/BUILD
+++ b/pkg/tcpip/network/arp/BUILD
@@ -10,6 +10,7 @@ go_library(
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/header",
+        "//pkg/tcpip/header/parse",
         "//pkg/tcpip/stack",
     ],
 )
@@ -28,5 +29,6 @@ go_test(
         "//pkg/tcpip/network/ipv4",
         "//pkg/tcpip/stack",
         "//pkg/tcpip/transport/icmp",
+        "@com_github_google_go_cmp//cmp:go_default_library",
     ],
 )
diff --git a/pkg/tcpip/network/arp/arp.go b/pkg/tcpip/network/arp/arp.go
index 920872c3f..7df77c66e 100644
--- a/pkg/tcpip/network/arp/arp.go
+++ b/pkg/tcpip/network/arp/arp.go
@@ -15,20 +15,15 @@
 // Package arp implements the ARP network protocol. It is used to resolve
 // IPv4 addresses into link-local MAC addresses, and advertises IPv4
 // addresses of its stack with the local network.
-//
-// To use it in the networking stack, pass arp.NewProtocol() as one of the
-// network protocols when calling stack.New. Then add an "arp" address to every
-// NIC on the stack that should respond to ARP requests. That is:
-//
-//	if err := s.AddAddress(1, arp.ProtocolNumber, "arp"); err != nil {
-//		// handle err
-//	}
 package arp
 
 import (
+	"sync/atomic"
+
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/header/parse"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 )
 
@@ -40,12 +35,54 @@ const (
 	ProtocolAddress = tcpip.Address("arp")
 )
 
-// endpoint implements stack.NetworkEndpoint.
+var _ stack.AddressableEndpoint = (*endpoint)(nil)
+var _ stack.NetworkEndpoint = (*endpoint)(nil)
+
 type endpoint struct {
-	protocol      *protocol
-	nicID         tcpip.NICID
-	linkEP        stack.LinkEndpoint
+	stack.AddressableEndpointState
+
+	protocol *protocol
+
+	// enabled is set to 1 when the NIC is enabled and 0 when it is disabled.
+	//
+	// Must be accessed using atomic operations.
+	enabled uint32
+
+	nic           stack.NetworkInterface
 	linkAddrCache stack.LinkAddressCache
+	nud           stack.NUDHandler
+}
+
+func (e *endpoint) Enable() *tcpip.Error {
+	if !e.nic.Enabled() {
+		return tcpip.ErrNotPermitted
+	}
+
+	e.setEnabled(true)
+	return nil
+}
+
+func (e *endpoint) Enabled() bool {
+	return e.nic.Enabled() && e.isEnabled()
+}
+
+// isEnabled returns true if the endpoint is enabled, regardless of the
+// enabled status of the NIC.
+func (e *endpoint) isEnabled() bool {
+	return atomic.LoadUint32(&e.enabled) == 1
+}
+
+// setEnabled sets the enabled status for the endpoint.
+func (e *endpoint) setEnabled(v bool) {
+	if v {
+		atomic.StoreUint32(&e.enabled, 1)
+	} else {
+		atomic.StoreUint32(&e.enabled, 0)
+	}
+}
+
+func (e *endpoint) Disable() {
+	e.setEnabled(false)
 }
 
 // DefaultTTL is unused for ARP. It implements stack.NetworkEndpoint.
@@ -54,23 +91,17 @@ func (e *endpoint) DefaultTTL() uint8 {
 }
 
 func (e *endpoint) MTU() uint32 {
-	lmtu := e.linkEP.MTU()
+	lmtu := e.nic.MTU()
 	return lmtu - uint32(e.MaxHeaderLength())
 }
 
-func (e *endpoint) NICID() tcpip.NICID {
-	return e.nicID
-}
-
-func (e *endpoint) Capabilities() stack.LinkEndpointCapabilities {
-	return e.linkEP.Capabilities()
-}
-
 func (e *endpoint) MaxHeaderLength() uint16 {
-	return e.linkEP.MaxHeaderLength() + header.ARPSize
+	return e.nic.MaxHeaderLength() + header.ARPSize
 }
 
-func (e *endpoint) Close() {}
+func (e *endpoint) Close() {
+	e.AddressableEndpointState.Cleanup()
+}
 
 func (e *endpoint) WritePacket(*stack.Route, *stack.GSO, stack.NetworkHeaderParams, *stack.PacketBuffer) *tcpip.Error {
 	return tcpip.ErrNotSupported
@@ -78,7 +109,7 @@ func (e *endpoint) WritePacket(*stack.Route, *stack.GSO, stack.NetworkHeaderPara
 
 // NetworkProtocolNumber implements stack.NetworkEndpoint.NetworkProtocolNumber.
 func (e *endpoint) NetworkProtocolNumber() tcpip.NetworkProtocolNumber {
-	return e.protocol.Number()
+	return ProtocolNumber
 }
 
 // WritePackets implements stack.NetworkEndpoint.WritePackets.
@@ -91,6 +122,10 @@ func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt *stack.PacketBu
 }
 
 func (e *endpoint) HandlePacket(r *stack.Route, pkt *stack.PacketBuffer) {
+	if !e.isEnabled() {
+		return
+	}
+
 	h := header.ARP(pkt.NetworkHeader().View())
 	if !h.IsValid() {
 		return
@@ -99,25 +134,66 @@ func (e *endpoint) HandlePacket(r *stack.Route, pkt *stack.PacketBuffer) {
 	switch h.Op() {
 	case header.ARPRequest:
 		localAddr := tcpip.Address(h.ProtocolAddressTarget())
-		if e.linkAddrCache.CheckLocalAddress(e.nicID, header.IPv4ProtocolNumber, localAddr) == 0 {
-			return // we have no useful answer, ignore the request
+
+		if e.nud == nil {
+			if e.linkAddrCache.CheckLocalAddress(e.nic.ID(), header.IPv4ProtocolNumber, localAddr) == 0 {
+				return // we have no useful answer, ignore the request
+			}
+
+			addr := tcpip.Address(h.ProtocolAddressSender())
+			linkAddr := tcpip.LinkAddress(h.HardwareAddressSender())
+			e.linkAddrCache.AddLinkAddress(e.nic.ID(), addr, linkAddr)
+		} else {
+			if r.Stack().CheckLocalAddress(e.nic.ID(), header.IPv4ProtocolNumber, localAddr) == 0 {
+				return // we have no useful answer, ignore the request
+			}
+
+			remoteAddr := tcpip.Address(h.ProtocolAddressSender())
+			remoteLinkAddr := tcpip.LinkAddress(h.HardwareAddressSender())
+			e.nud.HandleProbe(remoteAddr, localAddr, ProtocolNumber, remoteLinkAddr, e.protocol)
 		}
-		pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
-			ReserveHeaderBytes: int(e.linkEP.MaxHeaderLength()) + header.ARPSize,
+
+		// As per RFC 826, under Packet Reception:
+		//   Swap hardware and protocol fields, putting the local hardware and
+		//   protocol addresses in the sender fields.
+		//
+		//   Send the packet to the (new) target hardware address on the same
+		//   hardware on which the request was received.
+		origSender := h.HardwareAddressSender()
+		r.RemoteLinkAddress = tcpip.LinkAddress(origSender)
+		respPkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
+			ReserveHeaderBytes: int(e.nic.MaxHeaderLength()) + header.ARPSize,
 		})
-		packet := header.ARP(pkt.NetworkHeader().Push(header.ARPSize))
+		packet := header.ARP(respPkt.NetworkHeader().Push(header.ARPSize))
 		packet.SetIPv4OverEthernet()
 		packet.SetOp(header.ARPReply)
 		copy(packet.HardwareAddressSender(), r.LocalLinkAddress[:])
 		copy(packet.ProtocolAddressSender(), h.ProtocolAddressTarget())
-		copy(packet.HardwareAddressTarget(), h.HardwareAddressSender())
+		copy(packet.HardwareAddressTarget(), origSender)
 		copy(packet.ProtocolAddressTarget(), h.ProtocolAddressSender())
-		_ = e.linkEP.WritePacket(r, nil /* gso */, ProtocolNumber, pkt)
-		fallthrough // also fill the cache from requests
+		_ = e.nic.WritePacket(r, nil /* gso */, ProtocolNumber, respPkt)
+
 	case header.ARPReply:
 		addr := tcpip.Address(h.ProtocolAddressSender())
 		linkAddr := tcpip.LinkAddress(h.HardwareAddressSender())
-		e.linkAddrCache.AddLinkAddress(e.nicID, addr, linkAddr)
+
+		if e.nud == nil {
+			e.linkAddrCache.AddLinkAddress(e.nic.ID(), addr, linkAddr)
+			return
+		}
+
+		// The solicited, override, and isRouter flags are not available for ARP;
+		// they are only available for IPv6 Neighbor Advertisements.
+		e.nud.HandleConfirmation(addr, linkAddr, stack.ReachabilityConfirmationFlags{
+			// Solicited and unsolicited (also referred to as gratuitous) ARP Replies
+			// are handled equivalently to a solicited Neighbor Advertisement.
+			Solicited: true,
+			// If a different link address is received than the one cached, the entry
+			// should always go to Stale.
+			Override: false,
+			// ARP does not distinguish between router and non-router hosts.
+			IsRouter: false,
+		})
 	}
 }
 
@@ -134,13 +210,15 @@ func (*protocol) ParseAddresses(v buffer.View) (src, dst tcpip.Address) {
 	return tcpip.Address(h.ProtocolAddressSender()), ProtocolAddress
 }
 
-func (p *protocol) NewEndpoint(nicID tcpip.NICID, linkAddrCache stack.LinkAddressCache, dispatcher stack.TransportDispatcher, sender stack.LinkEndpoint, st *stack.Stack) stack.NetworkEndpoint {
-	return &endpoint{
+func (p *protocol) NewEndpoint(nic stack.NetworkInterface, linkAddrCache stack.LinkAddressCache, nud stack.NUDHandler, dispatcher stack.TransportDispatcher) stack.NetworkEndpoint {
+	e := &endpoint{
 		protocol:      p,
-		nicID:         nicID,
-		linkEP:        sender,
+		nic:           nic,
 		linkAddrCache: linkAddrCache,
+		nud:           nud,
 	}
+	e.AddressableEndpointState.Init(e)
+	return e
 }
 
 // LinkAddressProtocol implements stack.LinkAddressResolver.LinkAddressProtocol.
@@ -151,6 +229,7 @@ func (*protocol) LinkAddressProtocol() tcpip.NetworkProtocolNumber {
 // LinkAddressRequest implements stack.LinkAddressResolver.LinkAddressRequest.
 func (*protocol) LinkAddressRequest(addr, localAddr tcpip.Address, remoteLinkAddr tcpip.LinkAddress, linkEP stack.LinkEndpoint) *tcpip.Error {
 	r := &stack.Route{
+		NetProto:          ProtocolNumber,
 		RemoteLinkAddress: remoteLinkAddr,
 	}
 	if len(r.RemoteLinkAddress) == 0 {
@@ -182,12 +261,12 @@ func (*protocol) ResolveStaticAddress(addr tcpip.Address) (tcpip.LinkAddress, bo
 }
 
 // SetOption implements stack.NetworkProtocol.SetOption.
-func (*protocol) SetOption(option interface{}) *tcpip.Error {
+func (*protocol) SetOption(tcpip.SettableNetworkProtocolOption) *tcpip.Error {
 	return tcpip.ErrUnknownProtocolOption
 }
 
 // Option implements stack.NetworkProtocol.Option.
-func (*protocol) Option(option interface{}) *tcpip.Error {
+func (*protocol) Option(tcpip.GettableNetworkProtocolOption) *tcpip.Error {
 	return tcpip.ErrUnknownProtocolOption
 }
 
@@ -199,14 +278,14 @@ func (*protocol) Wait() {}
 
 // Parse implements stack.NetworkProtocol.Parse.
 func (*protocol) Parse(pkt *stack.PacketBuffer) (proto tcpip.TransportProtocolNumber, hasTransportHdr bool, ok bool) {
-	_, ok = pkt.NetworkHeader().Consume(header.ARPSize)
-	if !ok {
-		return 0, false, false
-	}
-	return 0, false, true
+	return 0, false, parse.ARP(pkt)
 }
 
 // NewProtocol returns an ARP network protocol.
-func NewProtocol() stack.NetworkProtocol {
+//
+// Note, to make sure that the ARP endpoint receives ARP packets, the "arp"
+// address must be added to every NIC that should respond to ARP requests. See
+// ProtocolAddress for more details.
+func NewProtocol(*stack.Stack) stack.NetworkProtocol {
 	return &protocol{}
 }
diff --git a/pkg/tcpip/network/arp/arp_test.go b/pkg/tcpip/network/arp/arp_test.go
index c2c3e6891..626af975a 100644
--- a/pkg/tcpip/network/arp/arp_test.go
+++ b/pkg/tcpip/network/arp/arp_test.go
@@ -16,10 +16,12 @@ package arp_test
 
 import (
 	"context"
+	"fmt"
 	"strconv"
 	"testing"
 	"time"
 
+	"github.com/google/go-cmp/cmp"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
@@ -32,57 +34,192 @@ import (
 )
 
 const (
-	stackLinkAddr1 = tcpip.LinkAddress("\x0a\x0a\x0b\x0b\x0c\x0c")
-	stackLinkAddr2 = tcpip.LinkAddress("\x0b\x0b\x0c\x0c\x0d\x0d")
-	stackAddr1     = tcpip.Address("\x0a\x00\x00\x01")
-	stackAddr2     = tcpip.Address("\x0a\x00\x00\x02")
-	stackAddrBad   = tcpip.Address("\x0a\x00\x00\x03")
+	nicID = 1
+
+	stackAddr     = tcpip.Address("\x0a\x00\x00\x01")
+	stackLinkAddr = tcpip.LinkAddress("\x0a\x0a\x0b\x0b\x0c\x0c")
+
+	remoteAddr     = tcpip.Address("\x0a\x00\x00\x02")
+	remoteLinkAddr = tcpip.LinkAddress("\x01\x02\x03\x04\x05\x06")
+
+	unknownAddr = tcpip.Address("\x0a\x00\x00\x03")
 
 	defaultChannelSize = 1
 	defaultMTU         = 65536
+
+	// eventChanSize defines the size of event channels used by the neighbor
+	// cache's event dispatcher. The size chosen here needs to be sufficient to
+	// queue all the events received during tests before consumption.
+	// If eventChanSize is too small, the tests may deadlock.
+	eventChanSize = 32
+)
+
+type eventType uint8
+
+const (
+	entryAdded eventType = iota
+	entryChanged
+	entryRemoved
 )
 
+func (t eventType) String() string {
+	switch t {
+	case entryAdded:
+		return "add"
+	case entryChanged:
+		return "change"
+	case entryRemoved:
+		return "remove"
+	default:
+		return fmt.Sprintf("unknown (%d)", t)
+	}
+}
+
+type eventInfo struct {
+	eventType eventType
+	nicID     tcpip.NICID
+	addr      tcpip.Address
+	linkAddr  tcpip.LinkAddress
+	state     stack.NeighborState
+}
+
+func (e eventInfo) String() string {
+	return fmt.Sprintf("%s event for NIC #%d, addr=%q, linkAddr=%q, state=%q", e.eventType, e.nicID, e.addr, e.linkAddr, e.state)
+}
+
+// arpDispatcher implements NUDDispatcher to validate the dispatching of
+// events upon certain NUD state machine events.
+type arpDispatcher struct {
+	// C is where events are queued
+	C chan eventInfo
+}
+
+var _ stack.NUDDispatcher = (*arpDispatcher)(nil)
+
+func (d *arpDispatcher) OnNeighborAdded(nicID tcpip.NICID, addr tcpip.Address, linkAddr tcpip.LinkAddress, state stack.NeighborState, updatedAt time.Time) {
+	e := eventInfo{
+		eventType: entryAdded,
+		nicID:     nicID,
+		addr:      addr,
+		linkAddr:  linkAddr,
+		state:     state,
+	}
+	d.C <- e
+}
+
+func (d *arpDispatcher) OnNeighborChanged(nicID tcpip.NICID, addr tcpip.Address, linkAddr tcpip.LinkAddress, state stack.NeighborState, updatedAt time.Time) {
+	e := eventInfo{
+		eventType: entryChanged,
+		nicID:     nicID,
+		addr:      addr,
+		linkAddr:  linkAddr,
+		state:     state,
+	}
+	d.C <- e
+}
+
+func (d *arpDispatcher) OnNeighborRemoved(nicID tcpip.NICID, addr tcpip.Address, linkAddr tcpip.LinkAddress, state stack.NeighborState, updatedAt time.Time) {
+	e := eventInfo{
+		eventType: entryRemoved,
+		nicID:     nicID,
+		addr:      addr,
+		linkAddr:  linkAddr,
+		state:     state,
+	}
+	d.C <- e
+}
+
+func (d *arpDispatcher) waitForEvent(ctx context.Context, want eventInfo) error {
+	select {
+	case got := <-d.C:
+		if diff := cmp.Diff(got, want, cmp.AllowUnexported(got)); diff != "" {
+			return fmt.Errorf("got invalid event (-got +want):\n%s", diff)
+		}
+	case <-ctx.Done():
+		return fmt.Errorf("%s for %s", ctx.Err(), want)
+	}
+	return nil
+}
+
+func (d *arpDispatcher) waitForEventWithTimeout(want eventInfo, timeout time.Duration) error {
+	ctx, cancel := context.WithTimeout(context.Background(), timeout)
+	defer cancel()
+	return d.waitForEvent(ctx, want)
+}
+
+func (d *arpDispatcher) nextEvent() (eventInfo, bool) {
+	select {
+	case event := <-d.C:
+		return event, true
+	default:
+		return eventInfo{}, false
+	}
+}
+
 type testContext struct {
-	t      *testing.T
-	linkEP *channel.Endpoint
-	s      *stack.Stack
+	s       *stack.Stack
+	linkEP  *channel.Endpoint
+	nudDisp *arpDispatcher
 }
 
-func newTestContext(t *testing.T) *testContext {
+func newTestContext(t *testing.T, useNeighborCache bool) *testContext {
+	c := stack.DefaultNUDConfigurations()
+	// Transition from Reachable to Stale almost immediately to test if receiving
+	// probes refreshes positive reachability.
+	c.BaseReachableTime = time.Microsecond
+
+	d := arpDispatcher{
+		// Create an event channel large enough so the neighbor cache doesn't block
+		// while dispatching events. Blocking could interfere with the timing of
+		// NUD transitions.
+		C: make(chan eventInfo, eventChanSize),
+	}
+
 	s := stack.New(stack.Options{
-		NetworkProtocols:   []stack.NetworkProtocol{ipv4.NewProtocol(), arp.NewProtocol()},
-		TransportProtocols: []stack.TransportProtocol{icmp.NewProtocol4()},
+		NetworkProtocols:   []stack.NetworkProtocolFactory{ipv4.NewProtocol, arp.NewProtocol},
+		TransportProtocols: []stack.TransportProtocolFactory{icmp.NewProtocol4},
+		NUDConfigs:         c,
+		NUDDisp:            &d,
+		UseNeighborCache:   useNeighborCache,
 	})
 
-	ep := channel.New(defaultChannelSize, defaultMTU, stackLinkAddr1)
+	ep := channel.New(defaultChannelSize, defaultMTU, stackLinkAddr)
+	ep.LinkEPCapabilities |= stack.CapabilityResolutionRequired
+
 	wep := stack.LinkEndpoint(ep)
 
 	if testing.Verbose() {
 		wep = sniffer.New(ep)
 	}
-	if err := s.CreateNIC(1, wep); err != nil {
+	if err := s.CreateNIC(nicID, wep); err != nil {
 		t.Fatalf("CreateNIC failed: %v", err)
 	}
 
-	if err := s.AddAddress(1, ipv4.ProtocolNumber, stackAddr1); err != nil {
+	if err := s.AddAddress(nicID, ipv4.ProtocolNumber, stackAddr); err != nil {
 		t.Fatalf("AddAddress for ipv4 failed: %v", err)
 	}
-	if err := s.AddAddress(1, ipv4.ProtocolNumber, stackAddr2); err != nil {
-		t.Fatalf("AddAddress for ipv4 failed: %v", err)
+	if !useNeighborCache {
+		// The remote address needs to be assigned to the NIC so we can receive and
+		// verify outgoing ARP packets. The neighbor cache isn't concerned with
+		// this; the tests that use linkAddrCache expect the ARP responses to be
+		// received by the same NIC.
+		if err := s.AddAddress(nicID, ipv4.ProtocolNumber, remoteAddr); err != nil {
+			t.Fatalf("AddAddress for ipv4 failed: %v", err)
+		}
 	}
-	if err := s.AddAddress(1, arp.ProtocolNumber, arp.ProtocolAddress); err != nil {
+	if err := s.AddAddress(nicID, arp.ProtocolNumber, arp.ProtocolAddress); err != nil {
 		t.Fatalf("AddAddress for arp failed: %v", err)
 	}
 
 	s.SetRouteTable([]tcpip.Route{{
 		Destination: header.IPv4EmptySubnet,
-		NIC:         1,
+		NIC:         nicID,
 	}})
 
 	return &testContext{
-		t:      t,
-		s:      s,
-		linkEP: ep,
+		s:       s,
+		linkEP:  ep,
+		nudDisp: &d,
 	}
 }
 
@@ -91,7 +228,7 @@ func (c *testContext) cleanup() {
 }
 
 func TestDirectRequest(t *testing.T) {
-	c := newTestContext(t)
+	c := newTestContext(t, false /* useNeighborCache */)
 	defer c.cleanup()
 
 	const senderMAC = "\x01\x02\x03\x04\x05\x06"
@@ -111,7 +248,7 @@ func TestDirectRequest(t *testing.T) {
 		}))
 	}
 
-	for i, address := range []tcpip.Address{stackAddr1, stackAddr2} {
+	for i, address := range []tcpip.Address{stackAddr, remoteAddr} {
 		t.Run(strconv.Itoa(i), func(t *testing.T) {
 			inject(address)
 			pi, _ := c.linkEP.ReadContext(context.Background())
@@ -122,7 +259,7 @@ func TestDirectRequest(t *testing.T) {
 			if !rep.IsValid() {
 				t.Fatalf("invalid ARP response: len = %d; response = %x", len(rep), rep)
 			}
-			if got, want := tcpip.LinkAddress(rep.HardwareAddressSender()), stackLinkAddr1; got != want {
+			if got, want := tcpip.LinkAddress(rep.HardwareAddressSender()), stackLinkAddr; got != want {
 				t.Errorf("got HardwareAddressSender = %s, want = %s", got, want)
 			}
 			if got, want := tcpip.Address(rep.ProtocolAddressSender()), tcpip.Address(h.ProtocolAddressTarget()); got != want {
@@ -137,7 +274,7 @@ func TestDirectRequest(t *testing.T) {
 		})
 	}
 
-	inject(stackAddrBad)
+	inject(unknownAddr)
 	// Sleep tests are gross, but this will only potentially flake
 	// if there's a bug. If there is no bug this will reliably
 	// succeed.
@@ -148,6 +285,144 @@ func TestDirectRequest(t *testing.T) {
 	}
 }
 
+func TestDirectRequestWithNeighborCache(t *testing.T) {
+	c := newTestContext(t, true /* useNeighborCache */)
+	defer c.cleanup()
+
+	tests := []struct {
+		name           string
+		senderAddr     tcpip.Address
+		senderLinkAddr tcpip.LinkAddress
+		targetAddr     tcpip.Address
+		isValid        bool
+	}{
+		{
+			name:           "Loopback",
+			senderAddr:     stackAddr,
+			senderLinkAddr: stackLinkAddr,
+			targetAddr:     stackAddr,
+			isValid:        true,
+		},
+		{
+			name:           "Remote",
+			senderAddr:     remoteAddr,
+			senderLinkAddr: remoteLinkAddr,
+			targetAddr:     stackAddr,
+			isValid:        true,
+		},
+		{
+			name:           "RemoteInvalidTarget",
+			senderAddr:     remoteAddr,
+			senderLinkAddr: remoteLinkAddr,
+			targetAddr:     unknownAddr,
+			isValid:        false,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			// Inject an incoming ARP request.
+			v := make(buffer.View, header.ARPSize)
+			h := header.ARP(v)
+			h.SetIPv4OverEthernet()
+			h.SetOp(header.ARPRequest)
+			copy(h.HardwareAddressSender(), test.senderLinkAddr)
+			copy(h.ProtocolAddressSender(), test.senderAddr)
+			copy(h.ProtocolAddressTarget(), test.targetAddr)
+			c.linkEP.InjectInbound(arp.ProtocolNumber, &stack.PacketBuffer{
+				Data: v.ToVectorisedView(),
+			})
+
+			if !test.isValid {
+				// No packets should be sent after receiving an invalid ARP request.
+				// There is no need to perform a blocking read here, since packets are
+				// sent in the same function that handles ARP requests.
+				if pkt, ok := c.linkEP.Read(); ok {
+					t.Errorf("unexpected packet sent with network protocol number %d", pkt.Proto)
+				}
+				return
+			}
+
+			// Verify an ARP response was sent.
+			pi, ok := c.linkEP.Read()
+			if !ok {
+				t.Fatal("expected ARP response to be sent, got none")
+			}
+
+			if pi.Proto != arp.ProtocolNumber {
+				t.Fatalf("expected ARP response, got network protocol number %d", pi.Proto)
+			}
+			rep := header.ARP(pi.Pkt.NetworkHeader().View())
+			if !rep.IsValid() {
+				t.Fatalf("invalid ARP response: len = %d; response = %x", len(rep), rep)
+			}
+			if got, want := tcpip.LinkAddress(rep.HardwareAddressSender()), stackLinkAddr; got != want {
+				t.Errorf("got HardwareAddressSender() = %s, want = %s", got, want)
+			}
+			if got, want := tcpip.Address(rep.ProtocolAddressSender()), tcpip.Address(h.ProtocolAddressTarget()); got != want {
+				t.Errorf("got ProtocolAddressSender() = %s, want = %s", got, want)
+			}
+			if got, want := tcpip.LinkAddress(rep.HardwareAddressTarget()), tcpip.LinkAddress(h.HardwareAddressSender()); got != want {
+				t.Errorf("got HardwareAddressTarget() = %s, want = %s", got, want)
+			}
+			if got, want := tcpip.Address(rep.ProtocolAddressTarget()), tcpip.Address(h.ProtocolAddressSender()); got != want {
+				t.Errorf("got ProtocolAddressTarget() = %s, want = %s", got, want)
+			}
+
+			// Verify the sender was saved in the neighbor cache.
+			wantEvent := eventInfo{
+				eventType: entryAdded,
+				nicID:     nicID,
+				addr:      test.senderAddr,
+				linkAddr:  tcpip.LinkAddress(test.senderLinkAddr),
+				state:     stack.Stale,
+			}
+			if err := c.nudDisp.waitForEventWithTimeout(wantEvent, time.Second); err != nil {
+				t.Fatal(err)
+			}
+
+			neighbors, err := c.s.Neighbors(nicID)
+			if err != nil {
+				t.Fatalf("c.s.Neighbors(%d): %s", nicID, err)
+			}
+
+			neighborByAddr := make(map[tcpip.Address]stack.NeighborEntry)
+			for _, n := range neighbors {
+				if existing, ok := neighborByAddr[n.Addr]; ok {
+					if diff := cmp.Diff(existing, n); diff != "" {
+						t.Fatalf("duplicate neighbor entry found (-existing +got):\n%s", diff)
+					}
+					t.Fatalf("exact neighbor entry duplicate found for addr=%s", n.Addr)
+				}
+				neighborByAddr[n.Addr] = n
+			}
+
+			neigh, ok := neighborByAddr[test.senderAddr]
+			if !ok {
+				t.Fatalf("expected neighbor entry with Addr = %s", test.senderAddr)
+			}
+			if got, want := neigh.LinkAddr, test.senderLinkAddr; got != want {
+				t.Errorf("got neighbor LinkAddr = %s, want = %s", got, want)
+			}
+			if got, want := neigh.LocalAddr, stackAddr; got != want {
+				t.Errorf("got neighbor LocalAddr = %s, want = %s", got, want)
+			}
+			if got, want := neigh.State, stack.Stale; got != want {
+				t.Errorf("got neighbor State = %s, want = %s", got, want)
+			}
+
+			// No more events should be dispatched
+			for {
+				event, ok := c.nudDisp.nextEvent()
+				if !ok {
+					break
+				}
+				t.Errorf("unexpected %s", event)
+			}
+		})
+	}
+}
+
 func TestLinkAddressRequest(t *testing.T) {
 	tests := []struct {
 		name           string
@@ -156,8 +431,8 @@ func TestLinkAddressRequest(t *testing.T) {
 	}{
 		{
 			name:           "Unicast",
-			remoteLinkAddr: stackLinkAddr2,
-			expectLinkAddr: stackLinkAddr2,
+			remoteLinkAddr: remoteLinkAddr,
+			expectLinkAddr: remoteLinkAddr,
 		},
 		{
 			name:           "Multicast",
@@ -167,15 +442,15 @@ func TestLinkAddressRequest(t *testing.T) {
 	}
 
 	for _, test := range tests {
-		p := arp.NewProtocol()
+		p := arp.NewProtocol(nil)
 		linkRes, ok := p.(stack.LinkAddressResolver)
 		if !ok {
 			t.Fatal("expected ARP protocol to implement stack.LinkAddressResolver")
 		}
 
-		linkEP := channel.New(defaultChannelSize, defaultMTU, stackLinkAddr1)
-		if err := linkRes.LinkAddressRequest(stackAddr1, stackAddr2, test.remoteLinkAddr, linkEP); err != nil {
-			t.Errorf("got p.LinkAddressRequest(%s, %s, %s, _) = %s", stackAddr1, stackAddr2, test.remoteLinkAddr, err)
+		linkEP := channel.New(defaultChannelSize, defaultMTU, stackLinkAddr)
+		if err := linkRes.LinkAddressRequest(stackAddr, remoteAddr, test.remoteLinkAddr, linkEP); err != nil {
+			t.Errorf("got p.LinkAddressRequest(%s, %s, %s, _) = %s", stackAddr, remoteAddr, test.remoteLinkAddr, err)
 		}
 
 		pkt, ok := linkEP.Read()
diff --git a/pkg/tcpip/network/fragmentation/BUILD b/pkg/tcpip/network/fragmentation/BUILD
index d1c728ccf..47fb63290 100644
--- a/pkg/tcpip/network/fragmentation/BUILD
+++ b/pkg/tcpip/network/fragmentation/BUILD
@@ -29,6 +29,8 @@ go_library(
         "//pkg/sync",
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
+        "//pkg/tcpip/header",
+        "//pkg/tcpip/stack",
     ],
 )
 
@@ -41,5 +43,10 @@ go_test(
         "reassembler_test.go",
     ],
     library = ":fragmentation",
-    deps = ["//pkg/tcpip/buffer"],
+    deps = [
+        "//pkg/tcpip/buffer",
+        "//pkg/tcpip/faketime",
+        "//pkg/tcpip/network/testutil",
+        "@com_github_google_go_cmp//cmp:go_default_library",
+    ],
 )
diff --git a/pkg/tcpip/network/fragmentation/fragmentation.go b/pkg/tcpip/network/fragmentation/fragmentation.go
index 1827666c5..ed502a473 100644
--- a/pkg/tcpip/network/fragmentation/fragmentation.go
+++ b/pkg/tcpip/network/fragmentation/fragmentation.go
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 // Package fragmentation contains the implementation of IP fragmentation.
-// It is based on RFC 791 and RFC 815.
+// It is based on RFC 791, RFC 815 and RFC 8200.
 package fragmentation
 
 import (
@@ -25,12 +25,10 @@ import (
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
 )
 
 const (
-	// DefaultReassembleTimeout is based on the linux stack: net.ipv4.ipfrag_time.
-	DefaultReassembleTimeout = 30 * time.Second
-
 	// HighFragThreshold is the threshold at which we start trimming old
 	// fragmented packets. Linux uses a default value of 4 MB. See
 	// net.ipv4.ipfrag_high_thresh for more information.
@@ -81,6 +79,8 @@ type Fragmentation struct {
 	size         int
 	timeout      time.Duration
 	blockSize    uint16
+	clock        tcpip.Clock
+	releaseJob   *tcpip.Job
 }
 
 // NewFragmentation creates a new Fragmentation.
@@ -97,7 +97,7 @@ type Fragmentation struct {
 // reassemblingTimeout specifies the maximum time allowed to reassemble a packet.
 // Fragments are lazily evicted only when a new a packet with an
 // already existing fragmentation-id arrives after the timeout.
-func NewFragmentation(blockSize uint16, highMemoryLimit, lowMemoryLimit int, reassemblingTimeout time.Duration) *Fragmentation {
+func NewFragmentation(blockSize uint16, highMemoryLimit, lowMemoryLimit int, reassemblingTimeout time.Duration, clock tcpip.Clock) *Fragmentation {
 	if lowMemoryLimit >= highMemoryLimit {
 		lowMemoryLimit = highMemoryLimit
 	}
@@ -110,64 +110,77 @@ func NewFragmentation(blockSize uint16, highMemoryLimit, lowMemoryLimit int, rea
 		blockSize = minBlockSize
 	}
 
-	return &Fragmentation{
+	f := &Fragmentation{
 		reassemblers: make(map[FragmentID]*reassembler),
 		highLimit:    highMemoryLimit,
 		lowLimit:     lowMemoryLimit,
 		timeout:      reassemblingTimeout,
 		blockSize:    blockSize,
+		clock:        clock,
 	}
+	f.releaseJob = tcpip.NewJob(f.clock, &f.mu, f.releaseReassemblersLocked)
+
+	return f
 }
 
 // Process processes an incoming fragment belonging to an ID and returns a
-// complete packet when all the packets belonging to that ID have been received.
+// complete packet and its protocol number when all the packets belonging to
+// that ID have been received.
 //
 // [first, last] is the range of the fragment bytes.
 //
 // first must be a multiple of the block size f is configured with. The size
 // of the fragment data must be a multiple of the block size, unless there are
 // no fragments following this fragment (more set to false).
-func (f *Fragmentation) Process(id FragmentID, first, last uint16, more bool, vv buffer.VectorisedView) (buffer.VectorisedView, bool, error) {
+//
+// proto is the protocol number marked in the fragment being processed. It has
+// to be given here outside of the FragmentID struct because IPv6 should not use
+// the protocol to identify a fragment.
+func (f *Fragmentation) Process(
+	id FragmentID, first, last uint16, more bool, proto uint8, vv buffer.VectorisedView) (
+	buffer.VectorisedView, uint8, bool, error) {
 	if first > last {
-		return buffer.VectorisedView{}, false, fmt.Errorf("first=%d is greater than last=%d: %w", first, last, ErrInvalidArgs)
+		return buffer.VectorisedView{}, 0, false, fmt.Errorf("first=%d is greater than last=%d: %w", first, last, ErrInvalidArgs)
 	}
 
 	if first%f.blockSize != 0 {
-		return buffer.VectorisedView{}, false, fmt.Errorf("first=%d is not a multiple of block size=%d: %w", first, f.blockSize, ErrInvalidArgs)
+		return buffer.VectorisedView{}, 0, false, fmt.Errorf("first=%d is not a multiple of block size=%d: %w", first, f.blockSize, ErrInvalidArgs)
 	}
 
 	fragmentSize := last - first + 1
 	if more && fragmentSize%f.blockSize != 0 {
-		return buffer.VectorisedView{}, false, fmt.Errorf("fragment size=%d bytes is not a multiple of block size=%d on non-final fragment: %w", fragmentSize, f.blockSize, ErrInvalidArgs)
+		return buffer.VectorisedView{}, 0, false, fmt.Errorf("fragment size=%d bytes is not a multiple of block size=%d on non-final fragment: %w", fragmentSize, f.blockSize, ErrInvalidArgs)
 	}
 
 	if l := vv.Size(); l < int(fragmentSize) {
-		return buffer.VectorisedView{}, false, fmt.Errorf("got fragment size=%d bytes less than the expected fragment size=%d bytes (first=%d last=%d): %w", l, fragmentSize, first, last, ErrInvalidArgs)
+		return buffer.VectorisedView{}, 0, false, fmt.Errorf("got fragment size=%d bytes less than the expected fragment size=%d bytes (first=%d last=%d): %w", l, fragmentSize, first, last, ErrInvalidArgs)
 	}
 	vv.CapLength(int(fragmentSize))
 
 	f.mu.Lock()
 	r, ok := f.reassemblers[id]
-	if ok && r.tooOld(f.timeout) {
-		// This is very likely to be an id-collision or someone performing a slow-rate attack.
-		f.release(r)
-		ok = false
-	}
 	if !ok {
-		r = newReassembler(id)
+		r = newReassembler(id, f.clock)
 		f.reassemblers[id] = r
+		wasEmpty := f.rList.Empty()
 		f.rList.PushFront(r)
+		if wasEmpty {
+			// If we have just pushed a first reassembler into an empty list, we
+			// should kickstart the release job. The release job will keep
+			// rescheduling itself until the list becomes empty.
+			f.releaseReassemblersLocked()
+		}
 	}
 	f.mu.Unlock()
 
-	res, done, consumed, err := r.process(first, last, more, vv)
+	res, firstFragmentProto, done, consumed, err := r.process(first, last, more, proto, vv)
 	if err != nil {
 		// We probably got an invalid sequence of fragments. Just
 		// discard the reassembler and move on.
 		f.mu.Lock()
 		f.release(r)
 		f.mu.Unlock()
-		return buffer.VectorisedView{}, false, fmt.Errorf("fragmentation processing error: %v", err)
+		return buffer.VectorisedView{}, 0, false, fmt.Errorf("fragmentation processing error: %w", err)
 	}
 	f.mu.Lock()
 	f.size += consumed
@@ -186,7 +199,7 @@ func (f *Fragmentation) Process(id FragmentID, first, last uint16, more bool, vv
 		}
 	}
 	f.mu.Unlock()
-	return res, done, nil
+	return res, firstFragmentProto, done, nil
 }
 
 func (f *Fragmentation) release(r *reassembler) {
@@ -204,3 +217,102 @@ func (f *Fragmentation) release(r *reassembler) {
 		f.size = 0
 	}
 }
+
+// releaseReassemblersLocked releases already-expired reassemblers, then
+// schedules the job to call back itself for the remaining reassemblers if
+// any. This function must be called with f.mu locked.
+func (f *Fragmentation) releaseReassemblersLocked() {
+	now := f.clock.NowMonotonic()
+	for {
+		// The reassembler at the end of the list is the oldest.
+		r := f.rList.Back()
+		if r == nil {
+			// The list is empty.
+			break
+		}
+		elapsed := time.Duration(now-r.creationTime) * time.Nanosecond
+		if f.timeout > elapsed {
+			// If the oldest reassembler has not expired, schedule the release
+			// job so that this function is called back when it has expired.
+			f.releaseJob.Schedule(f.timeout - elapsed)
+			break
+		}
+		// If the oldest reassembler has already expired, release it.
+		f.release(r)
+	}
+}
+
+// PacketFragmenter is the book-keeping struct for packet fragmentation.
+type PacketFragmenter struct {
+	transportHeader buffer.View
+	data            buffer.VectorisedView
+	reserve         int
+	innerMTU        int
+	fragmentCount   int
+	currentFragment int
+	fragmentOffset  int
+}
+
+// MakePacketFragmenter prepares the struct needed for packet fragmentation.
+//
+// pkt is the packet to be fragmented.
+//
+// innerMTU is the maximum number of bytes of fragmentable data a fragment can
+// have.
+//
+// reserve is the number of bytes that should be reserved for the headers in
+// each generated fragment.
+func MakePacketFragmenter(pkt *stack.PacketBuffer, innerMTU int, reserve int) PacketFragmenter {
+	// As per RFC 8200 Section 4.5, some IPv6 extension headers should not be
+	// repeated in each fragment. However we do not currently support any header
+	// of that kind yet, so the following computation is valid for both IPv4 and
+	// IPv6.
+	// TODO(gvisor.dev/issue/3912): Once Authentication or ESP Headers are
+	// supported for outbound packets, the fragmentable data should not include
+	// these headers.
+	var fragmentableData buffer.VectorisedView
+	fragmentableData.AppendView(pkt.TransportHeader().View())
+	fragmentableData.Append(pkt.Data)
+	fragmentCount := (fragmentableData.Size() + innerMTU - 1) / innerMTU
+
+	return PacketFragmenter{
+		data:          fragmentableData,
+		reserve:       reserve,
+		innerMTU:      innerMTU,
+		fragmentCount: fragmentCount,
+	}
+}
+
+// BuildNextFragment returns a packet with the payload of the next fragment,
+// along with the fragment's offset, the number of bytes copied and a boolean
+// indicating if there are more fragments left or not. If this function is
+// called again after it indicated that no more fragments were left, it will
+// panic.
+//
+// Note that the returned packet will not have its network and link headers
+// populated, but space for them will be reserved. The transport header will be
+// stored in the packet's data.
+func (pf *PacketFragmenter) BuildNextFragment() (*stack.PacketBuffer, int, int, bool) {
+	if pf.currentFragment >= pf.fragmentCount {
+		panic("BuildNextFragment should not be called again after the last fragment was returned")
+	}
+
+	fragPkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
+		ReserveHeaderBytes: pf.reserve,
+	})
+
+	// Copy data for the fragment.
+	copied := pf.data.ReadToVV(&fragPkt.Data, pf.innerMTU)
+
+	offset := pf.fragmentOffset
+	pf.fragmentOffset += copied
+	pf.currentFragment++
+	more := pf.currentFragment != pf.fragmentCount
+
+	return fragPkt, offset, copied, more
+}
+
+// RemainingFragmentCount returns the number of fragments left to be built.
+func (pf *PacketFragmenter) RemainingFragmentCount() int {
+	return pf.fragmentCount - pf.currentFragment
+}
diff --git a/pkg/tcpip/network/fragmentation/fragmentation_test.go b/pkg/tcpip/network/fragmentation/fragmentation_test.go
index 9eedd33c4..d3c7d7f92 100644
--- a/pkg/tcpip/network/fragmentation/fragmentation_test.go
+++ b/pkg/tcpip/network/fragmentation/fragmentation_test.go
@@ -20,9 +20,16 @@ import (
 	"testing"
 	"time"
 
+	"github.com/google/go-cmp/cmp"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/faketime"
+	"gvisor.dev/gvisor/pkg/tcpip/network/testutil"
 )
 
+// reassembleTimeout is dummy timeout used for testing, where the clock never
+// advances.
+const reassembleTimeout = 1
+
 // vv is a helper to build VectorisedView from different strings.
 func vv(size int, pieces ...string) buffer.VectorisedView {
 	views := make([]buffer.View, len(pieces))
@@ -38,12 +45,14 @@ type processInput struct {
 	first uint16
 	last  uint16
 	more  bool
+	proto uint8
 	vv    buffer.VectorisedView
 }
 
 type processOutput struct {
-	vv   buffer.VectorisedView
-	done bool
+	vv    buffer.VectorisedView
+	proto uint8
+	done  bool
 }
 
 var processTestCases = []struct {
@@ -63,6 +72,17 @@ var processTestCases = []struct {
 		},
 	},
 	{
+		comment: "Next Header protocol mismatch",
+		in: []processInput{
+			{id: FragmentID{ID: 0}, first: 0, last: 1, more: true, proto: 6, vv: vv(2, "01")},
+			{id: FragmentID{ID: 0}, first: 2, last: 3, more: false, proto: 17, vv: vv(2, "23")},
+		},
+		out: []processOutput{
+			{vv: buffer.VectorisedView{}, done: false},
+			{vv: vv(4, "01", "23"), proto: 6, done: true},
+		},
+	},
+	{
 		comment: "Two IDs",
 		in: []processInput{
 			{id: FragmentID{ID: 0}, first: 0, last: 1, more: true, vv: vv(2, "01")},
@@ -82,19 +102,27 @@ var processTestCases = []struct {
 func TestFragmentationProcess(t *testing.T) {
 	for _, c := range processTestCases {
 		t.Run(c.comment, func(t *testing.T) {
-			f := NewFragmentation(minBlockSize, 1024, 512, DefaultReassembleTimeout)
+			f := NewFragmentation(minBlockSize, 1024, 512, reassembleTimeout, &faketime.NullClock{})
+			firstFragmentProto := c.in[0].proto
 			for i, in := range c.in {
-				vv, done, err := f.Process(in.id, in.first, in.last, in.more, in.vv)
+				vv, proto, done, err := f.Process(in.id, in.first, in.last, in.more, in.proto, in.vv)
 				if err != nil {
-					t.Fatalf("f.Process(%+v, %+d, %+d, %t, %+v) failed: %v", in.id, in.first, in.last, in.more, in.vv, err)
+					t.Fatalf("f.Process(%+v, %d, %d, %t, %d, %X) failed: %s",
+						in.id, in.first, in.last, in.more, in.proto, in.vv.ToView(), err)
 				}
 				if !reflect.DeepEqual(vv, c.out[i].vv) {
-					t.Errorf("got Process(%d) = %+v, want = %+v", i, vv, c.out[i].vv)
+					t.Errorf("got Process(%+v, %d, %d, %t, %d, %X) = (%X, _, _, _), want = (%X, _, _, _)",
+						in.id, in.first, in.last, in.more, in.proto, in.vv.ToView(), vv.ToView(), c.out[i].vv.ToView())
 				}
 				if done != c.out[i].done {
-					t.Errorf("got Process(%d) = %+v, want = %+v", i, done, c.out[i].done)
+					t.Errorf("got Process(%+v, %d, %d, %t, %d, _) = (_, _, %t, _), want = (_, _, %t, _)",
+						in.id, in.first, in.last, in.more, in.proto, done, c.out[i].done)
 				}
 				if c.out[i].done {
+					if firstFragmentProto != proto {
+						t.Errorf("got Process(%+v, %d, %d, %t, %d, _) = (_, %d, _, _), want = (_, %d, _, _)",
+							in.id, in.first, in.last, in.more, in.proto, proto, firstFragmentProto)
+					}
 					if _, ok := f.reassemblers[in.id]; ok {
 						t.Errorf("Process(%d) did not remove buffer from reassemblers", i)
 					}
@@ -110,35 +138,136 @@ func TestFragmentationProcess(t *testing.T) {
 }
 
 func TestReassemblingTimeout(t *testing.T) {
-	timeout := time.Millisecond
-	f := NewFragmentation(minBlockSize, 1024, 512, timeout)
-	// Send first fragment with id = 0, first = 0, last = 0, and more = true.
-	f.Process(FragmentID{}, 0, 0, true, vv(1, "0"))
-	// Sleep more than the timeout.
-	time.Sleep(2 * timeout)
-	// Send another fragment that completes a packet.
-	// However, no packet should be reassembled because the fragment arrived after the timeout.
-	_, done, err := f.Process(FragmentID{}, 1, 1, false, vv(1, "1"))
-	if err != nil {
-		t.Fatalf("f.Process(0, 1, 1, false, vv(1, \"1\")) failed: %v", err)
+	const (
+		reassemblyTimeout = time.Millisecond
+		protocol          = 0xff
+	)
+
+	type fragment struct {
+		first uint16
+		last  uint16
+		more  bool
+		data  string
+	}
+
+	type event struct {
+		// name is a nickname of this event.
+		name string
+
+		// clockAdvance is a duration to advance the clock. The clock advances
+		// before a fragment specified in the fragment field is processed.
+		clockAdvance time.Duration
+
+		// fragment is a fragment to process. This can be nil if there is no
+		// fragment to process.
+		fragment *fragment
+
+		// expectDone is true if the fragmentation instance should report the
+		// reassembly is done after the fragment is processd.
+		expectDone bool
+
+		// sizeAfterEvent is the expected size of the fragmentation instance after
+		// the event.
+		sizeAfterEvent int
 	}
-	if done {
-		t.Errorf("Fragmentation does not respect the reassembling timeout.")
+
+	half1 := &fragment{first: 0, last: 0, more: true, data: "0"}
+	half2 := &fragment{first: 1, last: 1, more: false, data: "1"}
+
+	tests := []struct {
+		name   string
+		events []event
+	}{
+		{
+			name: "half1 and half2 are reassembled successfully",
+			events: []event{
+				{
+					name:           "half1",
+					fragment:       half1,
+					expectDone:     false,
+					sizeAfterEvent: 1,
+				},
+				{
+					name:           "half2",
+					fragment:       half2,
+					expectDone:     true,
+					sizeAfterEvent: 0,
+				},
+			},
+		},
+		{
+			name: "half1 timeout, half2 timeout",
+			events: []event{
+				{
+					name:           "half1",
+					fragment:       half1,
+					expectDone:     false,
+					sizeAfterEvent: 1,
+				},
+				{
+					name:           "half1 just before reassembly timeout",
+					clockAdvance:   reassemblyTimeout - 1,
+					sizeAfterEvent: 1,
+				},
+				{
+					name:           "half1 reassembly timeout",
+					clockAdvance:   1,
+					sizeAfterEvent: 0,
+				},
+				{
+					name:           "half2",
+					fragment:       half2,
+					expectDone:     false,
+					sizeAfterEvent: 1,
+				},
+				{
+					name:           "half2 just before reassembly timeout",
+					clockAdvance:   reassemblyTimeout - 1,
+					sizeAfterEvent: 1,
+				},
+				{
+					name:           "half2 reassembly timeout",
+					clockAdvance:   1,
+					sizeAfterEvent: 0,
+				},
+			},
+		},
+	}
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			clock := faketime.NewManualClock()
+			f := NewFragmentation(minBlockSize, HighFragThreshold, LowFragThreshold, reassemblyTimeout, clock)
+			for _, event := range test.events {
+				clock.Advance(event.clockAdvance)
+				if frag := event.fragment; frag != nil {
+					_, _, done, err := f.Process(FragmentID{}, frag.first, frag.last, frag.more, protocol, vv(len(frag.data), frag.data))
+					if err != nil {
+						t.Fatalf("%s: f.Process failed: %s", event.name, err)
+					}
+					if done != event.expectDone {
+						t.Fatalf("%s: got done = %t, want = %t", event.name, done, event.expectDone)
+					}
+				}
+				if got, want := f.size, event.sizeAfterEvent; got != want {
+					t.Errorf("%s: got f.size = %d, want = %d", event.name, got, want)
+				}
+			}
+		})
 	}
 }
 
 func TestMemoryLimits(t *testing.T) {
-	f := NewFragmentation(minBlockSize, 3, 1, DefaultReassembleTimeout)
+	f := NewFragmentation(minBlockSize, 3, 1, reassembleTimeout, &faketime.NullClock{})
 	// Send first fragment with id = 0.
-	f.Process(FragmentID{ID: 0}, 0, 0, true, vv(1, "0"))
+	f.Process(FragmentID{ID: 0}, 0, 0, true, 0xFF, vv(1, "0"))
 	// Send first fragment with id = 1.
-	f.Process(FragmentID{ID: 1}, 0, 0, true, vv(1, "1"))
+	f.Process(FragmentID{ID: 1}, 0, 0, true, 0xFF, vv(1, "1"))
 	// Send first fragment with id = 2.
-	f.Process(FragmentID{ID: 2}, 0, 0, true, vv(1, "2"))
+	f.Process(FragmentID{ID: 2}, 0, 0, true, 0xFF, vv(1, "2"))
 
 	// Send first fragment with id = 3. This should caused id = 0 and id = 1 to be
 	// evicted.
-	f.Process(FragmentID{ID: 3}, 0, 0, true, vv(1, "3"))
+	f.Process(FragmentID{ID: 3}, 0, 0, true, 0xFF, vv(1, "3"))
 
 	if _, ok := f.reassemblers[FragmentID{ID: 0}]; ok {
 		t.Errorf("Memory limits are not respected: id=0 has not been evicted.")
@@ -152,11 +281,11 @@ func TestMemoryLimits(t *testing.T) {
 }
 
 func TestMemoryLimitsIgnoresDuplicates(t *testing.T) {
-	f := NewFragmentation(minBlockSize, 1, 0, DefaultReassembleTimeout)
+	f := NewFragmentation(minBlockSize, 1, 0, reassembleTimeout, &faketime.NullClock{})
 	// Send first fragment with id = 0.
-	f.Process(FragmentID{}, 0, 0, true, vv(1, "0"))
+	f.Process(FragmentID{}, 0, 0, true, 0xFF, vv(1, "0"))
 	// Send the same packet again.
-	f.Process(FragmentID{}, 0, 0, true, vv(1, "0"))
+	f.Process(FragmentID{}, 0, 0, true, 0xFF, vv(1, "0"))
 
 	got := f.size
 	want := 1
@@ -247,13 +376,123 @@ func TestErrors(t *testing.T) {
 
 	for _, test := range tests {
 		t.Run(test.name, func(t *testing.T) {
-			f := NewFragmentation(test.blockSize, HighFragThreshold, LowFragThreshold, DefaultReassembleTimeout)
-			_, done, err := f.Process(FragmentID{}, test.first, test.last, test.more, vv(len(test.data), test.data))
+			f := NewFragmentation(test.blockSize, HighFragThreshold, LowFragThreshold, reassembleTimeout, &faketime.NullClock{})
+			_, _, done, err := f.Process(FragmentID{}, test.first, test.last, test.more, 0, vv(len(test.data), test.data))
 			if !errors.Is(err, test.err) {
-				t.Errorf("got Proceess(_, %d, %d, %t, %q) = (_, _, %v), want = (_, _, %v)", test.first, test.last, test.more, test.data, err, test.err)
+				t.Errorf("got Process(_, %d, %d, %t, _, %q) = (_, _, _, %v), want = (_, _, _, %v)", test.first, test.last, test.more, test.data, err, test.err)
 			}
 			if done {
-				t.Errorf("got Proceess(_, %d, %d, %t, %q) = (_, true, _), want = (_, false, _)", test.first, test.last, test.more, test.data)
+				t.Errorf("got Process(_, %d, %d, %t, _, %q) = (_, _, true, _), want = (_, _, false, _)", test.first, test.last, test.more, test.data)
+			}
+		})
+	}
+}
+
+type fragmentInfo struct {
+	remaining int
+	copied    int
+	offset    int
+	more      bool
+}
+
+func TestPacketFragmenter(t *testing.T) {
+	const (
+		reserve = 60
+		proto   = 0
+	)
+
+	tests := []struct {
+		name               string
+		innerMTU           int
+		transportHeaderLen int
+		payloadSize        int
+		wantFragments      []fragmentInfo
+	}{
+		{
+			name:               "Packet exactly fits in MTU",
+			innerMTU:           1280,
+			transportHeaderLen: 0,
+			payloadSize:        1280,
+			wantFragments: []fragmentInfo{
+				{remaining: 0, copied: 1280, offset: 0, more: false},
+			},
+		},
+		{
+			name:               "Packet exactly does not fit in MTU",
+			innerMTU:           1000,
+			transportHeaderLen: 0,
+			payloadSize:        1001,
+			wantFragments: []fragmentInfo{
+				{remaining: 1, copied: 1000, offset: 0, more: true},
+				{remaining: 0, copied: 1, offset: 1000, more: false},
+			},
+		},
+		{
+			name:               "Packet has a transport header",
+			innerMTU:           560,
+			transportHeaderLen: 40,
+			payloadSize:        560,
+			wantFragments: []fragmentInfo{
+				{remaining: 1, copied: 560, offset: 0, more: true},
+				{remaining: 0, copied: 40, offset: 560, more: false},
+			},
+		},
+		{
+			name:               "Packet has a huge transport header",
+			innerMTU:           500,
+			transportHeaderLen: 1300,
+			payloadSize:        500,
+			wantFragments: []fragmentInfo{
+				{remaining: 3, copied: 500, offset: 0, more: true},
+				{remaining: 2, copied: 500, offset: 500, more: true},
+				{remaining: 1, copied: 500, offset: 1000, more: true},
+				{remaining: 0, copied: 300, offset: 1500, more: false},
+			},
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			pkt := testutil.MakeRandPkt(test.transportHeaderLen, reserve, []int{test.payloadSize}, proto)
+			var originalPayload buffer.VectorisedView
+			originalPayload.AppendView(pkt.TransportHeader().View())
+			originalPayload.Append(pkt.Data)
+			var reassembledPayload buffer.VectorisedView
+			pf := MakePacketFragmenter(pkt, test.innerMTU, reserve)
+			for i := 0; ; i++ {
+				fragPkt, offset, copied, more := pf.BuildNextFragment()
+				wantFragment := test.wantFragments[i]
+				if got := pf.RemainingFragmentCount(); got != wantFragment.remaining {
+					t.Errorf("(fragment #%d) got pf.RemainingFragmentCount() = %d, want = %d", i, got, wantFragment.remaining)
+				}
+				if copied != wantFragment.copied {
+					t.Errorf("(fragment #%d) got copied = %d, want = %d", i, copied, wantFragment.copied)
+				}
+				if offset != wantFragment.offset {
+					t.Errorf("(fragment #%d) got offset = %d, want = %d", i, offset, wantFragment.offset)
+				}
+				if more != wantFragment.more {
+					t.Errorf("(fragment #%d) got more = %t, want = %t", i, more, wantFragment.more)
+				}
+				if got := fragPkt.Size(); got > test.innerMTU {
+					t.Errorf("(fragment #%d) got fragPkt.Size() = %d, want <= %d", i, got, test.innerMTU)
+				}
+				if got := fragPkt.AvailableHeaderBytes(); got != reserve {
+					t.Errorf("(fragment #%d) got fragPkt.AvailableHeaderBytes() = %d, want = %d", i, got, reserve)
+				}
+				if got := fragPkt.TransportHeader().View().Size(); got != 0 {
+					t.Errorf("(fragment #%d) got fragPkt.TransportHeader().View().Size() = %d, want = 0", i, got)
+				}
+				reassembledPayload.Append(fragPkt.Data)
+				if !more {
+					if i != len(test.wantFragments)-1 {
+						t.Errorf("got fragment count = %d, want = %d", i, len(test.wantFragments)-1)
+					}
+					break
+				}
+			}
+			if diff := cmp.Diff(reassembledPayload.ToView(), originalPayload.ToView()); diff != "" {
+				t.Errorf("reassembledPayload mismatch (-want +got):\n%s", diff)
 			}
 		})
 	}
diff --git a/pkg/tcpip/network/fragmentation/reassembler.go b/pkg/tcpip/network/fragmentation/reassembler.go
index 50d30bbf0..9bb051a30 100644
--- a/pkg/tcpip/network/fragmentation/reassembler.go
+++ b/pkg/tcpip/network/fragmentation/reassembler.go
@@ -18,9 +18,9 @@ import (
 	"container/heap"
 	"fmt"
 	"math"
-	"time"
 
 	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 )
 
@@ -34,21 +34,21 @@ type reassembler struct {
 	reassemblerEntry
 	id           FragmentID
 	size         int
+	proto        uint8
 	mu           sync.Mutex
 	holes        []hole
 	deleted      int
 	heap         fragHeap
 	done         bool
-	creationTime time.Time
+	creationTime int64
 }
 
-func newReassembler(id FragmentID) *reassembler {
+func newReassembler(id FragmentID, clock tcpip.Clock) *reassembler {
 	r := &reassembler{
 		id:           id,
 		holes:        make([]hole, 0, 16),
-		deleted:      0,
 		heap:         make(fragHeap, 0, 8),
-		creationTime: time.Now(),
+		creationTime: clock.NowMonotonic(),
 	}
 	r.holes = append(r.holes, hole{
 		first:   0,
@@ -78,7 +78,7 @@ func (r *reassembler) updateHoles(first, last uint16, more bool) bool {
 	return used
 }
 
-func (r *reassembler) process(first, last uint16, more bool, vv buffer.VectorisedView) (buffer.VectorisedView, bool, int, error) {
+func (r *reassembler) process(first, last uint16, more bool, proto uint8, vv buffer.VectorisedView) (buffer.VectorisedView, uint8, bool, int, error) {
 	r.mu.Lock()
 	defer r.mu.Unlock()
 	consumed := 0
@@ -86,7 +86,18 @@ func (r *reassembler) process(first, last uint16, more bool, vv buffer.Vectorise
 		// A concurrent goroutine might have already reassembled
 		// the packet and emptied the heap while this goroutine
 		// was waiting on the mutex. We don't have to do anything in this case.
-		return buffer.VectorisedView{}, false, consumed, nil
+		return buffer.VectorisedView{}, 0, false, consumed, nil
+	}
+	// For IPv6, it is possible to have different Protocol values between
+	// fragments of a packet (because, unlike IPv4, the Protocol is not used to
+	// identify a fragment). In this case, only the Protocol of the first
+	// fragment must be used as per RFC 8200 Section 4.5.
+	//
+	// TODO(gvisor.dev/issue/3648): The entire first IP header should be recorded
+	// here (instead of just the protocol) because most IP options should be
+	// derived from the first fragment.
+	if first == 0 {
+		r.proto = proto
 	}
 	if r.updateHoles(first, last, more) {
 		// We store the incoming packet only if it filled some holes.
@@ -96,17 +107,13 @@ func (r *reassembler) process(first, last uint16, more bool, vv buffer.Vectorise
 	}
 	// Check if all the holes have been deleted and we are ready to reassamble.
 	if r.deleted < len(r.holes) {
-		return buffer.VectorisedView{}, false, consumed, nil
+		return buffer.VectorisedView{}, 0, false, consumed, nil
 	}
 	res, err := r.heap.reassemble()
 	if err != nil {
-		return buffer.VectorisedView{}, false, consumed, fmt.Errorf("fragment reassembly failed: %v", err)
+		return buffer.VectorisedView{}, 0, false, consumed, fmt.Errorf("fragment reassembly failed: %w", err)
 	}
-	return res, true, consumed, nil
-}
-
-func (r *reassembler) tooOld(timeout time.Duration) bool {
-	return time.Now().Sub(r.creationTime) > timeout
+	return res, r.proto, true, consumed, nil
 }
 
 func (r *reassembler) checkDoneOrMark() bool {
diff --git a/pkg/tcpip/network/fragmentation/reassembler_test.go b/pkg/tcpip/network/fragmentation/reassembler_test.go
index dff7c9dcb..a0a04a027 100644
--- a/pkg/tcpip/network/fragmentation/reassembler_test.go
+++ b/pkg/tcpip/network/fragmentation/reassembler_test.go
@@ -18,6 +18,8 @@ import (
 	"math"
 	"reflect"
 	"testing"
+
+	"gvisor.dev/gvisor/pkg/tcpip/faketime"
 )
 
 type updateHolesInput struct {
@@ -94,7 +96,7 @@ var holesTestCases = []struct {
 
 func TestUpdateHoles(t *testing.T) {
 	for _, c := range holesTestCases {
-		r := newReassembler(FragmentID{})
+		r := newReassembler(FragmentID{}, &faketime.NullClock{})
 		for _, i := range c.in {
 			r.updateHoles(i.first, i.last, i.more)
 		}
diff --git a/pkg/tcpip/network/ip_test.go b/pkg/tcpip/network/ip_test.go
index 9007346fe..f20b94d97 100644
--- a/pkg/tcpip/network/ip_test.go
+++ b/pkg/tcpip/network/ip_test.go
@@ -15,36 +15,48 @@
 package ip_test
 
 import (
+	"strings"
 	"testing"
 
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/checker"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/link/channel"
 	"gvisor.dev/gvisor/pkg/tcpip/link/loopback"
 	"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
 	"gvisor.dev/gvisor/pkg/tcpip/network/ipv6"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
+	"gvisor.dev/gvisor/pkg/tcpip/transport/icmp"
 	"gvisor.dev/gvisor/pkg/tcpip/transport/tcp"
 	"gvisor.dev/gvisor/pkg/tcpip/transport/udp"
 )
 
 const (
-	localIpv4Addr      = "\x0a\x00\x00\x01"
-	localIpv4PrefixLen = 24
-	remoteIpv4Addr     = "\x0a\x00\x00\x02"
-	ipv4SubnetAddr     = "\x0a\x00\x00\x00"
-	ipv4SubnetMask     = "\xff\xff\xff\x00"
-	ipv4Gateway        = "\x0a\x00\x00\x03"
-	localIpv6Addr      = "\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01"
-	localIpv6PrefixLen = 120
-	remoteIpv6Addr     = "\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02"
-	ipv6SubnetAddr     = "\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
-	ipv6SubnetMask     = "\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\x00"
-	ipv6Gateway        = "\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03"
-	nicID              = 1
+	localIPv4Addr  = "\x0a\x00\x00\x01"
+	remoteIPv4Addr = "\x0a\x00\x00\x02"
+	ipv4SubnetAddr = "\x0a\x00\x00\x00"
+	ipv4SubnetMask = "\xff\xff\xff\x00"
+	ipv4Gateway    = "\x0a\x00\x00\x03"
+	localIPv6Addr  = "\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01"
+	remoteIPv6Addr = "\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02"
+	ipv6SubnetAddr = "\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+	ipv6SubnetMask = "\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\x00"
+	ipv6Gateway    = "\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03"
+	nicID          = 1
 )
 
+var localIPv4AddrWithPrefix = tcpip.AddressWithPrefix{
+	Address:   localIPv4Addr,
+	PrefixLen: 24,
+}
+
+var localIPv6AddrWithPrefix = tcpip.AddressWithPrefix{
+	Address:   localIPv6Addr,
+	PrefixLen: 120,
+}
+
 // testObject implements two interfaces: LinkEndpoint and TransportDispatcher.
 // The former is used to pretend that it's a link endpoint so that we can
 // inspect packets written by the network endpoints. The latter is used to
@@ -98,9 +110,10 @@ func (t *testObject) checkValues(protocol tcpip.TransportProtocolNumber, vv buff
 // DeliverTransportPacket is called by network endpoints after parsing incoming
 // packets. This is used by the test object to verify that the results of the
 // parsing are expected.
-func (t *testObject) DeliverTransportPacket(r *stack.Route, protocol tcpip.TransportProtocolNumber, pkt *stack.PacketBuffer) {
+func (t *testObject) DeliverTransportPacket(r *stack.Route, protocol tcpip.TransportProtocolNumber, pkt *stack.PacketBuffer) stack.TransportPacketDisposition {
 	t.checkValues(protocol, pkt.Data, r.RemoteAddress, r.LocalAddress)
 	t.dataCalls++
+	return stack.TransportPacketHandled
 }
 
 // DeliverTransportControlPacket is called by network endpoints after parsing
@@ -194,8 +207,8 @@ func (*testObject) AddHeader(local, remote tcpip.LinkAddress, protocol tcpip.Net
 
 func buildIPv4Route(local, remote tcpip.Address) (stack.Route, *tcpip.Error) {
 	s := stack.New(stack.Options{
-		NetworkProtocols:   []stack.NetworkProtocol{ipv4.NewProtocol()},
-		TransportProtocols: []stack.TransportProtocol{udp.NewProtocol(), tcp.NewProtocol()},
+		NetworkProtocols:   []stack.NetworkProtocolFactory{ipv4.NewProtocol},
+		TransportProtocols: []stack.TransportProtocolFactory{udp.NewProtocol, tcp.NewProtocol},
 	})
 	s.CreateNIC(nicID, loopback.New())
 	s.AddAddress(nicID, ipv4.ProtocolNumber, local)
@@ -210,8 +223,8 @@ func buildIPv4Route(local, remote tcpip.Address) (stack.Route, *tcpip.Error) {
 
 func buildIPv6Route(local, remote tcpip.Address) (stack.Route, *tcpip.Error) {
 	s := stack.New(stack.Options{
-		NetworkProtocols:   []stack.NetworkProtocol{ipv6.NewProtocol()},
-		TransportProtocols: []stack.TransportProtocol{udp.NewProtocol(), tcp.NewProtocol()},
+		NetworkProtocols:   []stack.NetworkProtocolFactory{ipv6.NewProtocol},
+		TransportProtocols: []stack.TransportProtocolFactory{udp.NewProtocol, tcp.NewProtocol},
 	})
 	s.CreateNIC(nicID, loopback.New())
 	s.AddAddress(nicID, ipv6.ProtocolNumber, local)
@@ -224,33 +237,290 @@ func buildIPv6Route(local, remote tcpip.Address) (stack.Route, *tcpip.Error) {
 	return s.FindRoute(nicID, local, remote, ipv6.ProtocolNumber, false /* multicastLoop */)
 }
 
-func buildDummyStack(t *testing.T) *stack.Stack {
+func buildDummyStackWithLinkEndpoint(t *testing.T) (*stack.Stack, *channel.Endpoint) {
 	t.Helper()
 
 	s := stack.New(stack.Options{
-		NetworkProtocols:   []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol()},
-		TransportProtocols: []stack.TransportProtocol{udp.NewProtocol(), tcp.NewProtocol()},
+		NetworkProtocols:   []stack.NetworkProtocolFactory{ipv4.NewProtocol, ipv6.NewProtocol},
+		TransportProtocols: []stack.TransportProtocolFactory{udp.NewProtocol, tcp.NewProtocol},
 	})
 	e := channel.New(0, 1280, "")
 	if err := s.CreateNIC(nicID, e); err != nil {
 		t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
 	}
 
-	if err := s.AddAddress(nicID, header.IPv4ProtocolNumber, localIpv4Addr); err != nil {
-		t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, header.IPv4ProtocolNumber, localIpv4Addr, err)
+	v4Addr := tcpip.ProtocolAddress{Protocol: header.IPv4ProtocolNumber, AddressWithPrefix: localIPv4AddrWithPrefix}
+	if err := s.AddProtocolAddress(nicID, v4Addr); err != nil {
+		t.Fatalf("AddProtocolAddress(%d, %#v) = %s", nicID, v4Addr, err)
 	}
 
-	if err := s.AddAddress(nicID, header.IPv6ProtocolNumber, localIpv6Addr); err != nil {
-		t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, header.IPv6ProtocolNumber, localIpv6Addr, err)
+	v6Addr := tcpip.ProtocolAddress{Protocol: header.IPv6ProtocolNumber, AddressWithPrefix: localIPv6AddrWithPrefix}
+	if err := s.AddProtocolAddress(nicID, v6Addr); err != nil {
+		t.Fatalf("AddProtocolAddress(%d, %#v) = %s", nicID, v6Addr, err)
 	}
 
+	return s, e
+}
+
+func buildDummyStack(t *testing.T) *stack.Stack {
+	t.Helper()
+
+	s, _ := buildDummyStackWithLinkEndpoint(t)
 	return s
 }
 
+var _ stack.NetworkInterface = (*testInterface)(nil)
+
+type testInterface struct {
+	testObject
+
+	mu struct {
+		sync.RWMutex
+		disabled bool
+	}
+}
+
+func (*testInterface) ID() tcpip.NICID {
+	return nicID
+}
+
+func (*testInterface) IsLoopback() bool {
+	return false
+}
+
+func (*testInterface) Name() string {
+	return ""
+}
+
+func (t *testInterface) Enabled() bool {
+	t.mu.RLock()
+	defer t.mu.RUnlock()
+	return !t.mu.disabled
+}
+
+func (t *testInterface) setEnabled(v bool) {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	t.mu.disabled = !v
+}
+
+func TestSourceAddressValidation(t *testing.T) {
+	rxIPv4ICMP := func(e *channel.Endpoint, src tcpip.Address) {
+		totalLen := header.IPv4MinimumSize + header.ICMPv4MinimumSize
+		hdr := buffer.NewPrependable(totalLen)
+		pkt := header.ICMPv4(hdr.Prepend(header.ICMPv4MinimumSize))
+		pkt.SetType(header.ICMPv4Echo)
+		pkt.SetCode(0)
+		pkt.SetChecksum(0)
+		pkt.SetChecksum(^header.Checksum(pkt, 0))
+		ip := header.IPv4(hdr.Prepend(header.IPv4MinimumSize))
+		ip.Encode(&header.IPv4Fields{
+			IHL:         header.IPv4MinimumSize,
+			TotalLength: uint16(totalLen),
+			Protocol:    uint8(icmp.ProtocolNumber4),
+			TTL:         ipv4.DefaultTTL,
+			SrcAddr:     src,
+			DstAddr:     localIPv4Addr,
+		})
+		ip.SetChecksum(^ip.CalculateChecksum())
+
+		e.InjectInbound(header.IPv4ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
+			Data: hdr.View().ToVectorisedView(),
+		}))
+	}
+
+	rxIPv6ICMP := func(e *channel.Endpoint, src tcpip.Address) {
+		totalLen := header.IPv6MinimumSize + header.ICMPv6MinimumSize
+		hdr := buffer.NewPrependable(totalLen)
+		pkt := header.ICMPv6(hdr.Prepend(header.ICMPv6MinimumSize))
+		pkt.SetType(header.ICMPv6EchoRequest)
+		pkt.SetCode(0)
+		pkt.SetChecksum(0)
+		pkt.SetChecksum(header.ICMPv6Checksum(pkt, src, localIPv6Addr, buffer.VectorisedView{}))
+		ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
+		ip.Encode(&header.IPv6Fields{
+			PayloadLength: header.ICMPv6MinimumSize,
+			NextHeader:    uint8(icmp.ProtocolNumber6),
+			HopLimit:      ipv6.DefaultTTL,
+			SrcAddr:       src,
+			DstAddr:       localIPv6Addr,
+		})
+		e.InjectInbound(header.IPv6ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
+			Data: hdr.View().ToVectorisedView(),
+		}))
+	}
+
+	tests := []struct {
+		name       string
+		srcAddress tcpip.Address
+		rxICMP     func(*channel.Endpoint, tcpip.Address)
+		valid      bool
+	}{
+		{
+			name:       "IPv4 valid",
+			srcAddress: "\x01\x02\x03\x04",
+			rxICMP:     rxIPv4ICMP,
+			valid:      true,
+		},
+		{
+			name:       "IPv6 valid",
+			srcAddress: "\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f\x10",
+			rxICMP:     rxIPv6ICMP,
+			valid:      true,
+		},
+		{
+			name:       "IPv4 unspecified",
+			srcAddress: header.IPv4Any,
+			rxICMP:     rxIPv4ICMP,
+			valid:      true,
+		},
+		{
+			name:       "IPv6 unspecified",
+			srcAddress: header.IPv4Any,
+			rxICMP:     rxIPv6ICMP,
+			valid:      true,
+		},
+		{
+			name:       "IPv4 multicast",
+			srcAddress: "\xe0\x00\x00\x01",
+			rxICMP:     rxIPv4ICMP,
+			valid:      false,
+		},
+		{
+			name:       "IPv6 multicast",
+			srcAddress: "\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01",
+			rxICMP:     rxIPv6ICMP,
+			valid:      false,
+		},
+		{
+			name:       "IPv4 broadcast",
+			srcAddress: header.IPv4Broadcast,
+			rxICMP:     rxIPv4ICMP,
+			valid:      false,
+		},
+		{
+			name: "IPv4 subnet broadcast",
+			srcAddress: func() tcpip.Address {
+				subnet := localIPv4AddrWithPrefix.Subnet()
+				return subnet.Broadcast()
+			}(),
+			rxICMP: rxIPv4ICMP,
+			valid:  false,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			s, e := buildDummyStackWithLinkEndpoint(t)
+			test.rxICMP(e, test.srcAddress)
+
+			var wantValid uint64
+			if test.valid {
+				wantValid = 1
+			}
+
+			if got, want := s.Stats().IP.InvalidSourceAddressesReceived.Value(), 1-wantValid; got != want {
+				t.Errorf("got s.Stats().IP.InvalidSourceAddressesReceived.Value() = %d, want = %d", got, want)
+			}
+			if got := s.Stats().IP.PacketsDelivered.Value(); got != wantValid {
+				t.Errorf("got s.Stats().IP.PacketsDelivered.Value() = %d, want = %d", got, wantValid)
+			}
+		})
+	}
+}
+
+func TestEnableWhenNICDisabled(t *testing.T) {
+	tests := []struct {
+		name            string
+		protocolFactory stack.NetworkProtocolFactory
+		protoNum        tcpip.NetworkProtocolNumber
+	}{
+		{
+			name:            "IPv4",
+			protocolFactory: ipv4.NewProtocol,
+			protoNum:        ipv4.ProtocolNumber,
+		},
+		{
+			name:            "IPv6",
+			protocolFactory: ipv6.NewProtocol,
+			protoNum:        ipv6.ProtocolNumber,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			var nic testInterface
+			nic.setEnabled(false)
+
+			s := stack.New(stack.Options{
+				NetworkProtocols: []stack.NetworkProtocolFactory{test.protocolFactory},
+			})
+			p := s.NetworkProtocolInstance(test.protoNum)
+
+			// We pass nil for all parameters except the NetworkInterface and Stack
+			// since Enable only depends on these.
+			ep := p.NewEndpoint(&nic, nil, nil, nil)
+
+			// The endpoint should initially be disabled, regardless the NIC's enabled
+			// status.
+			if ep.Enabled() {
+				t.Fatal("got ep.Enabled() = true, want = false")
+			}
+			nic.setEnabled(true)
+			if ep.Enabled() {
+				t.Fatal("got ep.Enabled() = true, want = false")
+			}
+
+			// Attempting to enable the endpoint while the NIC is disabled should
+			// fail.
+			nic.setEnabled(false)
+			if err := ep.Enable(); err != tcpip.ErrNotPermitted {
+				t.Fatalf("got ep.Enable() = %s, want = %s", err, tcpip.ErrNotPermitted)
+			}
+			// ep should consider the NIC's enabled status when determining its own
+			// enabled status so we "enable" the NIC to read just the endpoint's
+			// enabled status.
+			nic.setEnabled(true)
+			if ep.Enabled() {
+				t.Fatal("got ep.Enabled() = true, want = false")
+			}
+
+			// Enabling the interface after the NIC has been enabled should succeed.
+			if err := ep.Enable(); err != nil {
+				t.Fatalf("ep.Enable(): %s", err)
+			}
+			if !ep.Enabled() {
+				t.Fatal("got ep.Enabled() = false, want = true")
+			}
+
+			// ep should consider the NIC's enabled status when determining its own
+			// enabled status.
+			nic.setEnabled(false)
+			if ep.Enabled() {
+				t.Fatal("got ep.Enabled() = true, want = false")
+			}
+
+			// Disabling the endpoint when the NIC is enabled should make the endpoint
+			// disabled.
+			nic.setEnabled(true)
+			ep.Disable()
+			if ep.Enabled() {
+				t.Fatal("got ep.Enabled() = true, want = false")
+			}
+		})
+	}
+}
+
 func TestIPv4Send(t *testing.T) {
-	o := testObject{t: t, v4: true}
-	proto := ipv4.NewProtocol()
-	ep := proto.NewEndpoint(nicID, nil, nil, &o, buildDummyStack(t))
+	s := buildDummyStack(t)
+	proto := s.NetworkProtocolInstance(ipv4.ProtocolNumber)
+	nic := testInterface{
+		testObject: testObject{
+			t:  t,
+			v4: true,
+		},
+	}
+	ep := proto.NewEndpoint(&nic, nil, nil, nil)
 	defer ep.Close()
 
 	// Allocate and initialize the payload view.
@@ -266,12 +536,12 @@ func TestIPv4Send(t *testing.T) {
 	})
 
 	// Issue the write.
-	o.protocol = 123
-	o.srcAddr = localIpv4Addr
-	o.dstAddr = remoteIpv4Addr
-	o.contents = payload
+	nic.testObject.protocol = 123
+	nic.testObject.srcAddr = localIPv4Addr
+	nic.testObject.dstAddr = remoteIPv4Addr
+	nic.testObject.contents = payload
 
-	r, err := buildIPv4Route(localIpv4Addr, remoteIpv4Addr)
+	r, err := buildIPv4Route(localIPv4Addr, remoteIPv4Addr)
 	if err != nil {
 		t.Fatalf("could not find route: %v", err)
 	}
@@ -285,11 +555,21 @@ func TestIPv4Send(t *testing.T) {
 }
 
 func TestIPv4Receive(t *testing.T) {
-	o := testObject{t: t, v4: true}
-	proto := ipv4.NewProtocol()
-	ep := proto.NewEndpoint(nicID, nil, &o, nil, buildDummyStack(t))
+	s := buildDummyStack(t)
+	proto := s.NetworkProtocolInstance(ipv4.ProtocolNumber)
+	nic := testInterface{
+		testObject: testObject{
+			t:  t,
+			v4: true,
+		},
+	}
+	ep := proto.NewEndpoint(&nic, nil, nil, &nic.testObject)
 	defer ep.Close()
 
+	if err := ep.Enable(); err != nil {
+		t.Fatalf("ep.Enable(): %s", err)
+	}
+
 	totalLen := header.IPv4MinimumSize + 30
 	view := buffer.NewView(totalLen)
 	ip := header.IPv4(view)
@@ -298,9 +578,10 @@ func TestIPv4Receive(t *testing.T) {
 		TotalLength: uint16(totalLen),
 		TTL:         20,
 		Protocol:    10,
-		SrcAddr:     remoteIpv4Addr,
-		DstAddr:     localIpv4Addr,
+		SrcAddr:     remoteIPv4Addr,
+		DstAddr:     localIPv4Addr,
 	})
+	ip.SetChecksum(^ip.CalculateChecksum())
 
 	// Make payload be non-zero.
 	for i := header.IPv4MinimumSize; i < totalLen; i++ {
@@ -308,12 +589,12 @@ func TestIPv4Receive(t *testing.T) {
 	}
 
 	// Give packet to ipv4 endpoint, dispatcher will validate that it's ok.
-	o.protocol = 10
-	o.srcAddr = remoteIpv4Addr
-	o.dstAddr = localIpv4Addr
-	o.contents = view[header.IPv4MinimumSize:totalLen]
+	nic.testObject.protocol = 10
+	nic.testObject.srcAddr = remoteIPv4Addr
+	nic.testObject.dstAddr = localIPv4Addr
+	nic.testObject.contents = view[header.IPv4MinimumSize:totalLen]
 
-	r, err := buildIPv4Route(localIpv4Addr, remoteIpv4Addr)
+	r, err := buildIPv4Route(localIPv4Addr, remoteIPv4Addr)
 	if err != nil {
 		t.Fatalf("could not find route: %v", err)
 	}
@@ -324,8 +605,8 @@ func TestIPv4Receive(t *testing.T) {
 		t.Fatalf("failed to parse packet: %x", pkt.Data.ToView())
 	}
 	ep.HandlePacket(&r, pkt)
-	if o.dataCalls != 1 {
-		t.Fatalf("Bad number of data calls: got %x, want 1", o.dataCalls)
+	if nic.testObject.dataCalls != 1 {
+		t.Fatalf("Bad number of data calls: got %x, want 1", nic.testObject.dataCalls)
 	}
 }
 
@@ -349,17 +630,26 @@ func TestIPv4ReceiveControl(t *testing.T) {
 		{"Non-zero fragment offset", 0, 100, header.ICMPv4PortUnreachable, stack.ControlPortUnreachable, 0, 0},
 		{"Zero-length packet", 0, 0, header.ICMPv4PortUnreachable, stack.ControlPortUnreachable, 0, 2*header.IPv4MinimumSize + header.ICMPv4MinimumSize + 8},
 	}
-	r, err := buildIPv4Route(localIpv4Addr, "\x0a\x00\x00\xbb")
+	r, err := buildIPv4Route(localIPv4Addr, "\x0a\x00\x00\xbb")
 	if err != nil {
 		t.Fatal(err)
 	}
 	for _, c := range cases {
 		t.Run(c.name, func(t *testing.T) {
-			o := testObject{t: t}
-			proto := ipv4.NewProtocol()
-			ep := proto.NewEndpoint(nicID, nil, &o, nil, buildDummyStack(t))
+			s := buildDummyStack(t)
+			proto := s.NetworkProtocolInstance(ipv4.ProtocolNumber)
+			nic := testInterface{
+				testObject: testObject{
+					t: t,
+				},
+			}
+			ep := proto.NewEndpoint(&nic, nil, nil, &nic.testObject)
 			defer ep.Close()
 
+			if err := ep.Enable(); err != nil {
+				t.Fatalf("ep.Enable(): %s", err)
+			}
+
 			const dataOffset = header.IPv4MinimumSize*2 + header.ICMPv4MinimumSize
 			view := buffer.NewView(dataOffset + 8)
 
@@ -371,8 +661,9 @@ func TestIPv4ReceiveControl(t *testing.T) {
 				TTL:         20,
 				Protocol:    uint8(header.ICMPv4ProtocolNumber),
 				SrcAddr:     "\x0a\x00\x00\xbb",
-				DstAddr:     localIpv4Addr,
+				DstAddr:     localIPv4Addr,
 			})
+			ip.SetChecksum(^ip.CalculateChecksum())
 
 			// Create the ICMP header.
 			icmp := header.ICMPv4(view[header.IPv4MinimumSize:])
@@ -389,9 +680,10 @@ func TestIPv4ReceiveControl(t *testing.T) {
 				TTL:            20,
 				Protocol:       10,
 				FragmentOffset: c.fragmentOffset,
-				SrcAddr:        localIpv4Addr,
-				DstAddr:        remoteIpv4Addr,
+				SrcAddr:        localIPv4Addr,
+				DstAddr:        remoteIPv4Addr,
 			})
+			ip.SetChecksum(^ip.CalculateChecksum())
 
 			// Make payload be non-zero.
 			for i := dataOffset; i < len(view); i++ {
@@ -400,27 +692,37 @@ func TestIPv4ReceiveControl(t *testing.T) {
 
 			// Give packet to IPv4 endpoint, dispatcher will validate that
 			// it's ok.
-			o.protocol = 10
-			o.srcAddr = remoteIpv4Addr
-			o.dstAddr = localIpv4Addr
-			o.contents = view[dataOffset:]
-			o.typ = c.expectedTyp
-			o.extra = c.expectedExtra
+			nic.testObject.protocol = 10
+			nic.testObject.srcAddr = remoteIPv4Addr
+			nic.testObject.dstAddr = localIPv4Addr
+			nic.testObject.contents = view[dataOffset:]
+			nic.testObject.typ = c.expectedTyp
+			nic.testObject.extra = c.expectedExtra
 
 			ep.HandlePacket(&r, truncatedPacket(view, c.trunc, header.IPv4MinimumSize))
-			if want := c.expectedCount; o.controlCalls != want {
-				t.Fatalf("Bad number of control calls for %q case: got %v, want %v", c.name, o.controlCalls, want)
+			if want := c.expectedCount; nic.testObject.controlCalls != want {
+				t.Fatalf("Bad number of control calls for %q case: got %v, want %v", c.name, nic.testObject.controlCalls, want)
 			}
 		})
 	}
 }
 
 func TestIPv4FragmentationReceive(t *testing.T) {
-	o := testObject{t: t, v4: true}
-	proto := ipv4.NewProtocol()
-	ep := proto.NewEndpoint(nicID, nil, &o, nil, buildDummyStack(t))
+	s := buildDummyStack(t)
+	proto := s.NetworkProtocolInstance(ipv4.ProtocolNumber)
+	nic := testInterface{
+		testObject: testObject{
+			t:  t,
+			v4: true,
+		},
+	}
+	ep := proto.NewEndpoint(&nic, nil, nil, &nic.testObject)
 	defer ep.Close()
 
+	if err := ep.Enable(); err != nil {
+		t.Fatalf("ep.Enable(): %s", err)
+	}
+
 	totalLen := header.IPv4MinimumSize + 24
 
 	frag1 := buffer.NewView(totalLen)
@@ -432,9 +734,11 @@ func TestIPv4FragmentationReceive(t *testing.T) {
 		Protocol:       10,
 		FragmentOffset: 0,
 		Flags:          header.IPv4FlagMoreFragments,
-		SrcAddr:        remoteIpv4Addr,
-		DstAddr:        localIpv4Addr,
+		SrcAddr:        remoteIPv4Addr,
+		DstAddr:        localIPv4Addr,
 	})
+	ip1.SetChecksum(^ip1.CalculateChecksum())
+
 	// Make payload be non-zero.
 	for i := header.IPv4MinimumSize; i < totalLen; i++ {
 		frag1[i] = uint8(i)
@@ -448,21 +752,23 @@ func TestIPv4FragmentationReceive(t *testing.T) {
 		TTL:            20,
 		Protocol:       10,
 		FragmentOffset: 24,
-		SrcAddr:        remoteIpv4Addr,
-		DstAddr:        localIpv4Addr,
+		SrcAddr:        remoteIPv4Addr,
+		DstAddr:        localIPv4Addr,
 	})
+	ip2.SetChecksum(^ip2.CalculateChecksum())
+
 	// Make payload be non-zero.
 	for i := header.IPv4MinimumSize; i < totalLen; i++ {
 		frag2[i] = uint8(i)
 	}
 
 	// Give packet to ipv4 endpoint, dispatcher will validate that it's ok.
-	o.protocol = 10
-	o.srcAddr = remoteIpv4Addr
-	o.dstAddr = localIpv4Addr
-	o.contents = append(frag1[header.IPv4MinimumSize:totalLen], frag2[header.IPv4MinimumSize:totalLen]...)
+	nic.testObject.protocol = 10
+	nic.testObject.srcAddr = remoteIPv4Addr
+	nic.testObject.dstAddr = localIPv4Addr
+	nic.testObject.contents = append(frag1[header.IPv4MinimumSize:totalLen], frag2[header.IPv4MinimumSize:totalLen]...)
 
-	r, err := buildIPv4Route(localIpv4Addr, remoteIpv4Addr)
+	r, err := buildIPv4Route(localIPv4Addr, remoteIPv4Addr)
 	if err != nil {
 		t.Fatalf("could not find route: %v", err)
 	}
@@ -475,8 +781,8 @@ func TestIPv4FragmentationReceive(t *testing.T) {
 		t.Fatalf("failed to parse packet: %x", pkt.Data.ToView())
 	}
 	ep.HandlePacket(&r, pkt)
-	if o.dataCalls != 0 {
-		t.Fatalf("Bad number of data calls: got %x, want 0", o.dataCalls)
+	if nic.testObject.dataCalls != 0 {
+		t.Fatalf("Bad number of data calls: got %x, want 0", nic.testObject.dataCalls)
 	}
 
 	// Send second segment.
@@ -487,17 +793,26 @@ func TestIPv4FragmentationReceive(t *testing.T) {
 		t.Fatalf("failed to parse packet: %x", pkt.Data.ToView())
 	}
 	ep.HandlePacket(&r, pkt)
-	if o.dataCalls != 1 {
-		t.Fatalf("Bad number of data calls: got %x, want 1", o.dataCalls)
+	if nic.testObject.dataCalls != 1 {
+		t.Fatalf("Bad number of data calls: got %x, want 1", nic.testObject.dataCalls)
 	}
 }
 
 func TestIPv6Send(t *testing.T) {
-	o := testObject{t: t}
-	proto := ipv6.NewProtocol()
-	ep := proto.NewEndpoint(nicID, nil, &o, channel.New(0, 1280, ""), buildDummyStack(t))
+	s := buildDummyStack(t)
+	proto := s.NetworkProtocolInstance(ipv6.ProtocolNumber)
+	nic := testInterface{
+		testObject: testObject{
+			t: t,
+		},
+	}
+	ep := proto.NewEndpoint(&nic, nil, nil, nil)
 	defer ep.Close()
 
+	if err := ep.Enable(); err != nil {
+		t.Fatalf("ep.Enable(): %s", err)
+	}
+
 	// Allocate and initialize the payload view.
 	payload := buffer.NewView(100)
 	for i := 0; i < len(payload); i++ {
@@ -511,12 +826,12 @@ func TestIPv6Send(t *testing.T) {
 	})
 
 	// Issue the write.
-	o.protocol = 123
-	o.srcAddr = localIpv6Addr
-	o.dstAddr = remoteIpv6Addr
-	o.contents = payload
+	nic.testObject.protocol = 123
+	nic.testObject.srcAddr = localIPv6Addr
+	nic.testObject.dstAddr = remoteIPv6Addr
+	nic.testObject.contents = payload
 
-	r, err := buildIPv6Route(localIpv6Addr, remoteIpv6Addr)
+	r, err := buildIPv6Route(localIPv6Addr, remoteIPv6Addr)
 	if err != nil {
 		t.Fatalf("could not find route: %v", err)
 	}
@@ -530,11 +845,20 @@ func TestIPv6Send(t *testing.T) {
 }
 
 func TestIPv6Receive(t *testing.T) {
-	o := testObject{t: t}
-	proto := ipv6.NewProtocol()
-	ep := proto.NewEndpoint(nicID, nil, &o, nil, buildDummyStack(t))
+	s := buildDummyStack(t)
+	proto := s.NetworkProtocolInstance(ipv6.ProtocolNumber)
+	nic := testInterface{
+		testObject: testObject{
+			t: t,
+		},
+	}
+	ep := proto.NewEndpoint(&nic, nil, nil, &nic.testObject)
 	defer ep.Close()
 
+	if err := ep.Enable(); err != nil {
+		t.Fatalf("ep.Enable(): %s", err)
+	}
+
 	totalLen := header.IPv6MinimumSize + 30
 	view := buffer.NewView(totalLen)
 	ip := header.IPv6(view)
@@ -542,8 +866,8 @@ func TestIPv6Receive(t *testing.T) {
 		PayloadLength: uint16(totalLen - header.IPv6MinimumSize),
 		NextHeader:    10,
 		HopLimit:      20,
-		SrcAddr:       remoteIpv6Addr,
-		DstAddr:       localIpv6Addr,
+		SrcAddr:       remoteIPv6Addr,
+		DstAddr:       localIPv6Addr,
 	})
 
 	// Make payload be non-zero.
@@ -552,12 +876,12 @@ func TestIPv6Receive(t *testing.T) {
 	}
 
 	// Give packet to ipv6 endpoint, dispatcher will validate that it's ok.
-	o.protocol = 10
-	o.srcAddr = remoteIpv6Addr
-	o.dstAddr = localIpv6Addr
-	o.contents = view[header.IPv6MinimumSize:totalLen]
+	nic.testObject.protocol = 10
+	nic.testObject.srcAddr = remoteIPv6Addr
+	nic.testObject.dstAddr = localIPv6Addr
+	nic.testObject.contents = view[header.IPv6MinimumSize:totalLen]
 
-	r, err := buildIPv6Route(localIpv6Addr, remoteIpv6Addr)
+	r, err := buildIPv6Route(localIPv6Addr, remoteIPv6Addr)
 	if err != nil {
 		t.Fatalf("could not find route: %v", err)
 	}
@@ -569,8 +893,8 @@ func TestIPv6Receive(t *testing.T) {
 		t.Fatalf("failed to parse packet: %x", pkt.Data.ToView())
 	}
 	ep.HandlePacket(&r, pkt)
-	if o.dataCalls != 1 {
-		t.Fatalf("Bad number of data calls: got %x, want 1", o.dataCalls)
+	if nic.testObject.dataCalls != 1 {
+		t.Fatalf("Bad number of data calls: got %x, want 1", nic.testObject.dataCalls)
 	}
 }
 
@@ -601,7 +925,7 @@ func TestIPv6ReceiveControl(t *testing.T) {
 		{"Zero-length packet", 0, nil, header.ICMPv6DstUnreachable, header.ICMPv6PortUnreachable, stack.ControlPortUnreachable, 0, 2*header.IPv6MinimumSize + header.ICMPv6DstUnreachableMinimumSize + 8},
 	}
 	r, err := buildIPv6Route(
-		localIpv6Addr,
+		localIPv6Addr,
 		"\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xaa",
 	)
 	if err != nil {
@@ -609,11 +933,20 @@ func TestIPv6ReceiveControl(t *testing.T) {
 	}
 	for _, c := range cases {
 		t.Run(c.name, func(t *testing.T) {
-			o := testObject{t: t}
-			proto := ipv6.NewProtocol()
-			ep := proto.NewEndpoint(nicID, nil, &o, nil, buildDummyStack(t))
+			s := buildDummyStack(t)
+			proto := s.NetworkProtocolInstance(ipv6.ProtocolNumber)
+			nic := testInterface{
+				testObject: testObject{
+					t: t,
+				},
+			}
+			ep := proto.NewEndpoint(&nic, nil, nil, &nic.testObject)
 			defer ep.Close()
 
+			if err := ep.Enable(); err != nil {
+				t.Fatalf("ep.Enable(): %s", err)
+			}
+
 			dataOffset := header.IPv6MinimumSize*2 + header.ICMPv6MinimumSize
 			if c.fragmentOffset != nil {
 				dataOffset += header.IPv6FragmentHeaderSize
@@ -627,7 +960,7 @@ func TestIPv6ReceiveControl(t *testing.T) {
 				NextHeader:    uint8(header.ICMPv6ProtocolNumber),
 				HopLimit:      20,
 				SrcAddr:       outerSrcAddr,
-				DstAddr:       localIpv6Addr,
+				DstAddr:       localIPv6Addr,
 			})
 
 			// Create the ICMP header.
@@ -643,8 +976,8 @@ func TestIPv6ReceiveControl(t *testing.T) {
 				PayloadLength: 100,
 				NextHeader:    10,
 				HopLimit:      20,
-				SrcAddr:       localIpv6Addr,
-				DstAddr:       remoteIpv6Addr,
+				SrcAddr:       localIPv6Addr,
+				DstAddr:       remoteIPv6Addr,
 			})
 
 			// Build the fragmentation header if needed.
@@ -666,19 +999,19 @@ func TestIPv6ReceiveControl(t *testing.T) {
 
 			// Give packet to IPv6 endpoint, dispatcher will validate that
 			// it's ok.
-			o.protocol = 10
-			o.srcAddr = remoteIpv6Addr
-			o.dstAddr = localIpv6Addr
-			o.contents = view[dataOffset:]
-			o.typ = c.expectedTyp
-			o.extra = c.expectedExtra
+			nic.testObject.protocol = 10
+			nic.testObject.srcAddr = remoteIPv6Addr
+			nic.testObject.dstAddr = localIPv6Addr
+			nic.testObject.contents = view[dataOffset:]
+			nic.testObject.typ = c.expectedTyp
+			nic.testObject.extra = c.expectedExtra
 
 			// Set ICMPv6 checksum.
-			icmp.SetChecksum(header.ICMPv6Checksum(icmp, outerSrcAddr, localIpv6Addr, buffer.VectorisedView{}))
+			icmp.SetChecksum(header.ICMPv6Checksum(icmp, outerSrcAddr, localIPv6Addr, buffer.VectorisedView{}))
 
 			ep.HandlePacket(&r, truncatedPacket(view, c.trunc, header.IPv6MinimumSize))
-			if want := c.expectedCount; o.controlCalls != want {
-				t.Fatalf("Bad number of control calls for %q case: got %v, want %v", c.name, o.controlCalls, want)
+			if want := c.expectedCount; nic.testObject.controlCalls != want {
+				t.Fatalf("Bad number of control calls for %q case: got %v, want %v", c.name, nic.testObject.controlCalls, want)
 			}
 		})
 	}
@@ -696,3 +1029,406 @@ func truncatedPacket(view buffer.View, trunc, netHdrLen int) *stack.PacketBuffer
 	_, _ = pkt.NetworkHeader().Consume(netHdrLen)
 	return pkt
 }
+
+func TestWriteHeaderIncludedPacket(t *testing.T) {
+	const (
+		nicID          = 1
+		transportProto = 5
+
+		dataLen    = 4
+		optionsLen = 4
+	)
+
+	dataBuf := [dataLen]byte{1, 2, 3, 4}
+	data := dataBuf[:]
+
+	ipv4OptionsBuf := [optionsLen]byte{0, 1, 0, 1}
+	ipv4Options := ipv4OptionsBuf[:]
+
+	ipv6FragmentExtHdrBuf := [header.IPv6FragmentExtHdrLength]byte{transportProto, 0, 62, 4, 1, 2, 3, 4}
+	ipv6FragmentExtHdr := ipv6FragmentExtHdrBuf[:]
+
+	var ipv6PayloadWithExtHdrBuf [dataLen + header.IPv6FragmentExtHdrLength]byte
+	ipv6PayloadWithExtHdr := ipv6PayloadWithExtHdrBuf[:]
+	if n := copy(ipv6PayloadWithExtHdr, ipv6FragmentExtHdr); n != len(ipv6FragmentExtHdr) {
+		t.Fatalf("copied %d bytes, expected %d bytes", n, len(ipv6FragmentExtHdr))
+	}
+	if n := copy(ipv6PayloadWithExtHdr[header.IPv6FragmentExtHdrLength:], data); n != len(data) {
+		t.Fatalf("copied %d bytes, expected %d bytes", n, len(data))
+	}
+
+	tests := []struct {
+		name         string
+		protoFactory stack.NetworkProtocolFactory
+		protoNum     tcpip.NetworkProtocolNumber
+		nicAddr      tcpip.Address
+		remoteAddr   tcpip.Address
+		pktGen       func(*testing.T, tcpip.Address) buffer.View
+		checker      func(*testing.T, *stack.PacketBuffer, tcpip.Address)
+		expectedErr  *tcpip.Error
+	}{
+		{
+			name:         "IPv4",
+			protoFactory: ipv4.NewProtocol,
+			protoNum:     ipv4.ProtocolNumber,
+			nicAddr:      localIPv4Addr,
+			remoteAddr:   remoteIPv4Addr,
+			pktGen: func(t *testing.T, src tcpip.Address) buffer.View {
+				totalLen := header.IPv4MinimumSize + len(data)
+				hdr := buffer.NewPrependable(totalLen)
+				if n := copy(hdr.Prepend(len(data)), data); n != len(data) {
+					t.Fatalf("copied %d bytes, expected %d bytes", n, len(data))
+				}
+				ip := header.IPv4(hdr.Prepend(header.IPv4MinimumSize))
+				ip.Encode(&header.IPv4Fields{
+					IHL:      header.IPv4MinimumSize,
+					Protocol: transportProto,
+					TTL:      ipv4.DefaultTTL,
+					SrcAddr:  src,
+					DstAddr:  header.IPv4Any,
+				})
+				return hdr.View()
+			},
+			checker: func(t *testing.T, pkt *stack.PacketBuffer, src tcpip.Address) {
+				if src == header.IPv4Any {
+					src = localIPv4Addr
+				}
+
+				netHdr := pkt.NetworkHeader()
+
+				if len(netHdr.View()) != header.IPv4MinimumSize {
+					t.Errorf("got len(netHdr.View()) = %d, want = %d", len(netHdr.View()), header.IPv4MinimumSize)
+				}
+
+				checker.IPv4(t, stack.PayloadSince(netHdr),
+					checker.SrcAddr(src),
+					checker.DstAddr(remoteIPv4Addr),
+					checker.IPv4HeaderLength(header.IPv4MinimumSize),
+					checker.IPFullLength(uint16(header.IPv4MinimumSize+len(data))),
+					checker.IPPayload(data),
+				)
+			},
+		},
+		{
+			name:         "IPv4 with IHL too small",
+			protoFactory: ipv4.NewProtocol,
+			protoNum:     ipv4.ProtocolNumber,
+			nicAddr:      localIPv4Addr,
+			remoteAddr:   remoteIPv4Addr,
+			pktGen: func(t *testing.T, src tcpip.Address) buffer.View {
+				totalLen := header.IPv4MinimumSize + len(data)
+				hdr := buffer.NewPrependable(totalLen)
+				if n := copy(hdr.Prepend(len(data)), data); n != len(data) {
+					t.Fatalf("copied %d bytes, expected %d bytes", n, len(data))
+				}
+				ip := header.IPv4(hdr.Prepend(header.IPv4MinimumSize))
+				ip.Encode(&header.IPv4Fields{
+					IHL:      header.IPv4MinimumSize - 1,
+					Protocol: transportProto,
+					TTL:      ipv4.DefaultTTL,
+					SrcAddr:  src,
+					DstAddr:  header.IPv4Any,
+				})
+				return hdr.View()
+			},
+			expectedErr: tcpip.ErrMalformedHeader,
+		},
+		{
+			name:         "IPv4 too small",
+			protoFactory: ipv4.NewProtocol,
+			protoNum:     ipv4.ProtocolNumber,
+			nicAddr:      localIPv4Addr,
+			remoteAddr:   remoteIPv4Addr,
+			pktGen: func(t *testing.T, src tcpip.Address) buffer.View {
+				ip := header.IPv4(make([]byte, header.IPv4MinimumSize))
+				ip.Encode(&header.IPv4Fields{
+					IHL:      header.IPv4MinimumSize,
+					Protocol: transportProto,
+					TTL:      ipv4.DefaultTTL,
+					SrcAddr:  src,
+					DstAddr:  header.IPv4Any,
+				})
+				return buffer.View(ip[:len(ip)-1])
+			},
+			expectedErr: tcpip.ErrMalformedHeader,
+		},
+		{
+			name:         "IPv4 minimum size",
+			protoFactory: ipv4.NewProtocol,
+			protoNum:     ipv4.ProtocolNumber,
+			nicAddr:      localIPv4Addr,
+			remoteAddr:   remoteIPv4Addr,
+			pktGen: func(t *testing.T, src tcpip.Address) buffer.View {
+				ip := header.IPv4(make([]byte, header.IPv4MinimumSize))
+				ip.Encode(&header.IPv4Fields{
+					IHL:      header.IPv4MinimumSize,
+					Protocol: transportProto,
+					TTL:      ipv4.DefaultTTL,
+					SrcAddr:  src,
+					DstAddr:  header.IPv4Any,
+				})
+				return buffer.View(ip)
+			},
+			checker: func(t *testing.T, pkt *stack.PacketBuffer, src tcpip.Address) {
+				if src == header.IPv4Any {
+					src = localIPv4Addr
+				}
+
+				netHdr := pkt.NetworkHeader()
+
+				if len(netHdr.View()) != header.IPv4MinimumSize {
+					t.Errorf("got len(netHdr.View()) = %d, want = %d", len(netHdr.View()), header.IPv4MinimumSize)
+				}
+
+				checker.IPv4(t, stack.PayloadSince(netHdr),
+					checker.SrcAddr(src),
+					checker.DstAddr(remoteIPv4Addr),
+					checker.IPv4HeaderLength(header.IPv4MinimumSize),
+					checker.IPFullLength(header.IPv4MinimumSize),
+					checker.IPPayload(nil),
+				)
+			},
+		},
+		{
+			name:         "IPv4 with options",
+			protoFactory: ipv4.NewProtocol,
+			protoNum:     ipv4.ProtocolNumber,
+			nicAddr:      localIPv4Addr,
+			remoteAddr:   remoteIPv4Addr,
+			pktGen: func(t *testing.T, src tcpip.Address) buffer.View {
+				ipHdrLen := header.IPv4MinimumSize + len(ipv4Options)
+				totalLen := ipHdrLen + len(data)
+				hdr := buffer.NewPrependable(totalLen)
+				if n := copy(hdr.Prepend(len(data)), data); n != len(data) {
+					t.Fatalf("copied %d bytes, expected %d bytes", n, len(data))
+				}
+				ip := header.IPv4(hdr.Prepend(ipHdrLen))
+				ip.Encode(&header.IPv4Fields{
+					IHL:      uint8(ipHdrLen),
+					Protocol: transportProto,
+					TTL:      ipv4.DefaultTTL,
+					SrcAddr:  src,
+					DstAddr:  header.IPv4Any,
+				})
+				if n := copy(ip.Options(), ipv4Options); n != len(ipv4Options) {
+					t.Fatalf("copied %d bytes, expected %d bytes", n, len(ipv4Options))
+				}
+				return hdr.View()
+			},
+			checker: func(t *testing.T, pkt *stack.PacketBuffer, src tcpip.Address) {
+				if src == header.IPv4Any {
+					src = localIPv4Addr
+				}
+
+				netHdr := pkt.NetworkHeader()
+
+				hdrLen := header.IPv4MinimumSize + len(ipv4Options)
+				if len(netHdr.View()) != hdrLen {
+					t.Errorf("got len(netHdr.View()) = %d, want = %d", len(netHdr.View()), hdrLen)
+				}
+
+				checker.IPv4(t, stack.PayloadSince(netHdr),
+					checker.SrcAddr(src),
+					checker.DstAddr(remoteIPv4Addr),
+					checker.IPv4HeaderLength(hdrLen),
+					checker.IPFullLength(uint16(hdrLen+len(data))),
+					checker.IPv4Options(ipv4Options),
+					checker.IPPayload(data),
+				)
+			},
+		},
+		{
+			name:         "IPv6",
+			protoFactory: ipv6.NewProtocol,
+			protoNum:     ipv6.ProtocolNumber,
+			nicAddr:      localIPv6Addr,
+			remoteAddr:   remoteIPv6Addr,
+			pktGen: func(t *testing.T, src tcpip.Address) buffer.View {
+				totalLen := header.IPv6MinimumSize + len(data)
+				hdr := buffer.NewPrependable(totalLen)
+				if n := copy(hdr.Prepend(len(data)), data); n != len(data) {
+					t.Fatalf("copied %d bytes, expected %d bytes", n, len(data))
+				}
+				ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
+				ip.Encode(&header.IPv6Fields{
+					NextHeader: transportProto,
+					HopLimit:   ipv6.DefaultTTL,
+					SrcAddr:    src,
+					DstAddr:    header.IPv4Any,
+				})
+				return hdr.View()
+			},
+			checker: func(t *testing.T, pkt *stack.PacketBuffer, src tcpip.Address) {
+				if src == header.IPv6Any {
+					src = localIPv6Addr
+				}
+
+				netHdr := pkt.NetworkHeader()
+
+				if len(netHdr.View()) != header.IPv6MinimumSize {
+					t.Errorf("got len(netHdr.View()) = %d, want = %d", len(netHdr.View()), header.IPv6MinimumSize)
+				}
+
+				checker.IPv6(t, stack.PayloadSince(netHdr),
+					checker.SrcAddr(src),
+					checker.DstAddr(remoteIPv6Addr),
+					checker.IPFullLength(uint16(header.IPv6MinimumSize+len(data))),
+					checker.IPPayload(data),
+				)
+			},
+		},
+		{
+			name:         "IPv6 with extension header",
+			protoFactory: ipv6.NewProtocol,
+			protoNum:     ipv6.ProtocolNumber,
+			nicAddr:      localIPv6Addr,
+			remoteAddr:   remoteIPv6Addr,
+			pktGen: func(t *testing.T, src tcpip.Address) buffer.View {
+				totalLen := header.IPv6MinimumSize + len(ipv6FragmentExtHdr) + len(data)
+				hdr := buffer.NewPrependable(totalLen)
+				if n := copy(hdr.Prepend(len(data)), data); n != len(data) {
+					t.Fatalf("copied %d bytes, expected %d bytes", n, len(data))
+				}
+				if n := copy(hdr.Prepend(len(ipv6FragmentExtHdr)), ipv6FragmentExtHdr); n != len(ipv6FragmentExtHdr) {
+					t.Fatalf("copied %d bytes, expected %d bytes", n, len(ipv6FragmentExtHdr))
+				}
+				ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
+				ip.Encode(&header.IPv6Fields{
+					NextHeader: uint8(header.IPv6FragmentExtHdrIdentifier),
+					HopLimit:   ipv6.DefaultTTL,
+					SrcAddr:    src,
+					DstAddr:    header.IPv4Any,
+				})
+				return hdr.View()
+			},
+			checker: func(t *testing.T, pkt *stack.PacketBuffer, src tcpip.Address) {
+				if src == header.IPv6Any {
+					src = localIPv6Addr
+				}
+
+				netHdr := pkt.NetworkHeader()
+
+				if want := header.IPv6MinimumSize + len(ipv6FragmentExtHdr); len(netHdr.View()) != want {
+					t.Errorf("got len(netHdr.View()) = %d, want = %d", len(netHdr.View()), want)
+				}
+
+				checker.IPv6(t, stack.PayloadSince(netHdr),
+					checker.SrcAddr(src),
+					checker.DstAddr(remoteIPv6Addr),
+					checker.IPFullLength(uint16(header.IPv6MinimumSize+len(ipv6PayloadWithExtHdr))),
+					checker.IPPayload(ipv6PayloadWithExtHdr),
+				)
+			},
+		},
+		{
+			name:         "IPv6 minimum size",
+			protoFactory: ipv6.NewProtocol,
+			protoNum:     ipv6.ProtocolNumber,
+			nicAddr:      localIPv6Addr,
+			remoteAddr:   remoteIPv6Addr,
+			pktGen: func(t *testing.T, src tcpip.Address) buffer.View {
+				ip := header.IPv6(make([]byte, header.IPv6MinimumSize))
+				ip.Encode(&header.IPv6Fields{
+					NextHeader: transportProto,
+					HopLimit:   ipv6.DefaultTTL,
+					SrcAddr:    src,
+					DstAddr:    header.IPv4Any,
+				})
+				return buffer.View(ip)
+			},
+			checker: func(t *testing.T, pkt *stack.PacketBuffer, src tcpip.Address) {
+				if src == header.IPv6Any {
+					src = localIPv6Addr
+				}
+
+				netHdr := pkt.NetworkHeader()
+
+				if len(netHdr.View()) != header.IPv6MinimumSize {
+					t.Errorf("got len(netHdr.View()) = %d, want = %d", len(netHdr.View()), header.IPv6MinimumSize)
+				}
+
+				checker.IPv6(t, stack.PayloadSince(netHdr),
+					checker.SrcAddr(src),
+					checker.DstAddr(remoteIPv6Addr),
+					checker.IPFullLength(header.IPv6MinimumSize),
+					checker.IPPayload(nil),
+				)
+			},
+		},
+		{
+			name:         "IPv6 too small",
+			protoFactory: ipv6.NewProtocol,
+			protoNum:     ipv6.ProtocolNumber,
+			nicAddr:      localIPv6Addr,
+			remoteAddr:   remoteIPv6Addr,
+			pktGen: func(t *testing.T, src tcpip.Address) buffer.View {
+				ip := header.IPv6(make([]byte, header.IPv6MinimumSize))
+				ip.Encode(&header.IPv6Fields{
+					NextHeader: transportProto,
+					HopLimit:   ipv6.DefaultTTL,
+					SrcAddr:    src,
+					DstAddr:    header.IPv4Any,
+				})
+				return buffer.View(ip[:len(ip)-1])
+			},
+			expectedErr: tcpip.ErrMalformedHeader,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			subTests := []struct {
+				name    string
+				srcAddr tcpip.Address
+			}{
+				{
+					name:    "unspecified source",
+					srcAddr: tcpip.Address(strings.Repeat("\x00", len(test.nicAddr))),
+				},
+				{
+					name:    "random source",
+					srcAddr: tcpip.Address(strings.Repeat("\xab", len(test.nicAddr))),
+				},
+			}
+
+			for _, subTest := range subTests {
+				t.Run(subTest.name, func(t *testing.T) {
+					s := stack.New(stack.Options{
+						NetworkProtocols: []stack.NetworkProtocolFactory{test.protoFactory},
+					})
+					e := channel.New(1, 1280, "")
+					if err := s.CreateNIC(nicID, e); err != nil {
+						t.Fatalf("s.CreateNIC(%d, _): %s", nicID, err)
+					}
+					if err := s.AddAddress(nicID, test.protoNum, test.nicAddr); err != nil {
+						t.Fatalf("s.AddAddress(%d, %d, %s): %s", nicID, test.protoNum, test.nicAddr, err)
+					}
+
+					s.SetRouteTable([]tcpip.Route{{Destination: test.remoteAddr.WithPrefix().Subnet(), NIC: nicID}})
+
+					r, err := s.FindRoute(nicID, test.nicAddr, test.remoteAddr, test.protoNum, false /* multicastLoop */)
+					if err != nil {
+						t.Fatalf("s.FindRoute(%d, %s, %s, %d, false): %s", nicID, test.remoteAddr, test.nicAddr, test.protoNum, err)
+					}
+					defer r.Release()
+
+					if err := r.WriteHeaderIncludedPacket(stack.NewPacketBuffer(stack.PacketBufferOptions{
+						Data: test.pktGen(t, subTest.srcAddr).ToVectorisedView(),
+					})); err != test.expectedErr {
+						t.Fatalf("got r.WriteHeaderIncludedPacket(_) = %s, want = %s", err, test.expectedErr)
+					}
+
+					if test.expectedErr != nil {
+						return
+					}
+
+					pkt, ok := e.Read()
+					if !ok {
+						t.Fatal("expected a packet to be written")
+					}
+					test.checker(t, pkt.Pkt, subTest.srcAddr)
+				})
+			}
+		})
+	}
+}
diff --git a/pkg/tcpip/network/ipv4/BUILD b/pkg/tcpip/network/ipv4/BUILD
index d142b4ffa..7fc12e229 100644
--- a/pkg/tcpip/network/ipv4/BUILD
+++ b/pkg/tcpip/network/ipv4/BUILD
@@ -10,9 +10,11 @@ go_library(
     ],
     visibility = ["//visibility:public"],
     deps = [
+        "//pkg/sync",
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/header",
+        "//pkg/tcpip/header/parse",
         "//pkg/tcpip/network/fragmentation",
         "//pkg/tcpip/network/hash",
         "//pkg/tcpip/stack",
@@ -26,11 +28,15 @@ go_test(
     deps = [
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
+        "//pkg/tcpip/checker",
         "//pkg/tcpip/header",
         "//pkg/tcpip/link/channel",
         "//pkg/tcpip/link/sniffer",
+        "//pkg/tcpip/network/arp",
         "//pkg/tcpip/network/ipv4",
+        "//pkg/tcpip/network/testutil",
         "//pkg/tcpip/stack",
+        "//pkg/tcpip/transport/icmp",
         "//pkg/tcpip/transport/tcp",
         "//pkg/tcpip/transport/udp",
         "//pkg/waiter",
diff --git a/pkg/tcpip/network/ipv4/icmp.go b/pkg/tcpip/network/ipv4/icmp.go
index b5659a36b..3407755ed 100644
--- a/pkg/tcpip/network/ipv4/icmp.go
+++ b/pkg/tcpip/network/ipv4/icmp.go
@@ -15,6 +15,9 @@
 package ipv4
 
 import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
@@ -39,7 +42,7 @@ func (e *endpoint) handleControl(typ stack.ControlType, extra uint32, pkt *stack
 	// Drop packet if it doesn't have the basic IPv4 header or if the
 	// original source address doesn't match an address we own.
 	src := hdr.SourceAddress()
-	if e.stack.CheckLocalAddress(e.NICID(), ProtocolNumber, src) == 0 {
+	if e.protocol.stack.CheckLocalAddress(e.nic.ID(), ProtocolNumber, src) == 0 {
 		return
 	}
 
@@ -76,69 +79,87 @@ func (e *endpoint) handleICMP(r *stack.Route, pkt *stack.PacketBuffer) {
 		received.Echo.Increment()
 
 		// Only send a reply if the checksum is valid.
-		wantChecksum := h.Checksum()
-		// Reset the checksum field to 0 to can calculate the proper
-		// checksum. We'll have to reset this before we hand the packet
-		// off.
+		headerChecksum := h.Checksum()
 		h.SetChecksum(0)
-		gotChecksum := ^header.ChecksumVV(pkt.Data, 0 /* initial */)
-		if gotChecksum != wantChecksum {
-			// It's possible that a raw socket expects to receive this.
-			h.SetChecksum(wantChecksum)
+		calculatedChecksum := ^header.ChecksumVV(pkt.Data, 0 /* initial */)
+		h.SetChecksum(headerChecksum)
+		if calculatedChecksum != headerChecksum {
+			// It's possible that a raw socket still expects to receive this.
 			e.dispatcher.DeliverTransportPacket(r, header.ICMPv4ProtocolNumber, pkt)
 			received.Invalid.Increment()
 			return
 		}
 
-		// Make a copy of data before pkt gets sent to raw socket.
-		// DeliverTransportPacket will take ownership of pkt.
-		replyData := pkt.Data.Clone(nil)
-		replyData.TrimFront(header.ICMPv4MinimumSize)
+		// DeliverTransportPacket will take ownership of pkt so don't use it beyond
+		// this point. Make a deep copy of the data before pkt gets sent as we will
+		// be modifying fields.
+		//
+		// TODO(gvisor.dev/issue/4399): The copy may not be needed if there are no
+		// waiting endpoints. Consider moving responsibility for doing the copy to
+		// DeliverTransportPacket so that is is only done when needed.
+		replyData := pkt.Data.ToOwnedView()
+		replyIPHdr := header.IPv4(append(buffer.View(nil), pkt.NetworkHeader().View()...))
 
-		// It's possible that a raw socket expects to receive this.
-		h.SetChecksum(wantChecksum)
 		e.dispatcher.DeliverTransportPacket(r, header.ICMPv4ProtocolNumber, pkt)
 
-		remoteLinkAddr := r.RemoteLinkAddress
-
 		// As per RFC 1122 section 3.2.1.3, when a host sends any datagram, the IP
 		// source address MUST be one of its own IP addresses (but not a broadcast
 		// or multicast address).
 		localAddr := r.LocalAddress
-		if r.IsInboundBroadcast() || header.IsV4MulticastAddress(r.LocalAddress) {
+		if r.IsInboundBroadcast() || header.IsV4MulticastAddress(localAddr) {
 			localAddr = ""
 		}
 
-		r, err := r.Stack().FindRoute(e.NICID(), localAddr, r.RemoteAddress, ProtocolNumber, false /* multicastLoop */)
+		r, err := r.Stack().FindRoute(e.nic.ID(), localAddr, r.RemoteAddress, ProtocolNumber, false /* multicastLoop */)
 		if err != nil {
 			// If we cannot find a route to the destination, silently drop the packet.
 			return
 		}
 		defer r.Release()
 
-		// Use the remote link address from the incoming packet.
-		r.ResolveWith(remoteLinkAddr)
-
-		// Prepare a reply packet.
-		icmpHdr := make(header.ICMPv4, header.ICMPv4MinimumSize)
-		copy(icmpHdr, h)
-		icmpHdr.SetType(header.ICMPv4EchoReply)
-		icmpHdr.SetChecksum(0)
-		icmpHdr.SetChecksum(^header.Checksum(icmpHdr, header.ChecksumVV(replyData, 0)))
-		dataVV := buffer.View(icmpHdr).ToVectorisedView()
-		dataVV.Append(replyData)
+		// TODO(gvisor.dev/issue/3810:) When adding protocol numbers into the
+		// header information, we may have to change this code to handle the
+		// ICMP header no longer being in the data buffer.
+
+		// Because IP and ICMP are so closely intertwined, we need to handcraft our
+		// IP header to be able to follow RFC 792. The wording on page 13 is as
+		// follows:
+		//   IP Fields:
+		//   Addresses
+		//     The address of the source in an echo message will be the
+		//     destination of the echo reply message.  To form an echo reply
+		//     message, the source and destination addresses are simply reversed,
+		//     the type code changed to 0, and the checksum recomputed.
+		//
+		// This was interpreted by early implementors to mean that all options must
+		// be copied from the echo request IP header to the echo reply IP header
+		// and this behaviour is still relied upon by some applications.
+		//
+		// Create a copy of the IP header we received, options and all, and change
+		// The fields we need to alter.
+		//
+		// We need to produce the entire packet in the data segment in order to
+		// use WriteHeaderIncludedPacket().
+		replyIPHdr.SetSourceAddress(r.LocalAddress)
+		replyIPHdr.SetDestinationAddress(r.RemoteAddress)
+		replyIPHdr.SetTTL(r.DefaultTTL())
+
+		replyICMPHdr := header.ICMPv4(replyData)
+		replyICMPHdr.SetType(header.ICMPv4EchoReply)
+		replyICMPHdr.SetChecksum(0)
+		replyICMPHdr.SetChecksum(^header.Checksum(replyData, 0))
+
+		replyVV := buffer.View(replyIPHdr).ToVectorisedView()
+		replyVV.AppendView(replyData)
 		replyPkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
 			ReserveHeaderBytes: int(r.MaxHeaderLength()),
-			Data:               dataVV,
+			Data:               replyVV,
 		})
+		replyPkt.TransportProtocolNumber = header.ICMPv4ProtocolNumber
 
-		// Send out the reply packet.
+		// The checksum will be calculated so we don't need to do it here.
 		sent := stats.ICMP.V4PacketsSent
-		if err := r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{
-			Protocol: header.ICMPv4ProtocolNumber,
-			TTL:      r.DefaultTTL(),
-			TOS:      stack.DefaultTOS,
-		}, replyPkt); err != nil {
+		if err := r.WriteHeaderIncludedPacket(replyPkt); err != nil {
 			sent.Dropped.Increment()
 			return
 		}
@@ -193,3 +214,190 @@ func (e *endpoint) handleICMP(r *stack.Route, pkt *stack.PacketBuffer) {
 		received.Invalid.Increment()
 	}
 }
+
+// ======= ICMP Error packet generation =========
+
+// icmpReason is a marker interface for IPv4 specific ICMP errors.
+type icmpReason interface {
+	isICMPReason()
+}
+
+// icmpReasonPortUnreachable is an error where the transport protocol has no
+// listener and no alternative means to inform the sender.
+type icmpReasonPortUnreachable struct{}
+
+func (*icmpReasonPortUnreachable) isICMPReason() {}
+
+// icmpReasonProtoUnreachable is an error where the transport protocol is
+// not supported.
+type icmpReasonProtoUnreachable struct{}
+
+func (*icmpReasonProtoUnreachable) isICMPReason() {}
+
+// returnError takes an error descriptor and generates the appropriate ICMP
+// error packet for IPv4 and sends it back to the remote device that sent
+// the problematic packet. It incorporates as much of that packet as
+// possible as well as any error metadata as is available. returnError
+// expects pkt to hold a valid IPv4 packet as per the wire format.
+func (p *protocol) returnError(r *stack.Route, reason icmpReason, pkt *stack.PacketBuffer) *tcpip.Error {
+	// We check we are responding only when we are allowed to.
+	// See RFC 1812 section 4.3.2.7 (shown below).
+	//
+	// =========
+	// 4.3.2.7 When Not to Send ICMP Errors
+	//
+	//  An ICMP error message MUST NOT be sent as the result of receiving:
+	//
+	//  o An ICMP error message, or
+	//
+	//  o A packet which fails the IP header validation tests described in
+	//    Section [5.2.2] (except where that section specifically permits
+	//    the sending of an ICMP error message), or
+	//
+	//  o A packet destined to an IP broadcast or IP multicast address, or
+	//
+	//  o A packet sent as a Link Layer broadcast or multicast, or
+	//
+	//  o Any fragment of a datagram other then the first fragment (i.e., a
+	// packet for which the fragment offset in the IP header is nonzero).
+	//
+	// TODO(gvisor.dev/issues/4058): Make sure we don't send ICMP errors in
+	// response to a non-initial fragment, but it currently can not happen.
+
+	if r.IsInboundBroadcast() || header.IsV4MulticastAddress(r.LocalAddress) || r.RemoteAddress == header.IPv4Any {
+		return nil
+	}
+
+	// Even if we were able to receive a packet from some remote, we may not have
+	// a route to it - the remote may be blocked via routing rules. We must always
+	// consult our routing table and find a route to the remote before sending any
+	// packet.
+	route, err := p.stack.FindRoute(r.NICID(), r.LocalAddress, r.RemoteAddress, ProtocolNumber, false /* multicastLoop */)
+	if err != nil {
+		return err
+	}
+	defer route.Release()
+	// From this point on, the incoming route should no longer be used; route
+	// must be used to send the ICMP error.
+	r = nil
+
+	sent := p.stack.Stats().ICMP.V4PacketsSent
+	if !p.stack.AllowICMPMessage() {
+		sent.RateLimited.Increment()
+		return nil
+	}
+
+	networkHeader := pkt.NetworkHeader().View()
+	transportHeader := pkt.TransportHeader().View()
+
+	// Don't respond to icmp error packets.
+	if header.IPv4(networkHeader).Protocol() == uint8(header.ICMPv4ProtocolNumber) {
+		// TODO(gvisor.dev/issue/3810):
+		// Unfortunately the current stack pretty much always has ICMPv4 headers
+		// in the Data section of the packet but there is no guarantee that is the
+		// case. If this is the case grab the header to make it like all other
+		// packet types. When this is cleaned up the Consume should be removed.
+		if transportHeader.IsEmpty() {
+			var ok bool
+			transportHeader, ok = pkt.TransportHeader().Consume(header.ICMPv4MinimumSize)
+			if !ok {
+				return nil
+			}
+		} else if transportHeader.Size() < header.ICMPv4MinimumSize {
+			return nil
+		}
+		// We need to decide to explicitly name the packets we can respond to or
+		// the ones we can not respond to. The decision is somewhat arbitrary and
+		// if problems arise this could be reversed. It was judged less of a breach
+		// of protocol to not respond to unknown non-error packets than to respond
+		// to unknown error packets so we take the first approach.
+		switch header.ICMPv4(transportHeader).Type() {
+		case
+			header.ICMPv4EchoReply,
+			header.ICMPv4Echo,
+			header.ICMPv4Timestamp,
+			header.ICMPv4TimestampReply,
+			header.ICMPv4InfoRequest,
+			header.ICMPv4InfoReply:
+		default:
+			// Assume any type we don't know about may be an error type.
+			return nil
+		}
+	}
+
+	// Now work out how much of the triggering packet we should return.
+	// As per RFC 1812 Section 4.3.2.3
+	//
+	//   ICMP datagram SHOULD contain as much of the original
+	//   datagram as possible without the length of the ICMP
+	//   datagram exceeding 576 bytes.
+	//
+	// NOTE: The above RFC referenced is different from the original
+	// recommendation in RFC 1122 and RFC 792 where it mentioned that at
+	// least 8 bytes of the payload must be included. Today linux and other
+	// systems implement the RFC 1812 definition and not the original
+	// requirement. We treat 8 bytes as the minimum but will try send more.
+	mtu := int(route.MTU())
+	if mtu > header.IPv4MinimumProcessableDatagramSize {
+		mtu = header.IPv4MinimumProcessableDatagramSize
+	}
+	headerLen := int(route.MaxHeaderLength()) + header.ICMPv4MinimumSize
+	available := int(mtu) - headerLen
+
+	if available < header.IPv4MinimumSize+header.ICMPv4MinimumErrorPayloadSize {
+		return nil
+	}
+
+	payloadLen := networkHeader.Size() + transportHeader.Size() + pkt.Data.Size()
+	if payloadLen > available {
+		payloadLen = available
+	}
+
+	// The buffers used by pkt may be used elsewhere in the system.
+	// For example, an AF_RAW or AF_PACKET socket may use what the transport
+	// protocol considers an unreachable destination. Thus we deep copy pkt to
+	// prevent multiple ownership and SR errors. The new copy is a vectorized
+	// view with the entire incoming IP packet reassembled and truncated as
+	// required. This is now the payload of the new ICMP packet and no longer
+	// considered a packet in its own right.
+	newHeader := append(buffer.View(nil), networkHeader...)
+	newHeader = append(newHeader, transportHeader...)
+	payload := newHeader.ToVectorisedView()
+	payload.AppendView(pkt.Data.ToView())
+	payload.CapLength(payloadLen)
+
+	icmpPkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
+		ReserveHeaderBytes: headerLen,
+		Data:               payload,
+	})
+
+	icmpPkt.TransportProtocolNumber = header.ICMPv4ProtocolNumber
+
+	icmpHdr := header.ICMPv4(icmpPkt.TransportHeader().Push(header.ICMPv4MinimumSize))
+	switch reason.(type) {
+	case *icmpReasonPortUnreachable:
+		icmpHdr.SetCode(header.ICMPv4PortUnreachable)
+	case *icmpReasonProtoUnreachable:
+		icmpHdr.SetCode(header.ICMPv4ProtoUnreachable)
+	default:
+		panic(fmt.Sprintf("unsupported ICMP type %T", reason))
+	}
+	icmpHdr.SetType(header.ICMPv4DstUnreachable)
+	icmpHdr.SetChecksum(header.ICMPv4Checksum(icmpHdr, icmpPkt.Data))
+	counter := sent.DstUnreachable
+
+	if err := route.WritePacket(
+		nil, /* gso */
+		stack.NetworkHeaderParams{
+			Protocol: header.ICMPv4ProtocolNumber,
+			TTL:      route.DefaultTTL(),
+			TOS:      stack.DefaultTOS,
+		},
+		icmpPkt,
+	); err != nil {
+		sent.Dropped.Increment()
+		return err
+	}
+	counter.Increment()
+	return nil
+}
diff --git a/pkg/tcpip/network/ipv4/ipv4.go b/pkg/tcpip/network/ipv4/ipv4.go
index 79872ec9a..e7c58ae0a 100644
--- a/pkg/tcpip/network/ipv4/ipv4.go
+++ b/pkg/tcpip/network/ipv4/ipv4.go
@@ -12,26 +12,34 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// Package ipv4 contains the implementation of the ipv4 network protocol. To use
-// it in the networking stack, this package must be added to the project, and
-// activated on the stack by passing ipv4.NewProtocol() as one of the network
-// protocols when calling stack.New(). Then endpoints can be created by passing
-// ipv4.ProtocolNumber as the network protocol number when calling
-// Stack.NewEndpoint().
+// Package ipv4 contains the implementation of the ipv4 network protocol.
 package ipv4
 
 import (
+	"fmt"
 	"sync/atomic"
+	"time"
 
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/header/parse"
 	"gvisor.dev/gvisor/pkg/tcpip/network/fragmentation"
 	"gvisor.dev/gvisor/pkg/tcpip/network/hash"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 )
 
 const (
+	// As per RFC 791 section 3.2:
+	//   The current recommendation for the initial timer setting is 15 seconds.
+	//   This may be changed as experience with this protocol accumulates.
+	//
+	// Considering that it is an old recommendation, we use the same reassembly
+	// timeout that linux defines, which is 30 seconds:
+	// https://github.com/torvalds/linux/blob/47ec5303d73ea344e84f46660fff693c57641386/include/net/ip.h#L138
+	reassembleTimeout = 30 * time.Second
+
 	// ProtocolNumber is the ipv4 protocol number.
 	ProtocolNumber = header.IPv4ProtocolNumber
 
@@ -50,158 +58,136 @@ const (
 	fragmentblockSize = 8
 )
 
+var ipv4BroadcastAddr = header.IPv4Broadcast.WithPrefix()
+
+var _ stack.GroupAddressableEndpoint = (*endpoint)(nil)
+var _ stack.AddressableEndpoint = (*endpoint)(nil)
+var _ stack.NetworkEndpoint = (*endpoint)(nil)
+
 type endpoint struct {
-	nicID      tcpip.NICID
-	linkEP     stack.LinkEndpoint
+	nic        stack.NetworkInterface
 	dispatcher stack.TransportDispatcher
 	protocol   *protocol
-	stack      *stack.Stack
+
+	// enabled is set to 1 when the enpoint is enabled and 0 when it is
+	// disabled.
+	//
+	// Must be accessed using atomic operations.
+	enabled uint32
+
+	mu struct {
+		sync.RWMutex
+
+		addressableEndpointState stack.AddressableEndpointState
+	}
 }
 
 // NewEndpoint creates a new ipv4 endpoint.
-func (p *protocol) NewEndpoint(nicID tcpip.NICID, linkAddrCache stack.LinkAddressCache, dispatcher stack.TransportDispatcher, linkEP stack.LinkEndpoint, st *stack.Stack) stack.NetworkEndpoint {
-	return &endpoint{
-		nicID:      nicID,
-		linkEP:     linkEP,
+func (p *protocol) NewEndpoint(nic stack.NetworkInterface, _ stack.LinkAddressCache, _ stack.NUDHandler, dispatcher stack.TransportDispatcher) stack.NetworkEndpoint {
+	e := &endpoint{
+		nic:        nic,
 		dispatcher: dispatcher,
 		protocol:   p,
-		stack:      st,
 	}
+	e.mu.addressableEndpointState.Init(e)
+	return e
 }
 
-// DefaultTTL is the default time-to-live value for this endpoint.
-func (e *endpoint) DefaultTTL() uint8 {
-	return e.protocol.DefaultTTL()
-}
+// Enable implements stack.NetworkEndpoint.
+func (e *endpoint) Enable() *tcpip.Error {
+	e.mu.Lock()
+	defer e.mu.Unlock()
 
-// MTU implements stack.NetworkEndpoint.MTU. It returns the link-layer MTU minus
-// the network layer max header length.
-func (e *endpoint) MTU() uint32 {
-	return calculateMTU(e.linkEP.MTU())
-}
+	// If the NIC is not enabled, the endpoint can't do anything meaningful so
+	// don't enable the endpoint.
+	if !e.nic.Enabled() {
+		return tcpip.ErrNotPermitted
+	}
+
+	// If the endpoint is already enabled, there is nothing for it to do.
+	if !e.setEnabled(true) {
+		return nil
+	}
+
+	// Create an endpoint to receive broadcast packets on this interface.
+	ep, err := e.mu.addressableEndpointState.AddAndAcquirePermanentAddress(ipv4BroadcastAddr, stack.NeverPrimaryEndpoint, stack.AddressConfigStatic, false /* deprecated */)
+	if err != nil {
+		return err
+	}
+	// We have no need for the address endpoint.
+	ep.DecRef()
 
-// Capabilities implements stack.NetworkEndpoint.Capabilities.
-func (e *endpoint) Capabilities() stack.LinkEndpointCapabilities {
-	return e.linkEP.Capabilities()
+	// As per RFC 1122 section 3.3.7, all hosts should join the all-hosts
+	// multicast group. Note, the IANA calls the all-hosts multicast group the
+	// all-systems multicast group.
+	_, err = e.mu.addressableEndpointState.JoinGroup(header.IPv4AllSystems)
+	return err
 }
 
-// NICID returns the ID of the NIC this endpoint belongs to.
-func (e *endpoint) NICID() tcpip.NICID {
-	return e.nicID
+// Enabled implements stack.NetworkEndpoint.
+func (e *endpoint) Enabled() bool {
+	return e.nic.Enabled() && e.isEnabled()
 }
 
-// MaxHeaderLength returns the maximum length needed by ipv4 headers (and
-// underlying protocols).
-func (e *endpoint) MaxHeaderLength() uint16 {
-	return e.linkEP.MaxHeaderLength() + header.IPv4MinimumSize
+// isEnabled returns true if the endpoint is enabled, regardless of the
+// enabled status of the NIC.
+func (e *endpoint) isEnabled() bool {
+	return atomic.LoadUint32(&e.enabled) == 1
 }
 
-// GSOMaxSize returns the maximum GSO packet size.
-func (e *endpoint) GSOMaxSize() uint32 {
-	if gso, ok := e.linkEP.(stack.GSOEndpoint); ok {
-		return gso.GSOMaxSize()
+// setEnabled sets the enabled status for the endpoint.
+//
+// Returns true if the enabled status was updated.
+func (e *endpoint) setEnabled(v bool) bool {
+	if v {
+		return atomic.SwapUint32(&e.enabled, 1) == 0
 	}
-	return 0
+	return atomic.SwapUint32(&e.enabled, 0) == 1
 }
 
-// NetworkProtocolNumber implements stack.NetworkEndpoint.NetworkProtocolNumber.
-func (e *endpoint) NetworkProtocolNumber() tcpip.NetworkProtocolNumber {
-	return e.protocol.Number()
+// Disable implements stack.NetworkEndpoint.
+func (e *endpoint) Disable() {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+	e.disableLocked()
 }
 
-// writePacketFragments calls e.linkEP.WritePacket with each packet fragment to
-// write. It assumes that the IP header is already present in pkt.NetworkHeader.
-// pkt.TransportHeader may be set. mtu includes the IP header and options. This
-// does not support the DontFragment IP flag.
-func (e *endpoint) writePacketFragments(r *stack.Route, gso *stack.GSO, mtu int, pkt *stack.PacketBuffer) *tcpip.Error {
-	// This packet is too big, it needs to be fragmented.
-	ip := header.IPv4(pkt.NetworkHeader().View())
-	flags := ip.Flags()
-
-	// Update mtu to take into account the header, which will exist in all
-	// fragments anyway.
-	innerMTU := mtu - int(ip.HeaderLength())
-
-	// Round the MTU down to align to 8 bytes. Then calculate the number of
-	// fragments. Calculate fragment sizes as in RFC791.
-	innerMTU &^= 7
-	n := (int(ip.PayloadLength()) + innerMTU - 1) / innerMTU
-
-	outerMTU := innerMTU + int(ip.HeaderLength())
-	offset := ip.FragmentOffset()
-
-	// Keep the length reserved for link-layer, we need to create fragments with
-	// the same reserved length.
-	reservedForLink := pkt.AvailableHeaderBytes()
-
-	// Destroy the packet, pull all payloads out for fragmentation.
-	transHeader, data := pkt.TransportHeader().View(), pkt.Data
-
-	// Where possible, the first fragment that is sent has the same
-	// number of bytes reserved for header as the input packet. The link-layer
-	// endpoint may depend on this for looking at, eg, L4 headers.
-	transFitsFirst := len(transHeader) <= innerMTU
-
-	for i := 0; i < n; i++ {
-		reserve := reservedForLink + int(ip.HeaderLength())
-		if i == 0 && transFitsFirst {
-			// Reserve for transport header if it's going to be put in the first
-			// fragment.
-			reserve += len(transHeader)
-		}
-		fragPkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
-			ReserveHeaderBytes: reserve,
-		})
-		fragPkt.NetworkProtocolNumber = header.IPv4ProtocolNumber
+func (e *endpoint) disableLocked() {
+	if !e.setEnabled(false) {
+		return
+	}
 
-		// Copy data for the fragment.
-		avail := innerMTU
+	// The endpoint may have already left the multicast group.
+	if _, err := e.mu.addressableEndpointState.LeaveGroup(header.IPv4AllSystems); err != nil && err != tcpip.ErrBadLocalAddress {
+		panic(fmt.Sprintf("unexpected error when leaving group = %s: %s", header.IPv4AllSystems, err))
+	}
 
-		if n := len(transHeader); n > 0 {
-			if n > avail {
-				n = avail
-			}
-			if i == 0 && transFitsFirst {
-				copy(fragPkt.TransportHeader().Push(n), transHeader)
-			} else {
-				fragPkt.Data.AppendView(transHeader[:n:n])
-			}
-			transHeader = transHeader[n:]
-			avail -= n
-		}
+	// The address may have already been removed.
+	if err := e.mu.addressableEndpointState.RemovePermanentAddress(ipv4BroadcastAddr.Address); err != nil && err != tcpip.ErrBadLocalAddress {
+		panic(fmt.Sprintf("unexpected error when removing address = %s: %s", ipv4BroadcastAddr.Address, err))
+	}
+}
 
-		if avail > 0 {
-			n := data.Size()
-			if n > avail {
-				n = avail
-			}
-			data.ReadToVV(&fragPkt.Data, n)
-			avail -= n
-		}
+// DefaultTTL is the default time-to-live value for this endpoint.
+func (e *endpoint) DefaultTTL() uint8 {
+	return e.protocol.DefaultTTL()
+}
 
-		copied := uint16(innerMTU - avail)
-
-		// Set lengths in header and calculate checksum.
-		h := header.IPv4(fragPkt.NetworkHeader().Push(len(ip)))
-		copy(h, ip)
-		if i != n-1 {
-			h.SetTotalLength(uint16(outerMTU))
-			h.SetFlagsFragmentOffset(flags|header.IPv4FlagMoreFragments, offset)
-		} else {
-			h.SetTotalLength(uint16(h.HeaderLength()) + copied)
-			h.SetFlagsFragmentOffset(flags, offset)
-		}
-		h.SetChecksum(0)
-		h.SetChecksum(^h.CalculateChecksum())
-		offset += copied
+// MTU implements stack.NetworkEndpoint.MTU. It returns the link-layer MTU minus
+// the network layer max header length.
+func (e *endpoint) MTU() uint32 {
+	return calculateMTU(e.nic.MTU())
+}
 
-		// Send out the fragment.
-		if err := e.linkEP.WritePacket(r, gso, ProtocolNumber, fragPkt); err != nil {
-			return err
-		}
-		r.Stats().IP.PacketsSent.Increment()
-	}
-	return nil
+// MaxHeaderLength returns the maximum length needed by ipv4 headers (and
+// underlying protocols).
+func (e *endpoint) MaxHeaderLength() uint16 {
+	return e.nic.MaxHeaderLength() + header.IPv4MaximumHeaderSize
+}
+
+// NetworkProtocolNumber implements stack.NetworkEndpoint.NetworkProtocolNumber.
+func (e *endpoint) NetworkProtocolNumber() tcpip.NetworkProtocolNumber {
+	return e.protocol.Number()
 }
 
 func (e *endpoint) addIPHeader(r *stack.Route, pkt *stack.PacketBuffer, params stack.NetworkHeaderParams) {
@@ -222,30 +208,62 @@ func (e *endpoint) addIPHeader(r *stack.Route, pkt *stack.PacketBuffer, params s
 		DstAddr:     r.RemoteAddress,
 	})
 	ip.SetChecksum(^ip.CalculateChecksum())
-	pkt.NetworkProtocolNumber = header.IPv4ProtocolNumber
+	pkt.NetworkProtocolNumber = ProtocolNumber
+}
+
+func (e *endpoint) packetMustBeFragmented(pkt *stack.PacketBuffer, gso *stack.GSO) bool {
+	return (gso == nil || gso.Type == stack.GSONone) && pkt.Size() > int(e.nic.MTU())
+}
+
+// handleFragments fragments pkt and calls the handler function on each
+// fragment. It returns the number of fragments handled and the number of
+// fragments left to be processed. The IP header must already be present in the
+// original packet. The mtu is the maximum size of the packets.
+func (e *endpoint) handleFragments(r *stack.Route, gso *stack.GSO, mtu uint32, pkt *stack.PacketBuffer, handler func(*stack.PacketBuffer) *tcpip.Error) (int, int, *tcpip.Error) {
+	fragMTU := int(calculateFragmentInnerMTU(mtu, pkt))
+	networkHeader := header.IPv4(pkt.NetworkHeader().View())
+	pf := fragmentation.MakePacketFragmenter(pkt, fragMTU, pkt.AvailableHeaderBytes()+len(networkHeader))
+
+	var n int
+	for {
+		fragPkt, more := buildNextFragment(&pf, networkHeader)
+		if err := handler(fragPkt); err != nil {
+			return n, pf.RemainingFragmentCount() + 1, err
+		}
+		n++
+		if !more {
+			return n, pf.RemainingFragmentCount(), nil
+		}
+	}
 }
 
 // WritePacket writes a packet to the given destination address and protocol.
 func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.NetworkHeaderParams, pkt *stack.PacketBuffer) *tcpip.Error {
 	e.addIPHeader(r, pkt, params)
+	return e.writePacket(r, gso, pkt)
+}
 
+func (e *endpoint) writePacket(r *stack.Route, gso *stack.GSO, pkt *stack.PacketBuffer) *tcpip.Error {
 	// iptables filtering. All packets that reach here are locally
 	// generated.
-	nicName := e.stack.FindNICNameFromID(e.NICID())
-	ipt := e.stack.IPTables()
+	nicName := e.protocol.stack.FindNICNameFromID(e.nic.ID())
+	ipt := e.protocol.stack.IPTables()
 	if ok := ipt.Check(stack.Output, pkt, gso, r, "", nicName); !ok {
 		// iptables is telling us to drop the packet.
+		r.Stats().IP.IPTablesOutputDropped.Increment()
 		return nil
 	}
 
-	// If the packet is manipulated as per NAT Ouput rules, handle packet
-	// based on destination address and do not send the packet to link layer.
-	// TODO(gvisor.dev/issue/170): We should do this for every packet, rather than
-	// only NATted packets, but removing this check short circuits broadcasts
-	// before they are sent out to other hosts.
+	// If the packet is manipulated as per NAT Output rules, handle packet
+	// based on destination address and do not send the packet to link
+	// layer.
+	//
+	// TODO(gvisor.dev/issue/170): We should do this for every
+	// packet, rather than only NATted packets, but removing this check
+	// short circuits broadcasts before they are sent out to other hosts.
 	if pkt.NatDone {
 		netHeader := header.IPv4(pkt.NetworkHeader().View())
-		ep, err := e.stack.FindNetworkEndpoint(header.IPv4ProtocolNumber, netHeader.DestinationAddress())
+		ep, err := e.protocol.stack.FindNetworkEndpoint(ProtocolNumber, netHeader.DestinationAddress())
 		if err == nil {
 			route := r.ReverseRoute(netHeader.SourceAddress(), netHeader.DestinationAddress())
 			ep.HandlePacket(&route, pkt)
@@ -261,10 +279,21 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.Netw
 	if r.Loop&stack.PacketOut == 0 {
 		return nil
 	}
-	if pkt.Size() > int(e.linkEP.MTU()) && (gso == nil || gso.Type == stack.GSONone) {
-		return e.writePacketFragments(r, gso, int(e.linkEP.MTU()), pkt)
+
+	if e.packetMustBeFragmented(pkt, gso) {
+		sent, remain, err := e.handleFragments(r, gso, e.nic.MTU(), pkt, func(fragPkt *stack.PacketBuffer) *tcpip.Error {
+			// TODO(gvisor.dev/issue/3884): Evaluate whether we want to send each
+			// fragment one by one using WritePacket() (current strategy) or if we
+			// want to create a PacketBufferList from the fragments and feed it to
+			// WritePackets(). It'll be faster but cost more memory.
+			return e.nic.WritePacket(r, gso, ProtocolNumber, fragPkt)
+		})
+		r.Stats().IP.PacketsSent.IncrementBy(uint64(sent))
+		r.Stats().IP.OutgoingPacketErrors.IncrementBy(uint64(remain))
+		return err
 	}
-	if err := e.linkEP.WritePacket(r, gso, ProtocolNumber, pkt); err != nil {
+	if err := e.nic.WritePacket(r, gso, ProtocolNumber, pkt); err != nil {
+		r.Stats().IP.OutgoingPacketErrors.Increment()
 		return err
 	}
 	r.Stats().IP.PacketsSent.Increment()
@@ -280,25 +309,43 @@ func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.Packe
 		return pkts.Len(), nil
 	}
 
-	for pkt := pkts.Front(); pkt != nil; {
+	for pkt := pkts.Front(); pkt != nil; pkt = pkt.Next() {
 		e.addIPHeader(r, pkt, params)
-		pkt = pkt.Next()
+		if e.packetMustBeFragmented(pkt, gso) {
+			// Keep track of the packet that is about to be fragmented so it can be
+			// removed once the fragmentation is done.
+			originalPkt := pkt
+			if _, _, err := e.handleFragments(r, gso, e.nic.MTU(), pkt, func(fragPkt *stack.PacketBuffer) *tcpip.Error {
+				// Modify the packet list in place with the new fragments.
+				pkts.InsertAfter(pkt, fragPkt)
+				pkt = fragPkt
+				return nil
+			}); err != nil {
+				panic(fmt.Sprintf("e.handleFragments(_, _, %d, _, _) = %s", e.nic.MTU(), err))
+			}
+			// Remove the packet that was just fragmented and process the rest.
+			pkts.Remove(originalPkt)
+		}
 	}
 
-	nicName := e.stack.FindNICNameFromID(e.NICID())
+	nicName := e.protocol.stack.FindNICNameFromID(e.nic.ID())
 	// iptables filtering. All packets that reach here are locally
 	// generated.
-	ipt := e.stack.IPTables()
+	ipt := e.protocol.stack.IPTables()
 	dropped, natPkts := ipt.CheckPackets(stack.Output, pkts, gso, r, nicName)
 	if len(dropped) == 0 && len(natPkts) == 0 {
 		// Fast path: If no packets are to be dropped then we can just invoke the
 		// faster WritePackets API directly.
-		n, err := e.linkEP.WritePackets(r, gso, pkts, ProtocolNumber)
+		n, err := e.nic.WritePackets(r, gso, pkts, ProtocolNumber)
 		r.Stats().IP.PacketsSent.IncrementBy(uint64(n))
+		if err != nil {
+			r.Stats().IP.OutgoingPacketErrors.IncrementBy(uint64(pkts.Len() - n))
+		}
 		return n, err
 	}
+	r.Stats().IP.IPTablesOutputDropped.IncrementBy(uint64(len(dropped)))
 
-	// Slow Path as we are dropping some packets in the batch degrade to
+	// Slow path as we are dropping some packets in the batch degrade to
 	// emitting one packet at a time.
 	n := 0
 	for pkt := pkts.Front(); pkt != nil; pkt = pkt.Next() {
@@ -307,7 +354,7 @@ func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.Packe
 		}
 		if _, ok := natPkts[pkt]; ok {
 			netHeader := header.IPv4(pkt.NetworkHeader().View())
-			if ep, err := e.stack.FindNetworkEndpoint(header.IPv4ProtocolNumber, netHeader.DestinationAddress()); err == nil {
+			if ep, err := e.protocol.stack.FindNetworkEndpoint(ProtocolNumber, netHeader.DestinationAddress()); err == nil {
 				src := netHeader.SourceAddress()
 				dst := netHeader.DestinationAddress()
 				route := r.ReverseRoute(src, dst)
@@ -316,40 +363,41 @@ func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.Packe
 				continue
 			}
 		}
-		if err := e.linkEP.WritePacket(r, gso, ProtocolNumber, pkt); err != nil {
+		if err := e.nic.WritePacket(r, gso, ProtocolNumber, pkt); err != nil {
 			r.Stats().IP.PacketsSent.IncrementBy(uint64(n))
-			return n, err
+			r.Stats().IP.OutgoingPacketErrors.IncrementBy(uint64(pkts.Len() - n - len(dropped)))
+			// Dropped packets aren't errors, so include them in
+			// the return value.
+			return n + len(dropped), err
 		}
 		n++
 	}
 	r.Stats().IP.PacketsSent.IncrementBy(uint64(n))
-	return n, nil
+	// Dropped packets aren't errors, so include them in the return value.
+	return n + len(dropped), nil
 }
 
-// WriteHeaderIncludedPacket writes a packet already containing a network
-// header through the given route.
+// WriteHeaderIncludedPacket implements stack.NetworkEndpoint.
 func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt *stack.PacketBuffer) *tcpip.Error {
 	// The packet already has an IP header, but there are a few required
 	// checks.
 	h, ok := pkt.Data.PullUp(header.IPv4MinimumSize)
 	if !ok {
-		return tcpip.ErrInvalidOptionValue
+		return tcpip.ErrMalformedHeader
 	}
 	ip := header.IPv4(h)
-	if !ip.IsValid(pkt.Data.Size()) {
-		return tcpip.ErrInvalidOptionValue
-	}
 
 	// Always set the total length.
-	ip.SetTotalLength(uint16(pkt.Data.Size()))
+	pktSize := pkt.Data.Size()
+	ip.SetTotalLength(uint16(pktSize))
 
 	// Set the source address when zero.
-	if ip.SourceAddress() == tcpip.Address(([]byte{0, 0, 0, 0})) {
+	if ip.SourceAddress() == header.IPv4Any {
 		ip.SetSourceAddress(r.LocalAddress)
 	}
 
-	// Set the destination. If the packet already included a destination,
-	// it will be part of the route.
+	// Set the destination. If the packet already included a destination, it will
+	// be part of the route anyways.
 	ip.SetDestinationAddress(r.RemoteAddress)
 
 	// Set the packet ID when zero.
@@ -366,32 +414,73 @@ func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt *stack.PacketBu
 	ip.SetChecksum(0)
 	ip.SetChecksum(^ip.CalculateChecksum())
 
-	if r.Loop&stack.PacketLoop != 0 {
-		e.HandlePacket(r, pkt.Clone())
-	}
-	if r.Loop&stack.PacketOut == 0 {
-		return nil
+	// Populate the packet buffer's network header and don't allow an invalid
+	// packet to be sent.
+	//
+	// Note that parsing only makes sure that the packet is well formed as per the
+	// wire format. We also want to check if the header's fields are valid before
+	// sending the packet.
+	if !parse.IPv4(pkt) || !header.IPv4(pkt.NetworkHeader().View()).IsValid(pktSize) {
+		return tcpip.ErrMalformedHeader
 	}
 
-	r.Stats().IP.PacketsSent.Increment()
-
-	return e.linkEP.WritePacket(r, nil /* gso */, ProtocolNumber, pkt)
+	return e.writePacket(r, nil /* gso */, pkt)
 }
 
 // HandlePacket is called by the link layer when new ipv4 packets arrive for
 // this endpoint.
 func (e *endpoint) HandlePacket(r *stack.Route, pkt *stack.PacketBuffer) {
+	if !e.isEnabled() {
+		return
+	}
+
 	h := header.IPv4(pkt.NetworkHeader().View())
 	if !h.IsValid(pkt.Data.Size() + pkt.NetworkHeader().View().Size() + pkt.TransportHeader().View().Size()) {
 		r.Stats().IP.MalformedPacketsReceived.Increment()
 		return
 	}
 
+	// There has been some confusion regarding verifying checksums. We need
+	// just look for negative 0 (0xffff) as the checksum, as it's not possible to
+	// get positive 0 (0) for the checksum. Some bad implementations could get it
+	// when doing entry replacement in the early days of the Internet,
+	// however the lore that one needs to check for both persists.
+	//
+	// RFC 1624 section 1 describes the source of this confusion as:
+	//     [the partial recalculation method described in RFC 1071] computes a
+	//     result for certain cases that differs from the one obtained from
+	//     scratch (one's complement of one's complement sum of the original
+	//     fields).
+	//
+	// However RFC 1624 section 5 clarifies that if using the verification method
+	// "recommended by RFC 1071, it does not matter if an intermediate system
+	// generated a -0 instead of +0".
+	//
+	// RFC1071 page 1 specifies the verification method as:
+	//	  (3)  To check a checksum, the 1's complement sum is computed over the
+	//        same set of octets, including the checksum field.  If the result
+	//        is all 1 bits (-0 in 1's complement arithmetic), the check
+	//        succeeds.
+	if h.CalculateChecksum() != 0xffff {
+		r.Stats().IP.MalformedPacketsReceived.Increment()
+		return
+	}
+
+	// As per RFC 1122 section 3.2.1.3:
+	//   When a host sends any datagram, the IP source address MUST
+	//   be one of its own IP addresses (but not a broadcast or
+	//   multicast address).
+	if r.IsOutboundBroadcast() || header.IsV4MulticastAddress(r.RemoteAddress) {
+		r.Stats().IP.InvalidSourceAddressesReceived.Increment()
+		return
+	}
+
 	// iptables filtering. All packets that reach here are intended for
 	// this machine and will not be forwarded.
-	ipt := e.stack.IPTables()
+	ipt := e.protocol.stack.IPTables()
 	if ok := ipt.Check(stack.Input, pkt, nil, nil, "", ""); !ok {
 		// iptables is telling us to drop the packet.
+		r.Stats().IP.IPTablesInputDropped.Increment()
 		return
 	}
 
@@ -404,29 +493,35 @@ func (e *endpoint) HandlePacket(r *stack.Route, pkt *stack.PacketBuffer) {
 			return
 		}
 		// The packet is a fragment, let's try to reassemble it.
-		last := h.FragmentOffset() + uint16(pkt.Data.Size()) - 1
-		// Drop the packet if the fragmentOffset is incorrect. i.e the
-		// combination of fragmentOffset and pkt.Data.size() causes a
-		// wrap around resulting in last being less than the offset.
-		if last < h.FragmentOffset() {
+		start := h.FragmentOffset()
+		// Drop the fragment if the size of the reassembled payload would exceed the
+		// maximum payload size.
+		//
+		// Note that this addition doesn't overflow even on 32bit architecture
+		// because pkt.Data.Size() should not exceed 65535 (the max IP datagram
+		// size). Otherwise the packet would've been rejected as invalid before
+		// reaching here.
+		if int(start)+pkt.Data.Size() > header.IPv4MaximumPayloadSize {
 			r.Stats().IP.MalformedPacketsReceived.Increment()
 			r.Stats().IP.MalformedFragmentsReceived.Increment()
 			return
 		}
 		var ready bool
 		var err error
-		pkt.Data, ready, err = e.protocol.fragmentation.Process(
+		proto := h.Protocol()
+		pkt.Data, _, ready, err = e.protocol.fragmentation.Process(
 			// As per RFC 791 section 2.3, the identification value is unique
 			// for a source-destination pair and protocol.
 			fragmentation.FragmentID{
 				Source:      h.SourceAddress(),
 				Destination: h.DestinationAddress(),
 				ID:          uint32(h.ID()),
-				Protocol:    h.Protocol(),
+				Protocol:    proto,
 			},
-			h.FragmentOffset(),
-			last,
+			start,
+			start+uint16(pkt.Data.Size())-1,
 			h.More(),
+			proto,
 			pkt.Data,
 		)
 		if err != nil {
@@ -438,27 +533,165 @@ func (e *endpoint) HandlePacket(r *stack.Route, pkt *stack.PacketBuffer) {
 			return
 		}
 	}
+
+	r.Stats().IP.PacketsDelivered.Increment()
 	p := h.TransportProtocol()
 	if p == header.ICMPv4ProtocolNumber {
+		// TODO(gvisor.dev/issues/3810): when we sort out ICMP and transport
+		// headers, the setting of the transport number here should be
+		// unnecessary and removed.
+		pkt.TransportProtocolNumber = p
 		e.handleICMP(r, pkt)
 		return
 	}
-	r.Stats().IP.PacketsDelivered.Increment()
-	e.dispatcher.DeliverTransportPacket(r, p, pkt)
+
+	switch res := e.dispatcher.DeliverTransportPacket(r, p, pkt); res {
+	case stack.TransportPacketHandled:
+	case stack.TransportPacketDestinationPortUnreachable:
+		// As per RFC: 1122 Section 3.2.2.1 A host SHOULD generate Destination
+		//   Unreachable messages with code:
+		//     3 (Port Unreachable), when the designated transport protocol
+		//     (e.g., UDP) is unable to demultiplex the datagram but has no
+		//     protocol mechanism to inform the sender.
+		_ = e.protocol.returnError(r, &icmpReasonPortUnreachable{}, pkt)
+	case stack.TransportPacketProtocolUnreachable:
+		// As per RFC: 1122 Section 3.2.2.1
+		//   A host SHOULD generate Destination Unreachable messages with code:
+		//     2 (Protocol Unreachable), when the designated transport protocol
+		//     is not supported
+		_ = e.protocol.returnError(r, &icmpReasonProtoUnreachable{}, pkt)
+	default:
+		panic(fmt.Sprintf("unrecognized result from DeliverTransportPacket = %d", res))
+	}
 }
 
 // Close cleans up resources associated with the endpoint.
-func (e *endpoint) Close() {}
+func (e *endpoint) Close() {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+
+	e.disableLocked()
+	e.mu.addressableEndpointState.Cleanup()
+}
+
+// AddAndAcquirePermanentAddress implements stack.AddressableEndpoint.
+func (e *endpoint) AddAndAcquirePermanentAddress(addr tcpip.AddressWithPrefix, peb stack.PrimaryEndpointBehavior, configType stack.AddressConfigType, deprecated bool) (stack.AddressEndpoint, *tcpip.Error) {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+	return e.mu.addressableEndpointState.AddAndAcquirePermanentAddress(addr, peb, configType, deprecated)
+}
+
+// RemovePermanentAddress implements stack.AddressableEndpoint.
+func (e *endpoint) RemovePermanentAddress(addr tcpip.Address) *tcpip.Error {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+	return e.mu.addressableEndpointState.RemovePermanentAddress(addr)
+}
+
+// MainAddress implements stack.AddressableEndpoint.
+func (e *endpoint) MainAddress() tcpip.AddressWithPrefix {
+	e.mu.RLock()
+	defer e.mu.RUnlock()
+	return e.mu.addressableEndpointState.MainAddress()
+}
+
+// AcquireAssignedAddress implements stack.AddressableEndpoint.
+func (e *endpoint) AcquireAssignedAddress(localAddr tcpip.Address, allowTemp bool, tempPEB stack.PrimaryEndpointBehavior) stack.AddressEndpoint {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+
+	loopback := e.nic.IsLoopback()
+	addressEndpoint := e.mu.addressableEndpointState.ReadOnly().AddrOrMatching(localAddr, allowTemp, func(addressEndpoint stack.AddressEndpoint) bool {
+		subnet := addressEndpoint.AddressWithPrefix().Subnet()
+		// IPv4 has a notion of a subnet broadcast address and considers the
+		// loopback interface bound to an address's whole subnet (on linux).
+		return subnet.IsBroadcast(localAddr) || (loopback && subnet.Contains(localAddr))
+	})
+	if addressEndpoint != nil {
+		return addressEndpoint
+	}
+
+	if !allowTemp {
+		return nil
+	}
+
+	addr := localAddr.WithPrefix()
+	addressEndpoint, err := e.mu.addressableEndpointState.AddAndAcquireTemporaryAddress(addr, tempPEB)
+	if err != nil {
+		// AddAddress only returns an error if the address is already assigned,
+		// but we just checked above if the address exists so we expect no error.
+		panic(fmt.Sprintf("e.mu.addressableEndpointState.AddAndAcquireTemporaryAddress(%s, %d): %s", addr, tempPEB, err))
+	}
+	return addressEndpoint
+}
+
+// AcquireOutgoingPrimaryAddress implements stack.AddressableEndpoint.
+func (e *endpoint) AcquireOutgoingPrimaryAddress(remoteAddr tcpip.Address, allowExpired bool) stack.AddressEndpoint {
+	e.mu.RLock()
+	defer e.mu.RUnlock()
+	return e.mu.addressableEndpointState.AcquireOutgoingPrimaryAddress(remoteAddr, allowExpired)
+}
+
+// PrimaryAddresses implements stack.AddressableEndpoint.
+func (e *endpoint) PrimaryAddresses() []tcpip.AddressWithPrefix {
+	e.mu.RLock()
+	defer e.mu.RUnlock()
+	return e.mu.addressableEndpointState.PrimaryAddresses()
+}
+
+// PermanentAddresses implements stack.AddressableEndpoint.
+func (e *endpoint) PermanentAddresses() []tcpip.AddressWithPrefix {
+	e.mu.RLock()
+	defer e.mu.RUnlock()
+	return e.mu.addressableEndpointState.PermanentAddresses()
+}
+
+// JoinGroup implements stack.GroupAddressableEndpoint.
+func (e *endpoint) JoinGroup(addr tcpip.Address) (bool, *tcpip.Error) {
+	if !header.IsV4MulticastAddress(addr) {
+		return false, tcpip.ErrBadAddress
+	}
+
+	e.mu.Lock()
+	defer e.mu.Unlock()
+	return e.mu.addressableEndpointState.JoinGroup(addr)
+}
+
+// LeaveGroup implements stack.GroupAddressableEndpoint.
+func (e *endpoint) LeaveGroup(addr tcpip.Address) (bool, *tcpip.Error) {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+	return e.mu.addressableEndpointState.LeaveGroup(addr)
+}
+
+// IsInGroup implements stack.GroupAddressableEndpoint.
+func (e *endpoint) IsInGroup(addr tcpip.Address) bool {
+	e.mu.RLock()
+	defer e.mu.RUnlock()
+	return e.mu.addressableEndpointState.IsInGroup(addr)
+}
+
+var _ stack.ForwardingNetworkProtocol = (*protocol)(nil)
+var _ stack.NetworkProtocol = (*protocol)(nil)
 
 type protocol struct {
-	ids    []uint32
-	hashIV uint32
+	stack *stack.Stack
 
 	// defaultTTL is the current default TTL for the protocol. Only the
-	// uint8 portion of it is meaningful and it must be accessed
-	// atomically.
+	// uint8 portion of it is meaningful.
+	//
+	// Must be accessed using atomic operations.
 	defaultTTL uint32
 
+	// forwarding is set to 1 when the protocol has forwarding enabled and 0
+	// when it is disabled.
+	//
+	// Must be accessed using atomic operations.
+	forwarding uint32
+
+	ids    []uint32
+	hashIV uint32
+
 	fragmentation *fragmentation.Fragmentation
 }
 
@@ -484,10 +717,10 @@ func (*protocol) ParseAddresses(v buffer.View) (src, dst tcpip.Address) {
 }
 
 // SetOption implements NetworkProtocol.SetOption.
-func (p *protocol) SetOption(option interface{}) *tcpip.Error {
+func (p *protocol) SetOption(option tcpip.SettableNetworkProtocolOption) *tcpip.Error {
 	switch v := option.(type) {
-	case tcpip.DefaultTTLOption:
-		p.SetDefaultTTL(uint8(v))
+	case *tcpip.DefaultTTLOption:
+		p.SetDefaultTTL(uint8(*v))
 		return nil
 	default:
 		return tcpip.ErrUnknownProtocolOption
@@ -495,7 +728,7 @@ func (p *protocol) SetOption(option interface{}) *tcpip.Error {
 }
 
 // Option implements NetworkProtocol.Option.
-func (p *protocol) Option(option interface{}) *tcpip.Error {
+func (p *protocol) Option(option tcpip.GettableNetworkProtocolOption) *tcpip.Error {
 	switch v := option.(type) {
 	case *tcpip.DefaultTTLOption:
 		*v = tcpip.DefaultTTLOption(p.DefaultTTL())
@@ -521,37 +754,28 @@ func (*protocol) Close() {}
 // Wait implements stack.TransportProtocol.Wait.
 func (*protocol) Wait() {}
 
-// Parse implements stack.TransportProtocol.Parse.
+// Parse implements stack.NetworkProtocol.Parse.
 func (*protocol) Parse(pkt *stack.PacketBuffer) (proto tcpip.TransportProtocolNumber, hasTransportHdr bool, ok bool) {
-	hdr, ok := pkt.Data.PullUp(header.IPv4MinimumSize)
-	if !ok {
+	if ok := parse.IPv4(pkt); !ok {
 		return 0, false, false
 	}
-	ipHdr := header.IPv4(hdr)
 
-	// Header may have options, determine the true header length.
-	headerLen := int(ipHdr.HeaderLength())
-	if headerLen < header.IPv4MinimumSize {
-		// TODO(gvisor.dev/issue/2404): Per RFC 791, IHL needs to be at least 5 in
-		// order for the packet to be valid. Figure out if we want to reject this
-		// case.
-		headerLen = header.IPv4MinimumSize
-	}
-	hdr, ok = pkt.NetworkHeader().Consume(headerLen)
-	if !ok {
-		return 0, false, false
-	}
-	ipHdr = header.IPv4(hdr)
+	ipHdr := header.IPv4(pkt.NetworkHeader().View())
+	return ipHdr.TransportProtocol(), !ipHdr.More() && ipHdr.FragmentOffset() == 0, true
+}
 
-	// If this is a fragment, don't bother parsing the transport header.
-	parseTransportHeader := true
-	if ipHdr.More() || ipHdr.FragmentOffset() != 0 {
-		parseTransportHeader = false
-	}
+// Forwarding implements stack.ForwardingNetworkProtocol.
+func (p *protocol) Forwarding() bool {
+	return uint8(atomic.LoadUint32(&p.forwarding)) == 1
+}
 
-	pkt.NetworkProtocolNumber = header.IPv4ProtocolNumber
-	pkt.Data.CapLength(int(ipHdr.TotalLength()) - len(hdr))
-	return ipHdr.TransportProtocol(), parseTransportHeader, true
+// SetForwarding implements stack.ForwardingNetworkProtocol.
+func (p *protocol) SetForwarding(v bool) {
+	if v {
+		atomic.StoreUint32(&p.forwarding, 1)
+	} else {
+		atomic.StoreUint32(&p.forwarding, 0)
+	}
 }
 
 // calculateMTU calculates the network-layer payload MTU based on the link-layer
@@ -563,19 +787,41 @@ func calculateMTU(mtu uint32) uint32 {
 	return mtu - header.IPv4MinimumSize
 }
 
+// calculateFragmentInnerMTU calculates the maximum number of bytes of
+// fragmentable data a fragment can have, based on the link layer mtu and pkt's
+// network header size.
+func calculateFragmentInnerMTU(mtu uint32, pkt *stack.PacketBuffer) uint32 {
+	if mtu > MaxTotalSize {
+		mtu = MaxTotalSize
+	}
+	mtu -= uint32(pkt.NetworkHeader().View().Size())
+	// Round the MTU down to align to 8 bytes.
+	mtu &^= 7
+	return mtu
+}
+
+// addressToUint32 translates an IPv4 address into its little endian uint32
+// representation.
+//
+// This function does the same thing as binary.LittleEndian.Uint32 but operates
+// on a tcpip.Address (a string) without the need to convert it to a byte slice,
+// which would cause an allocation.
+func addressToUint32(addr tcpip.Address) uint32 {
+	_ = addr[3] // bounds check hint to compiler
+	return uint32(addr[0]) | uint32(addr[1])<<8 | uint32(addr[2])<<16 | uint32(addr[3])<<24
+}
+
 // hashRoute calculates a hash value for the given route. It uses the source &
-// destination address, the transport protocol number, and a random initial
-// value (generated once on initialization) to generate the hash.
+// destination address, the transport protocol number and a 32-bit number to
+// generate the hash.
 func hashRoute(r *stack.Route, protocol tcpip.TransportProtocolNumber, hashIV uint32) uint32 {
-	t := r.LocalAddress
-	a := uint32(t[0]) | uint32(t[1])<<8 | uint32(t[2])<<16 | uint32(t[3])<<24
-	t = r.RemoteAddress
-	b := uint32(t[0]) | uint32(t[1])<<8 | uint32(t[2])<<16 | uint32(t[3])<<24
+	a := addressToUint32(r.LocalAddress)
+	b := addressToUint32(r.RemoteAddress)
 	return hash.Hash3Words(a, b, uint32(protocol), hashIV)
 }
 
 // NewProtocol returns an IPv4 network protocol.
-func NewProtocol() stack.NetworkProtocol {
+func NewProtocol(s *stack.Stack) stack.NetworkProtocol {
 	ids := make([]uint32, buckets)
 
 	// Randomly initialize hashIV and the ids.
@@ -586,9 +832,33 @@ func NewProtocol() stack.NetworkProtocol {
 	hashIV := r[buckets]
 
 	return &protocol{
+		stack:         s,
 		ids:           ids,
 		hashIV:        hashIV,
 		defaultTTL:    DefaultTTL,
-		fragmentation: fragmentation.NewFragmentation(fragmentblockSize, fragmentation.HighFragThreshold, fragmentation.LowFragThreshold, fragmentation.DefaultReassembleTimeout),
+		fragmentation: fragmentation.NewFragmentation(fragmentblockSize, fragmentation.HighFragThreshold, fragmentation.LowFragThreshold, reassembleTimeout, s.Clock()),
+	}
+}
+
+func buildNextFragment(pf *fragmentation.PacketFragmenter, originalIPHeader header.IPv4) (*stack.PacketBuffer, bool) {
+	fragPkt, offset, copied, more := pf.BuildNextFragment()
+	fragPkt.NetworkProtocolNumber = ProtocolNumber
+
+	originalIPHeaderLength := len(originalIPHeader)
+	nextFragIPHeader := header.IPv4(fragPkt.NetworkHeader().Push(originalIPHeaderLength))
+
+	if copied := copy(nextFragIPHeader, originalIPHeader); copied != len(originalIPHeader) {
+		panic(fmt.Sprintf("wrong number of bytes copied into fragmentIPHeaders: got = %d, want = %d", copied, originalIPHeaderLength))
 	}
+
+	flags := originalIPHeader.Flags()
+	if more {
+		flags |= header.IPv4FlagMoreFragments
+	}
+	nextFragIPHeader.SetFlagsFragmentOffset(flags, uint16(offset))
+	nextFragIPHeader.SetTotalLength(uint16(nextFragIPHeader.HeaderLength()) + uint16(copied))
+	nextFragIPHeader.SetChecksum(0)
+	nextFragIPHeader.SetChecksum(^nextFragIPHeader.CalculateChecksum())
+
+	return fragPkt, more
 }
diff --git a/pkg/tcpip/network/ipv4/ipv4_test.go b/pkg/tcpip/network/ipv4/ipv4_test.go
index 197e3bc51..fee11bb38 100644
--- a/pkg/tcpip/network/ipv4/ipv4_test.go
+++ b/pkg/tcpip/network/ipv4/ipv4_test.go
@@ -15,29 +15,36 @@
 package ipv4_test
 
 import (
-	"bytes"
+	"context"
 	"encoding/hex"
 	"fmt"
-	"math/rand"
+	"math"
+	"net"
 	"testing"
 
 	"github.com/google/go-cmp/cmp"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/checker"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/link/channel"
 	"gvisor.dev/gvisor/pkg/tcpip/link/sniffer"
+	"gvisor.dev/gvisor/pkg/tcpip/network/arp"
 	"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
+	"gvisor.dev/gvisor/pkg/tcpip/network/testutil"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
+	"gvisor.dev/gvisor/pkg/tcpip/transport/icmp"
 	"gvisor.dev/gvisor/pkg/tcpip/transport/tcp"
 	"gvisor.dev/gvisor/pkg/tcpip/transport/udp"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
+const extraHeaderReserve = 50
+
 func TestExcludeBroadcast(t *testing.T) {
 	s := stack.New(stack.Options{
-		NetworkProtocols:   []stack.NetworkProtocol{ipv4.NewProtocol()},
-		TransportProtocols: []stack.TransportProtocol{udp.NewProtocol()},
+		NetworkProtocols:   []stack.NetworkProtocolFactory{ipv4.NewProtocol},
+		TransportProtocols: []stack.TransportProtocolFactory{udp.NewProtocol},
 	})
 
 	const defaultMTU = 65536
@@ -92,38 +99,297 @@ func TestExcludeBroadcast(t *testing.T) {
 	})
 }
 
-// makeRandPkt generates a randomize packet. hdrLength indicates how much
-// data should already be in the header before WritePacket. extraLength
-// indicates how much extra space should be in the header. The payload is made
-// from many Views of the sizes listed in viewSizes.
-func makeRandPkt(hdrLength int, extraLength int, viewSizes []int) *stack.PacketBuffer {
-	var views []buffer.View
-	totalLength := 0
-	for _, s := range viewSizes {
-		newView := buffer.NewView(s)
-		rand.Read(newView)
-		views = append(views, newView)
-		totalLength += s
+// TestIPv4Sanity sends IP/ICMP packets with various problems to the stack and
+// checks the response.
+func TestIPv4Sanity(t *testing.T) {
+	const (
+		defaultMTU     = header.IPv6MinimumMTU
+		ttl            = 255
+		nicID          = 1
+		randomSequence = 123
+		randomIdent    = 42
+	)
+	var (
+		ipv4Addr = tcpip.AddressWithPrefix{
+			Address:   tcpip.Address(net.ParseIP("192.168.1.58").To4()),
+			PrefixLen: 24,
+		}
+		remoteIPv4Addr = tcpip.Address(net.ParseIP("10.0.0.1").To4())
+	)
+
+	tests := []struct {
+		name              string
+		headerLength      uint8 // value of 0 means "use correct size"
+		badHeaderChecksum bool
+		maxTotalLength    uint16
+		transportProtocol uint8
+		TTL               uint8
+		shouldFail        bool
+		expectICMP        bool
+		ICMPType          header.ICMPv4Type
+		ICMPCode          header.ICMPv4Code
+		options           []byte
+	}{
+		{
+			name:              "valid",
+			maxTotalLength:    defaultMTU,
+			transportProtocol: uint8(header.ICMPv4ProtocolNumber),
+			TTL:               ttl,
+		},
+		{
+			name:              "bad header checksum",
+			maxTotalLength:    defaultMTU,
+			transportProtocol: uint8(header.ICMPv4ProtocolNumber),
+			TTL:               ttl,
+			badHeaderChecksum: true,
+			shouldFail:        true,
+		},
+		// The TTL tests check that we are not rejecting an incoming packet
+		// with a zero or one TTL, which has been a point of confusion in the
+		// past as RFC 791 says: "If this field contains the value zero, then the
+		// datagram must be destroyed". However RFC 1122 section 3.2.1.7 clarifies
+		// for the case of the destination host, stating as follows.
+		//
+		//      A host MUST NOT send a datagram with a Time-to-Live (TTL)
+		//      value of zero.
+		//
+		//      A host MUST NOT discard a datagram just because it was
+		//      received with TTL less than 2.
+		{
+			name:              "zero TTL",
+			maxTotalLength:    defaultMTU,
+			transportProtocol: uint8(header.ICMPv4ProtocolNumber),
+			TTL:               0,
+			shouldFail:        false,
+		},
+		{
+			name:              "one TTL",
+			maxTotalLength:    defaultMTU,
+			transportProtocol: uint8(header.ICMPv4ProtocolNumber),
+			TTL:               1,
+			shouldFail:        false,
+		},
+		{
+			name:              "End options",
+			maxTotalLength:    defaultMTU,
+			transportProtocol: uint8(header.ICMPv4ProtocolNumber),
+			TTL:               ttl,
+			options:           []byte{0, 0, 0, 0},
+		},
+		{
+			name:              "NOP options",
+			maxTotalLength:    defaultMTU,
+			transportProtocol: uint8(header.ICMPv4ProtocolNumber),
+			TTL:               ttl,
+			options:           []byte{1, 1, 1, 1},
+		},
+		{
+			name:              "NOP and End options",
+			maxTotalLength:    defaultMTU,
+			transportProtocol: uint8(header.ICMPv4ProtocolNumber),
+			TTL:               ttl,
+			options:           []byte{1, 1, 0, 0},
+		},
+		{
+			name:              "bad header length",
+			headerLength:      header.IPv4MinimumSize - 1,
+			maxTotalLength:    defaultMTU,
+			transportProtocol: uint8(header.ICMPv4ProtocolNumber),
+			TTL:               ttl,
+			shouldFail:        true,
+			expectICMP:        false,
+		},
+		{
+			name:              "bad total length (0)",
+			maxTotalLength:    0,
+			transportProtocol: uint8(header.ICMPv4ProtocolNumber),
+			TTL:               ttl,
+			shouldFail:        true,
+			expectICMP:        false,
+		},
+		{
+			name:              "bad total length (ip - 1)",
+			maxTotalLength:    uint16(header.IPv4MinimumSize - 1),
+			transportProtocol: uint8(header.ICMPv4ProtocolNumber),
+			TTL:               ttl,
+			shouldFail:        true,
+			expectICMP:        false,
+		},
+		{
+			name:              "bad total length (ip + icmp - 1)",
+			maxTotalLength:    uint16(header.IPv4MinimumSize + header.ICMPv4MinimumSize - 1),
+			transportProtocol: uint8(header.ICMPv4ProtocolNumber),
+			TTL:               ttl,
+			shouldFail:        true,
+			expectICMP:        false,
+		},
+		{
+			name:              "bad protocol",
+			maxTotalLength:    defaultMTU,
+			transportProtocol: 99,
+			TTL:               ttl,
+			shouldFail:        true,
+			expectICMP:        true,
+			ICMPType:          header.ICMPv4DstUnreachable,
+			ICMPCode:          header.ICMPv4ProtoUnreachable,
+		},
 	}
 
-	pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
-		ReserveHeaderBytes: hdrLength + extraLength,
-		Data:               buffer.NewVectorisedView(totalLength, views),
-	})
-	pkt.NetworkProtocolNumber = header.IPv4ProtocolNumber
-	if _, err := rand.Read(pkt.TransportHeader().Push(hdrLength)); err != nil {
-		panic(fmt.Sprintf("rand.Read: %s", err))
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			s := stack.New(stack.Options{
+				NetworkProtocols:   []stack.NetworkProtocolFactory{ipv4.NewProtocol},
+				TransportProtocols: []stack.TransportProtocolFactory{icmp.NewProtocol4},
+			})
+			// We expect at most a single packet in response to our ICMP Echo Request.
+			e := channel.New(1, defaultMTU, "")
+			if err := s.CreateNIC(nicID, e); err != nil {
+				t.Fatalf("CreateNIC(%d, _): %s", nicID, err)
+			}
+			ipv4ProtoAddr := tcpip.ProtocolAddress{Protocol: header.IPv4ProtocolNumber, AddressWithPrefix: ipv4Addr}
+			if err := s.AddProtocolAddress(nicID, ipv4ProtoAddr); err != nil {
+				t.Fatalf("AddProtocolAddress(%d, %#v): %s", nicID, ipv4ProtoAddr, err)
+			}
+
+			// Default routes for IPv4 so ICMP can find a route to the remote
+			// node when attempting to send the ICMP Echo Reply.
+			s.SetRouteTable([]tcpip.Route{
+				{
+					Destination: header.IPv4EmptySubnet,
+					NIC:         nicID,
+				},
+			})
+
+			// Round up the header size to the next multiple of 4 as RFC 791, page 11
+			// says: "Internet Header Length is the length of the internet header
+			// in 32 bit words..." and on page 23: "The internet header padding is
+			// used to ensure that the internet header ends on a 32 bit boundary."
+			ipHeaderLength := ((header.IPv4MinimumSize + len(test.options)) + header.IPv4IHLStride - 1) & ^(header.IPv4IHLStride - 1)
+
+			if ipHeaderLength > header.IPv4MaximumHeaderSize {
+				t.Fatalf("too many bytes in options: got = %d, want <= %d ", ipHeaderLength, header.IPv4MaximumHeaderSize)
+			}
+			totalLen := uint16(ipHeaderLength + header.ICMPv4MinimumSize)
+			hdr := buffer.NewPrependable(int(totalLen))
+			icmp := header.ICMPv4(hdr.Prepend(header.ICMPv4MinimumSize))
+
+			// Specify ident/seq to make sure we get the same in the response.
+			icmp.SetIdent(randomIdent)
+			icmp.SetSequence(randomSequence)
+			icmp.SetType(header.ICMPv4Echo)
+			icmp.SetCode(header.ICMPv4UnusedCode)
+			icmp.SetChecksum(0)
+			icmp.SetChecksum(^header.Checksum(icmp, 0))
+			ip := header.IPv4(hdr.Prepend(ipHeaderLength))
+			if test.maxTotalLength < totalLen {
+				totalLen = test.maxTotalLength
+			}
+			ip.Encode(&header.IPv4Fields{
+				IHL:         uint8(ipHeaderLength),
+				TotalLength: totalLen,
+				Protocol:    test.transportProtocol,
+				TTL:         test.TTL,
+				SrcAddr:     remoteIPv4Addr,
+				DstAddr:     ipv4Addr.Address,
+			})
+			if n := copy(ip.Options(), test.options); n != len(test.options) {
+				t.Fatalf("options larger than available space: copied %d/%d bytes", n, len(test.options))
+			}
+			// Override the correct value if the test case specified one.
+			if test.headerLength != 0 {
+				ip.SetHeaderLength(test.headerLength)
+			}
+			ip.SetChecksum(0)
+			ipHeaderChecksum := ip.CalculateChecksum()
+			if test.badHeaderChecksum {
+				ipHeaderChecksum += 42
+			}
+			ip.SetChecksum(^ipHeaderChecksum)
+			requestPkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
+				Data: hdr.View().ToVectorisedView(),
+			})
+			e.InjectInbound(header.IPv4ProtocolNumber, requestPkt)
+			reply, ok := e.Read()
+			if !ok {
+				if test.shouldFail {
+					if test.expectICMP {
+						t.Fatal("expected ICMP error response missing")
+					}
+					return // Expected silent failure.
+				}
+				t.Fatal("expected ICMP echo reply missing")
+			}
+
+			// Check the route that brought the packet to us.
+			if reply.Route.LocalAddress != ipv4Addr.Address {
+				t.Errorf("got pkt.Route.LocalAddress = %s, want = %s", reply.Route.LocalAddress, ipv4Addr.Address)
+			}
+			if reply.Route.RemoteAddress != remoteIPv4Addr {
+				t.Errorf("got pkt.Route.RemoteAddress = %s, want = %s", reply.Route.RemoteAddress, remoteIPv4Addr)
+			}
+
+			// Make sure it's all in one buffer.
+			vv := buffer.NewVectorisedView(reply.Pkt.Size(), reply.Pkt.Views())
+			replyIPHeader := header.IPv4(vv.ToView())
+
+			// At this stage we only know it's an IP header so verify that much.
+			checker.IPv4(t, replyIPHeader,
+				checker.SrcAddr(ipv4Addr.Address),
+				checker.DstAddr(remoteIPv4Addr),
+			)
+
+			// All expected responses are ICMP packets.
+			if got, want := replyIPHeader.Protocol(), uint8(header.ICMPv4ProtocolNumber); got != want {
+				t.Fatalf("not ICMP response, got protocol %d, want = %d", got, want)
+			}
+			replyICMPHeader := header.ICMPv4(replyIPHeader.Payload())
+
+			// Sanity check the response.
+			switch replyICMPHeader.Type() {
+			case header.ICMPv4DstUnreachable:
+				checker.IPv4(t, replyIPHeader,
+					checker.IPFullLength(uint16(header.IPv4MinimumSize+header.ICMPv4MinimumSize+requestPkt.Size())),
+					checker.IPv4HeaderLength(header.IPv4MinimumSize),
+					checker.ICMPv4(
+						checker.ICMPv4Code(test.ICMPCode),
+						checker.ICMPv4Checksum(),
+						checker.ICMPv4Payload([]byte(hdr.View())),
+					),
+				)
+				if !test.shouldFail || !test.expectICMP {
+					t.Fatalf("unexpected packet rejection, got ICMP error packet type %d, code %d",
+						header.ICMPv4DstUnreachable, replyICMPHeader.Code())
+				}
+				return
+			case header.ICMPv4EchoReply:
+				checker.IPv4(t, replyIPHeader,
+					checker.IPv4HeaderLength(ipHeaderLength),
+					checker.IPv4Options(test.options),
+					checker.IPFullLength(uint16(requestPkt.Size())),
+					checker.ICMPv4(
+						checker.ICMPv4Code(header.ICMPv4UnusedCode),
+						checker.ICMPv4Seq(randomSequence),
+						checker.ICMPv4Ident(randomIdent),
+						checker.ICMPv4Checksum(),
+					),
+				)
+				if test.shouldFail {
+					t.Fatalf("unexpected Echo Reply packet\n")
+				}
+			default:
+				t.Fatalf("unexpected ICMP response, got type %d, want = %d or %d",
+					replyICMPHeader.Type(), header.ICMPv4EchoReply, header.ICMPv4DstUnreachable)
+			}
+		})
 	}
-	return pkt
 }
 
 // comparePayloads compared the contents of all the packets against the contents
 // of the source packet.
-func compareFragments(t *testing.T, packets []*stack.PacketBuffer, sourcePacketInfo *stack.PacketBuffer, mtu uint32) {
-	t.Helper()
-	// Make a complete array of the sourcePacketInfo packet.
-	source := header.IPv4(packets[0].NetworkHeader().View()[:header.IPv4MinimumSize])
-	vv := buffer.NewVectorisedView(sourcePacketInfo.Size(), sourcePacketInfo.Views())
+func compareFragments(packets []*stack.PacketBuffer, sourcePacket *stack.PacketBuffer, mtu uint32, wantFragments []fragmentInfo, proto tcpip.TransportProtocolNumber) error {
+	// Make a complete array of the sourcePacket packet.
+	source := header.IPv4(packets[0].NetworkHeader().View())
+	vv := buffer.NewVectorisedView(sourcePacket.Size(), sourcePacket.Views())
 	source = append(source, vv.ToView()...)
 
 	// Make a copy of the IP header, which will be modified in some fields to make
@@ -132,199 +398,251 @@ func compareFragments(t *testing.T, packets []*stack.PacketBuffer, sourcePacketI
 	sourceCopy.SetChecksum(0)
 	sourceCopy.SetFlagsFragmentOffset(0, 0)
 	sourceCopy.SetTotalLength(0)
-	var offset uint16
 	// Build up an array of the bytes sent.
-	var reassembledPayload []byte
+	var reassembledPayload buffer.VectorisedView
 	for i, packet := range packets {
 		// Confirm that the packet is valid.
 		allBytes := buffer.NewVectorisedView(packet.Size(), packet.Views())
-		ip := header.IPv4(allBytes.ToView())
-		if !ip.IsValid(len(ip)) {
-			t.Errorf("IP packet is invalid:\n%s", hex.Dump(ip))
+		fragmentIPHeader := header.IPv4(allBytes.ToView())
+		if !fragmentIPHeader.IsValid(len(fragmentIPHeader)) {
+			return fmt.Errorf("fragment #%d: IP packet is invalid:\n%s", i, hex.Dump(fragmentIPHeader))
 		}
-		if got, want := ip.CalculateChecksum(), uint16(0xffff); got != want {
-			t.Errorf("ip.CalculateChecksum() got %#x, want %#x", got, want)
+		if got := len(fragmentIPHeader); got > int(mtu) {
+			return fmt.Errorf("fragment #%d: got len(fragmentIPHeader) = %d, want <= %d", i, got, mtu)
 		}
-		if got, want := len(ip), int(mtu); got > want {
-			t.Errorf("fragment is too large, got %d want %d", got, want)
+		if got := fragmentIPHeader.TransportProtocol(); got != proto {
+			return fmt.Errorf("fragment #%d: got fragmentIPHeader.TransportProtocol() = %d, want = %d", i, got, uint8(proto))
 		}
-		if i == 0 {
-			got := packet.NetworkHeader().View().Size() + packet.TransportHeader().View().Size()
-			// sourcePacketInfo does not have NetworkHeader added, simulate one.
-			want := header.IPv4MinimumSize + sourcePacketInfo.TransportHeader().View().Size()
-			// Check that it kept the transport header in packet.TransportHeader if
-			// it fits in the first fragment.
-			if want < int(mtu) && got != want {
-				t.Errorf("first fragment hdr parts should have unmodified length if possible: got %d, want %d", got, want)
-			}
+		if got := packet.AvailableHeaderBytes(); got != extraHeaderReserve {
+			return fmt.Errorf("fragment #%d: got packet.AvailableHeaderBytes() = %d, want = %d", i, got, extraHeaderReserve)
 		}
-		if got, want := packet.AvailableHeaderBytes(), sourcePacketInfo.AvailableHeaderBytes()-header.IPv4MinimumSize; got != want {
-			t.Errorf("fragment #%d should have the same available space for prepending as source: got %d, want %d", i, got, want)
+		if got, want := packet.NetworkProtocolNumber, sourcePacket.NetworkProtocolNumber; got != want {
+			return fmt.Errorf("fragment #%d: got fragment.NetworkProtocolNumber = %d, want = %d", i, got, want)
 		}
-		if got, want := packet.NetworkProtocolNumber, sourcePacketInfo.NetworkProtocolNumber; got != want {
-			t.Errorf("fragment #%d has wrong network protocol number: got %d, want %d", i, got, want)
+		if got, want := fragmentIPHeader.CalculateChecksum(), uint16(0xffff); got != want {
+			return fmt.Errorf("fragment #%d: got ip.CalculateChecksum() = %#x, want = %#x", i, got, want)
 		}
-		if i < len(packets)-1 {
-			sourceCopy.SetFlagsFragmentOffset(sourceCopy.Flags()|header.IPv4FlagMoreFragments, offset)
+		if wantFragments[i].more {
+			sourceCopy.SetFlagsFragmentOffset(sourceCopy.Flags()|header.IPv4FlagMoreFragments, wantFragments[i].offset)
 		} else {
-			sourceCopy.SetFlagsFragmentOffset(sourceCopy.Flags()&^header.IPv4FlagMoreFragments, offset)
+			sourceCopy.SetFlagsFragmentOffset(sourceCopy.Flags()&^header.IPv4FlagMoreFragments, wantFragments[i].offset)
 		}
-		reassembledPayload = append(reassembledPayload, ip.Payload()...)
-		offset += ip.TotalLength() - uint16(ip.HeaderLength())
+		reassembledPayload.AppendView(packet.TransportHeader().View())
+		reassembledPayload.Append(packet.Data)
 		// Clear out the checksum and length from the ip because we can't compare
 		// it.
-		sourceCopy.SetTotalLength(uint16(len(ip)))
+		sourceCopy.SetTotalLength(wantFragments[i].payloadSize + header.IPv4MinimumSize)
 		sourceCopy.SetChecksum(0)
 		sourceCopy.SetChecksum(^sourceCopy.CalculateChecksum())
-		if !bytes.Equal(ip[:ip.HeaderLength()], sourceCopy[:sourceCopy.HeaderLength()]) {
-			t.Errorf("ip[:ip.HeaderLength()] got:\n%s\nwant:\n%s", hex.Dump(ip[:ip.HeaderLength()]), hex.Dump(sourceCopy[:sourceCopy.HeaderLength()]))
-		}
-	}
-	expected := source[source.HeaderLength():]
-	if !bytes.Equal(reassembledPayload, expected) {
-		t.Errorf("reassembledPayload got:\n%s\nwant:\n%s", hex.Dump(reassembledPayload), hex.Dump(expected))
-	}
-}
-
-type errorChannel struct {
-	*channel.Endpoint
-	Ch                    chan *stack.PacketBuffer
-	packetCollectorErrors []*tcpip.Error
-}
-
-// newErrorChannel creates a new errorChannel endpoint. Each call to WritePacket
-// will return successive errors from packetCollectorErrors until the list is
-// empty and then return nil each time.
-func newErrorChannel(size int, mtu uint32, linkAddr tcpip.LinkAddress, packetCollectorErrors []*tcpip.Error) *errorChannel {
-	return &errorChannel{
-		Endpoint:              channel.New(size, mtu, linkAddr),
-		Ch:                    make(chan *stack.PacketBuffer, size),
-		packetCollectorErrors: packetCollectorErrors,
-	}
-}
-
-// Drain removes all outbound packets from the channel and counts them.
-func (e *errorChannel) Drain() int {
-	c := 0
-	for {
-		select {
-		case <-e.Ch:
-			c++
-		default:
-			return c
+		if diff := cmp.Diff(fragmentIPHeader[:fragmentIPHeader.HeaderLength()], sourceCopy[:sourceCopy.HeaderLength()]); diff != "" {
+			return fmt.Errorf("fragment #%d: fragmentIPHeader mismatch (-want +got):\n%s", i, diff)
 		}
 	}
-}
 
-// WritePacket stores outbound packets into the channel.
-func (e *errorChannel) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) *tcpip.Error {
-	select {
-	case e.Ch <- pkt:
-	default:
+	expected := buffer.View(source[source.HeaderLength():])
+	if diff := cmp.Diff(expected, reassembledPayload.ToView()); diff != "" {
+		return fmt.Errorf("reassembledPayload mismatch (-want +got):\n%s", diff)
 	}
 
-	nextError := (*tcpip.Error)(nil)
-	if len(e.packetCollectorErrors) > 0 {
-		nextError = e.packetCollectorErrors[0]
-		e.packetCollectorErrors = e.packetCollectorErrors[1:]
-	}
-	return nextError
+	return nil
 }
 
-type context struct {
-	stack.Route
-	linkEP *errorChannel
+type fragmentInfo struct {
+	offset      uint16
+	more        bool
+	payloadSize uint16
 }
 
-func buildContext(t *testing.T, packetCollectorErrors []*tcpip.Error, mtu uint32) context {
-	// Make the packet and write it.
-	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{ipv4.NewProtocol()},
-	})
-	ep := newErrorChannel(100 /* Enough for all tests. */, mtu, "", packetCollectorErrors)
-	s.CreateNIC(1, ep)
-	const (
-		src = "\x10\x00\x00\x01"
-		dst = "\x10\x00\x00\x02"
-	)
-	s.AddAddress(1, ipv4.ProtocolNumber, src)
+var fragmentationTests = []struct {
+	description           string
+	mtu                   uint32
+	gso                   *stack.GSO
+	transportHeaderLength int
+	payloadSize           int
+	wantFragments         []fragmentInfo
+}{
 	{
-		subnet, err := tcpip.NewSubnet(dst, tcpip.AddressMask(header.IPv4Broadcast))
-		if err != nil {
-			t.Fatal(err)
-		}
-		s.SetRouteTable([]tcpip.Route{{
-			Destination: subnet,
-			NIC:         1,
-		}})
-	}
-	r, err := s.FindRoute(0, src, dst, ipv4.ProtocolNumber, false /* multicastLoop */)
-	if err != nil {
-		t.Fatalf("s.FindRoute got %v, want %v", err, nil)
-	}
-	return context{
-		Route:  r,
-		linkEP: ep,
-	}
+		description:           "No Fragmentation",
+		mtu:                   1280,
+		gso:                   nil,
+		transportHeaderLength: 0,
+		payloadSize:           1000,
+		wantFragments: []fragmentInfo{
+			{offset: 0, payloadSize: 1000, more: false},
+		},
+	},
+	{
+		description:           "Fragmented",
+		mtu:                   1280,
+		gso:                   nil,
+		transportHeaderLength: 0,
+		payloadSize:           2000,
+		wantFragments: []fragmentInfo{
+			{offset: 0, payloadSize: 1256, more: true},
+			{offset: 1256, payloadSize: 744, more: false},
+		},
+	},
+	{
+		description:           "No fragmentation with big header",
+		mtu:                   2000,
+		gso:                   nil,
+		transportHeaderLength: 100,
+		payloadSize:           1000,
+		wantFragments: []fragmentInfo{
+			{offset: 0, payloadSize: 1100, more: false},
+		},
+	},
+	{
+		description:           "Fragmented with gso none",
+		mtu:                   1280,
+		gso:                   &stack.GSO{Type: stack.GSONone},
+		transportHeaderLength: 0,
+		payloadSize:           1400,
+		wantFragments: []fragmentInfo{
+			{offset: 0, payloadSize: 1256, more: true},
+			{offset: 1256, payloadSize: 144, more: false},
+		},
+	},
+	{
+		description:           "Fragmented with big header",
+		mtu:                   1280,
+		gso:                   nil,
+		transportHeaderLength: 100,
+		payloadSize:           1200,
+		wantFragments: []fragmentInfo{
+			{offset: 0, payloadSize: 1256, more: true},
+			{offset: 1256, payloadSize: 44, more: false},
+		},
+	},
+	{
+		description:           "Fragmented with MTU smaller than header",
+		mtu:                   300,
+		gso:                   nil,
+		transportHeaderLength: 1000,
+		payloadSize:           500,
+		wantFragments: []fragmentInfo{
+			{offset: 0, payloadSize: 280, more: true},
+			{offset: 280, payloadSize: 280, more: true},
+			{offset: 560, payloadSize: 280, more: true},
+			{offset: 840, payloadSize: 280, more: true},
+			{offset: 1120, payloadSize: 280, more: true},
+			{offset: 1400, payloadSize: 100, more: false},
+		},
+	},
 }
 
-func TestFragmentation(t *testing.T) {
-	var manyPayloadViewsSizes [1000]int
-	for i := range manyPayloadViewsSizes {
-		manyPayloadViewsSizes[i] = 7
-	}
-	fragTests := []struct {
-		description       string
-		mtu               uint32
-		gso               *stack.GSO
-		hdrLength         int
-		extraLength       int
-		payloadViewsSizes []int
-		expectedFrags     int
-	}{
-		{"NoFragmentation", 2000, &stack.GSO{}, 0, header.IPv4MinimumSize, []int{1000}, 1},
-		{"NoFragmentationWithBigHeader", 2000, &stack.GSO{}, 16, header.IPv4MinimumSize, []int{1000}, 1},
-		{"Fragmented", 800, &stack.GSO{}, 0, header.IPv4MinimumSize, []int{1000}, 2},
-		{"FragmentedWithGsoNil", 800, nil, 0, header.IPv4MinimumSize, []int{1000}, 2},
-		{"FragmentedWithManyViews", 300, &stack.GSO{}, 0, header.IPv4MinimumSize, manyPayloadViewsSizes[:], 25},
-		{"FragmentedWithManyViewsAndPrependableBytes", 300, &stack.GSO{}, 0, header.IPv4MinimumSize + 55, manyPayloadViewsSizes[:], 25},
-		{"FragmentedWithBigHeader", 800, &stack.GSO{}, 20, header.IPv4MinimumSize, []int{1000}, 2},
-		{"FragmentedWithBigHeaderAndPrependableBytes", 800, &stack.GSO{}, 20, header.IPv4MinimumSize + 66, []int{1000}, 2},
-		{"FragmentedWithMTUSmallerThanHeaderAndPrependableBytes", 300, &stack.GSO{}, 1000, header.IPv4MinimumSize + 77, []int{500}, 6},
-	}
+func TestFragmentationWritePacket(t *testing.T) {
+	const ttl = 42
 
-	for _, ft := range fragTests {
+	for _, ft := range fragmentationTests {
 		t.Run(ft.description, func(t *testing.T) {
-			pkt := makeRandPkt(ft.hdrLength, ft.extraLength, ft.payloadViewsSizes)
+			ep := testutil.NewMockLinkEndpoint(ft.mtu, nil, math.MaxInt32)
+			r := buildRoute(t, ep)
+			pkt := testutil.MakeRandPkt(ft.transportHeaderLength, extraHeaderReserve+header.IPv4MinimumSize, []int{ft.payloadSize}, header.IPv4ProtocolNumber)
 			source := pkt.Clone()
-			c := buildContext(t, nil, ft.mtu)
-			err := c.Route.WritePacket(ft.gso, stack.NetworkHeaderParams{
+			err := r.WritePacket(ft.gso, stack.NetworkHeaderParams{
 				Protocol: tcp.ProtocolNumber,
-				TTL:      42,
+				TTL:      ttl,
 				TOS:      stack.DefaultTOS,
 			}, pkt)
 			if err != nil {
-				t.Errorf("err got %v, want %v", err, nil)
+				t.Fatalf("r.WritePacket(_, _, _) = %s", err)
 			}
-
-			var results []*stack.PacketBuffer
-		L:
-			for {
-				select {
-				case pi := <-c.linkEP.Ch:
-					results = append(results, pi)
-				default:
-					break L
-				}
+			if got := len(ep.WrittenPackets); got != len(ft.wantFragments) {
+				t.Errorf("got len(ep.WrittenPackets) = %d, want = %d", got, len(ft.wantFragments))
 			}
-
-			if got, want := len(results), ft.expectedFrags; got != want {
-				t.Errorf("len(result) got %d, want %d", got, want)
+			if got := int(r.Stats().IP.PacketsSent.Value()); got != len(ft.wantFragments) {
+				t.Errorf("got c.Route.Stats().IP.PacketsSent.Value() = %d, want = %d", got, len(ft.wantFragments))
+			}
+			if got := r.Stats().IP.OutgoingPacketErrors.Value(); got != 0 {
+				t.Errorf("got r.Stats().IP.OutgoingPacketErrors.Value() = %d, want = 0", got)
+			}
+			if err := compareFragments(ep.WrittenPackets, source, ft.mtu, ft.wantFragments, tcp.ProtocolNumber); err != nil {
+				t.Error(err)
 			}
-			if got, want := len(results), int(c.Route.Stats().IP.PacketsSent.Value()); got != want {
-				t.Errorf("no errors yet len(result) got %d, want %d", got, want)
+		})
+	}
+}
+
+func TestFragmentationWritePackets(t *testing.T) {
+	const ttl = 42
+	writePacketsTests := []struct {
+		description  string
+		insertBefore int
+		insertAfter  int
+	}{
+		{
+			description:  "Single packet",
+			insertBefore: 0,
+			insertAfter:  0,
+		},
+		{
+			description:  "With packet before",
+			insertBefore: 1,
+			insertAfter:  0,
+		},
+		{
+			description:  "With packet after",
+			insertBefore: 0,
+			insertAfter:  1,
+		},
+		{
+			description:  "With packet before and after",
+			insertBefore: 1,
+			insertAfter:  1,
+		},
+	}
+	tinyPacket := testutil.MakeRandPkt(header.TCPMinimumSize, extraHeaderReserve+header.IPv4MinimumSize, []int{1}, header.IPv4ProtocolNumber)
+
+	for _, test := range writePacketsTests {
+		t.Run(test.description, func(t *testing.T) {
+			for _, ft := range fragmentationTests {
+				t.Run(ft.description, func(t *testing.T) {
+					var pkts stack.PacketBufferList
+					for i := 0; i < test.insertBefore; i++ {
+						pkts.PushBack(tinyPacket.Clone())
+					}
+					pkt := testutil.MakeRandPkt(ft.transportHeaderLength, extraHeaderReserve+header.IPv4MinimumSize, []int{ft.payloadSize}, header.IPv4ProtocolNumber)
+					pkts.PushBack(pkt.Clone())
+					for i := 0; i < test.insertAfter; i++ {
+						pkts.PushBack(tinyPacket.Clone())
+					}
+
+					ep := testutil.NewMockLinkEndpoint(ft.mtu, nil, math.MaxInt32)
+					r := buildRoute(t, ep)
+
+					wantTotalPackets := len(ft.wantFragments) + test.insertBefore + test.insertAfter
+					n, err := r.WritePackets(ft.gso, pkts, stack.NetworkHeaderParams{
+						Protocol: tcp.ProtocolNumber,
+						TTL:      ttl,
+						TOS:      stack.DefaultTOS,
+					})
+					if err != nil {
+						t.Errorf("got WritePackets(_, _, _) = (_, %s), want = (_, nil)", err)
+					}
+					if n != wantTotalPackets {
+						t.Errorf("got WritePackets(_, _, _) = (%d, _), want = (%d, _)", n, wantTotalPackets)
+					}
+					if got := len(ep.WrittenPackets); got != wantTotalPackets {
+						t.Errorf("got len(ep.WrittenPackets) = %d, want = %d", got, wantTotalPackets)
+					}
+					if got := int(r.Stats().IP.PacketsSent.Value()); got != wantTotalPackets {
+						t.Errorf("got c.Route.Stats().IP.PacketsSent.Value() = %d, want = %d", got, wantTotalPackets)
+					}
+					if got := int(r.Stats().IP.OutgoingPacketErrors.Value()); got != 0 {
+						t.Errorf("got r.Stats().IP.OutgoingPacketErrors.Value() = %d, want = 0", got)
+					}
+
+					if wantTotalPackets == 0 {
+						return
+					}
+
+					fragments := ep.WrittenPackets[test.insertBefore : len(ft.wantFragments)+test.insertBefore]
+					if err := compareFragments(fragments, pkt, ft.mtu, ft.wantFragments, tcp.ProtocolNumber); err != nil {
+						t.Error(err)
+					}
+				})
 			}
-			compareFragments(t, results, source, ft.mtu)
 		})
 	}
 }
@@ -332,155 +650,377 @@ func TestFragmentation(t *testing.T) {
 // TestFragmentationErrors checks that errors are returned from write packet
 // correctly.
 func TestFragmentationErrors(t *testing.T) {
+	const ttl = 42
+
+	expectedError := tcpip.ErrAborted
 	fragTests := []struct {
 		description           string
 		mtu                   uint32
-		hdrLength             int
-		payloadViewsSizes     []int
-		packetCollectorErrors []*tcpip.Error
+		transportHeaderLength int
+		payloadSize           int
+		allowPackets          int
+		fragmentCount         int
 	}{
-		{"NoFrag", 2000, 0, []int{1000}, []*tcpip.Error{tcpip.ErrAborted}},
-		{"ErrorOnFirstFrag", 500, 0, []int{1000}, []*tcpip.Error{tcpip.ErrAborted}},
-		{"ErrorOnSecondFrag", 500, 0, []int{1000}, []*tcpip.Error{nil, tcpip.ErrAborted}},
-		{"ErrorOnFirstFragMTUSmallerThanHdr", 500, 1000, []int{500}, []*tcpip.Error{tcpip.ErrAborted}},
+		{
+			description:           "No frag",
+			mtu:                   2000,
+			transportHeaderLength: 0,
+			payloadSize:           1000,
+			allowPackets:          0,
+			fragmentCount:         1,
+		},
+		{
+			description:           "Error on first frag",
+			mtu:                   500,
+			transportHeaderLength: 0,
+			payloadSize:           1000,
+			allowPackets:          0,
+			fragmentCount:         3,
+		},
+		{
+			description:           "Error on second frag",
+			mtu:                   500,
+			transportHeaderLength: 0,
+			payloadSize:           1000,
+			allowPackets:          1,
+			fragmentCount:         3,
+		},
+		{
+			description:           "Error on first frag MTU smaller than header",
+			mtu:                   500,
+			transportHeaderLength: 1000,
+			payloadSize:           500,
+			allowPackets:          0,
+			fragmentCount:         4,
+		},
 	}
 
 	for _, ft := range fragTests {
 		t.Run(ft.description, func(t *testing.T) {
-			pkt := makeRandPkt(ft.hdrLength, header.IPv4MinimumSize, ft.payloadViewsSizes)
-			c := buildContext(t, ft.packetCollectorErrors, ft.mtu)
-			err := c.Route.WritePacket(&stack.GSO{}, stack.NetworkHeaderParams{
+			ep := testutil.NewMockLinkEndpoint(ft.mtu, expectedError, ft.allowPackets)
+			r := buildRoute(t, ep)
+			pkt := testutil.MakeRandPkt(ft.transportHeaderLength, extraHeaderReserve+header.IPv4MinimumSize, []int{ft.payloadSize}, header.IPv4ProtocolNumber)
+			err := r.WritePacket(&stack.GSO{}, stack.NetworkHeaderParams{
 				Protocol: tcp.ProtocolNumber,
-				TTL:      42,
+				TTL:      ttl,
 				TOS:      stack.DefaultTOS,
 			}, pkt)
-			for i := 0; i < len(ft.packetCollectorErrors)-1; i++ {
-				if got, want := ft.packetCollectorErrors[i], (*tcpip.Error)(nil); got != want {
-					t.Errorf("ft.packetCollectorErrors[%d] got %v, want %v", i, got, want)
-				}
+			if err != expectedError {
+				t.Errorf("got WritePacket(_, _, _) = %s, want = %s", err, expectedError)
 			}
-			// We only need to check that last error because all the ones before are
-			// nil.
-			if got, want := err, ft.packetCollectorErrors[len(ft.packetCollectorErrors)-1]; got != want {
-				t.Errorf("err got %v, want %v", got, want)
+			if got, want := len(ep.WrittenPackets), int(r.Stats().IP.PacketsSent.Value()); err != nil && got != want {
+				t.Errorf("got len(ep.WrittenPackets) = %d, want = %d", got, want)
 			}
-			if got, want := c.linkEP.Drain(), int(c.Route.Stats().IP.PacketsSent.Value())+1; err != nil && got != want {
-				t.Errorf("after linkEP error len(result) got %d, want %d", got, want)
+			if got, want := int(r.Stats().IP.OutgoingPacketErrors.Value()), ft.fragmentCount-ft.allowPackets; got != want {
+				t.Errorf("got r.Stats().IP.OutgoingPacketErrors.Value() = %d, want = %d", got, want)
 			}
 		})
 	}
 }
 
 func TestInvalidFragments(t *testing.T) {
+	const (
+		nicID    = 1
+		linkAddr = tcpip.LinkAddress("\x0a\x0b\x0c\x0d\x0e\x0e")
+		addr1    = "\x0a\x00\x00\x01"
+		addr2    = "\x0a\x00\x00\x02"
+		tos      = 0
+		ident    = 1
+		ttl      = 48
+		protocol = 6
+	)
+
+	payloadGen := func(payloadLen int) []byte {
+		payload := make([]byte, payloadLen)
+		for i := 0; i < len(payload); i++ {
+			payload[i] = 0x30
+		}
+		return payload
+	}
+
+	type fragmentData struct {
+		ipv4fields   header.IPv4Fields
+		payload      []byte
+		autoChecksum bool // if true, the Checksum field will be overwritten.
+	}
+
 	// These packets have both IHL and TotalLength set to 0.
-	testCases := []struct {
+	tests := []struct {
 		name                   string
-		packets                [][]byte
+		fragments              []fragmentData
 		wantMalformedIPPackets uint64
 		wantMalformedFragments uint64
 	}{
 		{
-			"ihl_totallen_zero_valid_frag_offset",
-			[][]byte{
-				{0x40, 0x30, 0x00, 0x00, 0x6c, 0x74, 0x7d, 0x30, 0x30, 0x30, 0x30, 0x30, 0x39, 0x32, 0x39, 0x33, 0xff, 0xff, 0xff, 0xff, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30},
-			},
-			1,
-			0,
-		},
-		{
-			"ihl_totallen_zero_invalid_frag_offset",
-			[][]byte{
-				{0x40, 0x30, 0x00, 0x00, 0x6c, 0x74, 0x20, 0x00, 0x30, 0x30, 0x30, 0x30, 0x39, 0x32, 0x39, 0x33, 0xff, 0xff, 0xff, 0xff, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30},
+			name: "IHL and TotalLength zero, FragmentOffset non-zero",
+			fragments: []fragmentData{
+				{
+					ipv4fields: header.IPv4Fields{
+						IHL:            0,
+						TOS:            tos,
+						TotalLength:    0,
+						ID:             ident,
+						Flags:          header.IPv4FlagDontFragment | header.IPv4FlagMoreFragments,
+						FragmentOffset: 59776,
+						TTL:            ttl,
+						Protocol:       protocol,
+						SrcAddr:        addr1,
+						DstAddr:        addr2,
+					},
+					payload:      payloadGen(12),
+					autoChecksum: true,
+				},
 			},
-			1,
-			0,
+			wantMalformedIPPackets: 1,
+			wantMalformedFragments: 0,
 		},
 		{
-			// Total Length of 37(20 bytes IP header + 17 bytes of
-			// payload)
-			// Frag Offset of 0x1ffe = 8190*8 = 65520
-			// Leading to the fragment end to be past 65535.
-			"ihl_totallen_valid_invalid_frag_offset_1",
-			[][]byte{
-				{0x45, 0x30, 0x00, 0x25, 0x6c, 0x74, 0x1f, 0xfe, 0x30, 0x30, 0x30, 0x30, 0x39, 0x32, 0x39, 0x33, 0xff, 0xff, 0xff, 0xff, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30},
+			name: "IHL and TotalLength zero, FragmentOffset zero",
+			fragments: []fragmentData{
+				{
+					ipv4fields: header.IPv4Fields{
+						IHL:            0,
+						TOS:            tos,
+						TotalLength:    0,
+						ID:             ident,
+						Flags:          header.IPv4FlagMoreFragments,
+						FragmentOffset: 0,
+						TTL:            ttl,
+						Protocol:       protocol,
+						SrcAddr:        addr1,
+						DstAddr:        addr2,
+					},
+					payload:      payloadGen(12),
+					autoChecksum: true,
+				},
 			},
-			1,
-			1,
+			wantMalformedIPPackets: 1,
+			wantMalformedFragments: 0,
 		},
-		// The following 3 tests were found by running a fuzzer and were
-		// triggering a panic in the IPv4 reassembler code.
 		{
-			"ihl_less_than_ipv4_minimum_size_1",
-			[][]byte{
-				{0x42, 0x30, 0x0, 0x30, 0x30, 0x40, 0x0, 0xf3, 0x30, 0x1, 0x30, 0x30, 0x73, 0x73, 0x69, 0x6e, 0xff, 0xff, 0xff, 0xff, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30},
-				{0x42, 0x30, 0x0, 0x8, 0x30, 0x40, 0x20, 0x0, 0x30, 0x1, 0x30, 0x30, 0x73, 0x73, 0x69, 0x6e, 0xff, 0xff, 0xff, 0xff, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30},
+			// Payload 17 octets and Fragment offset 65520
+			// Leading to the fragment end to be past 65536.
+			name: "fragment ends past 65536",
+			fragments: []fragmentData{
+				{
+					ipv4fields: header.IPv4Fields{
+						IHL:            header.IPv4MinimumSize,
+						TOS:            tos,
+						TotalLength:    header.IPv4MinimumSize + 17,
+						ID:             ident,
+						Flags:          0,
+						FragmentOffset: 65520,
+						TTL:            ttl,
+						Protocol:       protocol,
+						SrcAddr:        addr1,
+						DstAddr:        addr2,
+					},
+					payload:      payloadGen(17),
+					autoChecksum: true,
+				},
 			},
-			2,
-			0,
+			wantMalformedIPPackets: 1,
+			wantMalformedFragments: 1,
 		},
 		{
-			"ihl_less_than_ipv4_minimum_size_2",
-			[][]byte{
-				{0x42, 0x30, 0x0, 0x30, 0x30, 0x40, 0xb3, 0x12, 0x30, 0x6, 0x30, 0x30, 0x73, 0x73, 0x69, 0x6e, 0xff, 0xff, 0xff, 0xff, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30},
-				{0x42, 0x30, 0x0, 0x8, 0x30, 0x40, 0x20, 0x0, 0x30, 0x6, 0x30, 0x30, 0x73, 0x73, 0x69, 0x6e, 0xff, 0xff, 0xff, 0xff, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30},
+			// Payload 16 octets and fragment offset 65520
+			// Leading to the fragment end to be exactly 65536.
+			name: "fragment ends exactly at 65536",
+			fragments: []fragmentData{
+				{
+					ipv4fields: header.IPv4Fields{
+						IHL:            header.IPv4MinimumSize,
+						TOS:            tos,
+						TotalLength:    header.IPv4MinimumSize + 16,
+						ID:             ident,
+						Flags:          0,
+						FragmentOffset: 65520,
+						TTL:            ttl,
+						Protocol:       protocol,
+						SrcAddr:        addr1,
+						DstAddr:        addr2,
+					},
+					payload:      payloadGen(16),
+					autoChecksum: true,
+				},
 			},
-			2,
-			0,
+			wantMalformedIPPackets: 0,
+			wantMalformedFragments: 0,
 		},
 		{
-			"ihl_less_than_ipv4_minimum_size_3",
-			[][]byte{
-				{0x42, 0x30, 0x0, 0x30, 0x30, 0x40, 0xb3, 0x30, 0x30, 0x6, 0x30, 0x30, 0x73, 0x73, 0x69, 0x6e, 0xff, 0xff, 0xff, 0xff, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30},
-				{0x42, 0x30, 0x0, 0x8, 0x30, 0x40, 0x20, 0x0, 0x30, 0x6, 0x30, 0x30, 0x73, 0x73, 0x69, 0x6e, 0xff, 0xff, 0xff, 0xff, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30},
+			name: "IHL less than IPv4 minimum size",
+			fragments: []fragmentData{
+				{
+					ipv4fields: header.IPv4Fields{
+						IHL:            header.IPv4MinimumSize - 12,
+						TOS:            tos,
+						TotalLength:    header.IPv4MinimumSize + 28,
+						ID:             ident,
+						Flags:          0,
+						FragmentOffset: 1944,
+						TTL:            ttl,
+						Protocol:       protocol,
+						SrcAddr:        addr1,
+						DstAddr:        addr2,
+					},
+					payload:      payloadGen(28),
+					autoChecksum: true,
+				},
+				{
+					ipv4fields: header.IPv4Fields{
+						IHL:            header.IPv4MinimumSize - 12,
+						TOS:            tos,
+						TotalLength:    header.IPv4MinimumSize - 12,
+						ID:             ident,
+						Flags:          header.IPv4FlagMoreFragments,
+						FragmentOffset: 0,
+						TTL:            ttl,
+						Protocol:       protocol,
+						SrcAddr:        addr1,
+						DstAddr:        addr2,
+					},
+					payload:      payloadGen(28),
+					autoChecksum: true,
+				},
 			},
-			2,
-			0,
+			wantMalformedIPPackets: 2,
+			wantMalformedFragments: 0,
 		},
 		{
-			"fragment_with_short_total_len_extra_payload",
-			[][]byte{
-				{0x46, 0x30, 0x00, 0x30, 0x30, 0x40, 0x0e, 0x12, 0x30, 0x06, 0x30, 0x30, 0x73, 0x73, 0x69, 0x6e, 0xff, 0xff, 0xff, 0xff, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30},
-				{0x46, 0x30, 0x00, 0x18, 0x30, 0x40, 0x20, 0x00, 0x30, 0x06, 0x30, 0x30, 0x73, 0x73, 0x69, 0x6e, 0xff, 0xff, 0xff, 0xff, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30},
+			name: "fragment with short TotalLength and extra payload",
+			fragments: []fragmentData{
+				{
+					ipv4fields: header.IPv4Fields{
+						IHL:            header.IPv4MinimumSize + 4,
+						TOS:            tos,
+						TotalLength:    header.IPv4MinimumSize + 28,
+						ID:             ident,
+						Flags:          0,
+						FragmentOffset: 28816,
+						TTL:            ttl,
+						Protocol:       protocol,
+						SrcAddr:        addr1,
+						DstAddr:        addr2,
+					},
+					payload:      payloadGen(28),
+					autoChecksum: true,
+				},
+				{
+					ipv4fields: header.IPv4Fields{
+						IHL:            header.IPv4MinimumSize + 4,
+						TOS:            tos,
+						TotalLength:    header.IPv4MinimumSize + 4,
+						ID:             ident,
+						Flags:          header.IPv4FlagMoreFragments,
+						FragmentOffset: 0,
+						TTL:            ttl,
+						Protocol:       protocol,
+						SrcAddr:        addr1,
+						DstAddr:        addr2,
+					},
+					payload:      payloadGen(28),
+					autoChecksum: true,
+				},
 			},
-			1,
-			1,
+			wantMalformedIPPackets: 1,
+			wantMalformedFragments: 1,
 		},
 		{
-			"multiple_fragments_with_more_fragments_set_to_false",
-			[][]byte{
-				{0x45, 0x00, 0x00, 0x1c, 0x30, 0x40, 0x00, 0x10, 0x00, 0x06, 0x34, 0x69, 0x73, 0x73, 0x69, 0x6e, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-				{0x45, 0x00, 0x00, 0x1c, 0x30, 0x40, 0x00, 0x01, 0x61, 0x06, 0x34, 0x69, 0x73, 0x73, 0x69, 0x6e, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-				{0x45, 0x00, 0x00, 0x1c, 0x30, 0x40, 0x20, 0x00, 0x00, 0x06, 0x34, 0x1e, 0x73, 0x73, 0x69, 0x6e, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+			name: "multiple fragments with More Fragments flag set to false",
+			fragments: []fragmentData{
+				{
+					ipv4fields: header.IPv4Fields{
+						IHL:            header.IPv4MinimumSize,
+						TOS:            tos,
+						TotalLength:    header.IPv4MinimumSize + 8,
+						ID:             ident,
+						Flags:          0,
+						FragmentOffset: 128,
+						TTL:            ttl,
+						Protocol:       protocol,
+						SrcAddr:        addr1,
+						DstAddr:        addr2,
+					},
+					payload:      payloadGen(8),
+					autoChecksum: true,
+				},
+				{
+					ipv4fields: header.IPv4Fields{
+						IHL:            header.IPv4MinimumSize,
+						TOS:            tos,
+						TotalLength:    header.IPv4MinimumSize + 8,
+						ID:             ident,
+						Flags:          0,
+						FragmentOffset: 8,
+						TTL:            ttl,
+						Protocol:       protocol,
+						SrcAddr:        addr1,
+						DstAddr:        addr2,
+					},
+					payload:      payloadGen(8),
+					autoChecksum: true,
+				},
+				{
+					ipv4fields: header.IPv4Fields{
+						IHL:            header.IPv4MinimumSize,
+						TOS:            tos,
+						TotalLength:    header.IPv4MinimumSize + 8,
+						ID:             ident,
+						Flags:          header.IPv4FlagMoreFragments,
+						FragmentOffset: 0,
+						TTL:            ttl,
+						Protocol:       protocol,
+						SrcAddr:        addr1,
+						DstAddr:        addr2,
+					},
+					payload:      payloadGen(8),
+					autoChecksum: true,
+				},
 			},
-			1,
-			1,
+			wantMalformedIPPackets: 1,
+			wantMalformedFragments: 1,
 		},
 	}
 
-	for _, tc := range testCases {
-		t.Run(tc.name, func(t *testing.T) {
-			const nicID tcpip.NICID = 42
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+
 			s := stack.New(stack.Options{
-				NetworkProtocols: []stack.NetworkProtocol{
-					ipv4.NewProtocol(),
+				NetworkProtocols: []stack.NetworkProtocolFactory{
+					ipv4.NewProtocol,
 				},
 			})
+			e := channel.New(0, 1500, linkAddr)
+			if err := s.CreateNIC(nicID, e); err != nil {
+				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+			}
+			if err := s.AddAddress(nicID, ipv4.ProtocolNumber, addr2); err != nil {
+				t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, header.IPv4ProtocolNumber, addr2, err)
+			}
+
+			for _, f := range test.fragments {
+				pktSize := header.IPv4MinimumSize + len(f.payload)
+				hdr := buffer.NewPrependable(pktSize)
 
-			var linkAddr = tcpip.LinkAddress([]byte{0x30, 0x30, 0x30, 0x30, 0x30, 0x30})
-			var remoteLinkAddr = tcpip.LinkAddress([]byte{0x30, 0x30, 0x30, 0x30, 0x30, 0x31})
-			ep := channel.New(10, 1500, linkAddr)
-			s.CreateNIC(nicID, sniffer.New(ep))
+				ip := header.IPv4(hdr.Prepend(pktSize))
+				ip.Encode(&f.ipv4fields)
+				copy(ip[header.IPv4MinimumSize:], f.payload)
 
-			for _, pkt := range tc.packets {
-				ep.InjectLinkAddr(header.IPv4ProtocolNumber, remoteLinkAddr, stack.NewPacketBuffer(stack.PacketBufferOptions{
-					Data: buffer.NewVectorisedView(len(pkt), []buffer.View{pkt}),
+				if f.autoChecksum {
+					ip.SetChecksum(0)
+					ip.SetChecksum(^ip.CalculateChecksum())
+				}
+
+				vv := hdr.View().ToVectorisedView()
+				e.InjectInbound(header.IPv4ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
+					Data: vv,
 				}))
 			}
 
-			if got, want := s.Stats().IP.MalformedPacketsReceived.Value(), tc.wantMalformedIPPackets; got != want {
+			if got, want := s.Stats().IP.MalformedPacketsReceived.Value(), test.wantMalformedIPPackets; got != want {
 				t.Errorf("incorrect Stats.IP.MalformedPacketsReceived, got: %d, want: %d", got, want)
 			}
-			if got, want := s.Stats().IP.MalformedFragmentsReceived.Value(), tc.wantMalformedFragments; got != want {
+			if got, want := s.Stats().IP.MalformedFragmentsReceived.Value(), test.wantMalformedFragments; got != want {
 				t.Errorf("incorrect Stats.IP.MalformedFragmentsReceived, got: %d, want: %d", got, want)
 			}
 		})
@@ -534,6 +1074,9 @@ func TestReceiveFragments(t *testing.T) {
 	// the fragment block size of 8 (RFC 791 section 3.1 page 14).
 	ipv4Payload3Addr1ToAddr2 := udpGen(127, 3, addr1, addr2)
 	udpPayload3Addr1ToAddr2 := ipv4Payload3Addr1ToAddr2[header.UDPMinimumSize:]
+	// Used to test the max reassembled payload length (65,535 octets).
+	ipv4Payload4Addr1ToAddr2 := udpGen(header.UDPMaximumSize-header.UDPMinimumSize, 4, addr1, addr2)
+	udpPayload4Addr1ToAddr2 := ipv4Payload4Addr1ToAddr2[header.UDPMinimumSize:]
 
 	type fragmentData struct {
 		srcAddr        tcpip.Address
@@ -827,14 +1370,36 @@ func TestReceiveFragments(t *testing.T) {
 			},
 			expectedPayloads: nil,
 		},
+		{
+			name: "Two fragments reassembled into a maximum UDP packet",
+			fragments: []fragmentData{
+				{
+					srcAddr:        addr1,
+					dstAddr:        addr2,
+					id:             1,
+					flags:          header.IPv4FlagMoreFragments,
+					fragmentOffset: 0,
+					payload:        ipv4Payload4Addr1ToAddr2[:65512],
+				},
+				{
+					srcAddr:        addr1,
+					dstAddr:        addr2,
+					id:             1,
+					flags:          0,
+					fragmentOffset: 65512,
+					payload:        ipv4Payload4Addr1ToAddr2[65512:],
+				},
+			},
+			expectedPayloads: [][]byte{udpPayload4Addr1ToAddr2},
+		},
 	}
 
 	for _, test := range tests {
 		t.Run(test.name, func(t *testing.T) {
 			// Setup a stack and endpoint.
 			s := stack.New(stack.Options{
-				NetworkProtocols:   []stack.NetworkProtocol{ipv4.NewProtocol()},
-				TransportProtocols: []stack.TransportProtocol{udp.NewProtocol()},
+				NetworkProtocols:   []stack.NetworkProtocolFactory{ipv4.NewProtocol},
+				TransportProtocols: []stack.TransportProtocolFactory{udp.NewProtocol},
 			})
 			e := channel.New(0, 1280, tcpip.LinkAddress("\xf0\x00"))
 			if err := s.CreateNIC(nicID, e); err != nil {
@@ -877,6 +1442,7 @@ func TestReceiveFragments(t *testing.T) {
 					SrcAddr:        frag.srcAddr,
 					DstAddr:        frag.dstAddr,
 				})
+				ip.SetChecksum(^ip.CalculateChecksum())
 
 				vv := hdr.View().ToVectorisedView()
 				vv.AppendView(frag.payload)
@@ -906,3 +1472,394 @@ func TestReceiveFragments(t *testing.T) {
 		})
 	}
 }
+
+func TestWriteStats(t *testing.T) {
+	const nPackets = 3
+
+	tests := []struct {
+		name          string
+		setup         func(*testing.T, *stack.Stack)
+		allowPackets  int
+		expectSent    int
+		expectDropped int
+		expectWritten int
+	}{
+		{
+			name: "Accept all",
+			// No setup needed, tables accept everything by default.
+			setup:         func(*testing.T, *stack.Stack) {},
+			allowPackets:  math.MaxInt32,
+			expectSent:    nPackets,
+			expectDropped: 0,
+			expectWritten: nPackets,
+		}, {
+			name: "Accept all with error",
+			// No setup needed, tables accept everything by default.
+			setup:         func(*testing.T, *stack.Stack) {},
+			allowPackets:  nPackets - 1,
+			expectSent:    nPackets - 1,
+			expectDropped: 0,
+			expectWritten: nPackets - 1,
+		}, {
+			name: "Drop all",
+			setup: func(t *testing.T, stk *stack.Stack) {
+				// Install Output DROP rule.
+				t.Helper()
+				ipt := stk.IPTables()
+				filter, ok := ipt.GetTable(stack.FilterTable, false /* ipv6 */)
+				if !ok {
+					t.Fatalf("failed to find filter table")
+				}
+				ruleIdx := filter.BuiltinChains[stack.Output]
+				filter.Rules[ruleIdx].Target = &stack.DropTarget{}
+				if err := ipt.ReplaceTable(stack.FilterTable, filter, false /* ipv6 */); err != nil {
+					t.Fatalf("failed to replace table: %s", err)
+				}
+			},
+			allowPackets:  math.MaxInt32,
+			expectSent:    0,
+			expectDropped: nPackets,
+			expectWritten: nPackets,
+		}, {
+			name: "Drop some",
+			setup: func(t *testing.T, stk *stack.Stack) {
+				// Install Output DROP rule that matches only 1
+				// of the 3 packets.
+				t.Helper()
+				ipt := stk.IPTables()
+				filter, ok := ipt.GetTable(stack.FilterTable, false /* ipv6 */)
+				if !ok {
+					t.Fatalf("failed to find filter table")
+				}
+				// We'll match and DROP the last packet.
+				ruleIdx := filter.BuiltinChains[stack.Output]
+				filter.Rules[ruleIdx].Target = &stack.DropTarget{}
+				filter.Rules[ruleIdx].Matchers = []stack.Matcher{&limitedMatcher{nPackets - 1}}
+				// Make sure the next rule is ACCEPT.
+				filter.Rules[ruleIdx+1].Target = &stack.AcceptTarget{}
+				if err := ipt.ReplaceTable(stack.FilterTable, filter, false /* ipv6 */); err != nil {
+					t.Fatalf("failed to replace table: %s", err)
+				}
+			},
+			allowPackets:  math.MaxInt32,
+			expectSent:    nPackets - 1,
+			expectDropped: 1,
+			expectWritten: nPackets,
+		},
+	}
+
+	// Parameterize the tests to run with both WritePacket and WritePackets.
+	writers := []struct {
+		name         string
+		writePackets func(*stack.Route, stack.PacketBufferList) (int, *tcpip.Error)
+	}{
+		{
+			name: "WritePacket",
+			writePackets: func(rt *stack.Route, pkts stack.PacketBufferList) (int, *tcpip.Error) {
+				nWritten := 0
+				for pkt := pkts.Front(); pkt != nil; pkt = pkt.Next() {
+					if err := rt.WritePacket(nil, stack.NetworkHeaderParams{}, pkt); err != nil {
+						return nWritten, err
+					}
+					nWritten++
+				}
+				return nWritten, nil
+			},
+		}, {
+			name: "WritePackets",
+			writePackets: func(rt *stack.Route, pkts stack.PacketBufferList) (int, *tcpip.Error) {
+				return rt.WritePackets(nil, pkts, stack.NetworkHeaderParams{})
+			},
+		},
+	}
+
+	for _, writer := range writers {
+		t.Run(writer.name, func(t *testing.T) {
+			for _, test := range tests {
+				t.Run(test.name, func(t *testing.T) {
+					ep := testutil.NewMockLinkEndpoint(header.IPv4MinimumSize+header.UDPMinimumSize, tcpip.ErrInvalidEndpointState, test.allowPackets)
+					rt := buildRoute(t, ep)
+
+					var pkts stack.PacketBufferList
+					for i := 0; i < nPackets; i++ {
+						pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
+							ReserveHeaderBytes: header.UDPMinimumSize + int(rt.MaxHeaderLength()),
+							Data:               buffer.NewView(0).ToVectorisedView(),
+						})
+						pkt.TransportHeader().Push(header.UDPMinimumSize)
+						pkts.PushBack(pkt)
+					}
+
+					test.setup(t, rt.Stack())
+
+					nWritten, _ := writer.writePackets(&rt, pkts)
+
+					if got := int(rt.Stats().IP.PacketsSent.Value()); got != test.expectSent {
+						t.Errorf("sent %d packets, but expected to send %d", got, test.expectSent)
+					}
+					if got := int(rt.Stats().IP.IPTablesOutputDropped.Value()); got != test.expectDropped {
+						t.Errorf("dropped %d packets, but expected to drop %d", got, test.expectDropped)
+					}
+					if nWritten != test.expectWritten {
+						t.Errorf("wrote %d packets, but expected WritePackets to return %d", nWritten, test.expectWritten)
+					}
+				})
+			}
+		})
+	}
+}
+
+func buildRoute(t *testing.T, ep stack.LinkEndpoint) stack.Route {
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocolFactory{ipv4.NewProtocol},
+	})
+	if err := s.CreateNIC(1, ep); err != nil {
+		t.Fatalf("CreateNIC(1, _) failed: %s", err)
+	}
+	const (
+		src = "\x10\x00\x00\x01"
+		dst = "\x10\x00\x00\x02"
+	)
+	if err := s.AddAddress(1, ipv4.ProtocolNumber, src); err != nil {
+		t.Fatalf("AddAddress(1, %d, %s) failed: %s", ipv4.ProtocolNumber, src, err)
+	}
+	{
+		mask := tcpip.AddressMask(header.IPv4Broadcast)
+		subnet, err := tcpip.NewSubnet(dst, mask)
+		if err != nil {
+			t.Fatalf("NewSubnet(%s, %s) failed: %v", dst, mask, err)
+		}
+		s.SetRouteTable([]tcpip.Route{{
+			Destination: subnet,
+			NIC:         1,
+		}})
+	}
+	rt, err := s.FindRoute(1, src, dst, ipv4.ProtocolNumber, false /* multicastLoop */)
+	if err != nil {
+		t.Fatalf("FindRoute(1, %s, %s, %d, false) = %s", src, dst, ipv4.ProtocolNumber, err)
+	}
+	return rt
+}
+
+// limitedMatcher is an iptables matcher that matches after a certain number of
+// packets are checked against it.
+type limitedMatcher struct {
+	limit int
+}
+
+// Name implements Matcher.Name.
+func (*limitedMatcher) Name() string {
+	return "limitedMatcher"
+}
+
+// Match implements Matcher.Match.
+func (lm *limitedMatcher) Match(stack.Hook, *stack.PacketBuffer, string) (bool, bool) {
+	if lm.limit == 0 {
+		return true, false
+	}
+	lm.limit--
+	return false, false
+}
+
+func TestPacketQueing(t *testing.T) {
+	const nicID = 1
+
+	var (
+		host1NICLinkAddr = tcpip.LinkAddress("\x02\x03\x03\x04\x05\x06")
+		host2NICLinkAddr = tcpip.LinkAddress("\x02\x03\x03\x04\x05\x09")
+
+		host1IPv4Addr = tcpip.ProtocolAddress{
+			Protocol: ipv4.ProtocolNumber,
+			AddressWithPrefix: tcpip.AddressWithPrefix{
+				Address:   tcpip.Address(net.ParseIP("192.168.0.1").To4()),
+				PrefixLen: 24,
+			},
+		}
+		host2IPv4Addr = tcpip.ProtocolAddress{
+			Protocol: ipv4.ProtocolNumber,
+			AddressWithPrefix: tcpip.AddressWithPrefix{
+				Address:   tcpip.Address(net.ParseIP("192.168.0.2").To4()),
+				PrefixLen: 8,
+			},
+		}
+	)
+
+	tests := []struct {
+		name      string
+		rxPkt     func(*channel.Endpoint)
+		checkResp func(*testing.T, *channel.Endpoint)
+	}{
+		{
+			name: "ICMP Error",
+			rxPkt: func(e *channel.Endpoint) {
+				hdr := buffer.NewPrependable(header.IPv4MinimumSize + header.UDPMinimumSize)
+				u := header.UDP(hdr.Prepend(header.UDPMinimumSize))
+				u.Encode(&header.UDPFields{
+					SrcPort: 5555,
+					DstPort: 80,
+					Length:  header.UDPMinimumSize,
+				})
+				sum := header.PseudoHeaderChecksum(udp.ProtocolNumber, host2IPv4Addr.AddressWithPrefix.Address, host1IPv4Addr.AddressWithPrefix.Address, header.UDPMinimumSize)
+				sum = header.Checksum(header.UDP([]byte{}), sum)
+				u.SetChecksum(^u.CalculateChecksum(sum))
+				ip := header.IPv4(hdr.Prepend(header.IPv4MinimumSize))
+				ip.Encode(&header.IPv4Fields{
+					IHL:         header.IPv4MinimumSize,
+					TotalLength: header.IPv4MinimumSize + header.UDPMinimumSize,
+					TTL:         ipv4.DefaultTTL,
+					Protocol:    uint8(udp.ProtocolNumber),
+					SrcAddr:     host2IPv4Addr.AddressWithPrefix.Address,
+					DstAddr:     host1IPv4Addr.AddressWithPrefix.Address,
+				})
+				ip.SetChecksum(^ip.CalculateChecksum())
+				e.InjectInbound(ipv4.ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
+					Data: hdr.View().ToVectorisedView(),
+				}))
+			},
+			checkResp: func(t *testing.T, e *channel.Endpoint) {
+				p, ok := e.ReadContext(context.Background())
+				if !ok {
+					t.Fatalf("timed out waiting for packet")
+				}
+				if p.Proto != header.IPv4ProtocolNumber {
+					t.Errorf("got p.Proto = %d, want = %d", p.Proto, header.IPv4ProtocolNumber)
+				}
+				if p.Route.RemoteLinkAddress != host2NICLinkAddr {
+					t.Errorf("got p.Route.RemoteLinkAddress = %s, want = %s", p.Route.RemoteLinkAddress, host2NICLinkAddr)
+				}
+				checker.IPv4(t, stack.PayloadSince(p.Pkt.NetworkHeader()),
+					checker.SrcAddr(host1IPv4Addr.AddressWithPrefix.Address),
+					checker.DstAddr(host2IPv4Addr.AddressWithPrefix.Address),
+					checker.ICMPv4(
+						checker.ICMPv4Type(header.ICMPv4DstUnreachable),
+						checker.ICMPv4Code(header.ICMPv4PortUnreachable)))
+			},
+		},
+
+		{
+			name: "Ping",
+			rxPkt: func(e *channel.Endpoint) {
+				totalLen := header.IPv4MinimumSize + header.ICMPv4MinimumSize
+				hdr := buffer.NewPrependable(totalLen)
+				pkt := header.ICMPv4(hdr.Prepend(header.ICMPv4MinimumSize))
+				pkt.SetType(header.ICMPv4Echo)
+				pkt.SetCode(0)
+				pkt.SetChecksum(0)
+				pkt.SetChecksum(^header.Checksum(pkt, 0))
+				ip := header.IPv4(hdr.Prepend(header.IPv4MinimumSize))
+				ip.Encode(&header.IPv4Fields{
+					IHL:         header.IPv4MinimumSize,
+					TotalLength: uint16(totalLen),
+					Protocol:    uint8(icmp.ProtocolNumber4),
+					TTL:         ipv4.DefaultTTL,
+					SrcAddr:     host2IPv4Addr.AddressWithPrefix.Address,
+					DstAddr:     host1IPv4Addr.AddressWithPrefix.Address,
+				})
+				ip.SetChecksum(^ip.CalculateChecksum())
+				e.InjectInbound(header.IPv4ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
+					Data: hdr.View().ToVectorisedView(),
+				}))
+			},
+			checkResp: func(t *testing.T, e *channel.Endpoint) {
+				p, ok := e.ReadContext(context.Background())
+				if !ok {
+					t.Fatalf("timed out waiting for packet")
+				}
+				if p.Proto != header.IPv4ProtocolNumber {
+					t.Errorf("got p.Proto = %d, want = %d", p.Proto, header.IPv4ProtocolNumber)
+				}
+				if p.Route.RemoteLinkAddress != host2NICLinkAddr {
+					t.Errorf("got p.Route.RemoteLinkAddress = %s, want = %s", p.Route.RemoteLinkAddress, host2NICLinkAddr)
+				}
+				checker.IPv4(t, stack.PayloadSince(p.Pkt.NetworkHeader()),
+					checker.SrcAddr(host1IPv4Addr.AddressWithPrefix.Address),
+					checker.DstAddr(host2IPv4Addr.AddressWithPrefix.Address),
+					checker.ICMPv4(
+						checker.ICMPv4Type(header.ICMPv4EchoReply),
+						checker.ICMPv4Code(header.ICMPv4UnusedCode)))
+			},
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			e := channel.New(1, header.IPv6MinimumMTU, host1NICLinkAddr)
+			e.LinkEPCapabilities |= stack.CapabilityResolutionRequired
+			s := stack.New(stack.Options{
+				NetworkProtocols:   []stack.NetworkProtocolFactory{arp.NewProtocol, ipv4.NewProtocol},
+				TransportProtocols: []stack.TransportProtocolFactory{udp.NewProtocol},
+			})
+
+			if err := s.CreateNIC(nicID, e); err != nil {
+				t.Fatalf("s.CreateNIC(%d, _): %s", nicID, err)
+			}
+			if err := s.AddAddress(nicID, arp.ProtocolNumber, arp.ProtocolAddress); err != nil {
+				t.Fatalf("s.AddAddress(%d, %d, %s): %s", nicID, arp.ProtocolNumber, arp.ProtocolAddress, err)
+			}
+			if err := s.AddProtocolAddress(nicID, host1IPv4Addr); err != nil {
+				t.Fatalf("s.AddProtocolAddress(%d, %#v): %s", nicID, host1IPv4Addr, err)
+			}
+
+			s.SetRouteTable([]tcpip.Route{
+				{
+					Destination: host1IPv4Addr.AddressWithPrefix.Subnet(),
+					NIC:         nicID,
+				},
+			})
+
+			// Receive a packet to trigger link resolution before a response is sent.
+			test.rxPkt(e)
+
+			// Wait for a ARP request since link address resolution should be
+			// performed.
+			{
+				p, ok := e.ReadContext(context.Background())
+				if !ok {
+					t.Fatalf("timed out waiting for packet")
+				}
+				if p.Proto != arp.ProtocolNumber {
+					t.Errorf("got p.Proto = %d, want = %d", p.Proto, arp.ProtocolNumber)
+				}
+				if p.Route.RemoteLinkAddress != header.EthernetBroadcastAddress {
+					t.Errorf("got p.Route.RemoteLinkAddress = %s, want = %s", p.Route.RemoteLinkAddress, header.EthernetBroadcastAddress)
+				}
+				rep := header.ARP(p.Pkt.NetworkHeader().View())
+				if got := rep.Op(); got != header.ARPRequest {
+					t.Errorf("got Op() = %d, want = %d", got, header.ARPRequest)
+				}
+				if got := tcpip.LinkAddress(rep.HardwareAddressSender()); got != host1NICLinkAddr {
+					t.Errorf("got HardwareAddressSender = %s, want = %s", got, host1NICLinkAddr)
+				}
+				if got := tcpip.Address(rep.ProtocolAddressSender()); got != host1IPv4Addr.AddressWithPrefix.Address {
+					t.Errorf("got ProtocolAddressSender = %s, want = %s", got, host1IPv4Addr.AddressWithPrefix.Address)
+				}
+				if got := tcpip.Address(rep.ProtocolAddressTarget()); got != host2IPv4Addr.AddressWithPrefix.Address {
+					t.Errorf("got ProtocolAddressTarget = %s, want = %s", got, host2IPv4Addr.AddressWithPrefix.Address)
+				}
+			}
+
+			// Send an ARP reply to complete link address resolution.
+			{
+				hdr := buffer.View(make([]byte, header.ARPSize))
+				packet := header.ARP(hdr)
+				packet.SetIPv4OverEthernet()
+				packet.SetOp(header.ARPReply)
+				copy(packet.HardwareAddressSender(), host2NICLinkAddr)
+				copy(packet.ProtocolAddressSender(), host2IPv4Addr.AddressWithPrefix.Address)
+				copy(packet.HardwareAddressTarget(), host1NICLinkAddr)
+				copy(packet.ProtocolAddressTarget(), host1IPv4Addr.AddressWithPrefix.Address)
+				e.InjectInbound(arp.ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
+					Data: hdr.ToVectorisedView(),
+				}))
+			}
+
+			// Expect the response now that the link address has resolved.
+			test.checkResp(t, e)
+
+			// Since link resolution was already performed, it shouldn't be performed
+			// again.
+			test.rxPkt(e)
+			test.checkResp(t, e)
+		})
+	}
+}
diff --git a/pkg/tcpip/network/ipv6/BUILD b/pkg/tcpip/network/ipv6/BUILD
index bcc64994e..a30437f02 100644
--- a/pkg/tcpip/network/ipv6/BUILD
+++ b/pkg/tcpip/network/ipv6/BUILD
@@ -5,15 +5,20 @@ package(licenses = ["notice"])
 go_library(
     name = "ipv6",
     srcs = [
+        "dhcpv6configurationfromndpra_string.go",
         "icmp.go",
         "ipv6.go",
+        "ndp.go",
     ],
     visibility = ["//visibility:public"],
     deps = [
+        "//pkg/sync",
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/header",
+        "//pkg/tcpip/header/parse",
         "//pkg/tcpip/network/fragmentation",
+        "//pkg/tcpip/network/hash",
         "//pkg/tcpip/stack",
     ],
 )
@@ -34,8 +39,10 @@ go_test(
         "//pkg/tcpip/header",
         "//pkg/tcpip/link/channel",
         "//pkg/tcpip/link/sniffer",
+        "//pkg/tcpip/network/testutil",
         "//pkg/tcpip/stack",
         "//pkg/tcpip/transport/icmp",
+        "//pkg/tcpip/transport/tcp",
         "//pkg/tcpip/transport/udp",
         "//pkg/waiter",
         "@com_github_google_go_cmp//cmp:go_default_library",
diff --git a/pkg/tcpip/stack/dhcpv6configurationfromndpra_string.go b/pkg/tcpip/network/ipv6/dhcpv6configurationfromndpra_string.go
index d199ded6a..09ba133b1 100644
--- a/pkg/tcpip/stack/dhcpv6configurationfromndpra_string.go
+++ b/pkg/tcpip/network/ipv6/dhcpv6configurationfromndpra_string.go
@@ -14,7 +14,7 @@
 
 // Code generated by "stringer -type DHCPv6ConfigurationFromNDPRA"; DO NOT EDIT.
 
-package stack
+package ipv6
 
 import "strconv"
 
diff --git a/pkg/tcpip/network/ipv6/icmp.go b/pkg/tcpip/network/ipv6/icmp.go
index 66d3a953a..ead6bedcb 100644
--- a/pkg/tcpip/network/ipv6/icmp.go
+++ b/pkg/tcpip/network/ipv6/icmp.go
@@ -41,7 +41,7 @@ func (e *endpoint) handleControl(typ stack.ControlType, extra uint32, pkt *stack
 	// Drop packet if it doesn't have the basic IPv6 header or if the
 	// original source address doesn't match an address we own.
 	src := hdr.SourceAddress()
-	if e.stack.CheckLocalAddress(e.NICID(), ProtocolNumber, src) == 0 {
+	if e.protocol.stack.CheckLocalAddress(e.nic.ID(), ProtocolNumber, src) == 0 {
 		return
 	}
 
@@ -71,6 +71,59 @@ func (e *endpoint) handleControl(typ stack.ControlType, extra uint32, pkt *stack
 	e.dispatcher.DeliverTransportControlPacket(src, hdr.DestinationAddress(), ProtocolNumber, p, typ, extra, pkt)
 }
 
+// getLinkAddrOption searches NDP options for a given link address option using
+// the provided getAddr function as a filter. Returns the link address if
+// found; otherwise, returns the zero link address value. Also returns true if
+// the options are valid as per the wire format, false otherwise.
+func getLinkAddrOption(it header.NDPOptionIterator, getAddr func(header.NDPOption) tcpip.LinkAddress) (tcpip.LinkAddress, bool) {
+	var linkAddr tcpip.LinkAddress
+	for {
+		opt, done, err := it.Next()
+		if err != nil {
+			return "", false
+		}
+		if done {
+			break
+		}
+		if addr := getAddr(opt); len(addr) != 0 {
+			// No RFCs define what to do when an NDP message has multiple Link-Layer
+			// Address options. Since no interface can have multiple link-layer
+			// addresses, we consider such messages invalid.
+			if len(linkAddr) != 0 {
+				return "", false
+			}
+			linkAddr = addr
+		}
+	}
+	return linkAddr, true
+}
+
+// getSourceLinkAddr searches NDP options for the source link address option.
+// Returns the link address if found; otherwise, returns the zero link address
+// value. Also returns true if the options are valid as per the wire format,
+// false otherwise.
+func getSourceLinkAddr(it header.NDPOptionIterator) (tcpip.LinkAddress, bool) {
+	return getLinkAddrOption(it, func(opt header.NDPOption) tcpip.LinkAddress {
+		if src, ok := opt.(header.NDPSourceLinkLayerAddressOption); ok {
+			return src.EthernetAddress()
+		}
+		return ""
+	})
+}
+
+// getTargetLinkAddr searches NDP options for the target link address option.
+// Returns the link address if found; otherwise, returns the zero link address
+// value. Also returns true if the options are valid as per the wire format,
+// false otherwise.
+func getTargetLinkAddr(it header.NDPOptionIterator) (tcpip.LinkAddress, bool) {
+	return getLinkAddrOption(it, func(opt header.NDPOption) tcpip.LinkAddress {
+		if dst, ok := opt.(header.NDPTargetLinkLayerAddressOption); ok {
+			return dst.EthernetAddress()
+		}
+		return ""
+	})
+}
+
 func (e *endpoint) handleICMP(r *stack.Route, pkt *stack.PacketBuffer, hasFragmentHeader bool) {
 	stats := r.Stats().ICMP
 	sent := stats.V6PacketsSent
@@ -137,7 +190,7 @@ func (e *endpoint) handleICMP(r *stack.Route, pkt *stack.PacketBuffer, hasFragme
 
 	case header.ICMPv6NeighborSolicit:
 		received.NeighborSolicit.Increment()
-		if pkt.Data.Size() < header.ICMPv6NeighborSolicitMinimumSize || !isNDPValid() {
+		if !isNDPValid() || pkt.Data.Size() < header.ICMPv6NeighborSolicitMinimumSize {
 			received.Invalid.Increment()
 			return
 		}
@@ -147,22 +200,16 @@ func (e *endpoint) handleICMP(r *stack.Route, pkt *stack.PacketBuffer, hasFragme
 		// NDP messages cannot be fragmented. Also note that in the common case NDP
 		// datagrams are very small and ToView() will not incur allocations.
 		ns := header.NDPNeighborSolicit(payload.ToView())
-		it, err := ns.Options().Iter(true)
-		if err != nil {
-			// If we have a malformed NDP NS option, drop the packet.
+		targetAddr := ns.TargetAddress()
+
+		// As per RFC 4861 section 4.3, the Target Address MUST NOT be a multicast
+		// address.
+		if header.IsV6MulticastAddress(targetAddr) {
 			received.Invalid.Increment()
 			return
 		}
 
-		targetAddr := ns.TargetAddress()
-		s := r.Stack()
-		if isTentative, err := s.IsAddrTentative(e.nicID, targetAddr); err != nil {
-			// We will only get an error if the NIC is unrecognized, which should not
-			// happen. For now, drop this packet.
-			//
-			// TODO(b/141002840): Handle this better?
-			return
-		} else if isTentative {
+		if e.hasTentativeAddr(targetAddr) {
 			// If the target address is tentative and the source of the packet is a
 			// unicast (specified) address, then the source of the packet is
 			// attempting to perform address resolution on the target. In this case,
@@ -175,7 +222,20 @@ func (e *endpoint) handleICMP(r *stack.Route, pkt *stack.PacketBuffer, hasFragme
 			// stack know so it can handle such a scenario and do nothing further with
 			// the NS.
 			if r.RemoteAddress == header.IPv6Any {
-				s.DupTentativeAddrDetected(e.nicID, targetAddr)
+				// We would get an error if the address no longer exists or the address
+				// is no longer tentative (DAD resolved between the call to
+				// hasTentativeAddr and this point). Both of these are valid scenarios:
+				//   1) An address may be removed at any time.
+				//   2) As per RFC 4862 section 5.4, DAD is not a perfect:
+				//       "Note that the method for detecting duplicates
+				//        is not completely reliable, and it is possible that duplicate
+				//        addresses will still exist"
+				//
+				// TODO(gvisor.dev/issue/4046): Handle the scenario when a duplicate
+				// address is detected for an assigned address.
+				if err := e.dupTentativeAddrDetected(targetAddr); err != nil && err != tcpip.ErrBadAddress && err != tcpip.ErrInvalidEndpointState {
+					panic(fmt.Sprintf("unexpected error handling duplicate tentative address: %s", err))
+				}
 			}
 
 			// Do not handle neighbor solicitations targeted to an address that is
@@ -187,48 +247,34 @@ func (e *endpoint) handleICMP(r *stack.Route, pkt *stack.PacketBuffer, hasFragme
 		// so the packet is processed as defined in RFC 4861, as per RFC 4862
 		// section 5.4.3.
 
-		// Is the NS targetting us?
-		if e.linkAddrCache.CheckLocalAddress(e.nicID, ProtocolNumber, targetAddr) == 0 {
+		// Is the NS targeting us?
+		if r.Stack().CheckLocalAddress(e.nic.ID(), ProtocolNumber, targetAddr) == 0 {
 			return
 		}
 
-		// If the NS message contains the Source Link-Layer Address option, update
-		// the link address cache with the value of the option.
-		//
-		// TODO(b/148429853): Properly process the NS message and do Neighbor
-		// Unreachability Detection.
 		var sourceLinkAddr tcpip.LinkAddress
-		for {
-			opt, done, err := it.Next()
+		{
+			it, err := ns.Options().Iter(false /* check */)
 			if err != nil {
-				// This should never happen as Iter(true) above did not return an error.
-				panic(fmt.Sprintf("unexpected error when iterating over NDP options: %s", err))
-			}
-			if done {
-				break
+				// Options are not valid as per the wire format, silently drop the
+				// packet.
+				received.Invalid.Increment()
+				return
 			}
 
-			switch opt := opt.(type) {
-			case header.NDPSourceLinkLayerAddressOption:
-				// No RFCs define what to do when an NS message has multiple Source
-				// Link-Layer Address options. Since no interface can have multiple
-				// link-layer addresses, we consider such messages invalid.
-				if len(sourceLinkAddr) != 0 {
-					received.Invalid.Increment()
-					return
-				}
-
-				sourceLinkAddr = opt.EthernetAddress()
+			sourceLinkAddr, ok = getSourceLinkAddr(it)
+			if !ok {
+				received.Invalid.Increment()
+				return
 			}
 		}
 
-		unspecifiedSource := r.RemoteAddress == header.IPv6Any
-
 		// As per RFC 4861 section 4.3, the Source Link-Layer Address Option MUST
 		// NOT be included when the source IP address is the unspecified address.
 		// Otherwise, on link layers that have addresses this option MUST be
 		// included in multicast solicitations and SHOULD be included in unicast
 		// solicitations.
+		unspecifiedSource := r.RemoteAddress == header.IPv6Any
 		if len(sourceLinkAddr) == 0 {
 			if header.IsV6MulticastAddress(r.LocalAddress) && !unspecifiedSource {
 				received.Invalid.Increment()
@@ -237,57 +283,88 @@ func (e *endpoint) handleICMP(r *stack.Route, pkt *stack.PacketBuffer, hasFragme
 		} else if unspecifiedSource {
 			received.Invalid.Increment()
 			return
+		} else if e.nud != nil {
+			e.nud.HandleProbe(r.RemoteAddress, r.LocalAddress, header.IPv6ProtocolNumber, sourceLinkAddr, e.protocol)
 		} else {
-			e.linkAddrCache.AddLinkAddress(e.nicID, r.RemoteAddress, sourceLinkAddr)
-		}
-
-		// ICMPv6 Neighbor Solicit messages are always sent to
-		// specially crafted IPv6 multicast addresses. As a result, the
-		// route we end up with here has as its LocalAddress such a
-		// multicast address. It would be nonsense to claim that our
-		// source address is a multicast address, so we manually set
-		// the source address to the target address requested in the
-		// solicit message. Since that requires mutating the route, we
-		// must first clone it.
-		r := r.Clone()
-		defer r.Release()
-		r.LocalAddress = targetAddr
+			e.linkAddrCache.AddLinkAddress(e.nic.ID(), r.RemoteAddress, sourceLinkAddr)
+		}
 
-		// As per RFC 4861 section 7.2.4, if the the source of the solicitation is
-		// the unspecified address, the node MUST set the Solicited flag to zero and
-		// multicast the advertisement to the all-nodes address.
-		solicited := true
+		// As per RFC 4861 section 7.1.1:
+		//   A node MUST silently discard any received Neighbor Solicitation
+		//   messages that do not satisfy all of the following validity checks:
+		//    ...
+		//    - If the IP source address is the unspecified address, the IP
+		//      destination address is a solicited-node multicast address.
+		if unspecifiedSource && !header.IsSolicitedNodeAddr(r.LocalAddress) {
+			received.Invalid.Increment()
+			return
+		}
+
+		// As per RFC 4861 section 7.2.4:
+		//
+		//   If the source of the solicitation is the unspecified address, the node
+		//   MUST [...] and multicast the advertisement to the all-nodes address.
+		//
+		remoteAddr := r.RemoteAddress
 		if unspecifiedSource {
-			solicited = false
-			r.RemoteAddress = header.IPv6AllNodesMulticastAddress
+			remoteAddr = header.IPv6AllNodesMulticastAddress
+		}
+
+		// Even if we were able to receive a packet from some remote, we may not
+		// have a route to it - the remote may be blocked via routing rules. We must
+		// always consult our routing table and find a route to the remote before
+		// sending any packet.
+		r, err := e.protocol.stack.FindRoute(e.nic.ID(), targetAddr, remoteAddr, ProtocolNumber, false /* multicastLoop */)
+		if err != nil {
+			// If we cannot find a route to the destination, silently drop the packet.
+			return
 		}
+		defer r.Release()
 
-		// If the NS has a source link-layer option, use the link address it
-		// specifies as the remote link address for the response instead of the
-		// source link address of the packet.
+		// If the NS has a source link-layer option, resolve the route immediately
+		// to avoid querying the neighbor table when the neighbor entry was updated
+		// as probing the neighbor table for a link address will transition the
+		// entry's state from stale to delay.
+		//
+		// Note, if the source link address is unspecified and this is a unicast
+		// solicitation, we may need to perform neighbor discovery to send the
+		// neighbor advertisement response. This is expected as per RFC 4861 section
+		// 7.2.4:
+		//
+		//   Because unicast Neighbor Solicitations are not required to include a
+		//   Source Link-Layer Address, it is possible that a node sending a
+		//   solicited Neighbor Advertisement does not have a corresponding link-
+		//   layer address for its neighbor in its Neighbor Cache. In such
+		//   situations, a node will first have to use Neighbor Discovery to
+		//   determine the link-layer address of its neighbor (i.e., send out a
+		//   multicast Neighbor Solicitation).
 		//
-		// TODO(#2401): As per RFC 4861 section 7.2.4 we should consult our link
-		// address cache for the right destination link address instead of manually
-		// patching the route with the remote link address if one is specified in a
-		// Source Link-Layer Address option.
 		if len(sourceLinkAddr) != 0 {
-			r.RemoteLinkAddress = sourceLinkAddr
+			r.ResolveWith(sourceLinkAddr)
 		}
 
 		optsSerializer := header.NDPOptionsSerializer{
-			header.NDPTargetLinkLayerAddressOption(r.LocalLinkAddress),
+			header.NDPTargetLinkLayerAddressOption(e.nic.LinkAddress()),
 		}
+		neighborAdvertSize := header.ICMPv6NeighborAdvertMinimumSize + optsSerializer.Length()
 		pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
-			ReserveHeaderBytes: int(r.MaxHeaderLength()) + header.ICMPv6NeighborAdvertMinimumSize + int(optsSerializer.Length()),
+			ReserveHeaderBytes: int(r.MaxHeaderLength()) + neighborAdvertSize,
 		})
-		packet := header.ICMPv6(pkt.TransportHeader().Push(header.ICMPv6NeighborAdvertSize))
+		pkt.TransportProtocolNumber = header.ICMPv6ProtocolNumber
+		packet := header.ICMPv6(pkt.TransportHeader().Push(neighborAdvertSize))
 		packet.SetType(header.ICMPv6NeighborAdvert)
 		na := header.NDPNeighborAdvert(packet.NDPPayload())
-		na.SetSolicitedFlag(solicited)
+
+		// As per RFC 4861 section 7.2.4:
+		//
+		//   If the source of the solicitation is the unspecified address, the node
+		//   MUST set the Solicited flag to zero and [..]. Otherwise, the node MUST
+		//   set the Solicited flag to one and [..].
+		//
+		na.SetSolicitedFlag(!unspecifiedSource)
 		na.SetOverrideFlag(true)
 		na.SetTargetAddress(targetAddr)
-		opts := na.Options()
-		opts.Serialize(optsSerializer)
+		na.Options().Serialize(optsSerializer)
 		packet.SetChecksum(header.ICMPv6Checksum(packet, r.LocalAddress, r.RemoteAddress, buffer.VectorisedView{}))
 
 		// RFC 4861 Neighbor Discovery for IP version 6 (IPv6)
@@ -304,7 +381,7 @@ func (e *endpoint) handleICMP(r *stack.Route, pkt *stack.PacketBuffer, hasFragme
 
 	case header.ICMPv6NeighborAdvert:
 		received.NeighborAdvert.Increment()
-		if pkt.Data.Size() < header.ICMPv6NeighborAdvertSize || !isNDPValid() {
+		if !isNDPValid() || pkt.Data.Size() < header.ICMPv6NeighborAdvertMinimumSize {
 			received.Invalid.Increment()
 			return
 		}
@@ -314,28 +391,34 @@ func (e *endpoint) handleICMP(r *stack.Route, pkt *stack.PacketBuffer, hasFragme
 		// 5, NDP messages cannot be fragmented. Also note that in the common case
 		// NDP datagrams are very small and ToView() will not incur allocations.
 		na := header.NDPNeighborAdvert(payload.ToView())
-		it, err := na.Options().Iter(true)
-		if err != nil {
-			// If we have a malformed NDP NA option, drop the packet.
-			received.Invalid.Increment()
-			return
-		}
-
 		targetAddr := na.TargetAddress()
-		stack := r.Stack()
-
-		if isTentative, err := stack.IsAddrTentative(e.nicID, targetAddr); err != nil {
-			// We will only get an error if the NIC is unrecognized, which should not
-			// happen. For now short-circuit this packet.
-			//
-			// TODO(b/141002840): Handle this better?
-			return
-		} else if isTentative {
+		if e.hasTentativeAddr(targetAddr) {
 			// We just got an NA from a node that owns an address we are performing
 			// DAD on, implying the address is not unique. In this case we let the
 			// stack know so it can handle such a scenario and do nothing furthur with
 			// the NDP NA.
-			stack.DupTentativeAddrDetected(e.nicID, targetAddr)
+			//
+			// We would get an error if the address no longer exists or the address
+			// is no longer tentative (DAD resolved between the call to
+			// hasTentativeAddr and this point). Both of these are valid scenarios:
+			//   1) An address may be removed at any time.
+			//   2) As per RFC 4862 section 5.4, DAD is not a perfect:
+			//       "Note that the method for detecting duplicates
+			//        is not completely reliable, and it is possible that duplicate
+			//        addresses will still exist"
+			//
+			// TODO(gvisor.dev/issue/4046): Handle the scenario when a duplicate
+			// address is detected for an assigned address.
+			if err := e.dupTentativeAddrDetected(targetAddr); err != nil && err != tcpip.ErrBadAddress && err != tcpip.ErrInvalidEndpointState {
+				panic(fmt.Sprintf("unexpected error handling duplicate tentative address: %s", err))
+			}
+			return
+		}
+
+		it, err := na.Options().Iter(false /* check */)
+		if err != nil {
+			// If we have a malformed NDP NA option, drop the packet.
+			received.Invalid.Increment()
 			return
 		}
 
@@ -348,40 +431,26 @@ func (e *endpoint) handleICMP(r *stack.Route, pkt *stack.PacketBuffer, hasFragme
 		// TODO(b/143147598): Handle the scenario described above. Also inform the
 		// netstack integration that a duplicate address was detected outside of
 		// DAD.
+		targetLinkAddr, ok := getTargetLinkAddr(it)
+		if !ok {
+			received.Invalid.Increment()
+			return
+		}
 
 		// If the NA message has the target link layer option, update the link
 		// address cache with the link address for the target of the message.
-		//
-		// TODO(b/148429853): Properly process the NA message and do Neighbor
-		// Unreachability Detection.
-		var targetLinkAddr tcpip.LinkAddress
-		for {
-			opt, done, err := it.Next()
-			if err != nil {
-				// This should never happen as Iter(true) above did not return an error.
-				panic(fmt.Sprintf("unexpected error when iterating over NDP options: %s", err))
-			}
-			if done {
-				break
-			}
-
-			switch opt := opt.(type) {
-			case header.NDPTargetLinkLayerAddressOption:
-				// No RFCs define what to do when an NA message has multiple Target
-				// Link-Layer Address options. Since no interface can have multiple
-				// link-layer addresses, we consider such messages invalid.
-				if len(targetLinkAddr) != 0 {
-					received.Invalid.Increment()
-					return
-				}
-
-				targetLinkAddr = opt.EthernetAddress()
+		if e.nud == nil {
+			if len(targetLinkAddr) != 0 {
+				e.linkAddrCache.AddLinkAddress(e.nic.ID(), targetAddr, targetLinkAddr)
 			}
+			return
 		}
 
-		if len(targetLinkAddr) != 0 {
-			e.linkAddrCache.AddLinkAddress(e.nicID, targetAddr, targetLinkAddr)
-		}
+		e.nud.HandleConfirmation(targetAddr, targetLinkAddr, stack.ReachabilityConfirmationFlags{
+			Solicited: na.SolicitedFlag(),
+			Override:  na.OverrideFlag(),
+			IsRouter:  na.RouterFlag(),
+		})
 
 	case header.ICMPv6EchoRequest:
 		received.EchoRequest.Increment()
@@ -391,8 +460,6 @@ func (e *endpoint) handleICMP(r *stack.Route, pkt *stack.PacketBuffer, hasFragme
 			return
 		}
 
-		remoteLinkAddr := r.RemoteLinkAddress
-
 		// As per RFC 4291 section 2.7, multicast addresses must not be used as
 		// source addresses in IPv6 packets.
 		localAddr := r.LocalAddress
@@ -400,21 +467,19 @@ func (e *endpoint) handleICMP(r *stack.Route, pkt *stack.PacketBuffer, hasFragme
 			localAddr = ""
 		}
 
-		r, err := r.Stack().FindRoute(e.NICID(), localAddr, r.RemoteAddress, ProtocolNumber, false /* multicastLoop */)
+		r, err := r.Stack().FindRoute(e.nic.ID(), localAddr, r.RemoteAddress, ProtocolNumber, false /* multicastLoop */)
 		if err != nil {
 			// If we cannot find a route to the destination, silently drop the packet.
 			return
 		}
 		defer r.Release()
 
-		// Use the link address from the source of the original packet.
-		r.ResolveWith(remoteLinkAddr)
-
 		replyPkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
 			ReserveHeaderBytes: int(r.MaxHeaderLength()) + header.ICMPv6EchoMinimumSize,
 			Data:               pkt.Data,
 		})
 		packet := header.ICMPv6(replyPkt.TransportHeader().Push(header.ICMPv6EchoMinimumSize))
+		pkt.TransportProtocolNumber = header.ICMPv6ProtocolNumber
 		copy(packet, icmpHdr)
 		packet.SetType(header.ICMPv6EchoReply)
 		packet.SetChecksum(header.ICMPv6Checksum(packet, r.LocalAddress, r.RemoteAddress, pkt.Data))
@@ -440,27 +505,75 @@ func (e *endpoint) handleICMP(r *stack.Route, pkt *stack.PacketBuffer, hasFragme
 
 	case header.ICMPv6RouterSolicit:
 		received.RouterSolicit.Increment()
-		if !isNDPValid() {
+
+		//
+		// Validate the RS as per RFC 4861 section 6.1.1.
+		//
+
+		// Is the NDP payload of sufficient size to hold a Router Solictation?
+		if !isNDPValid() || pkt.Data.Size()-header.ICMPv6HeaderSize < header.NDPRSMinimumSize {
 			received.Invalid.Increment()
 			return
 		}
 
-	case header.ICMPv6RouterAdvert:
-		received.RouterAdvert.Increment()
+		stack := r.Stack()
+
+		// Is the networking stack operating as a router?
+		if !stack.Forwarding(ProtocolNumber) {
+			// ... No, silently drop the packet.
+			received.RouterOnlyPacketsDroppedByHost.Increment()
+			return
+		}
+
+		// Note that in the common case NDP datagrams are very small and ToView()
+		// will not incur allocations.
+		rs := header.NDPRouterSolicit(payload.ToView())
+		it, err := rs.Options().Iter(false /* check */)
+		if err != nil {
+			// Options are not valid as per the wire format, silently drop the packet.
+			received.Invalid.Increment()
+			return
+		}
 
-		// Is the NDP payload of sufficient size to hold a Router
-		// Advertisement?
-		if pkt.Data.Size()-header.ICMPv6HeaderSize < header.NDPRAMinimumSize || !isNDPValid() {
+		sourceLinkAddr, ok := getSourceLinkAddr(it)
+		if !ok {
 			received.Invalid.Increment()
 			return
 		}
 
-		routerAddr := iph.SourceAddress()
+		// If the RS message has the source link layer option, update the link
+		// address cache with the link address for the source of the message.
+		if len(sourceLinkAddr) != 0 {
+			// As per RFC 4861 section 4.1, the Source Link-Layer Address Option MUST
+			// NOT be included when the source IP address is the unspecified address.
+			// Otherwise, it SHOULD be included on link layers that have addresses.
+			if r.RemoteAddress == header.IPv6Any {
+				received.Invalid.Increment()
+				return
+			}
+
+			if e.nud != nil {
+				// A RS with a specified source IP address modifies the NUD state
+				// machine in the same way a reachability probe would.
+				e.nud.HandleProbe(r.RemoteAddress, r.LocalAddress, header.IPv6ProtocolNumber, sourceLinkAddr, e.protocol)
+			}
+		}
+
+	case header.ICMPv6RouterAdvert:
+		received.RouterAdvert.Increment()
 
 		//
 		// Validate the RA as per RFC 4861 section 6.1.2.
 		//
 
+		// Is the NDP payload of sufficient size to hold a Router Advertisement?
+		if !isNDPValid() || pkt.Data.Size()-header.ICMPv6HeaderSize < header.NDPRAMinimumSize {
+			received.Invalid.Increment()
+			return
+		}
+
+		routerAddr := iph.SourceAddress()
+
 		// Is the IP Source Address a link-local address?
 		if !header.IsV6LinkLocalAddress(routerAddr) {
 			// ...No, silently drop the packet.
@@ -468,16 +581,18 @@ func (e *endpoint) handleICMP(r *stack.Route, pkt *stack.PacketBuffer, hasFragme
 			return
 		}
 
-		// The remainder of payload must be only the router advertisement, so
-		// payload.ToView() always returns the advertisement. Per RFC 6980 section
-		// 5, NDP messages cannot be fragmented. Also note that in the common case
-		// NDP datagrams are very small and ToView() will not incur allocations.
+		// Note that in the common case NDP datagrams are very small and ToView()
+		// will not incur allocations.
 		ra := header.NDPRouterAdvert(payload.ToView())
-		opts := ra.Options()
+		it, err := ra.Options().Iter(false /* check */)
+		if err != nil {
+			// Options are not valid as per the wire format, silently drop the packet.
+			received.Invalid.Increment()
+			return
+		}
 
-		// Are options valid as per the wire format?
-		if _, err := opts.Iter(true); err != nil {
-			// ...No, silently drop the packet.
+		sourceLinkAddr, ok := getSourceLinkAddr(it)
+		if !ok {
 			received.Invalid.Increment()
 			return
 		}
@@ -487,12 +602,33 @@ func (e *endpoint) handleICMP(r *stack.Route, pkt *stack.PacketBuffer, hasFragme
 		// as RFC 4861 section 6.1.2 is concerned.
 		//
 
-		// Tell the NIC to handle the RA.
-		stack := r.Stack()
-		rxNICID := r.NICID()
-		stack.HandleNDPRA(rxNICID, routerAddr, ra)
+		// If the RA has the source link layer option, update the link address
+		// cache with the link address for the advertised router.
+		if len(sourceLinkAddr) != 0 && e.nud != nil {
+			e.nud.HandleProbe(routerAddr, r.LocalAddress, header.IPv6ProtocolNumber, sourceLinkAddr, e.protocol)
+		}
+
+		e.mu.Lock()
+		e.mu.ndp.handleRA(routerAddr, ra)
+		e.mu.Unlock()
 
 	case header.ICMPv6RedirectMsg:
+		// TODO(gvisor.dev/issue/2285): Call `e.nud.HandleProbe` after validating
+		// this redirect message, as per RFC 4871 section 7.3.3:
+		//
+		//    "A Neighbor Cache entry enters the STALE state when created as a
+		//    result of receiving packets other than solicited Neighbor
+		//    Advertisements (i.e., Router Solicitations, Router Advertisements,
+		//    Redirects, and Neighbor Solicitations).  These packets contain the
+		//    link-layer address of either the sender or, in the case of Redirect,
+		//    the redirection target.  However, receipt of these link-layer
+		//    addresses does not confirm reachability of the forward-direction path
+		//    to that node.  Placing a newly created Neighbor Cache entry for which
+		//    the link-layer address is known in the STALE state provides assurance
+		//    that path failures are detected quickly. In addition, should a cached
+		//    link-layer address be modified due to receiving one of the above
+		//    messages, the state SHOULD also be set to STALE to provide prompt
+		//    verification that the path to the new link-layer address is working."
 		received.RedirectMsg.Increment()
 		if !isNDPValid() {
 			received.Invalid.Increment()
@@ -504,18 +640,6 @@ func (e *endpoint) handleICMP(r *stack.Route, pkt *stack.PacketBuffer, hasFragme
 	}
 }
 
-const (
-	ndpSolicitedFlag = 1 << 6
-	ndpOverrideFlag  = 1 << 5
-
-	ndpOptSrcLinkAddr = 1
-	ndpOptDstLinkAddr = 2
-
-	icmpV6FlagOffset   = 4
-	icmpV6OptOffset    = 24
-	icmpV6LengthOffset = 25
-)
-
 var _ stack.LinkAddressResolver = (*protocol)(nil)
 
 // LinkAddressProtocol implements stack.LinkAddressResolver.
@@ -525,30 +649,38 @@ func (*protocol) LinkAddressProtocol() tcpip.NetworkProtocolNumber {
 
 // LinkAddressRequest implements stack.LinkAddressResolver.
 func (*protocol) LinkAddressRequest(addr, localAddr tcpip.Address, remoteLinkAddr tcpip.LinkAddress, linkEP stack.LinkEndpoint) *tcpip.Error {
-	snaddr := header.SolicitedNodeAddr(addr)
-
 	// TODO(b/148672031): Use stack.FindRoute instead of manually creating the
 	// route here. Note, we would need the nicID to do this properly so the right
 	// NIC (associated to linkEP) is used to send the NDP NS message.
-	r := &stack.Route{
+	r := stack.Route{
 		LocalAddress:      localAddr,
-		RemoteAddress:     snaddr,
+		RemoteAddress:     addr,
+		LocalLinkAddress:  linkEP.LinkAddress(),
 		RemoteLinkAddress: remoteLinkAddr,
 	}
+
+	// If a remote address is not already known, then send a multicast
+	// solicitation since multicast addresses have a static mapping to link
+	// addresses.
 	if len(r.RemoteLinkAddress) == 0 {
-		r.RemoteLinkAddress = header.EthernetAddressFromMulticastIPv6Address(snaddr)
+		r.RemoteAddress = header.SolicitedNodeAddr(addr)
+		r.RemoteLinkAddress = header.EthernetAddressFromMulticastIPv6Address(r.RemoteAddress)
 	}
 
+	optsSerializer := header.NDPOptionsSerializer{
+		header.NDPSourceLinkLayerAddressOption(linkEP.LinkAddress()),
+	}
+	neighborSolicitSize := header.ICMPv6NeighborSolicitMinimumSize + optsSerializer.Length()
 	pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
-		ReserveHeaderBytes: int(linkEP.MaxHeaderLength()) + header.IPv6MinimumSize + header.ICMPv6NeighborAdvertSize,
+		ReserveHeaderBytes: int(linkEP.MaxHeaderLength()) + header.IPv6MinimumSize + neighborSolicitSize,
 	})
-	icmpHdr := header.ICMPv6(pkt.TransportHeader().Push(header.ICMPv6NeighborAdvertSize))
-	icmpHdr.SetType(header.ICMPv6NeighborSolicit)
-	copy(icmpHdr[icmpV6OptOffset-len(addr):], addr)
-	icmpHdr[icmpV6OptOffset] = ndpOptSrcLinkAddr
-	icmpHdr[icmpV6LengthOffset] = 1
-	copy(icmpHdr[icmpV6LengthOffset+1:], linkEP.LinkAddress())
-	icmpHdr.SetChecksum(header.ICMPv6Checksum(icmpHdr, r.LocalAddress, r.RemoteAddress, buffer.VectorisedView{}))
+	pkt.TransportProtocolNumber = header.ICMPv6ProtocolNumber
+	packet := header.ICMPv6(pkt.TransportHeader().Push(neighborSolicitSize))
+	packet.SetType(header.ICMPv6NeighborSolicit)
+	ns := header.NDPNeighborSolicit(packet.NDPPayload())
+	ns.SetTargetAddress(addr)
+	ns.Options().Serialize(optsSerializer)
+	packet.SetChecksum(header.ICMPv6Checksum(packet, r.LocalAddress, r.RemoteAddress, buffer.VectorisedView{}))
 
 	length := uint16(pkt.Size())
 	ip := header.IPv6(pkt.NetworkHeader().Push(header.IPv6MinimumSize))
@@ -561,7 +693,7 @@ func (*protocol) LinkAddressRequest(addr, localAddr tcpip.Address, remoteLinkAdd
 	})
 
 	// TODO(stijlist): count this in ICMP stats.
-	return linkEP.WritePacket(r, nil /* gso */, ProtocolNumber, pkt)
+	return linkEP.WritePacket(&r, nil /* gso */, ProtocolNumber, pkt)
 }
 
 // ResolveStaticAddress implements stack.LinkAddressResolver.
@@ -571,3 +703,179 @@ func (*protocol) ResolveStaticAddress(addr tcpip.Address) (tcpip.LinkAddress, bo
 	}
 	return tcpip.LinkAddress([]byte(nil)), false
 }
+
+// ======= ICMP Error packet generation =========
+
+// icmpReason is a marker interface for IPv6 specific ICMP errors.
+type icmpReason interface {
+	isICMPReason()
+}
+
+// icmpReasonParameterProblem is an error during processing of extension headers
+// or the fixed header defined in RFC 4443 section 3.4.
+type icmpReasonParameterProblem struct {
+	code header.ICMPv6Code
+
+	// respondToMulticast indicates that we are sending a packet that falls under
+	// the exception outlined by RFC 4443 section 2.4 point e.3 exception 2:
+	//
+	//       (e.3) A packet destined to an IPv6 multicast address.  (There are
+	//             two exceptions to this rule: (1) the Packet Too Big Message
+	//             (Section 3.2) to allow Path MTU discovery to work for IPv6
+	//             multicast, and (2) the Parameter Problem Message, Code 2
+	//             (Section 3.4) reporting an unrecognized IPv6 option (see
+	//             Section 4.2 of [IPv6]) that has the Option Type highest-
+	//             order two bits set to 10).
+	respondToMulticast bool
+
+	// pointer is defined in the RFC 4443 setion 3.4 which reads:
+	//
+	//  Pointer         Identifies the octet offset within the invoking packet
+	//                  where the error was detected.
+	//
+	//                  The pointer will point beyond the end of the ICMPv6
+	//                  packet if the field in error is beyond what can fit
+	//                  in the maximum size of an ICMPv6 error message.
+	pointer uint32
+}
+
+func (*icmpReasonParameterProblem) isICMPReason() {}
+
+// icmpReasonPortUnreachable is an error where the transport protocol has no
+// listener and no alternative means to inform the sender.
+type icmpReasonPortUnreachable struct{}
+
+func (*icmpReasonPortUnreachable) isICMPReason() {}
+
+// returnError takes an error descriptor and generates the appropriate ICMP
+// error packet for IPv6 and sends it.
+func (p *protocol) returnError(r *stack.Route, reason icmpReason, pkt *stack.PacketBuffer) *tcpip.Error {
+	// Only send ICMP error if the address is not a multicast v6
+	// address and the source is not the unspecified address.
+	//
+	// There are exceptions to this rule.
+	// See: point e.3) RFC 4443 section-2.4
+	//
+	//	 (e) An ICMPv6 error message MUST NOT be originated as a result of
+	//       receiving the following:
+	//
+	//       (e.1) An ICMPv6 error message.
+	//
+	//       (e.2) An ICMPv6 redirect message [IPv6-DISC].
+	//
+	//       (e.3) A packet destined to an IPv6 multicast address.  (There are
+	//             two exceptions to this rule: (1) the Packet Too Big Message
+	//             (Section 3.2) to allow Path MTU discovery to work for IPv6
+	//             multicast, and (2) the Parameter Problem Message, Code 2
+	//             (Section 3.4) reporting an unrecognized IPv6 option (see
+	//             Section 4.2 of [IPv6]) that has the Option Type highest-
+	//             order two bits set to 10).
+	//
+	var allowResponseToMulticast bool
+	if reason, ok := reason.(*icmpReasonParameterProblem); ok {
+		allowResponseToMulticast = reason.respondToMulticast
+	}
+
+	if (!allowResponseToMulticast && header.IsV6MulticastAddress(r.LocalAddress)) || r.RemoteAddress == header.IPv6Any {
+		return nil
+	}
+
+	// Even if we were able to receive a packet from some remote, we may not have
+	// a route to it - the remote may be blocked via routing rules. We must always
+	// consult our routing table and find a route to the remote before sending any
+	// packet.
+	route, err := p.stack.FindRoute(r.NICID(), r.LocalAddress, r.RemoteAddress, ProtocolNumber, false /* multicastLoop */)
+	if err != nil {
+		return err
+	}
+	defer route.Release()
+	// From this point on, the incoming route should no longer be used; route
+	// must be used to send the ICMP error.
+	r = nil
+
+	stats := p.stack.Stats().ICMP
+	sent := stats.V6PacketsSent
+	if !p.stack.AllowICMPMessage() {
+		sent.RateLimited.Increment()
+		return nil
+	}
+
+	network, transport := pkt.NetworkHeader().View(), pkt.TransportHeader().View()
+
+	if pkt.TransportProtocolNumber == header.ICMPv6ProtocolNumber {
+		// TODO(gvisor.dev/issues/3810): Sort this out when ICMP headers are stored.
+		// Unfortunately at this time ICMP Packets do not have a transport
+		// header separated out. It is in the Data part so we need to
+		// separate it out now. We will just pretend it is a minimal length
+		// ICMP packet as we don't really care if any later bits of a
+		// larger ICMP packet are in the header view or in the Data view.
+		transport, ok := pkt.TransportHeader().Consume(header.ICMPv6MinimumSize)
+		if !ok {
+			return nil
+		}
+		typ := header.ICMPv6(transport).Type()
+		if typ.IsErrorType() || typ == header.ICMPv6RedirectMsg {
+			return nil
+		}
+	}
+
+	// As per RFC 4443 section 2.4
+	//
+	//    (c) Every ICMPv6 error message (type < 128) MUST include
+	//    as much of the IPv6 offending (invoking) packet (the
+	//    packet that caused the error) as possible without making
+	//    the error message packet exceed the minimum IPv6 MTU
+	//    [IPv6].
+	mtu := int(route.MTU())
+	if mtu > header.IPv6MinimumMTU {
+		mtu = header.IPv6MinimumMTU
+	}
+	headerLen := int(route.MaxHeaderLength()) + header.ICMPv6ErrorHeaderSize
+	available := int(mtu) - headerLen
+	if available < header.IPv6MinimumSize {
+		return nil
+	}
+	payloadLen := network.Size() + transport.Size() + pkt.Data.Size()
+	if payloadLen > available {
+		payloadLen = available
+	}
+	payload := buffer.NewVectorisedView(pkt.Size(), pkt.Views())
+	payload.CapLength(payloadLen)
+
+	newPkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
+		ReserveHeaderBytes: headerLen,
+		Data:               payload,
+	})
+	newPkt.TransportProtocolNumber = header.ICMPv6ProtocolNumber
+
+	icmpHdr := header.ICMPv6(newPkt.TransportHeader().Push(header.ICMPv6DstUnreachableMinimumSize))
+	var counter *tcpip.StatCounter
+	switch reason := reason.(type) {
+	case *icmpReasonParameterProblem:
+		icmpHdr.SetType(header.ICMPv6ParamProblem)
+		icmpHdr.SetCode(reason.code)
+		icmpHdr.SetTypeSpecific(reason.pointer)
+		counter = sent.ParamProblem
+	case *icmpReasonPortUnreachable:
+		icmpHdr.SetType(header.ICMPv6DstUnreachable)
+		icmpHdr.SetCode(header.ICMPv6PortUnreachable)
+		counter = sent.DstUnreachable
+	default:
+		panic(fmt.Sprintf("unsupported ICMP type %T", reason))
+	}
+	icmpHdr.SetChecksum(header.ICMPv6Checksum(icmpHdr, route.LocalAddress, route.RemoteAddress, newPkt.Data))
+	if err := route.WritePacket(
+		nil, /* gso */
+		stack.NetworkHeaderParams{
+			Protocol: header.ICMPv6ProtocolNumber,
+			TTL:      route.DefaultTTL(),
+			TOS:      stack.DefaultTOS,
+		},
+		newPkt,
+	); err != nil {
+		sent.Dropped.Increment()
+		return err
+	}
+	counter.Increment()
+	return nil
+}
diff --git a/pkg/tcpip/network/ipv6/icmp_test.go b/pkg/tcpip/network/ipv6/icmp_test.go
index 9e4eeea77..8dc33c560 100644
--- a/pkg/tcpip/network/ipv6/icmp_test.go
+++ b/pkg/tcpip/network/ipv6/icmp_test.go
@@ -16,27 +16,36 @@ package ipv6
 
 import (
 	"context"
+	"net"
 	"reflect"
 	"strings"
 	"testing"
+	"time"
 
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/checker"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/link/channel"
 	"gvisor.dev/gvisor/pkg/tcpip/link/sniffer"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/tcpip/transport/icmp"
+	"gvisor.dev/gvisor/pkg/tcpip/transport/udp"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
 const (
+	nicID = 1
+
 	linkAddr0 = tcpip.LinkAddress("\x02\x02\x03\x04\x05\x06")
 	linkAddr1 = tcpip.LinkAddress("\x0a\x0b\x0c\x0d\x0e\x0e")
 	linkAddr2 = tcpip.LinkAddress("\x0a\x0b\x0c\x0d\x0e\x0f")
 
 	defaultChannelSize = 1
 	defaultMTU         = 65536
+
+	// Extra time to use when waiting for an async event to occur.
+	defaultAsyncPositiveEventTimeout = 30 * time.Second
 )
 
 var (
@@ -48,8 +57,15 @@ type stubLinkEndpoint struct {
 	stack.LinkEndpoint
 }
 
+func (*stubLinkEndpoint) MTU() uint32 {
+	return defaultMTU
+}
+
 func (*stubLinkEndpoint) Capabilities() stack.LinkEndpointCapabilities {
-	return 0
+	// Indicate that resolution for link layer addresses is required to send
+	// packets over this link. This is needed so the NIC knows to allocate a
+	// neighbor table.
+	return stack.CapabilityResolutionRequired
 }
 
 func (*stubLinkEndpoint) MaxHeaderLength() uint16 {
@@ -70,7 +86,8 @@ type stubDispatcher struct {
 	stack.TransportDispatcher
 }
 
-func (*stubDispatcher) DeliverTransportPacket(*stack.Route, tcpip.TransportProtocolNumber, *stack.PacketBuffer) {
+func (*stubDispatcher) DeliverTransportPacket(*stack.Route, tcpip.TransportProtocolNumber, *stack.PacketBuffer) stack.TransportPacketDisposition {
+	return stack.TransportPacketHandled
 }
 
 type stubLinkAddressCache struct {
@@ -84,16 +101,221 @@ func (*stubLinkAddressCache) CheckLocalAddress(tcpip.NICID, tcpip.NetworkProtoco
 func (*stubLinkAddressCache) AddLinkAddress(tcpip.NICID, tcpip.Address, tcpip.LinkAddress) {
 }
 
+type stubNUDHandler struct {
+	probeCount        int
+	confirmationCount int
+}
+
+var _ stack.NUDHandler = (*stubNUDHandler)(nil)
+
+func (s *stubNUDHandler) HandleProbe(remoteAddr, localAddr tcpip.Address, protocol tcpip.NetworkProtocolNumber, remoteLinkAddr tcpip.LinkAddress, linkRes stack.LinkAddressResolver) {
+	s.probeCount++
+}
+
+func (s *stubNUDHandler) HandleConfirmation(addr tcpip.Address, linkAddr tcpip.LinkAddress, flags stack.ReachabilityConfirmationFlags) {
+	s.confirmationCount++
+}
+
+func (*stubNUDHandler) HandleUpperLevelConfirmation(addr tcpip.Address) {
+}
+
+var _ stack.NetworkInterface = (*testInterface)(nil)
+
+type testInterface struct {
+	stack.NetworkLinkEndpoint
+
+	linkAddr tcpip.LinkAddress
+}
+
+func (i *testInterface) LinkAddress() tcpip.LinkAddress {
+	return i.linkAddr
+}
+
+func (*testInterface) ID() tcpip.NICID {
+	return 0
+}
+
+func (*testInterface) IsLoopback() bool {
+	return false
+}
+
+func (*testInterface) Name() string {
+	return ""
+}
+
+func (*testInterface) Enabled() bool {
+	return true
+}
+
 func TestICMPCounts(t *testing.T) {
+	tests := []struct {
+		name             string
+		useNeighborCache bool
+	}{
+		{
+			name:             "linkAddrCache",
+			useNeighborCache: false,
+		},
+		{
+			name:             "neighborCache",
+			useNeighborCache: true,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			s := stack.New(stack.Options{
+				NetworkProtocols:   []stack.NetworkProtocolFactory{NewProtocol},
+				TransportProtocols: []stack.TransportProtocolFactory{icmp.NewProtocol6},
+				UseNeighborCache:   test.useNeighborCache,
+			})
+			{
+				if err := s.CreateNIC(nicID, &stubLinkEndpoint{}); err != nil {
+					t.Fatalf("CreateNIC(_, _) = %s", err)
+				}
+				if err := s.AddAddress(nicID, ProtocolNumber, lladdr0); err != nil {
+					t.Fatalf("AddAddress(_, %d, %s) = %s", ProtocolNumber, lladdr0, err)
+				}
+			}
+			{
+				subnet, err := tcpip.NewSubnet(lladdr1, tcpip.AddressMask(strings.Repeat("\xff", len(lladdr1))))
+				if err != nil {
+					t.Fatal(err)
+				}
+				s.SetRouteTable(
+					[]tcpip.Route{{
+						Destination: subnet,
+						NIC:         nicID,
+					}},
+				)
+			}
+
+			netProto := s.NetworkProtocolInstance(ProtocolNumber)
+			if netProto == nil {
+				t.Fatalf("cannot find protocol instance for network protocol %d", ProtocolNumber)
+			}
+			ep := netProto.NewEndpoint(&testInterface{}, &stubLinkAddressCache{}, &stubNUDHandler{}, &stubDispatcher{})
+			defer ep.Close()
+
+			if err := ep.Enable(); err != nil {
+				t.Fatalf("ep.Enable(): %s", err)
+			}
+
+			r, err := s.FindRoute(nicID, lladdr0, lladdr1, ProtocolNumber, false /* multicastLoop */)
+			if err != nil {
+				t.Fatalf("FindRoute(%d, %s, %s, _, false) = (_, %s), want = (_, nil)", nicID, lladdr0, lladdr1, err)
+			}
+			defer r.Release()
+
+			var tllData [header.NDPLinkLayerAddressSize]byte
+			header.NDPOptions(tllData[:]).Serialize(header.NDPOptionsSerializer{
+				header.NDPTargetLinkLayerAddressOption(linkAddr1),
+			})
+
+			types := []struct {
+				typ       header.ICMPv6Type
+				size      int
+				extraData []byte
+			}{
+				{
+					typ:  header.ICMPv6DstUnreachable,
+					size: header.ICMPv6DstUnreachableMinimumSize,
+				},
+				{
+					typ:  header.ICMPv6PacketTooBig,
+					size: header.ICMPv6PacketTooBigMinimumSize,
+				},
+				{
+					typ:  header.ICMPv6TimeExceeded,
+					size: header.ICMPv6MinimumSize,
+				},
+				{
+					typ:  header.ICMPv6ParamProblem,
+					size: header.ICMPv6MinimumSize,
+				},
+				{
+					typ:  header.ICMPv6EchoRequest,
+					size: header.ICMPv6EchoMinimumSize,
+				},
+				{
+					typ:  header.ICMPv6EchoReply,
+					size: header.ICMPv6EchoMinimumSize,
+				},
+				{
+					typ:  header.ICMPv6RouterSolicit,
+					size: header.ICMPv6MinimumSize,
+				},
+				{
+					typ:  header.ICMPv6RouterAdvert,
+					size: header.ICMPv6HeaderSize + header.NDPRAMinimumSize,
+				},
+				{
+					typ:  header.ICMPv6NeighborSolicit,
+					size: header.ICMPv6NeighborSolicitMinimumSize,
+				},
+				{
+					typ:       header.ICMPv6NeighborAdvert,
+					size:      header.ICMPv6NeighborAdvertMinimumSize,
+					extraData: tllData[:],
+				},
+				{
+					typ:  header.ICMPv6RedirectMsg,
+					size: header.ICMPv6MinimumSize,
+				},
+			}
+
+			handleIPv6Payload := func(icmp header.ICMPv6) {
+				pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
+					ReserveHeaderBytes: header.IPv6MinimumSize,
+					Data:               buffer.View(icmp).ToVectorisedView(),
+				})
+				ip := header.IPv6(pkt.NetworkHeader().Push(header.IPv6MinimumSize))
+				ip.Encode(&header.IPv6Fields{
+					PayloadLength: uint16(len(icmp)),
+					NextHeader:    uint8(header.ICMPv6ProtocolNumber),
+					HopLimit:      header.NDPHopLimit,
+					SrcAddr:       r.LocalAddress,
+					DstAddr:       r.RemoteAddress,
+				})
+				ep.HandlePacket(&r, pkt)
+			}
+
+			for _, typ := range types {
+				icmp := header.ICMPv6(buffer.NewView(typ.size + len(typ.extraData)))
+				copy(icmp[typ.size:], typ.extraData)
+				icmp.SetType(typ.typ)
+				icmp.SetChecksum(header.ICMPv6Checksum(icmp[:typ.size], r.LocalAddress, r.RemoteAddress, buffer.View(typ.extraData).ToVectorisedView()))
+				handleIPv6Payload(icmp)
+			}
+
+			// Construct an empty ICMP packet so that
+			// Stats().ICMP.ICMPv6ReceivedPacketStats.Invalid is incremented.
+			handleIPv6Payload(header.ICMPv6(buffer.NewView(header.IPv6MinimumSize)))
+
+			icmpv6Stats := s.Stats().ICMP.V6PacketsReceived
+			visitStats(reflect.ValueOf(&icmpv6Stats).Elem(), func(name string, s *tcpip.StatCounter) {
+				if got, want := s.Value(), uint64(1); got != want {
+					t.Errorf("got %s = %d, want = %d", name, got, want)
+				}
+			})
+			if t.Failed() {
+				t.Logf("stats:\n%+v", s.Stats())
+			}
+		})
+	}
+}
+
+func TestICMPCountsWithNeighborCache(t *testing.T) {
 	s := stack.New(stack.Options{
-		NetworkProtocols:   []stack.NetworkProtocol{NewProtocol()},
-		TransportProtocols: []stack.TransportProtocol{icmp.NewProtocol6()},
+		NetworkProtocols:   []stack.NetworkProtocolFactory{NewProtocol},
+		TransportProtocols: []stack.TransportProtocolFactory{icmp.NewProtocol6},
+		UseNeighborCache:   true,
 	})
 	{
-		if err := s.CreateNIC(1, &stubLinkEndpoint{}); err != nil {
-			t.Fatalf("CreateNIC(_) = %s", err)
+		if err := s.CreateNIC(nicID, &stubLinkEndpoint{}); err != nil {
+			t.Fatalf("CreateNIC(_, _) = %s", err)
 		}
-		if err := s.AddAddress(1, ProtocolNumber, lladdr0); err != nil {
+		if err := s.AddAddress(nicID, ProtocolNumber, lladdr0); err != nil {
 			t.Fatalf("AddAddress(_, %d, %s) = %s", ProtocolNumber, lladdr0, err)
 		}
 	}
@@ -105,7 +327,7 @@ func TestICMPCounts(t *testing.T) {
 		s.SetRouteTable(
 			[]tcpip.Route{{
 				Destination: subnet,
-				NIC:         1,
+				NIC:         nicID,
 			}},
 		)
 	}
@@ -114,12 +336,16 @@ func TestICMPCounts(t *testing.T) {
 	if netProto == nil {
 		t.Fatalf("cannot find protocol instance for network protocol %d", ProtocolNumber)
 	}
-	ep := netProto.NewEndpoint(0, &stubLinkAddressCache{}, &stubDispatcher{}, nil, s)
+	ep := netProto.NewEndpoint(&testInterface{}, nil, &stubNUDHandler{}, &stubDispatcher{})
 	defer ep.Close()
 
-	r, err := s.FindRoute(1, lladdr0, lladdr1, ProtocolNumber, false /* multicastLoop */)
+	if err := ep.Enable(); err != nil {
+		t.Fatalf("ep.Enable(): %s", err)
+	}
+
+	r, err := s.FindRoute(nicID, lladdr0, lladdr1, ProtocolNumber, false /* multicastLoop */)
 	if err != nil {
-		t.Fatalf("FindRoute(_) = _, %s, want = _, nil", err)
+		t.Fatalf("FindRoute(%d, %s, %s, _, false) = (_, %s), want = (_, nil)", nicID, lladdr0, lladdr1, err)
 	}
 	defer r.Release()
 
@@ -250,12 +476,12 @@ func (e endpointWithResolutionCapability) Capabilities() stack.LinkEndpointCapab
 func newTestContext(t *testing.T) *testContext {
 	c := &testContext{
 		s0: stack.New(stack.Options{
-			NetworkProtocols:   []stack.NetworkProtocol{NewProtocol()},
-			TransportProtocols: []stack.TransportProtocol{icmp.NewProtocol6()},
+			NetworkProtocols:   []stack.NetworkProtocolFactory{NewProtocol},
+			TransportProtocols: []stack.TransportProtocolFactory{icmp.NewProtocol6},
 		}),
 		s1: stack.New(stack.Options{
-			NetworkProtocols:   []stack.NetworkProtocol{NewProtocol()},
-			TransportProtocols: []stack.TransportProtocol{icmp.NewProtocol6()},
+			NetworkProtocols:   []stack.NetworkProtocolFactory{NewProtocol},
+			TransportProtocols: []stack.TransportProtocolFactory{icmp.NewProtocol6},
 		}),
 	}
 
@@ -265,19 +491,19 @@ func newTestContext(t *testing.T) *testContext {
 	if testing.Verbose() {
 		wrappedEP0 = sniffer.New(wrappedEP0)
 	}
-	if err := c.s0.CreateNIC(1, wrappedEP0); err != nil {
+	if err := c.s0.CreateNIC(nicID, wrappedEP0); err != nil {
 		t.Fatalf("CreateNIC s0: %v", err)
 	}
-	if err := c.s0.AddAddress(1, ProtocolNumber, lladdr0); err != nil {
+	if err := c.s0.AddAddress(nicID, ProtocolNumber, lladdr0); err != nil {
 		t.Fatalf("AddAddress lladdr0: %v", err)
 	}
 
 	c.linkEP1 = channel.New(defaultChannelSize, defaultMTU, linkAddr1)
 	wrappedEP1 := stack.LinkEndpoint(endpointWithResolutionCapability{LinkEndpoint: c.linkEP1})
-	if err := c.s1.CreateNIC(1, wrappedEP1); err != nil {
+	if err := c.s1.CreateNIC(nicID, wrappedEP1); err != nil {
 		t.Fatalf("CreateNIC failed: %v", err)
 	}
-	if err := c.s1.AddAddress(1, ProtocolNumber, lladdr1); err != nil {
+	if err := c.s1.AddAddress(nicID, ProtocolNumber, lladdr1); err != nil {
 		t.Fatalf("AddAddress lladdr1: %v", err)
 	}
 
@@ -288,7 +514,7 @@ func newTestContext(t *testing.T) *testContext {
 	c.s0.SetRouteTable(
 		[]tcpip.Route{{
 			Destination: subnet0,
-			NIC:         1,
+			NIC:         nicID,
 		}},
 	)
 	subnet1, err := tcpip.NewSubnet(lladdr0, tcpip.AddressMask(strings.Repeat("\xff", len(lladdr0))))
@@ -298,7 +524,7 @@ func newTestContext(t *testing.T) *testContext {
 	c.s1.SetRouteTable(
 		[]tcpip.Route{{
 			Destination: subnet1,
-			NIC:         1,
+			NIC:         nicID,
 		}},
 	)
 
@@ -359,9 +585,9 @@ func TestLinkResolution(t *testing.T) {
 	c := newTestContext(t)
 	defer c.cleanup()
 
-	r, err := c.s0.FindRoute(1, lladdr0, lladdr1, ProtocolNumber, false /* multicastLoop */)
+	r, err := c.s0.FindRoute(nicID, lladdr0, lladdr1, ProtocolNumber, false /* multicastLoop */)
 	if err != nil {
-		t.Fatalf("FindRoute(_) = _, %s, want = _, nil", err)
+		t.Fatalf("FindRoute(%d, %s, %s, _, false) = (_, %s), want = (_, nil)", nicID, lladdr0, lladdr1, err)
 	}
 	defer r.Release()
 
@@ -376,14 +602,14 @@ func TestLinkResolution(t *testing.T) {
 	var wq waiter.Queue
 	ep, err := c.s0.NewEndpoint(header.ICMPv6ProtocolNumber, ProtocolNumber, &wq)
 	if err != nil {
-		t.Fatalf("NewEndpoint(_) = _, %s, want = _, nil", err)
+		t.Fatalf("NewEndpoint(_) = (_, %s), want = (_, nil)", err)
 	}
 
 	for {
-		_, resCh, err := ep.Write(payload, tcpip.WriteOptions{To: &tcpip.FullAddress{NIC: 1, Addr: lladdr1}})
+		_, resCh, err := ep.Write(payload, tcpip.WriteOptions{To: &tcpip.FullAddress{NIC: nicID, Addr: lladdr1}})
 		if resCh != nil {
 			if err != tcpip.ErrNoLinkAddress {
-				t.Fatalf("ep.Write(_) = _, <non-nil>, %s, want = _, <non-nil>, tcpip.ErrNoLinkAddress", err)
+				t.Fatalf("ep.Write(_) = (_, <non-nil>, %s), want = (_, <non-nil>, tcpip.ErrNoLinkAddress)", err)
 			}
 			for _, args := range []routeArgs{
 				{src: c.linkEP0, dst: c.linkEP1, typ: header.ICMPv6NeighborSolicit, remoteLinkAddr: header.EthernetAddressFromMulticastIPv6Address(header.SolicitedNodeAddr(lladdr1))},
@@ -399,7 +625,7 @@ func TestLinkResolution(t *testing.T) {
 			continue
 		}
 		if err != nil {
-			t.Fatalf("ep.Write(_) = _, _, %s", err)
+			t.Fatalf("ep.Write(_) = (_, _, %s)", err)
 		}
 		break
 	}
@@ -424,6 +650,7 @@ func TestICMPChecksumValidationSimple(t *testing.T) {
 		size        int
 		extraData   []byte
 		statCounter func(tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter
+		routerOnly  bool
 	}{
 		{
 			name: "DstUnreachable",
@@ -480,6 +707,8 @@ func TestICMPChecksumValidationSimple(t *testing.T) {
 			statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
 				return stats.RouterSolicit
 			},
+			// Hosts MUST silently discard any received Router Solicitation messages.
+			routerOnly: true,
 		},
 		{
 			name: "RouterAdvert",
@@ -516,84 +745,133 @@ func TestICMPChecksumValidationSimple(t *testing.T) {
 		},
 	}
 
-	for _, typ := range types {
-		t.Run(typ.name, func(t *testing.T) {
-			e := channel.New(10, 1280, linkAddr0)
-			s := stack.New(stack.Options{
-				NetworkProtocols: []stack.NetworkProtocol{NewProtocol()},
-			})
-			if err := s.CreateNIC(1, e); err != nil {
-				t.Fatalf("CreateNIC(_) = %s", err)
-			}
-
-			if err := s.AddAddress(1, ProtocolNumber, lladdr0); err != nil {
-				t.Fatalf("AddAddress(_, %d, %s) = %s", ProtocolNumber, lladdr0, err)
-			}
-			{
-				subnet, err := tcpip.NewSubnet(lladdr1, tcpip.AddressMask(strings.Repeat("\xff", len(lladdr1))))
-				if err != nil {
-					t.Fatal(err)
-				}
-				s.SetRouteTable(
-					[]tcpip.Route{{
-						Destination: subnet,
-						NIC:         1,
-					}},
-				)
-			}
+	tests := []struct {
+		name             string
+		useNeighborCache bool
+	}{
+		{
+			name:             "linkAddrCache",
+			useNeighborCache: false,
+		},
+		{
+			name:             "neighborCache",
+			useNeighborCache: true,
+		},
+	}
 
-			handleIPv6Payload := func(checksum bool) {
-				icmp := header.ICMPv6(buffer.NewView(typ.size + len(typ.extraData)))
-				copy(icmp[typ.size:], typ.extraData)
-				icmp.SetType(typ.typ)
-				if checksum {
-					icmp.SetChecksum(header.ICMPv6Checksum(icmp, lladdr1, lladdr0, buffer.View{}.ToVectorisedView()))
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			for _, typ := range types {
+				for _, isRouter := range []bool{false, true} {
+					name := typ.name
+					if isRouter {
+						name += " (Router)"
+					}
+					t.Run(name, func(t *testing.T) {
+						e := channel.New(0, 1280, linkAddr0)
+
+						// Indicate that resolution for link layer addresses is required to
+						// send packets over this link. This is needed so the NIC knows to
+						// allocate a neighbor table.
+						e.LinkEPCapabilities |= stack.CapabilityResolutionRequired
+
+						s := stack.New(stack.Options{
+							NetworkProtocols: []stack.NetworkProtocolFactory{NewProtocol},
+							UseNeighborCache: test.useNeighborCache,
+						})
+						if isRouter {
+							// Enabling forwarding makes the stack act as a router.
+							s.SetForwarding(ProtocolNumber, true)
+						}
+						if err := s.CreateNIC(nicID, e); err != nil {
+							t.Fatalf("CreateNIC(_, _) = %s", err)
+						}
+
+						if err := s.AddAddress(nicID, ProtocolNumber, lladdr0); err != nil {
+							t.Fatalf("AddAddress(_, %d, %s) = %s", ProtocolNumber, lladdr0, err)
+						}
+						{
+							subnet, err := tcpip.NewSubnet(lladdr1, tcpip.AddressMask(strings.Repeat("\xff", len(lladdr1))))
+							if err != nil {
+								t.Fatal(err)
+							}
+							s.SetRouteTable(
+								[]tcpip.Route{{
+									Destination: subnet,
+									NIC:         nicID,
+								}},
+							)
+						}
+
+						handleIPv6Payload := func(checksum bool) {
+							icmp := header.ICMPv6(buffer.NewView(typ.size + len(typ.extraData)))
+							copy(icmp[typ.size:], typ.extraData)
+							icmp.SetType(typ.typ)
+							if checksum {
+								icmp.SetChecksum(header.ICMPv6Checksum(icmp, lladdr1, lladdr0, buffer.View{}.ToVectorisedView()))
+							}
+							ip := header.IPv6(buffer.NewView(header.IPv6MinimumSize))
+							ip.Encode(&header.IPv6Fields{
+								PayloadLength: uint16(len(icmp)),
+								NextHeader:    uint8(header.ICMPv6ProtocolNumber),
+								HopLimit:      header.NDPHopLimit,
+								SrcAddr:       lladdr1,
+								DstAddr:       lladdr0,
+							})
+							pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
+								Data: buffer.NewVectorisedView(len(ip)+len(icmp), []buffer.View{buffer.View(ip), buffer.View(icmp)}),
+							})
+							e.InjectInbound(ProtocolNumber, pkt)
+						}
+
+						stats := s.Stats().ICMP.V6PacketsReceived
+						invalid := stats.Invalid
+						routerOnly := stats.RouterOnlyPacketsDroppedByHost
+						typStat := typ.statCounter(stats)
+
+						// Initial stat counts should be 0.
+						if got := invalid.Value(); got != 0 {
+							t.Fatalf("got invalid = %d, want = 0", got)
+						}
+						if got := routerOnly.Value(); got != 0 {
+							t.Fatalf("got RouterOnlyPacketsReceivedByHost = %d, want = 0", got)
+						}
+						if got := typStat.Value(); got != 0 {
+							t.Fatalf("got %s = %d, want = 0", typ.name, got)
+						}
+
+						// Without setting checksum, the incoming packet should
+						// be invalid.
+						handleIPv6Payload(false)
+						if got := invalid.Value(); got != 1 {
+							t.Fatalf("got invalid = %d, want = 1", got)
+						}
+						// Router only count should not have increased.
+						if got := routerOnly.Value(); got != 0 {
+							t.Fatalf("got RouterOnlyPacketsReceivedByHost = %d, want = 0", got)
+						}
+						// Rx count of type typ.typ should not have increased.
+						if got := typStat.Value(); got != 0 {
+							t.Fatalf("got %s = %d, want = 0", typ.name, got)
+						}
+
+						// When checksum is set, it should be received.
+						handleIPv6Payload(true)
+						if got := typStat.Value(); got != 1 {
+							t.Fatalf("got %s = %d, want = 1", typ.name, got)
+						}
+						// Invalid count should not have increased again.
+						if got := invalid.Value(); got != 1 {
+							t.Fatalf("got invalid = %d, want = 1", got)
+						}
+						if !isRouter && typ.routerOnly && test.useNeighborCache {
+							// Router only count should have increased.
+							if got := routerOnly.Value(); got != 1 {
+								t.Fatalf("got RouterOnlyPacketsReceivedByHost = %d, want = 1", got)
+							}
+						}
+					})
 				}
-				ip := header.IPv6(buffer.NewView(header.IPv6MinimumSize))
-				ip.Encode(&header.IPv6Fields{
-					PayloadLength: uint16(len(icmp)),
-					NextHeader:    uint8(header.ICMPv6ProtocolNumber),
-					HopLimit:      header.NDPHopLimit,
-					SrcAddr:       lladdr1,
-					DstAddr:       lladdr0,
-				})
-				pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
-					Data: buffer.NewVectorisedView(len(ip)+len(icmp), []buffer.View{buffer.View(ip), buffer.View(icmp)}),
-				})
-				e.InjectInbound(ProtocolNumber, pkt)
-			}
-
-			stats := s.Stats().ICMP.V6PacketsReceived
-			invalid := stats.Invalid
-			typStat := typ.statCounter(stats)
-
-			// Initial stat counts should be 0.
-			if got := invalid.Value(); got != 0 {
-				t.Fatalf("got invalid = %d, want = 0", got)
-			}
-			if got := typStat.Value(); got != 0 {
-				t.Fatalf("got %s = %d, want = 0", typ.name, got)
-			}
-
-			// Without setting checksum, the incoming packet should
-			// be invalid.
-			handleIPv6Payload(false)
-			if got := invalid.Value(); got != 1 {
-				t.Fatalf("got invalid = %d, want = 1", got)
-			}
-			// Rx count of type typ.typ should not have increased.
-			if got := typStat.Value(); got != 0 {
-				t.Fatalf("got %s = %d, want = 0", typ.name, got)
-			}
-
-			// When checksum is set, it should be received.
-			handleIPv6Payload(true)
-			if got := typStat.Value(); got != 1 {
-				t.Fatalf("got %s = %d, want = 1", typ.name, got)
-			}
-			// Invalid count should not have increased again.
-			if got := invalid.Value(); got != 1 {
-				t.Fatalf("got invalid = %d, want = 1", got)
 			}
 		})
 	}
@@ -694,13 +972,13 @@ func TestICMPChecksumValidationWithPayload(t *testing.T) {
 		t.Run(typ.name, func(t *testing.T) {
 			e := channel.New(10, 1280, linkAddr0)
 			s := stack.New(stack.Options{
-				NetworkProtocols: []stack.NetworkProtocol{NewProtocol()},
+				NetworkProtocols: []stack.NetworkProtocolFactory{NewProtocol},
 			})
-			if err := s.CreateNIC(1, e); err != nil {
-				t.Fatalf("CreateNIC(_) = %s", err)
+			if err := s.CreateNIC(nicID, e); err != nil {
+				t.Fatalf("CreateNIC(_, _) = %s", err)
 			}
 
-			if err := s.AddAddress(1, ProtocolNumber, lladdr0); err != nil {
+			if err := s.AddAddress(nicID, ProtocolNumber, lladdr0); err != nil {
 				t.Fatalf("AddAddress(_, %d, %s) = %s", ProtocolNumber, lladdr0, err)
 			}
 			{
@@ -711,7 +989,7 @@ func TestICMPChecksumValidationWithPayload(t *testing.T) {
 				s.SetRouteTable(
 					[]tcpip.Route{{
 						Destination: subnet,
-						NIC:         1,
+						NIC:         nicID,
 					}},
 				)
 			}
@@ -750,7 +1028,7 @@ func TestICMPChecksumValidationWithPayload(t *testing.T) {
 				t.Fatalf("got invalid = %d, want = 0", got)
 			}
 			if got := typStat.Value(); got != 0 {
-				t.Fatalf("got %s = %d, want = 0", typ.name, got)
+				t.Fatalf("got = %d, want = 0", got)
 			}
 
 			// Without setting checksum, the incoming packet should
@@ -761,13 +1039,13 @@ func TestICMPChecksumValidationWithPayload(t *testing.T) {
 			}
 			// Rx count of type typ.typ should not have increased.
 			if got := typStat.Value(); got != 0 {
-				t.Fatalf("got %s = %d, want = 0", typ.name, got)
+				t.Fatalf("got = %d, want = 0", got)
 			}
 
 			// When checksum is set, it should be received.
 			handleIPv6Payload(typ.typ, typ.size, typ.payloadSize, typ.payload, true)
 			if got := typStat.Value(); got != 1 {
-				t.Fatalf("got %s = %d, want = 1", typ.name, got)
+				t.Fatalf("got = %d, want = 0", got)
 			}
 			// Invalid count should not have increased again.
 			if got := invalid.Value(); got != 1 {
@@ -872,14 +1150,14 @@ func TestICMPChecksumValidationWithPayloadMultipleViews(t *testing.T) {
 		t.Run(typ.name, func(t *testing.T) {
 			e := channel.New(10, 1280, linkAddr0)
 			s := stack.New(stack.Options{
-				NetworkProtocols: []stack.NetworkProtocol{NewProtocol()},
+				NetworkProtocols: []stack.NetworkProtocolFactory{NewProtocol},
 			})
-			if err := s.CreateNIC(1, e); err != nil {
-				t.Fatalf("CreateNIC(_) = %s", err)
+			if err := s.CreateNIC(nicID, e); err != nil {
+				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
 			}
 
-			if err := s.AddAddress(1, ProtocolNumber, lladdr0); err != nil {
-				t.Fatalf("AddAddress(_, %d, %s) = %s", ProtocolNumber, lladdr0, err)
+			if err := s.AddAddress(nicID, ProtocolNumber, lladdr0); err != nil {
+				t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, ProtocolNumber, lladdr0, err)
 			}
 			{
 				subnet, err := tcpip.NewSubnet(lladdr1, tcpip.AddressMask(strings.Repeat("\xff", len(lladdr1))))
@@ -889,7 +1167,7 @@ func TestICMPChecksumValidationWithPayloadMultipleViews(t *testing.T) {
 				s.SetRouteTable(
 					[]tcpip.Route{{
 						Destination: subnet,
-						NIC:         1,
+						NIC:         nicID,
 					}},
 				)
 			}
@@ -929,7 +1207,7 @@ func TestICMPChecksumValidationWithPayloadMultipleViews(t *testing.T) {
 				t.Fatalf("got invalid = %d, want = 0", got)
 			}
 			if got := typStat.Value(); got != 0 {
-				t.Fatalf("got %s = %d, want = 0", typ.name, got)
+				t.Fatalf("got = %d, want = 0", got)
 			}
 
 			// Without setting checksum, the incoming packet should
@@ -940,13 +1218,13 @@ func TestICMPChecksumValidationWithPayloadMultipleViews(t *testing.T) {
 			}
 			// Rx count of type typ.typ should not have increased.
 			if got := typStat.Value(); got != 0 {
-				t.Fatalf("got %s = %d, want = 0", typ.name, got)
+				t.Fatalf("got = %d, want = 0", got)
 			}
 
 			// When checksum is set, it should be received.
 			handleIPv6Payload(typ.typ, typ.size, typ.payloadSize, typ.payload, true)
 			if got := typStat.Value(); got != 1 {
-				t.Fatalf("got %s = %d, want = 1", typ.name, got)
+				t.Fatalf("got = %d, want = 0", got)
 			}
 			// Invalid count should not have increased again.
 			if got := invalid.Value(); got != 1 {
@@ -961,24 +1239,30 @@ func TestLinkAddressRequest(t *testing.T) {
 	mcaddr := header.EthernetAddressFromMulticastIPv6Address(snaddr)
 
 	tests := []struct {
-		name           string
-		remoteLinkAddr tcpip.LinkAddress
-		expectLinkAddr tcpip.LinkAddress
+		name             string
+		remoteLinkAddr   tcpip.LinkAddress
+		expectedLinkAddr tcpip.LinkAddress
+		expectedAddr     tcpip.Address
 	}{
 		{
-			name:           "Unicast",
-			remoteLinkAddr: linkAddr1,
-			expectLinkAddr: linkAddr1,
+			name:             "Unicast",
+			remoteLinkAddr:   linkAddr1,
+			expectedLinkAddr: linkAddr1,
+			expectedAddr:     lladdr0,
 		},
 		{
-			name:           "Multicast",
-			remoteLinkAddr: "",
-			expectLinkAddr: mcaddr,
+			name:             "Multicast",
+			remoteLinkAddr:   "",
+			expectedLinkAddr: mcaddr,
+			expectedAddr:     snaddr,
 		},
 	}
 
 	for _, test := range tests {
-		p := NewProtocol()
+		s := stack.New(stack.Options{
+			NetworkProtocols: []stack.NetworkProtocolFactory{NewProtocol},
+		})
+		p := s.NetworkProtocolInstance(ProtocolNumber)
 		linkRes, ok := p.(stack.LinkAddressResolver)
 		if !ok {
 			t.Fatalf("expected IPv6 protocol to implement stack.LinkAddressResolver")
@@ -993,9 +1277,466 @@ func TestLinkAddressRequest(t *testing.T) {
 		if !ok {
 			t.Fatal("expected to send a link address request")
 		}
+		if pkt.Route.RemoteLinkAddress != test.expectedLinkAddr {
+			t.Errorf("got pkt.Route.RemoteLinkAddress = %s, want = %s", pkt.Route.RemoteLinkAddress, test.expectedLinkAddr)
+		}
+		if pkt.Route.RemoteAddress != test.expectedAddr {
+			t.Errorf("got pkt.Route.RemoteAddress = %s, want = %s", pkt.Route.RemoteAddress, test.expectedAddr)
+		}
+		if pkt.Route.LocalAddress != lladdr1 {
+			t.Errorf("got pkt.Route.LocalAddress = %s, want = %s", pkt.Route.LocalAddress, lladdr1)
+		}
+		checker.IPv6(t, stack.PayloadSince(pkt.Pkt.NetworkHeader()),
+			checker.SrcAddr(lladdr1),
+			checker.DstAddr(test.expectedAddr),
+			checker.TTL(header.NDPHopLimit),
+			checker.NDPNS(
+				checker.NDPNSTargetAddress(lladdr0),
+				checker.NDPNSOptions([]header.NDPOption{header.NDPSourceLinkLayerAddressOption(linkAddr0)}),
+			))
+	}
+}
 
-		if got, want := pkt.Route.RemoteLinkAddress, test.expectLinkAddr; got != want {
-			t.Errorf("got pkt.Route.RemoteLinkAddress = %s, want = %s", got, want)
+func TestPacketQueing(t *testing.T) {
+	const nicID = 1
+
+	var (
+		host1NICLinkAddr = tcpip.LinkAddress("\x02\x03\x03\x04\x05\x06")
+		host2NICLinkAddr = tcpip.LinkAddress("\x02\x03\x03\x04\x05\x09")
+
+		host1IPv6Addr = tcpip.ProtocolAddress{
+			Protocol: ProtocolNumber,
+			AddressWithPrefix: tcpip.AddressWithPrefix{
+				Address:   tcpip.Address(net.ParseIP("a::1").To16()),
+				PrefixLen: 64,
+			},
+		}
+		host2IPv6Addr = tcpip.ProtocolAddress{
+			Protocol: ProtocolNumber,
+			AddressWithPrefix: tcpip.AddressWithPrefix{
+				Address:   tcpip.Address(net.ParseIP("a::2").To16()),
+				PrefixLen: 64,
+			},
 		}
+	)
+
+	tests := []struct {
+		name      string
+		rxPkt     func(*channel.Endpoint)
+		checkResp func(*testing.T, *channel.Endpoint)
+	}{
+		{
+			name: "ICMP Error",
+			rxPkt: func(e *channel.Endpoint) {
+				hdr := buffer.NewPrependable(header.IPv6MinimumSize + header.UDPMinimumSize)
+				u := header.UDP(hdr.Prepend(header.UDPMinimumSize))
+				u.Encode(&header.UDPFields{
+					SrcPort: 5555,
+					DstPort: 80,
+					Length:  header.UDPMinimumSize,
+				})
+				sum := header.PseudoHeaderChecksum(udp.ProtocolNumber, host2IPv6Addr.AddressWithPrefix.Address, host1IPv6Addr.AddressWithPrefix.Address, header.UDPMinimumSize)
+				sum = header.Checksum(header.UDP([]byte{}), sum)
+				u.SetChecksum(^u.CalculateChecksum(sum))
+				payloadLength := hdr.UsedLength()
+				ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
+				ip.Encode(&header.IPv6Fields{
+					PayloadLength: uint16(payloadLength),
+					NextHeader:    uint8(udp.ProtocolNumber),
+					HopLimit:      DefaultTTL,
+					SrcAddr:       host2IPv6Addr.AddressWithPrefix.Address,
+					DstAddr:       host1IPv6Addr.AddressWithPrefix.Address,
+				})
+				e.InjectInbound(ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
+					Data: hdr.View().ToVectorisedView(),
+				}))
+			},
+			checkResp: func(t *testing.T, e *channel.Endpoint) {
+				p, ok := e.ReadContext(context.Background())
+				if !ok {
+					t.Fatalf("timed out waiting for packet")
+				}
+				if p.Proto != ProtocolNumber {
+					t.Errorf("got p.Proto = %d, want = %d", p.Proto, ProtocolNumber)
+				}
+				if p.Route.RemoteLinkAddress != host2NICLinkAddr {
+					t.Errorf("got p.Route.RemoteLinkAddress = %s, want = %s", p.Route.RemoteLinkAddress, host2NICLinkAddr)
+				}
+				checker.IPv6(t, stack.PayloadSince(p.Pkt.NetworkHeader()),
+					checker.SrcAddr(host1IPv6Addr.AddressWithPrefix.Address),
+					checker.DstAddr(host2IPv6Addr.AddressWithPrefix.Address),
+					checker.ICMPv6(
+						checker.ICMPv6Type(header.ICMPv6DstUnreachable),
+						checker.ICMPv6Code(header.ICMPv6PortUnreachable)))
+			},
+		},
+
+		{
+			name: "Ping",
+			rxPkt: func(e *channel.Endpoint) {
+				totalLen := header.IPv6MinimumSize + header.ICMPv6MinimumSize
+				hdr := buffer.NewPrependable(totalLen)
+				pkt := header.ICMPv6(hdr.Prepend(header.ICMPv6MinimumSize))
+				pkt.SetType(header.ICMPv6EchoRequest)
+				pkt.SetCode(0)
+				pkt.SetChecksum(0)
+				pkt.SetChecksum(header.ICMPv6Checksum(pkt, host2IPv6Addr.AddressWithPrefix.Address, host1IPv6Addr.AddressWithPrefix.Address, buffer.VectorisedView{}))
+				ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
+				ip.Encode(&header.IPv6Fields{
+					PayloadLength: header.ICMPv6MinimumSize,
+					NextHeader:    uint8(icmp.ProtocolNumber6),
+					HopLimit:      DefaultTTL,
+					SrcAddr:       host2IPv6Addr.AddressWithPrefix.Address,
+					DstAddr:       host1IPv6Addr.AddressWithPrefix.Address,
+				})
+				e.InjectInbound(header.IPv6ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
+					Data: hdr.View().ToVectorisedView(),
+				}))
+			},
+			checkResp: func(t *testing.T, e *channel.Endpoint) {
+				p, ok := e.ReadContext(context.Background())
+				if !ok {
+					t.Fatalf("timed out waiting for packet")
+				}
+				if p.Proto != ProtocolNumber {
+					t.Errorf("got p.Proto = %d, want = %d", p.Proto, ProtocolNumber)
+				}
+				if p.Route.RemoteLinkAddress != host2NICLinkAddr {
+					t.Errorf("got p.Route.RemoteLinkAddress = %s, want = %s", p.Route.RemoteLinkAddress, host2NICLinkAddr)
+				}
+				checker.IPv6(t, stack.PayloadSince(p.Pkt.NetworkHeader()),
+					checker.SrcAddr(host1IPv6Addr.AddressWithPrefix.Address),
+					checker.DstAddr(host2IPv6Addr.AddressWithPrefix.Address),
+					checker.ICMPv6(
+						checker.ICMPv6Type(header.ICMPv6EchoReply),
+						checker.ICMPv6Code(header.ICMPv6UnusedCode)))
+			},
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+
+			e := channel.New(1, header.IPv6MinimumMTU, host1NICLinkAddr)
+			e.LinkEPCapabilities |= stack.CapabilityResolutionRequired
+			s := stack.New(stack.Options{
+				NetworkProtocols:   []stack.NetworkProtocolFactory{NewProtocol},
+				TransportProtocols: []stack.TransportProtocolFactory{udp.NewProtocol},
+			})
+
+			if err := s.CreateNIC(nicID, e); err != nil {
+				t.Fatalf("s.CreateNIC(%d, _): %s", nicID, err)
+			}
+			if err := s.AddProtocolAddress(nicID, host1IPv6Addr); err != nil {
+				t.Fatalf("s.AddProtocolAddress(%d, %#v): %s", nicID, host1IPv6Addr, err)
+			}
+
+			s.SetRouteTable([]tcpip.Route{
+				tcpip.Route{
+					Destination: host1IPv6Addr.AddressWithPrefix.Subnet(),
+					NIC:         nicID,
+				},
+			})
+
+			// Receive a packet to trigger link resolution before a response is sent.
+			test.rxPkt(e)
+
+			// Wait for a neighbor solicitation since link address resolution should
+			// be performed.
+			{
+				p, ok := e.ReadContext(context.Background())
+				if !ok {
+					t.Fatalf("timed out waiting for packet")
+				}
+				if p.Proto != ProtocolNumber {
+					t.Errorf("got Proto = %d, want = %d", p.Proto, ProtocolNumber)
+				}
+				snmc := header.SolicitedNodeAddr(host2IPv6Addr.AddressWithPrefix.Address)
+				if want := header.EthernetAddressFromMulticastIPv6Address(snmc); p.Route.RemoteLinkAddress != want {
+					t.Errorf("got p.Route.RemoteLinkAddress = %s, want = %s", p.Route.RemoteLinkAddress, want)
+				}
+				checker.IPv6(t, stack.PayloadSince(p.Pkt.NetworkHeader()),
+					checker.SrcAddr(host1IPv6Addr.AddressWithPrefix.Address),
+					checker.DstAddr(snmc),
+					checker.TTL(header.NDPHopLimit),
+					checker.NDPNS(
+						checker.NDPNSTargetAddress(host2IPv6Addr.AddressWithPrefix.Address),
+						checker.NDPNSOptions([]header.NDPOption{header.NDPSourceLinkLayerAddressOption(host1NICLinkAddr)}),
+					))
+			}
+
+			// Send a neighbor advertisement to complete link address resolution.
+			{
+				naSize := header.ICMPv6NeighborAdvertMinimumSize + header.NDPLinkLayerAddressSize
+				hdr := buffer.NewPrependable(header.IPv6MinimumSize + naSize)
+				pkt := header.ICMPv6(hdr.Prepend(naSize))
+				pkt.SetType(header.ICMPv6NeighborAdvert)
+				na := header.NDPNeighborAdvert(pkt.NDPPayload())
+				na.SetSolicitedFlag(true)
+				na.SetOverrideFlag(true)
+				na.SetTargetAddress(host2IPv6Addr.AddressWithPrefix.Address)
+				na.Options().Serialize(header.NDPOptionsSerializer{
+					header.NDPTargetLinkLayerAddressOption(host2NICLinkAddr),
+				})
+				pkt.SetChecksum(header.ICMPv6Checksum(pkt, host2IPv6Addr.AddressWithPrefix.Address, host1IPv6Addr.AddressWithPrefix.Address, buffer.VectorisedView{}))
+				payloadLength := hdr.UsedLength()
+				ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
+				ip.Encode(&header.IPv6Fields{
+					PayloadLength: uint16(payloadLength),
+					NextHeader:    uint8(icmp.ProtocolNumber6),
+					HopLimit:      header.NDPHopLimit,
+					SrcAddr:       host2IPv6Addr.AddressWithPrefix.Address,
+					DstAddr:       host1IPv6Addr.AddressWithPrefix.Address,
+				})
+				e.InjectInbound(ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
+					Data: hdr.View().ToVectorisedView(),
+				}))
+			}
+
+			// Expect the response now that the link address has resolved.
+			test.checkResp(t, e)
+
+			// Since link resolution was already performed, it shouldn't be performed
+			// again.
+			test.rxPkt(e)
+			test.checkResp(t, e)
+		})
+	}
+}
+
+func TestCallsToNeighborCache(t *testing.T) {
+	tests := []struct {
+		name                  string
+		createPacket          func() header.ICMPv6
+		multicast             bool
+		source                tcpip.Address
+		destination           tcpip.Address
+		wantProbeCount        int
+		wantConfirmationCount int
+	}{
+		{
+			name: "Unicast Neighbor Solicitation without source link-layer address option",
+			createPacket: func() header.ICMPv6 {
+				nsSize := header.ICMPv6NeighborSolicitMinimumSize + header.NDPLinkLayerAddressSize
+				icmp := header.ICMPv6(buffer.NewView(nsSize))
+				icmp.SetType(header.ICMPv6NeighborSolicit)
+				ns := header.NDPNeighborSolicit(icmp.NDPPayload())
+				ns.SetTargetAddress(lladdr0)
+				return icmp
+			},
+			source:      lladdr1,
+			destination: lladdr0,
+			// "The source link-layer address option SHOULD be included in unicast
+			//  solicitations." - RFC 4861 section 4.3
+			//
+			// A Neighbor Advertisement needs to be sent in response, but the
+			// Neighbor Cache shouldn't be updated since we have no useful
+			// information about the sender.
+			wantProbeCount: 0,
+		},
+		{
+			name: "Unicast Neighbor Solicitation with source link-layer address option",
+			createPacket: func() header.ICMPv6 {
+				nsSize := header.ICMPv6NeighborSolicitMinimumSize + header.NDPLinkLayerAddressSize
+				icmp := header.ICMPv6(buffer.NewView(nsSize))
+				icmp.SetType(header.ICMPv6NeighborSolicit)
+				ns := header.NDPNeighborSolicit(icmp.NDPPayload())
+				ns.SetTargetAddress(lladdr0)
+				ns.Options().Serialize(header.NDPOptionsSerializer{
+					header.NDPSourceLinkLayerAddressOption(linkAddr1),
+				})
+				return icmp
+			},
+			source:         lladdr1,
+			destination:    lladdr0,
+			wantProbeCount: 1,
+		},
+		{
+			name: "Multicast Neighbor Solicitation without source link-layer address option",
+			createPacket: func() header.ICMPv6 {
+				nsSize := header.ICMPv6NeighborSolicitMinimumSize + header.NDPLinkLayerAddressSize
+				icmp := header.ICMPv6(buffer.NewView(nsSize))
+				icmp.SetType(header.ICMPv6NeighborSolicit)
+				ns := header.NDPNeighborSolicit(icmp.NDPPayload())
+				ns.SetTargetAddress(lladdr0)
+				return icmp
+			},
+			source:      lladdr1,
+			destination: header.SolicitedNodeAddr(lladdr0),
+			// "The source link-layer address option MUST be included in multicast
+			//  solicitations." - RFC 4861 section 4.3
+			wantProbeCount: 0,
+		},
+		{
+			name: "Multicast Neighbor Solicitation with source link-layer address option",
+			createPacket: func() header.ICMPv6 {
+				nsSize := header.ICMPv6NeighborSolicitMinimumSize + header.NDPLinkLayerAddressSize
+				icmp := header.ICMPv6(buffer.NewView(nsSize))
+				icmp.SetType(header.ICMPv6NeighborSolicit)
+				ns := header.NDPNeighborSolicit(icmp.NDPPayload())
+				ns.SetTargetAddress(lladdr0)
+				ns.Options().Serialize(header.NDPOptionsSerializer{
+					header.NDPSourceLinkLayerAddressOption(linkAddr1),
+				})
+				return icmp
+			},
+			source:         lladdr1,
+			destination:    header.SolicitedNodeAddr(lladdr0),
+			wantProbeCount: 1,
+		},
+		{
+			name: "Unicast Neighbor Advertisement without target link-layer address option",
+			createPacket: func() header.ICMPv6 {
+				naSize := header.ICMPv6NeighborAdvertMinimumSize
+				icmp := header.ICMPv6(buffer.NewView(naSize))
+				icmp.SetType(header.ICMPv6NeighborAdvert)
+				na := header.NDPNeighborAdvert(icmp.NDPPayload())
+				na.SetSolicitedFlag(true)
+				na.SetOverrideFlag(false)
+				na.SetTargetAddress(lladdr1)
+				return icmp
+			},
+			source:      lladdr1,
+			destination: lladdr0,
+			// "When responding to unicast solicitations, the target link-layer
+			//  address option can be omitted since the sender of the solicitation has
+			//  the correct link-layer address; otherwise, it would not be able to
+			//  send the unicast solicitation in the first place."
+			//   - RFC 4861 section 4.4
+			wantConfirmationCount: 1,
+		},
+		{
+			name: "Unicast Neighbor Advertisement with target link-layer address option",
+			createPacket: func() header.ICMPv6 {
+				naSize := header.ICMPv6NeighborAdvertMinimumSize + header.NDPLinkLayerAddressSize
+				icmp := header.ICMPv6(buffer.NewView(naSize))
+				icmp.SetType(header.ICMPv6NeighborAdvert)
+				na := header.NDPNeighborAdvert(icmp.NDPPayload())
+				na.SetSolicitedFlag(true)
+				na.SetOverrideFlag(false)
+				na.SetTargetAddress(lladdr1)
+				na.Options().Serialize(header.NDPOptionsSerializer{
+					header.NDPTargetLinkLayerAddressOption(linkAddr1),
+				})
+				return icmp
+			},
+			source:                lladdr1,
+			destination:           lladdr0,
+			wantConfirmationCount: 1,
+		},
+		{
+			name: "Multicast Neighbor Advertisement without target link-layer address option",
+			createPacket: func() header.ICMPv6 {
+				naSize := header.ICMPv6NeighborAdvertMinimumSize + header.NDPLinkLayerAddressSize
+				icmp := header.ICMPv6(buffer.NewView(naSize))
+				icmp.SetType(header.ICMPv6NeighborAdvert)
+				na := header.NDPNeighborAdvert(icmp.NDPPayload())
+				na.SetSolicitedFlag(false)
+				na.SetOverrideFlag(false)
+				na.SetTargetAddress(lladdr1)
+				return icmp
+			},
+			source:      lladdr1,
+			destination: header.IPv6AllNodesMulticastAddress,
+			// "Target link-layer address MUST be included for multicast solicitations
+			//  in order to avoid infinite Neighbor Solicitation "recursion" when the
+			//  peer node does not have a cache entry to return a Neighbor
+			//  Advertisements message." - RFC 4861 section 4.4
+			wantConfirmationCount: 0,
+		},
+		{
+			name: "Multicast Neighbor Advertisement with target link-layer address option",
+			createPacket: func() header.ICMPv6 {
+				naSize := header.ICMPv6NeighborAdvertMinimumSize + header.NDPLinkLayerAddressSize
+				icmp := header.ICMPv6(buffer.NewView(naSize))
+				icmp.SetType(header.ICMPv6NeighborAdvert)
+				na := header.NDPNeighborAdvert(icmp.NDPPayload())
+				na.SetSolicitedFlag(false)
+				na.SetOverrideFlag(false)
+				na.SetTargetAddress(lladdr1)
+				na.Options().Serialize(header.NDPOptionsSerializer{
+					header.NDPTargetLinkLayerAddressOption(linkAddr1),
+				})
+				return icmp
+			},
+			source:                lladdr1,
+			destination:           header.IPv6AllNodesMulticastAddress,
+			wantConfirmationCount: 1,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			s := stack.New(stack.Options{
+				NetworkProtocols:   []stack.NetworkProtocolFactory{NewProtocol},
+				TransportProtocols: []stack.TransportProtocolFactory{icmp.NewProtocol6},
+				UseNeighborCache:   true,
+			})
+			{
+				if err := s.CreateNIC(nicID, &stubLinkEndpoint{}); err != nil {
+					t.Fatalf("CreateNIC(_, _) = %s", err)
+				}
+				if err := s.AddAddress(nicID, ProtocolNumber, lladdr0); err != nil {
+					t.Fatalf("AddAddress(_, %d, %s) = %s", ProtocolNumber, lladdr0, err)
+				}
+			}
+			{
+				subnet, err := tcpip.NewSubnet(lladdr1, tcpip.AddressMask(strings.Repeat("\xff", len(lladdr1))))
+				if err != nil {
+					t.Fatal(err)
+				}
+				s.SetRouteTable(
+					[]tcpip.Route{{
+						Destination: subnet,
+						NIC:         nicID,
+					}},
+				)
+			}
+
+			netProto := s.NetworkProtocolInstance(ProtocolNumber)
+			if netProto == nil {
+				t.Fatalf("cannot find protocol instance for network protocol %d", ProtocolNumber)
+			}
+			nudHandler := &stubNUDHandler{}
+			ep := netProto.NewEndpoint(&testInterface{linkAddr: linkAddr0}, &stubLinkAddressCache{}, nudHandler, &stubDispatcher{})
+			defer ep.Close()
+
+			if err := ep.Enable(); err != nil {
+				t.Fatalf("ep.Enable(): %s", err)
+			}
+
+			r, err := s.FindRoute(nicID, lladdr0, test.source, ProtocolNumber, false /* multicastLoop */)
+			if err != nil {
+				t.Fatalf("FindRoute(%d, %s, %s, _, false) = (_, %s), want = (_, nil)", nicID, lladdr0, lladdr1, err)
+			}
+			defer r.Release()
+
+			// TODO(gvisor.dev/issue/4517): Remove the need for this manual patch.
+			r.LocalAddress = test.destination
+
+			icmp := test.createPacket()
+			icmp.SetChecksum(header.ICMPv6Checksum(icmp, r.RemoteAddress, r.LocalAddress, buffer.VectorisedView{}))
+			pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
+				ReserveHeaderBytes: header.IPv6MinimumSize,
+				Data:               buffer.View(icmp).ToVectorisedView(),
+			})
+			ip := header.IPv6(pkt.NetworkHeader().Push(header.IPv6MinimumSize))
+			ip.Encode(&header.IPv6Fields{
+				PayloadLength: uint16(len(icmp)),
+				NextHeader:    uint8(header.ICMPv6ProtocolNumber),
+				HopLimit:      header.NDPHopLimit,
+				SrcAddr:       r.RemoteAddress,
+				DstAddr:       r.LocalAddress,
+			})
+			ep.HandlePacket(&r, pkt)
+
+			// Confirm the endpoint calls the correct NUDHandler method.
+			if nudHandler.probeCount != test.wantProbeCount {
+				t.Errorf("got nudHandler.probeCount = %d, want = %d", nudHandler.probeCount, test.wantProbeCount)
+			}
+			if nudHandler.confirmationCount != test.wantConfirmationCount {
+				t.Errorf("got nudHandler.confirmationCount = %d, want = %d", nudHandler.confirmationCount, test.wantConfirmationCount)
+			}
+		})
 	}
 }
diff --git a/pkg/tcpip/network/ipv6/ipv6.go b/pkg/tcpip/network/ipv6/ipv6.go
index 0eafe9790..9670696c7 100644
--- a/pkg/tcpip/network/ipv6/ipv6.go
+++ b/pkg/tcpip/network/ipv6/ipv6.go
@@ -1,4 +1,4 @@
-// Copyright 2018 The gVisor Authors.
+// Copyright 2020 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,26 +12,37 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// Package ipv6 contains the implementation of the ipv6 network protocol. To use
-// it in the networking stack, this package must be added to the project, and
-// activated on the stack by passing ipv6.NewProtocol() as one of the network
-// protocols when calling stack.New(). Then endpoints can be created by passing
-// ipv6.ProtocolNumber as the network protocol number when calling
-// Stack.NewEndpoint().
+// Package ipv6 contains the implementation of the ipv6 network protocol.
 package ipv6
 
 import (
+	"encoding/binary"
 	"fmt"
+	"hash/fnv"
+	"sort"
 	"sync/atomic"
+	"time"
 
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/header/parse"
 	"gvisor.dev/gvisor/pkg/tcpip/network/fragmentation"
+	"gvisor.dev/gvisor/pkg/tcpip/network/hash"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 )
 
 const (
+	// As per RFC 8200 section 4.5:
+	//   If insufficient fragments are received to complete reassembly of a packet
+	//   within 60 seconds of the reception of the first-arriving fragment of that
+	//   packet, reassembly of that packet must be abandoned.
+	//
+	// Linux also uses 60 seconds for reassembly timeout:
+	// https://github.com/torvalds/linux/blob/47ec5303d73ea344e84f46660fff693c57641386/include/net/ipv6.h#L456
+	reassembleTimeout = 60 * time.Second
+
 	// ProtocolNumber is the ipv6 protocol number.
 	ProtocolNumber = header.IPv6ProtocolNumber
 
@@ -42,15 +53,306 @@ const (
 	// DefaultTTL is the default hop limit for IPv6 Packets egressed by
 	// Netstack.
 	DefaultTTL = 64
+
+	// buckets for fragment identifiers
+	buckets = 2048
 )
 
+var _ stack.GroupAddressableEndpoint = (*endpoint)(nil)
+var _ stack.AddressableEndpoint = (*endpoint)(nil)
+var _ stack.NetworkEndpoint = (*endpoint)(nil)
+var _ stack.NDPEndpoint = (*endpoint)(nil)
+var _ NDPEndpoint = (*endpoint)(nil)
+
 type endpoint struct {
-	nicID         tcpip.NICID
-	linkEP        stack.LinkEndpoint
+	nic           stack.NetworkInterface
 	linkAddrCache stack.LinkAddressCache
+	nud           stack.NUDHandler
 	dispatcher    stack.TransportDispatcher
 	protocol      *protocol
 	stack         *stack.Stack
+
+	// enabled is set to 1 when the endpoint is enabled and 0 when it is
+	// disabled.
+	//
+	// Must be accessed using atomic operations.
+	enabled uint32
+
+	mu struct {
+		sync.RWMutex
+
+		addressableEndpointState stack.AddressableEndpointState
+		ndp                      ndpState
+	}
+}
+
+// NICNameFromID is a function that returns a stable name for the specified NIC,
+// even if different NIC IDs are used to refer to the same NIC in different
+// program runs. It is used when generating opaque interface identifiers (IIDs).
+// If the NIC was created with a name, it is passed to NICNameFromID.
+//
+// NICNameFromID SHOULD return unique NIC names so unique opaque IIDs are
+// generated for the same prefix on differnt NICs.
+type NICNameFromID func(tcpip.NICID, string) string
+
+// OpaqueInterfaceIdentifierOptions holds the options related to the generation
+// of opaque interface indentifiers (IIDs) as defined by RFC 7217.
+type OpaqueInterfaceIdentifierOptions struct {
+	// NICNameFromID is a function that returns a stable name for a specified NIC,
+	// even if the NIC ID changes over time.
+	//
+	// Must be specified to generate the opaque IID.
+	NICNameFromID NICNameFromID
+
+	// SecretKey is a pseudo-random number used as the secret key when generating
+	// opaque IIDs as defined by RFC 7217. The key SHOULD be at least
+	// header.OpaqueIIDSecretKeyMinBytes bytes and MUST follow minimum randomness
+	// requirements for security as outlined by RFC 4086. SecretKey MUST NOT
+	// change between program runs, unless explicitly changed.
+	//
+	// OpaqueInterfaceIdentifierOptions takes ownership of SecretKey. SecretKey
+	// MUST NOT be modified after Stack is created.
+	//
+	// May be nil, but a nil value is highly discouraged to maintain
+	// some level of randomness between nodes.
+	SecretKey []byte
+}
+
+// InvalidateDefaultRouter implements stack.NDPEndpoint.
+func (e *endpoint) InvalidateDefaultRouter(rtr tcpip.Address) {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+	e.mu.ndp.invalidateDefaultRouter(rtr)
+}
+
+// SetNDPConfigurations implements NDPEndpoint.
+func (e *endpoint) SetNDPConfigurations(c NDPConfigurations) {
+	c.validate()
+	e.mu.Lock()
+	defer e.mu.Unlock()
+	e.mu.ndp.configs = c
+}
+
+// hasTentativeAddr returns true if addr is tentative on e.
+func (e *endpoint) hasTentativeAddr(addr tcpip.Address) bool {
+	e.mu.RLock()
+	addressEndpoint := e.getAddressRLocked(addr)
+	e.mu.RUnlock()
+	return addressEndpoint != nil && addressEndpoint.GetKind() == stack.PermanentTentative
+}
+
+// dupTentativeAddrDetected attempts to inform e that a tentative addr is a
+// duplicate on a link.
+//
+// dupTentativeAddrDetected removes the tentative address if it exists. If the
+// address was generated via SLAAC, an attempt is made to generate a new
+// address.
+func (e *endpoint) dupTentativeAddrDetected(addr tcpip.Address) *tcpip.Error {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+
+	addressEndpoint := e.getAddressRLocked(addr)
+	if addressEndpoint == nil {
+		return tcpip.ErrBadAddress
+	}
+
+	if addressEndpoint.GetKind() != stack.PermanentTentative {
+		return tcpip.ErrInvalidEndpointState
+	}
+
+	// If the address is a SLAAC address, do not invalidate its SLAAC prefix as an
+	// attempt will be made to generate a new address for it.
+	if err := e.removePermanentEndpointLocked(addressEndpoint, false /* allowSLAACInvalidation */); err != nil {
+		return err
+	}
+
+	prefix := addressEndpoint.AddressWithPrefix().Subnet()
+
+	switch t := addressEndpoint.ConfigType(); t {
+	case stack.AddressConfigStatic:
+	case stack.AddressConfigSlaac:
+		e.mu.ndp.regenerateSLAACAddr(prefix)
+	case stack.AddressConfigSlaacTemp:
+		// Do not reset the generation attempts counter for the prefix as the
+		// temporary address is being regenerated in response to a DAD conflict.
+		e.mu.ndp.regenerateTempSLAACAddr(prefix, false /* resetGenAttempts */)
+	default:
+		panic(fmt.Sprintf("unrecognized address config type = %d", t))
+	}
+
+	return nil
+}
+
+// transitionForwarding transitions the endpoint's forwarding status to
+// forwarding.
+//
+// Must only be called when the forwarding status changes.
+func (e *endpoint) transitionForwarding(forwarding bool) {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+
+	if !e.Enabled() {
+		return
+	}
+
+	if forwarding {
+		// When transitioning into an IPv6 router, host-only state (NDP discovered
+		// routers, discovered on-link prefixes, and auto-generated addresses) is
+		// cleaned up/invalidated and NDP router solicitations are stopped.
+		e.mu.ndp.stopSolicitingRouters()
+		e.mu.ndp.cleanupState(true /* hostOnly */)
+	} else {
+		// When transitioning into an IPv6 host, NDP router solicitations are
+		// started.
+		e.mu.ndp.startSolicitingRouters()
+	}
+}
+
+// Enable implements stack.NetworkEndpoint.
+func (e *endpoint) Enable() *tcpip.Error {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+
+	// If the NIC is not enabled, the endpoint can't do anything meaningful so
+	// don't enable the endpoint.
+	if !e.nic.Enabled() {
+		return tcpip.ErrNotPermitted
+	}
+
+	// If the endpoint is already enabled, there is nothing for it to do.
+	if !e.setEnabled(true) {
+		return nil
+	}
+
+	// Join the IPv6 All-Nodes Multicast group if the stack is configured to
+	// use IPv6. This is required to ensure that this node properly receives
+	// and responds to the various NDP messages that are destined to the
+	// all-nodes multicast address. An example is the Neighbor Advertisement
+	// when we perform Duplicate Address Detection, or Router Advertisement
+	// when we do Router Discovery. See RFC 4862, section 5.4.2 and RFC 4861
+	// section 4.2 for more information.
+	//
+	// Also auto-generate an IPv6 link-local address based on the endpoint's
+	// link address if it is configured to do so. Note, each interface is
+	// required to have IPv6 link-local unicast address, as per RFC 4291
+	// section 2.1.
+
+	// Join the All-Nodes multicast group before starting DAD as responses to DAD
+	// (NDP NS) messages may be sent to the All-Nodes multicast group if the
+	// source address of the NDP NS is the unspecified address, as per RFC 4861
+	// section 7.2.4.
+	if _, err := e.mu.addressableEndpointState.JoinGroup(header.IPv6AllNodesMulticastAddress); err != nil {
+		return err
+	}
+
+	// Perform DAD on the all the unicast IPv6 endpoints that are in the permanent
+	// state.
+	//
+	// Addresses may have aleady completed DAD but in the time since the endpoint
+	// was last enabled, other devices may have acquired the same addresses.
+	var err *tcpip.Error
+	e.mu.addressableEndpointState.ReadOnly().ForEach(func(addressEndpoint stack.AddressEndpoint) bool {
+		addr := addressEndpoint.AddressWithPrefix().Address
+		if !header.IsV6UnicastAddress(addr) {
+			return true
+		}
+
+		switch addressEndpoint.GetKind() {
+		case stack.Permanent:
+			addressEndpoint.SetKind(stack.PermanentTentative)
+			fallthrough
+		case stack.PermanentTentative:
+			err = e.mu.ndp.startDuplicateAddressDetection(addr, addressEndpoint)
+			return err == nil
+		default:
+			return true
+		}
+	})
+	if err != nil {
+		return err
+	}
+
+	// Do not auto-generate an IPv6 link-local address for loopback devices.
+	if e.protocol.autoGenIPv6LinkLocal && !e.nic.IsLoopback() {
+		// The valid and preferred lifetime is infinite for the auto-generated
+		// link-local address.
+		e.mu.ndp.doSLAAC(header.IPv6LinkLocalPrefix.Subnet(), header.NDPInfiniteLifetime, header.NDPInfiniteLifetime)
+	}
+
+	// If we are operating as a router, then do not solicit routers since we
+	// won't process the RAs anyway.
+	//
+	// Routers do not process Router Advertisements (RA) the same way a host
+	// does. That is, routers do not learn from RAs (e.g. on-link prefixes
+	// and default routers). Therefore, soliciting RAs from other routers on
+	// a link is unnecessary for routers.
+	if !e.protocol.Forwarding() {
+		e.mu.ndp.startSolicitingRouters()
+	}
+
+	return nil
+}
+
+// Enabled implements stack.NetworkEndpoint.
+func (e *endpoint) Enabled() bool {
+	return e.nic.Enabled() && e.isEnabled()
+}
+
+// isEnabled returns true if the endpoint is enabled, regardless of the
+// enabled status of the NIC.
+func (e *endpoint) isEnabled() bool {
+	return atomic.LoadUint32(&e.enabled) == 1
+}
+
+// setEnabled sets the enabled status for the endpoint.
+//
+// Returns true if the enabled status was updated.
+func (e *endpoint) setEnabled(v bool) bool {
+	if v {
+		return atomic.SwapUint32(&e.enabled, 1) == 0
+	}
+	return atomic.SwapUint32(&e.enabled, 0) == 1
+}
+
+// Disable implements stack.NetworkEndpoint.
+func (e *endpoint) Disable() {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+	e.disableLocked()
+}
+
+func (e *endpoint) disableLocked() {
+	if !e.setEnabled(false) {
+		return
+	}
+
+	e.mu.ndp.stopSolicitingRouters()
+	e.mu.ndp.cleanupState(false /* hostOnly */)
+	e.stopDADForPermanentAddressesLocked()
+
+	// The endpoint may have already left the multicast group.
+	if _, err := e.mu.addressableEndpointState.LeaveGroup(header.IPv6AllNodesMulticastAddress); err != nil && err != tcpip.ErrBadLocalAddress {
+		panic(fmt.Sprintf("unexpected error when leaving group = %s: %s", header.IPv6AllNodesMulticastAddress, err))
+	}
+}
+
+// stopDADForPermanentAddressesLocked stops DAD for all permaneent addresses.
+//
+// Precondition: e.mu must be write locked.
+func (e *endpoint) stopDADForPermanentAddressesLocked() {
+	// Stop DAD for all the tentative unicast addresses.
+	e.mu.addressableEndpointState.ReadOnly().ForEach(func(addressEndpoint stack.AddressEndpoint) bool {
+		if addressEndpoint.GetKind() != stack.PermanentTentative {
+			return true
+		}
+
+		addr := addressEndpoint.AddressWithPrefix().Address
+		if header.IsV6UnicastAddress(addr) {
+			e.mu.ndp.stopDuplicateAddressDetection(addr)
+		}
+
+		return true
+	})
 }
 
 // DefaultTTL is the default hop limit for this endpoint.
@@ -61,31 +363,13 @@ func (e *endpoint) DefaultTTL() uint8 {
 // MTU implements stack.NetworkEndpoint.MTU. It returns the link-layer MTU minus
 // the network layer max header length.
 func (e *endpoint) MTU() uint32 {
-	return calculateMTU(e.linkEP.MTU())
-}
-
-// NICID returns the ID of the NIC this endpoint belongs to.
-func (e *endpoint) NICID() tcpip.NICID {
-	return e.nicID
-}
-
-// Capabilities implements stack.NetworkEndpoint.Capabilities.
-func (e *endpoint) Capabilities() stack.LinkEndpointCapabilities {
-	return e.linkEP.Capabilities()
+	return calculateMTU(e.nic.MTU())
 }
 
 // MaxHeaderLength returns the maximum length needed by ipv6 headers (and
 // underlying protocols).
 func (e *endpoint) MaxHeaderLength() uint16 {
-	return e.linkEP.MaxHeaderLength() + header.IPv6MinimumSize
-}
-
-// GSOMaxSize returns the maximum GSO packet size.
-func (e *endpoint) GSOMaxSize() uint32 {
-	if gso, ok := e.linkEP.(stack.GSOEndpoint); ok {
-		return gso.GSOMaxSize()
-	}
-	return 0
+	return e.nic.MaxHeaderLength() + header.IPv6MinimumSize
 }
 
 func (e *endpoint) addIPHeader(r *stack.Route, pkt *stack.PacketBuffer, params stack.NetworkHeaderParams) {
@@ -99,12 +383,76 @@ func (e *endpoint) addIPHeader(r *stack.Route, pkt *stack.PacketBuffer, params s
 		SrcAddr:       r.LocalAddress,
 		DstAddr:       r.RemoteAddress,
 	})
-	pkt.NetworkProtocolNumber = header.IPv6ProtocolNumber
+	pkt.NetworkProtocolNumber = ProtocolNumber
+}
+
+func (e *endpoint) packetMustBeFragmented(pkt *stack.PacketBuffer, gso *stack.GSO) bool {
+	return (gso == nil || gso.Type == stack.GSONone) && pkt.Size() > int(e.nic.MTU())
+}
+
+// handleFragments fragments pkt and calls the handler function on each
+// fragment. It returns the number of fragments handled and the number of
+// fragments left to be processed. The IP header must already be present in the
+// original packet. The mtu is the maximum size of the packets. The transport
+// header protocol number is required to avoid parsing the IPv6 extension
+// headers.
+func (e *endpoint) handleFragments(r *stack.Route, gso *stack.GSO, mtu uint32, pkt *stack.PacketBuffer, transProto tcpip.TransportProtocolNumber, handler func(*stack.PacketBuffer) *tcpip.Error) (int, int, *tcpip.Error) {
+	fragMTU := int(calculateFragmentInnerMTU(mtu, pkt))
+	if fragMTU < pkt.TransportHeader().View().Size() {
+		// As per RFC 8200 Section 4.5, the Transport Header is expected to be small
+		// enough to fit in the first fragment.
+		return 0, 1, tcpip.ErrMessageTooLong
+	}
+
+	pf := fragmentation.MakePacketFragmenter(pkt, fragMTU, calculateFragmentReserve(pkt))
+	id := atomic.AddUint32(&e.protocol.ids[hashRoute(r, e.protocol.hashIV)%buckets], 1)
+	networkHeader := header.IPv6(pkt.NetworkHeader().View())
+
+	var n int
+	for {
+		fragPkt, more := buildNextFragment(&pf, networkHeader, transProto, id)
+		if err := handler(fragPkt); err != nil {
+			return n, pf.RemainingFragmentCount() + 1, err
+		}
+		n++
+		if !more {
+			return n, pf.RemainingFragmentCount(), nil
+		}
+	}
 }
 
 // WritePacket writes a packet to the given destination address and protocol.
 func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.NetworkHeaderParams, pkt *stack.PacketBuffer) *tcpip.Error {
 	e.addIPHeader(r, pkt, params)
+	return e.writePacket(r, gso, pkt, params.Protocol)
+}
+
+func (e *endpoint) writePacket(r *stack.Route, gso *stack.GSO, pkt *stack.PacketBuffer, protocol tcpip.TransportProtocolNumber) *tcpip.Error {
+	// iptables filtering. All packets that reach here are locally
+	// generated.
+	nicName := e.protocol.stack.FindNICNameFromID(e.nic.ID())
+	ipt := e.protocol.stack.IPTables()
+	if ok := ipt.Check(stack.Output, pkt, gso, r, "", nicName); !ok {
+		// iptables is telling us to drop the packet.
+		r.Stats().IP.IPTablesOutputDropped.Increment()
+		return nil
+	}
+
+	// If the packet is manipulated as per NAT Output rules, handle packet
+	// based on destination address and do not send the packet to link
+	// layer.
+	//
+	// TODO(gvisor.dev/issue/170): We should do this for every
+	// packet, rather than only NATted packets, but removing this check
+	// short circuits broadcasts before they are sent out to other hosts.
+	if pkt.NatDone {
+		netHeader := header.IPv6(pkt.NetworkHeader().View())
+		if ep, err := e.protocol.stack.FindNetworkEndpoint(ProtocolNumber, netHeader.DestinationAddress()); err == nil {
+			route := r.ReverseRoute(netHeader.SourceAddress(), netHeader.DestinationAddress())
+			ep.HandlePacket(&route, pkt)
+			return nil
+		}
+	}
 
 	if r.Loop&stack.PacketLoop != 0 {
 		loopedR := r.MakeLoopedRoute()
@@ -120,11 +468,29 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.Netw
 		return nil
 	}
 
+	if e.packetMustBeFragmented(pkt, gso) {
+		sent, remain, err := e.handleFragments(r, gso, e.nic.MTU(), pkt, protocol, func(fragPkt *stack.PacketBuffer) *tcpip.Error {
+			// TODO(gvisor.dev/issue/3884): Evaluate whether we want to send each
+			// fragment one by one using WritePacket() (current strategy) or if we
+			// want to create a PacketBufferList from the fragments and feed it to
+			// WritePackets(). It'll be faster but cost more memory.
+			return e.nic.WritePacket(r, gso, ProtocolNumber, fragPkt)
+		})
+		r.Stats().IP.PacketsSent.IncrementBy(uint64(sent))
+		r.Stats().IP.OutgoingPacketErrors.IncrementBy(uint64(remain))
+		return err
+	}
+
+	if err := e.nic.WritePacket(r, gso, ProtocolNumber, pkt); err != nil {
+		r.Stats().IP.OutgoingPacketErrors.Increment()
+		return err
+	}
+
 	r.Stats().IP.PacketsSent.Increment()
-	return e.linkEP.WritePacket(r, gso, ProtocolNumber, pkt)
+	return nil
 }
 
-// WritePackets implements stack.LinkEndpoint.WritePackets.
+// WritePackets implements stack.NetworkEndpoint.WritePackets.
 func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.PacketBufferList, params stack.NetworkHeaderParams) (int, *tcpip.Error) {
 	if r.Loop&stack.PacketLoop != 0 {
 		panic("not implemented")
@@ -135,29 +501,131 @@ func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.Packe
 
 	for pb := pkts.Front(); pb != nil; pb = pb.Next() {
 		e.addIPHeader(r, pb, params)
+		if e.packetMustBeFragmented(pb, gso) {
+			// Keep track of the packet that is about to be fragmented so it can be
+			// removed once the fragmentation is done.
+			originalPkt := pb
+			if _, _, err := e.handleFragments(r, gso, e.nic.MTU(), pb, params.Protocol, func(fragPkt *stack.PacketBuffer) *tcpip.Error {
+				// Modify the packet list in place with the new fragments.
+				pkts.InsertAfter(pb, fragPkt)
+				pb = fragPkt
+				return nil
+			}); err != nil {
+				r.Stats().IP.OutgoingPacketErrors.IncrementBy(uint64(pkts.Len()))
+				return 0, err
+			}
+			// Remove the packet that was just fragmented and process the rest.
+			pkts.Remove(originalPkt)
+		}
+	}
+
+	// iptables filtering. All packets that reach here are locally
+	// generated.
+	nicName := e.protocol.stack.FindNICNameFromID(e.nic.ID())
+	ipt := e.protocol.stack.IPTables()
+	dropped, natPkts := ipt.CheckPackets(stack.Output, pkts, gso, r, nicName)
+	if len(dropped) == 0 && len(natPkts) == 0 {
+		// Fast path: If no packets are to be dropped then we can just invoke the
+		// faster WritePackets API directly.
+		n, err := e.nic.WritePackets(r, gso, pkts, ProtocolNumber)
+		r.Stats().IP.PacketsSent.IncrementBy(uint64(n))
+		if err != nil {
+			r.Stats().IP.OutgoingPacketErrors.IncrementBy(uint64(pkts.Len() - n))
+		}
+		return n, err
+	}
+	r.Stats().IP.IPTablesOutputDropped.IncrementBy(uint64(len(dropped)))
+
+	// Slow path as we are dropping some packets in the batch degrade to
+	// emitting one packet at a time.
+	n := 0
+	for pkt := pkts.Front(); pkt != nil; pkt = pkt.Next() {
+		if _, ok := dropped[pkt]; ok {
+			continue
+		}
+		if _, ok := natPkts[pkt]; ok {
+			netHeader := header.IPv6(pkt.NetworkHeader().View())
+			if ep, err := e.protocol.stack.FindNetworkEndpoint(ProtocolNumber, netHeader.DestinationAddress()); err == nil {
+				src := netHeader.SourceAddress()
+				dst := netHeader.DestinationAddress()
+				route := r.ReverseRoute(src, dst)
+				ep.HandlePacket(&route, pkt)
+				n++
+				continue
+			}
+		}
+		if err := e.nic.WritePacket(r, gso, ProtocolNumber, pkt); err != nil {
+			r.Stats().IP.PacketsSent.IncrementBy(uint64(n))
+			r.Stats().IP.OutgoingPacketErrors.IncrementBy(uint64(pkts.Len() - n + len(dropped)))
+			// Dropped packets aren't errors, so include them in
+			// the return value.
+			return n + len(dropped), err
+		}
+		n++
 	}
 
-	n, err := e.linkEP.WritePackets(r, gso, pkts, ProtocolNumber)
 	r.Stats().IP.PacketsSent.IncrementBy(uint64(n))
-	return n, err
+	// Dropped packets aren't errors, so include them in the return value.
+	return n + len(dropped), nil
 }
 
-// WriteHeaderIncludedPacker implements stack.NetworkEndpoint. It is not yet
-// supported by IPv6.
-func (*endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt *stack.PacketBuffer) *tcpip.Error {
-	// TODO(b/146666412): Support IPv6 header-included packets.
-	return tcpip.ErrNotSupported
+// WriteHeaderIncludedPacker implements stack.NetworkEndpoint.
+func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt *stack.PacketBuffer) *tcpip.Error {
+	// The packet already has an IP header, but there are a few required checks.
+	h, ok := pkt.Data.PullUp(header.IPv6MinimumSize)
+	if !ok {
+		return tcpip.ErrMalformedHeader
+	}
+	ip := header.IPv6(h)
+
+	// Always set the payload length.
+	pktSize := pkt.Data.Size()
+	ip.SetPayloadLength(uint16(pktSize - header.IPv6MinimumSize))
+
+	// Set the source address when zero.
+	if ip.SourceAddress() == header.IPv6Any {
+		ip.SetSourceAddress(r.LocalAddress)
+	}
+
+	// Set the destination. If the packet already included a destination, it will
+	// be part of the route anyways.
+	ip.SetDestinationAddress(r.RemoteAddress)
+
+	// Populate the packet buffer's network header and don't allow an invalid
+	// packet to be sent.
+	//
+	// Note that parsing only makes sure that the packet is well formed as per the
+	// wire format. We also want to check if the header's fields are valid before
+	// sending the packet.
+	proto, _, _, _, ok := parse.IPv6(pkt)
+	if !ok || !header.IPv6(pkt.NetworkHeader().View()).IsValid(pktSize) {
+		return tcpip.ErrMalformedHeader
+	}
+
+	return e.writePacket(r, nil /* gso */, pkt, proto)
 }
 
 // HandlePacket is called by the link layer when new ipv6 packets arrive for
 // this endpoint.
 func (e *endpoint) HandlePacket(r *stack.Route, pkt *stack.PacketBuffer) {
+	if !e.isEnabled() {
+		return
+	}
+
 	h := header.IPv6(pkt.NetworkHeader().View())
 	if !h.IsValid(pkt.Data.Size() + pkt.NetworkHeader().View().Size() + pkt.TransportHeader().View().Size()) {
 		r.Stats().IP.MalformedPacketsReceived.Increment()
 		return
 	}
 
+	// As per RFC 4291 section 2.7:
+	//   Multicast addresses must not be used as source addresses in IPv6
+	//   packets or appear in any Routing header.
+	if header.IsV6MulticastAddress(r.RemoteAddress) {
+		r.Stats().IP.InvalidSourceAddressesReceived.Increment()
+		return
+	}
+
 	// vv consists of:
 	// - Any IPv6 header bytes after the first 40 (i.e. extensions).
 	// - The transport header, if present.
@@ -168,7 +636,19 @@ func (e *endpoint) HandlePacket(r *stack.Route, pkt *stack.PacketBuffer) {
 	it := header.MakeIPv6PayloadIterator(header.IPv6ExtensionHeaderIdentifier(h.NextHeader()), vv)
 	hasFragmentHeader := false
 
-	for firstHeader := true; ; firstHeader = false {
+	// iptables filtering. All packets that reach here are intended for
+	// this machine and need not be forwarded.
+	ipt := e.protocol.stack.IPTables()
+	if ok := ipt.Check(stack.Input, pkt, nil, nil, "", ""); !ok {
+		// iptables is telling us to drop the packet.
+		r.Stats().IP.IPTablesInputDropped.Increment()
+		return
+	}
+
+	for {
+		// Keep track of the start of the previous header so we can report the
+		// special case of a Hop by Hop at a location other than at the start.
+		previousHeaderStart := it.HeaderOffset()
 		extHdr, done, err := it.Next()
 		if err != nil {
 			r.Stats().IP.MalformedPacketsReceived.Increment()
@@ -182,11 +662,11 @@ func (e *endpoint) HandlePacket(r *stack.Route, pkt *stack.PacketBuffer) {
 		case header.IPv6HopByHopOptionsExtHdr:
 			// As per RFC 8200 section 4.1, the Hop By Hop extension header is
 			// restricted to appear immediately after an IPv6 fixed header.
-			//
-			// TODO(b/152019344): Send an ICMPv6 Parameter Problem, Code 1
-			// (unrecognized next header) error in response to an extension header's
-			// Next Header field with the Hop By Hop extension header identifier.
-			if !firstHeader {
+			if previousHeaderStart != 0 {
+				_ = e.protocol.returnError(r, &icmpReasonParameterProblem{
+					code:    header.ICMPv6UnknownHeader,
+					pointer: previousHeaderStart,
+				}, pkt)
 				return
 			}
 
@@ -208,13 +688,25 @@ func (e *endpoint) HandlePacket(r *stack.Route, pkt *stack.PacketBuffer) {
 				case header.IPv6OptionUnknownActionSkip:
 				case header.IPv6OptionUnknownActionDiscard:
 					return
-				case header.IPv6OptionUnknownActionDiscardSendICMP:
-					// TODO(b/152019344): Send an ICMPv6 Parameter Problem Code 2 for
-					// unrecognized IPv6 extension header options.
-					return
 				case header.IPv6OptionUnknownActionDiscardSendICMPNoMulticastDest:
-					// TODO(b/152019344): Send an ICMPv6 Parameter Problem Code 2 for
-					// unrecognized IPv6 extension header options.
+					if header.IsV6MulticastAddress(r.LocalAddress) {
+						return
+					}
+					fallthrough
+				case header.IPv6OptionUnknownActionDiscardSendICMP:
+					// This case satisfies a requirement of RFC 8200 section 4.2
+					// which states that an unknown option starting with bits [10] should:
+					//
+					//    discard the packet and, regardless of whether or not the
+					//    packet's Destination Address was a multicast address, send an
+					//    ICMP Parameter Problem, Code 2, message to the packet's
+					//    Source Address, pointing to the unrecognized Option Type.
+					//
+					_ = e.protocol.returnError(r, &icmpReasonParameterProblem{
+						code:               header.ICMPv6UnknownOption,
+						pointer:            it.ParseOffset() + optsIt.OptionOffset(),
+						respondToMulticast: true,
+					}, pkt)
 					return
 				default:
 					panic(fmt.Sprintf("unrecognized action for an unrecognized Hop By Hop extension header option = %d", opt))
@@ -225,16 +717,20 @@ func (e *endpoint) HandlePacket(r *stack.Route, pkt *stack.PacketBuffer) {
 			// As per RFC 8200 section 4.4, if a node encounters a routing header with
 			// an unrecognized routing type value, with a non-zero Segments Left
 			// value, the node must discard the packet and send an ICMP Parameter
-			// Problem, Code 0. If the Segments Left is 0, the node must ignore the
-			// Routing extension header and process the next header in the packet.
+			// Problem, Code 0 to the packet's Source Address, pointing to the
+			// unrecognized Routing Type.
+			//
+			// If the Segments Left is 0, the node must ignore the Routing extension
+			// header and process the next header in the packet.
 			//
 			// Note, the stack does not yet handle any type of routing extension
 			// header, so we just make sure Segments Left is zero before processing
 			// the next extension header.
-			//
-			// TODO(b/152019344): Send an ICMPv6 Parameter Problem Code 0 for
-			// unrecognized routing types with a non-zero Segments Left value.
 			if extHdr.SegmentsLeft() != 0 {
+				_ = e.protocol.returnError(r, &icmpReasonParameterProblem{
+					code:    header.ICMPv6ErroneousHeader,
+					pointer: it.ParseOffset(),
+				}, pkt)
 				return
 			}
 
@@ -267,7 +763,7 @@ func (e *endpoint) HandlePacket(r *stack.Route, pkt *stack.PacketBuffer) {
 					it, done, err := it.Next()
 					if err != nil {
 						r.Stats().IP.MalformedPacketsReceived.Increment()
-						r.Stats().IP.MalformedPacketsReceived.Increment()
+						r.Stats().IP.MalformedFragmentsReceived.Increment()
 						return
 					}
 					if done {
@@ -310,21 +806,18 @@ func (e *endpoint) HandlePacket(r *stack.Route, pkt *stack.PacketBuffer) {
 
 			// The packet is a fragment, let's try to reassemble it.
 			start := extHdr.FragmentOffset() * header.IPv6FragmentExtHdrFragmentOffsetBytesPerUnit
-			last := start + uint16(fragmentPayloadLen) - 1
 
-			// Drop the packet if the fragmentOffset is incorrect. i.e the
-			// combination of fragmentOffset and pkt.Data.size() causes a
-			// wrap around resulting in last being less than the offset.
-			if last < start {
+			// Drop the fragment if the size of the reassembled payload would exceed
+			// the maximum payload size.
+			if int(start)+fragmentPayloadLen > header.IPv6MaximumPayloadSize {
 				r.Stats().IP.MalformedPacketsReceived.Increment()
 				r.Stats().IP.MalformedFragmentsReceived.Increment()
 				return
 			}
 
-			var ready bool
 			// Note that pkt doesn't have its transport header set after reassembly,
 			// and won't until DeliverNetworkPacket sets it.
-			pkt.Data, ready, err = e.protocol.fragmentation.Process(
+			data, proto, ready, err := e.protocol.fragmentation.Process(
 				// IPv6 ignores the Protocol field since the ID only needs to be unique
 				// across source-destination pairs, as per RFC 8200 section 4.5.
 				fragmentation.FragmentID{
@@ -333,8 +826,9 @@ func (e *endpoint) HandlePacket(r *stack.Route, pkt *stack.PacketBuffer) {
 					ID:          extHdr.ID(),
 				},
 				start,
-				last,
+				start+uint16(fragmentPayloadLen)-1,
 				extHdr.More(),
+				uint8(rawPayload.Identifier),
 				rawPayload.Buf,
 			)
 			if err != nil {
@@ -342,12 +836,14 @@ func (e *endpoint) HandlePacket(r *stack.Route, pkt *stack.PacketBuffer) {
 				r.Stats().IP.MalformedFragmentsReceived.Increment()
 				return
 			}
+			pkt.Data = data
 
 			if ready {
 				// We create a new iterator with the reassembled packet because we could
 				// have more extension headers in the reassembled payload, as per RFC
-				// 8200 section 4.5.
-				it = header.MakeIPv6PayloadIterator(rawPayload.Identifier, pkt.Data)
+				// 8200 section 4.5. We also use the NextHeader value from the first
+				// fragment.
+				it = header.MakeIPv6PayloadIterator(header.IPv6ExtensionHeaderIdentifier(proto), pkt.Data)
 			}
 
 		case header.IPv6DestinationOptionsExtHdr:
@@ -369,13 +865,25 @@ func (e *endpoint) HandlePacket(r *stack.Route, pkt *stack.PacketBuffer) {
 				case header.IPv6OptionUnknownActionSkip:
 				case header.IPv6OptionUnknownActionDiscard:
 					return
-				case header.IPv6OptionUnknownActionDiscardSendICMP:
-					// TODO(b/152019344): Send an ICMPv6 Parameter Problem Code 2 for
-					// unrecognized IPv6 extension header options.
-					return
 				case header.IPv6OptionUnknownActionDiscardSendICMPNoMulticastDest:
-					// TODO(b/152019344): Send an ICMPv6 Parameter Problem Code 2 for
-					// unrecognized IPv6 extension header options.
+					if header.IsV6MulticastAddress(r.LocalAddress) {
+						return
+					}
+					fallthrough
+				case header.IPv6OptionUnknownActionDiscardSendICMP:
+					// This case satisfies a requirement of RFC 8200 section 4.2
+					// which states that an unknown option starting with bits [10] should:
+					//
+					//    discard the packet and, regardless of whether or not the
+					//    packet's Destination Address was a multicast address, send an
+					//    ICMP Parameter Problem, Code 2, message to the packet's
+					//    Source Address, pointing to the unrecognized Option Type.
+					//
+					_ = e.protocol.returnError(r, &icmpReasonParameterProblem{
+						code:               header.ICMPv6UnknownOption,
+						pointer:            it.ParseOffset() + optsIt.OptionOffset(),
+						respondToMulticast: true,
+					}, pkt)
 					return
 				default:
 					panic(fmt.Sprintf("unrecognized action for an unrecognized Destination extension header option = %d", opt))
@@ -394,21 +902,55 @@ func (e *endpoint) HandlePacket(r *stack.Route, pkt *stack.PacketBuffer) {
 			extHdr.Buf.TrimFront(pkt.TransportHeader().View().Size())
 			pkt.Data = extHdr.Buf
 
+			r.Stats().IP.PacketsDelivered.Increment()
 			if p := tcpip.TransportProtocolNumber(extHdr.Identifier); p == header.ICMPv6ProtocolNumber {
+				pkt.TransportProtocolNumber = p
 				e.handleICMP(r, pkt, hasFragmentHeader)
 			} else {
 				r.Stats().IP.PacketsDelivered.Increment()
-				// TODO(b/152019344): Send an ICMPv6 Parameter Problem, Code 1 error
-				// in response to unrecognized next header values.
-				e.dispatcher.DeliverTransportPacket(r, p, pkt)
+				switch res := e.dispatcher.DeliverTransportPacket(r, p, pkt); res {
+				case stack.TransportPacketHandled:
+				case stack.TransportPacketDestinationPortUnreachable:
+					// As per RFC 4443 section 3.1:
+					//   A destination node SHOULD originate a Destination Unreachable
+					//   message with Code 4 in response to a packet for which the
+					//   transport protocol (e.g., UDP) has no listener, if that transport
+					//   protocol has no alternative means to inform the sender.
+					_ = e.protocol.returnError(r, &icmpReasonPortUnreachable{}, pkt)
+				case stack.TransportPacketProtocolUnreachable:
+					// As per RFC 8200 section 4. (page 7):
+					//   Extension headers are numbered from IANA IP Protocol Numbers
+					//   [IANA-PN], the same values used for IPv4 and IPv6.  When
+					//   processing a sequence of Next Header values in a packet, the
+					//   first one that is not an extension header [IANA-EH] indicates
+					//   that the next item in the packet is the corresponding upper-layer
+					//   header.
+					// With more related information on page 8:
+					//   If, as a result of processing a header, the destination node is
+					//   required to proceed to the next header but the Next Header value
+					//   in the current header is unrecognized by the node, it should
+					//   discard the packet and send an ICMP Parameter Problem message to
+					//   the source of the packet, with an ICMP Code value of 1
+					//   ("unrecognized Next Header type encountered") and the ICMP
+					//   Pointer field containing the offset of the unrecognized value
+					//   within the original packet.
+					//
+					// Which when taken together indicate that an unknown protocol should
+					// be treated as an unrecognized next header value.
+					_ = e.protocol.returnError(r, &icmpReasonParameterProblem{
+						code:    header.ICMPv6UnknownHeader,
+						pointer: it.ParseOffset(),
+					}, pkt)
+				default:
+					panic(fmt.Sprintf("unrecognized result from DeliverTransportPacket = %d", res))
+				}
 			}
 
 		default:
-			// If we receive a packet for an extension header we do not yet handle,
-			// drop the packet for now.
-			//
-			// TODO(b/152019344): Send an ICMPv6 Parameter Problem, Code 1 error
-			// in response to unrecognized next header values.
+			_ = e.protocol.returnError(r, &icmpReasonParameterProblem{
+				code:    header.ICMPv6UnknownHeader,
+				pointer: it.ParseOffset(),
+			}, pkt)
 			r.Stats().UnknownProtocolRcvdPackets.Increment()
 			return
 		}
@@ -416,19 +958,343 @@ func (e *endpoint) HandlePacket(r *stack.Route, pkt *stack.PacketBuffer) {
 }
 
 // Close cleans up resources associated with the endpoint.
-func (*endpoint) Close() {}
+func (e *endpoint) Close() {
+	e.mu.Lock()
+	e.disableLocked()
+	e.mu.ndp.removeSLAACAddresses(false /* keepLinkLocal */)
+	e.stopDADForPermanentAddressesLocked()
+	e.mu.addressableEndpointState.Cleanup()
+	e.mu.Unlock()
+
+	e.protocol.forgetEndpoint(e)
+}
 
 // NetworkProtocolNumber implements stack.NetworkEndpoint.NetworkProtocolNumber.
 func (e *endpoint) NetworkProtocolNumber() tcpip.NetworkProtocolNumber {
 	return e.protocol.Number()
 }
 
+// AddAndAcquirePermanentAddress implements stack.AddressableEndpoint.
+func (e *endpoint) AddAndAcquirePermanentAddress(addr tcpip.AddressWithPrefix, peb stack.PrimaryEndpointBehavior, configType stack.AddressConfigType, deprecated bool) (stack.AddressEndpoint, *tcpip.Error) {
+	// TODO(b/169350103): add checks here after making sure we no longer receive
+	// an empty address.
+	e.mu.Lock()
+	defer e.mu.Unlock()
+	return e.addAndAcquirePermanentAddressLocked(addr, peb, configType, deprecated)
+}
+
+// addAndAcquirePermanentAddressLocked is like AddAndAcquirePermanentAddress but
+// with locking requirements.
+//
+// addAndAcquirePermanentAddressLocked also joins the passed address's
+// solicited-node multicast group and start duplicate address detection.
+//
+// Precondition: e.mu must be write locked.
+func (e *endpoint) addAndAcquirePermanentAddressLocked(addr tcpip.AddressWithPrefix, peb stack.PrimaryEndpointBehavior, configType stack.AddressConfigType, deprecated bool) (stack.AddressEndpoint, *tcpip.Error) {
+	addressEndpoint, err := e.mu.addressableEndpointState.AddAndAcquirePermanentAddress(addr, peb, configType, deprecated)
+	if err != nil {
+		return nil, err
+	}
+
+	if !header.IsV6UnicastAddress(addr.Address) {
+		return addressEndpoint, nil
+	}
+
+	snmc := header.SolicitedNodeAddr(addr.Address)
+	if _, err := e.mu.addressableEndpointState.JoinGroup(snmc); err != nil {
+		return nil, err
+	}
+
+	addressEndpoint.SetKind(stack.PermanentTentative)
+
+	if e.Enabled() {
+		if err := e.mu.ndp.startDuplicateAddressDetection(addr.Address, addressEndpoint); err != nil {
+			return nil, err
+		}
+	}
+
+	return addressEndpoint, nil
+}
+
+// RemovePermanentAddress implements stack.AddressableEndpoint.
+func (e *endpoint) RemovePermanentAddress(addr tcpip.Address) *tcpip.Error {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+
+	addressEndpoint := e.getAddressRLocked(addr)
+	if addressEndpoint == nil || !addressEndpoint.GetKind().IsPermanent() {
+		return tcpip.ErrBadLocalAddress
+	}
+
+	return e.removePermanentEndpointLocked(addressEndpoint, true)
+}
+
+// removePermanentEndpointLocked is like removePermanentAddressLocked except
+// it works with a stack.AddressEndpoint.
+//
+// Precondition: e.mu must be write locked.
+func (e *endpoint) removePermanentEndpointLocked(addressEndpoint stack.AddressEndpoint, allowSLAACInvalidation bool) *tcpip.Error {
+	addr := addressEndpoint.AddressWithPrefix()
+	unicast := header.IsV6UnicastAddress(addr.Address)
+	if unicast {
+		e.mu.ndp.stopDuplicateAddressDetection(addr.Address)
+
+		// If we are removing an address generated via SLAAC, cleanup
+		// its SLAAC resources and notify the integrator.
+		switch addressEndpoint.ConfigType() {
+		case stack.AddressConfigSlaac:
+			e.mu.ndp.cleanupSLAACAddrResourcesAndNotify(addr, allowSLAACInvalidation)
+		case stack.AddressConfigSlaacTemp:
+			e.mu.ndp.cleanupTempSLAACAddrResourcesAndNotify(addr, allowSLAACInvalidation)
+		}
+	}
+
+	if err := e.mu.addressableEndpointState.RemovePermanentEndpoint(addressEndpoint); err != nil {
+		return err
+	}
+
+	if !unicast {
+		return nil
+	}
+
+	snmc := header.SolicitedNodeAddr(addr.Address)
+	if _, err := e.mu.addressableEndpointState.LeaveGroup(snmc); err != nil && err != tcpip.ErrBadLocalAddress {
+		return err
+	}
+
+	return nil
+}
+
+// hasPermanentAddressLocked returns true if the endpoint has a permanent
+// address equal to the passed address.
+//
+// Precondition: e.mu must be read or write locked.
+func (e *endpoint) hasPermanentAddressRLocked(addr tcpip.Address) bool {
+	addressEndpoint := e.getAddressRLocked(addr)
+	if addressEndpoint == nil {
+		return false
+	}
+	return addressEndpoint.GetKind().IsPermanent()
+}
+
+// getAddressRLocked returns the endpoint for the passed address.
+//
+// Precondition: e.mu must be read or write locked.
+func (e *endpoint) getAddressRLocked(localAddr tcpip.Address) stack.AddressEndpoint {
+	return e.mu.addressableEndpointState.ReadOnly().Lookup(localAddr)
+}
+
+// MainAddress implements stack.AddressableEndpoint.
+func (e *endpoint) MainAddress() tcpip.AddressWithPrefix {
+	e.mu.RLock()
+	defer e.mu.RUnlock()
+	return e.mu.addressableEndpointState.MainAddress()
+}
+
+// AcquireAssignedAddress implements stack.AddressableEndpoint.
+func (e *endpoint) AcquireAssignedAddress(localAddr tcpip.Address, allowTemp bool, tempPEB stack.PrimaryEndpointBehavior) stack.AddressEndpoint {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+	return e.acquireAddressOrCreateTempLocked(localAddr, allowTemp, tempPEB)
+}
+
+// acquireAddressOrCreateTempLocked is like AcquireAssignedAddress but with
+// locking requirements.
+//
+// Precondition: e.mu must be write locked.
+func (e *endpoint) acquireAddressOrCreateTempLocked(localAddr tcpip.Address, allowTemp bool, tempPEB stack.PrimaryEndpointBehavior) stack.AddressEndpoint {
+	return e.mu.addressableEndpointState.AcquireAssignedAddress(localAddr, allowTemp, tempPEB)
+}
+
+// AcquireOutgoingPrimaryAddress implements stack.AddressableEndpoint.
+func (e *endpoint) AcquireOutgoingPrimaryAddress(remoteAddr tcpip.Address, allowExpired bool) stack.AddressEndpoint {
+	e.mu.RLock()
+	defer e.mu.RUnlock()
+	return e.acquireOutgoingPrimaryAddressRLocked(remoteAddr, allowExpired)
+}
+
+// acquireOutgoingPrimaryAddressRLocked is like AcquireOutgoingPrimaryAddress
+// but with locking requirements.
+//
+// Precondition: e.mu must be read locked.
+func (e *endpoint) acquireOutgoingPrimaryAddressRLocked(remoteAddr tcpip.Address, allowExpired bool) stack.AddressEndpoint {
+	// addrCandidate is a candidate for Source Address Selection, as per
+	// RFC 6724 section 5.
+	type addrCandidate struct {
+		addressEndpoint stack.AddressEndpoint
+		scope           header.IPv6AddressScope
+	}
+
+	if len(remoteAddr) == 0 {
+		return e.mu.addressableEndpointState.AcquireOutgoingPrimaryAddress(remoteAddr, allowExpired)
+	}
+
+	// Create a candidate set of available addresses we can potentially use as a
+	// source address.
+	var cs []addrCandidate
+	e.mu.addressableEndpointState.ReadOnly().ForEachPrimaryEndpoint(func(addressEndpoint stack.AddressEndpoint) {
+		// If r is not valid for outgoing connections, it is not a valid endpoint.
+		if !addressEndpoint.IsAssigned(allowExpired) {
+			return
+		}
+
+		addr := addressEndpoint.AddressWithPrefix().Address
+		scope, err := header.ScopeForIPv6Address(addr)
+		if err != nil {
+			// Should never happen as we got r from the primary IPv6 endpoint list and
+			// ScopeForIPv6Address only returns an error if addr is not an IPv6
+			// address.
+			panic(fmt.Sprintf("header.ScopeForIPv6Address(%s): %s", addr, err))
+		}
+
+		cs = append(cs, addrCandidate{
+			addressEndpoint: addressEndpoint,
+			scope:           scope,
+		})
+	})
+
+	remoteScope, err := header.ScopeForIPv6Address(remoteAddr)
+	if err != nil {
+		// primaryIPv6Endpoint should never be called with an invalid IPv6 address.
+		panic(fmt.Sprintf("header.ScopeForIPv6Address(%s): %s", remoteAddr, err))
+	}
+
+	// Sort the addresses as per RFC 6724 section 5 rules 1-3.
+	//
+	// TODO(b/146021396): Implement rules 4-8 of RFC 6724 section 5.
+	sort.Slice(cs, func(i, j int) bool {
+		sa := cs[i]
+		sb := cs[j]
+
+		// Prefer same address as per RFC 6724 section 5 rule 1.
+		if sa.addressEndpoint.AddressWithPrefix().Address == remoteAddr {
+			return true
+		}
+		if sb.addressEndpoint.AddressWithPrefix().Address == remoteAddr {
+			return false
+		}
+
+		// Prefer appropriate scope as per RFC 6724 section 5 rule 2.
+		if sa.scope < sb.scope {
+			return sa.scope >= remoteScope
+		} else if sb.scope < sa.scope {
+			return sb.scope < remoteScope
+		}
+
+		// Avoid deprecated addresses as per RFC 6724 section 5 rule 3.
+		if saDep, sbDep := sa.addressEndpoint.Deprecated(), sb.addressEndpoint.Deprecated(); saDep != sbDep {
+			// If sa is not deprecated, it is preferred over sb.
+			return sbDep
+		}
+
+		// Prefer temporary addresses as per RFC 6724 section 5 rule 7.
+		if saTemp, sbTemp := sa.addressEndpoint.ConfigType() == stack.AddressConfigSlaacTemp, sb.addressEndpoint.ConfigType() == stack.AddressConfigSlaacTemp; saTemp != sbTemp {
+			return saTemp
+		}
+
+		// sa and sb are equal, return the endpoint that is closest to the front of
+		// the primary endpoint list.
+		return i < j
+	})
+
+	// Return the most preferred address that can have its reference count
+	// incremented.
+	for _, c := range cs {
+		if c.addressEndpoint.IncRef() {
+			return c.addressEndpoint
+		}
+	}
+
+	return nil
+}
+
+// PrimaryAddresses implements stack.AddressableEndpoint.
+func (e *endpoint) PrimaryAddresses() []tcpip.AddressWithPrefix {
+	e.mu.RLock()
+	defer e.mu.RUnlock()
+	return e.mu.addressableEndpointState.PrimaryAddresses()
+}
+
+// PermanentAddresses implements stack.AddressableEndpoint.
+func (e *endpoint) PermanentAddresses() []tcpip.AddressWithPrefix {
+	e.mu.RLock()
+	defer e.mu.RUnlock()
+	return e.mu.addressableEndpointState.PermanentAddresses()
+}
+
+// JoinGroup implements stack.GroupAddressableEndpoint.
+func (e *endpoint) JoinGroup(addr tcpip.Address) (bool, *tcpip.Error) {
+	if !header.IsV6MulticastAddress(addr) {
+		return false, tcpip.ErrBadAddress
+	}
+
+	e.mu.Lock()
+	defer e.mu.Unlock()
+	return e.mu.addressableEndpointState.JoinGroup(addr)
+}
+
+// LeaveGroup implements stack.GroupAddressableEndpoint.
+func (e *endpoint) LeaveGroup(addr tcpip.Address) (bool, *tcpip.Error) {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+	return e.mu.addressableEndpointState.LeaveGroup(addr)
+}
+
+// IsInGroup implements stack.GroupAddressableEndpoint.
+func (e *endpoint) IsInGroup(addr tcpip.Address) bool {
+	e.mu.RLock()
+	defer e.mu.RUnlock()
+	return e.mu.addressableEndpointState.IsInGroup(addr)
+}
+
+var _ stack.ForwardingNetworkProtocol = (*protocol)(nil)
+var _ stack.NetworkProtocol = (*protocol)(nil)
+
 type protocol struct {
+	stack *stack.Stack
+
+	mu struct {
+		sync.RWMutex
+
+		eps map[*endpoint]struct{}
+	}
+
+	ids    []uint32
+	hashIV uint32
+
 	// defaultTTL is the current default TTL for the protocol. Only the
-	// uint8 portion of it is meaningful and it must be accessed
-	// atomically.
-	defaultTTL    uint32
+	// uint8 portion of it is meaningful.
+	//
+	// Must be accessed using atomic operations.
+	defaultTTL uint32
+
+	// forwarding is set to 1 when the protocol has forwarding enabled and 0
+	// when it is disabled.
+	//
+	// Must be accessed using atomic operations.
+	forwarding uint32
+
 	fragmentation *fragmentation.Fragmentation
+
+	// ndpDisp is the NDP event dispatcher that is used to send the netstack
+	// integrator NDP related events.
+	ndpDisp NDPDispatcher
+
+	// ndpConfigs is the default NDP configurations used by an IPv6 endpoint.
+	ndpConfigs NDPConfigurations
+
+	// opaqueIIDOpts hold the options for generating opaque interface identifiers
+	// (IIDs) as outlined by RFC 7217.
+	opaqueIIDOpts OpaqueInterfaceIdentifierOptions
+
+	// tempIIDSeed is used to seed the initial temporary interface identifier
+	// history value used to generate IIDs for temporary SLAAC addresses.
+	tempIIDSeed []byte
+
+	// autoGenIPv6LinkLocal determines whether or not the stack attempts to
+	// auto-generate an IPv6 link-local address for newly enabled non-loopback
+	// NICs. See the AutoGenIPv6LinkLocal field of Options for more details.
+	autoGenIPv6LinkLocal bool
 }
 
 // Number returns the ipv6 protocol number.
@@ -453,22 +1319,42 @@ func (*protocol) ParseAddresses(v buffer.View) (src, dst tcpip.Address) {
 }
 
 // NewEndpoint creates a new ipv6 endpoint.
-func (p *protocol) NewEndpoint(nicID tcpip.NICID, linkAddrCache stack.LinkAddressCache, dispatcher stack.TransportDispatcher, linkEP stack.LinkEndpoint, st *stack.Stack) stack.NetworkEndpoint {
-	return &endpoint{
-		nicID:         nicID,
-		linkEP:        linkEP,
+func (p *protocol) NewEndpoint(nic stack.NetworkInterface, linkAddrCache stack.LinkAddressCache, nud stack.NUDHandler, dispatcher stack.TransportDispatcher) stack.NetworkEndpoint {
+	e := &endpoint{
+		nic:           nic,
 		linkAddrCache: linkAddrCache,
+		nud:           nud,
 		dispatcher:    dispatcher,
 		protocol:      p,
-		stack:         st,
 	}
+	e.mu.addressableEndpointState.Init(e)
+	e.mu.ndp = ndpState{
+		ep:             e,
+		configs:        p.ndpConfigs,
+		dad:            make(map[tcpip.Address]dadState),
+		defaultRouters: make(map[tcpip.Address]defaultRouterState),
+		onLinkPrefixes: make(map[tcpip.Subnet]onLinkPrefixState),
+		slaacPrefixes:  make(map[tcpip.Subnet]slaacPrefixState),
+	}
+	e.mu.ndp.initializeTempAddrState()
+
+	p.mu.Lock()
+	defer p.mu.Unlock()
+	p.mu.eps[e] = struct{}{}
+	return e
+}
+
+func (p *protocol) forgetEndpoint(e *endpoint) {
+	p.mu.Lock()
+	defer p.mu.Unlock()
+	delete(p.mu.eps, e)
 }
 
 // SetOption implements NetworkProtocol.SetOption.
-func (p *protocol) SetOption(option interface{}) *tcpip.Error {
+func (p *protocol) SetOption(option tcpip.SettableNetworkProtocolOption) *tcpip.Error {
 	switch v := option.(type) {
-	case tcpip.DefaultTTLOption:
-		p.SetDefaultTTL(uint8(v))
+	case *tcpip.DefaultTTLOption:
+		p.SetDefaultTTL(uint8(*v))
 		return nil
 	default:
 		return tcpip.ErrUnknownProtocolOption
@@ -476,7 +1362,7 @@ func (p *protocol) SetOption(option interface{}) *tcpip.Error {
 }
 
 // Option implements NetworkProtocol.Option.
-func (p *protocol) Option(option interface{}) *tcpip.Error {
+func (p *protocol) Option(option tcpip.GettableNetworkProtocolOption) *tcpip.Error {
 	switch v := option.(type) {
 	case *tcpip.DefaultTTLOption:
 		*v = tcpip.DefaultTTLOption(p.DefaultTTL())
@@ -502,75 +1388,43 @@ func (*protocol) Close() {}
 // Wait implements stack.TransportProtocol.Wait.
 func (*protocol) Wait() {}
 
-// Parse implements stack.TransportProtocol.Parse.
+// Parse implements stack.NetworkProtocol.Parse.
 func (*protocol) Parse(pkt *stack.PacketBuffer) (proto tcpip.TransportProtocolNumber, hasTransportHdr bool, ok bool) {
-	hdr, ok := pkt.Data.PullUp(header.IPv6MinimumSize)
+	proto, _, fragOffset, fragMore, ok := parse.IPv6(pkt)
 	if !ok {
 		return 0, false, false
 	}
-	ipHdr := header.IPv6(hdr)
 
-	// dataClone consists of:
-	// - Any IPv6 header bytes after the first 40 (i.e. extensions).
-	// - The transport header, if present.
-	// - Any other payload data.
-	views := [8]buffer.View{}
-	dataClone := pkt.Data.Clone(views[:])
-	dataClone.TrimFront(header.IPv6MinimumSize)
-	it := header.MakeIPv6PayloadIterator(header.IPv6ExtensionHeaderIdentifier(ipHdr.NextHeader()), dataClone)
+	return proto, !fragMore && fragOffset == 0, true
+}
 
-	// Iterate over the IPv6 extensions to find their length.
-	//
-	// Parsing occurs again in HandlePacket because we don't track the
-	// extensions in PacketBuffer. Unfortunately, that means HandlePacket
-	// has to do the parsing work again.
-	var nextHdr tcpip.TransportProtocolNumber
-	foundNext := true
-	extensionsSize := 0
-traverseExtensions:
-	for extHdr, done, err := it.Next(); ; extHdr, done, err = it.Next() {
-		if err != nil {
-			break
-		}
-		// If we exhaust the extension list, the entire packet is the IPv6 header
-		// and (possibly) extensions.
-		if done {
-			extensionsSize = dataClone.Size()
-			foundNext = false
-			break
-		}
+// Forwarding implements stack.ForwardingNetworkProtocol.
+func (p *protocol) Forwarding() bool {
+	return uint8(atomic.LoadUint32(&p.forwarding)) == 1
+}
 
-		switch extHdr := extHdr.(type) {
-		case header.IPv6FragmentExtHdr:
-			// If this is an atomic fragment, we don't have to treat it specially.
-			if !extHdr.More() && extHdr.FragmentOffset() == 0 {
-				continue
-			}
-			// This is a non-atomic fragment and has to be re-assembled before we can
-			// examine the payload for a transport header.
-			foundNext = false
+// setForwarding sets the forwarding status for the protocol.
+//
+// Returns true if the forwarding status was updated.
+func (p *protocol) setForwarding(v bool) bool {
+	if v {
+		return atomic.SwapUint32(&p.forwarding, 1) == 0
+	}
+	return atomic.SwapUint32(&p.forwarding, 0) == 1
+}
 
-		case header.IPv6RawPayloadHeader:
-			// We've found the payload after any extensions.
-			extensionsSize = dataClone.Size() - extHdr.Buf.Size()
-			nextHdr = tcpip.TransportProtocolNumber(extHdr.Identifier)
-			break traverseExtensions
+// SetForwarding implements stack.ForwardingNetworkProtocol.
+func (p *protocol) SetForwarding(v bool) {
+	p.mu.Lock()
+	defer p.mu.Unlock()
 
-		default:
-			// Any other extension is a no-op, keep looping until we find the payload.
-		}
+	if !p.setForwarding(v) {
+		return
 	}
 
-	// Put the IPv6 header with extensions in pkt.NetworkHeader().
-	hdr, ok = pkt.NetworkHeader().Consume(header.IPv6MinimumSize + extensionsSize)
-	if !ok {
-		panic(fmt.Sprintf("pkt.Data should have at least %d bytes, but only has %d.", header.IPv6MinimumSize+extensionsSize, pkt.Data.Size()))
+	for ep := range p.mu.eps {
+		ep.transitionForwarding(v)
 	}
-	ipHdr = header.IPv6(hdr)
-	pkt.Data.CapLength(int(ipHdr.PayloadLength()))
-	pkt.NetworkProtocolNumber = header.IPv6ProtocolNumber
-
-	return nextHdr, foundNext, true
 }
 
 // calculateMTU calculates the network-layer payload MTU based on the link-layer
@@ -583,10 +1437,144 @@ func calculateMTU(mtu uint32) uint32 {
 	return maxPayloadSize
 }
 
-// NewProtocol returns an IPv6 network protocol.
-func NewProtocol() stack.NetworkProtocol {
-	return &protocol{
-		defaultTTL:    DefaultTTL,
-		fragmentation: fragmentation.NewFragmentation(header.IPv6FragmentExtHdrFragmentOffsetBytesPerUnit, fragmentation.HighFragThreshold, fragmentation.LowFragThreshold, fragmentation.DefaultReassembleTimeout),
+// Options holds options to configure a new protocol.
+type Options struct {
+	// NDPConfigs is the default NDP configurations used by interfaces.
+	NDPConfigs NDPConfigurations
+
+	// AutoGenIPv6LinkLocal determines whether or not the stack attempts to
+	// auto-generate an IPv6 link-local address for newly enabled non-loopback
+	// NICs.
+	//
+	// Note, setting this to true does not mean that a link-local address is
+	// assigned right away, or at all. If Duplicate Address Detection is enabled,
+	// an address is only assigned if it successfully resolves. If it fails, no
+	// further attempts are made to auto-generate an IPv6 link-local adddress.
+	//
+	// The generated link-local address follows RFC 4291 Appendix A guidelines.
+	AutoGenIPv6LinkLocal bool
+
+	// NDPDisp is the NDP event dispatcher that an integrator can provide to
+	// receive NDP related events.
+	NDPDisp NDPDispatcher
+
+	// OpaqueIIDOpts hold the options for generating opaque interface
+	// identifiers (IIDs) as outlined by RFC 7217.
+	OpaqueIIDOpts OpaqueInterfaceIdentifierOptions
+
+	// TempIIDSeed is used to seed the initial temporary interface identifier
+	// history value used to generate IIDs for temporary SLAAC addresses.
+	//
+	// Temporary SLAAC adresses are short-lived addresses which are unpredictable
+	// and random from the perspective of other nodes on the network. It is
+	// recommended that the seed be a random byte buffer of at least
+	// header.IIDSize bytes to make sure that temporary SLAAC addresses are
+	// sufficiently random. It should follow minimum randomness requirements for
+	// security as outlined by RFC 4086.
+	//
+	// Note: using a nil value, the same seed across netstack program runs, or a
+	// seed that is too small would reduce randomness and increase predictability,
+	// defeating the purpose of temporary SLAAC addresses.
+	TempIIDSeed []byte
+}
+
+// NewProtocolWithOptions returns an IPv6 network protocol.
+func NewProtocolWithOptions(opts Options) stack.NetworkProtocolFactory {
+	opts.NDPConfigs.validate()
+
+	ids := hash.RandN32(buckets)
+	hashIV := hash.RandN32(1)[0]
+
+	return func(s *stack.Stack) stack.NetworkProtocol {
+		p := &protocol{
+			stack:         s,
+			fragmentation: fragmentation.NewFragmentation(header.IPv6FragmentExtHdrFragmentOffsetBytesPerUnit, fragmentation.HighFragThreshold, fragmentation.LowFragThreshold, reassembleTimeout, s.Clock()),
+			ids:           ids,
+			hashIV:        hashIV,
+
+			ndpDisp:              opts.NDPDisp,
+			ndpConfigs:           opts.NDPConfigs,
+			opaqueIIDOpts:        opts.OpaqueIIDOpts,
+			tempIIDSeed:          opts.TempIIDSeed,
+			autoGenIPv6LinkLocal: opts.AutoGenIPv6LinkLocal,
+		}
+		p.mu.eps = make(map[*endpoint]struct{})
+		p.SetDefaultTTL(DefaultTTL)
+		return p
 	}
 }
+
+// NewProtocol is equivalent to NewProtocolWithOptions with an empty Options.
+func NewProtocol(s *stack.Stack) stack.NetworkProtocol {
+	return NewProtocolWithOptions(Options{})(s)
+}
+
+// calculateFragmentInnerMTU calculates the maximum number of bytes of
+// fragmentable data a fragment can have, based on the link layer mtu and pkt's
+// network header size.
+func calculateFragmentInnerMTU(mtu uint32, pkt *stack.PacketBuffer) uint32 {
+	// TODO(gvisor.dev/issue/3912): Once the Authentication or ESP Headers are
+	// supported for outbound packets, their length should not affect the fragment
+	// MTU because they should only be transmitted once.
+	mtu -= uint32(pkt.NetworkHeader().View().Size())
+	mtu -= header.IPv6FragmentHeaderSize
+	// Round the MTU down to align to 8 bytes.
+	mtu &^= 7
+	if mtu <= maxPayloadSize {
+		return mtu
+	}
+	return maxPayloadSize
+}
+
+func calculateFragmentReserve(pkt *stack.PacketBuffer) int {
+	return pkt.AvailableHeaderBytes() + pkt.NetworkHeader().View().Size() + header.IPv6FragmentHeaderSize
+}
+
+// hashRoute calculates a hash value for the given route. It uses the source &
+// destination address and 32-bit number to generate the hash.
+func hashRoute(r *stack.Route, hashIV uint32) uint32 {
+	// The FNV-1a was chosen because it is a fast hashing algorithm, and
+	// cryptographic properties are not needed here.
+	h := fnv.New32a()
+	if _, err := h.Write([]byte(r.LocalAddress)); err != nil {
+		panic(fmt.Sprintf("Hash.Write: %s, but Hash' implementation of Write is not expected to ever return an error", err))
+	}
+
+	if _, err := h.Write([]byte(r.RemoteAddress)); err != nil {
+		panic(fmt.Sprintf("Hash.Write: %s, but Hash' implementation of Write is not expected to ever return an error", err))
+	}
+
+	s := make([]byte, 4)
+	binary.LittleEndian.PutUint32(s, hashIV)
+	if _, err := h.Write(s); err != nil {
+		panic(fmt.Sprintf("Hash.Write: %s, but Hash' implementation of Write is not expected ever to return an error", err))
+	}
+
+	return h.Sum32()
+}
+
+func buildNextFragment(pf *fragmentation.PacketFragmenter, originalIPHeaders header.IPv6, transportProto tcpip.TransportProtocolNumber, id uint32) (*stack.PacketBuffer, bool) {
+	fragPkt, offset, copied, more := pf.BuildNextFragment()
+	fragPkt.NetworkProtocolNumber = ProtocolNumber
+
+	originalIPHeadersLength := len(originalIPHeaders)
+	fragmentIPHeadersLength := originalIPHeadersLength + header.IPv6FragmentHeaderSize
+	fragmentIPHeaders := header.IPv6(fragPkt.NetworkHeader().Push(fragmentIPHeadersLength))
+
+	// Copy the IPv6 header and any extension headers already populated.
+	if copied := copy(fragmentIPHeaders, originalIPHeaders); copied != originalIPHeadersLength {
+		panic(fmt.Sprintf("wrong number of bytes copied into fragmentIPHeaders: got %d, want %d", copied, originalIPHeadersLength))
+	}
+	fragmentIPHeaders.SetNextHeader(header.IPv6FragmentHeader)
+	fragmentIPHeaders.SetPayloadLength(uint16(copied + fragmentIPHeadersLength - header.IPv6MinimumSize))
+
+	fragmentHeader := header.IPv6Fragment(fragmentIPHeaders[originalIPHeadersLength:])
+	fragmentHeader.Encode(&header.IPv6FragmentFields{
+		M:              more,
+		FragmentOffset: uint16(offset / header.IPv6FragmentExtHdrFragmentOffsetBytesPerUnit),
+		Identification: id,
+		NextHeader:     uint8(transportProto),
+	})
+
+	return fragPkt, more
+}
diff --git a/pkg/tcpip/network/ipv6/ipv6_test.go b/pkg/tcpip/network/ipv6/ipv6_test.go
index 0a183bfde..297868f24 100644
--- a/pkg/tcpip/network/ipv6/ipv6_test.go
+++ b/pkg/tcpip/network/ipv6/ipv6_test.go
@@ -15,15 +15,21 @@
 package ipv6
 
 import (
+	"encoding/hex"
+	"fmt"
+	"math"
 	"testing"
 
 	"github.com/google/go-cmp/cmp"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/checker"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/link/channel"
+	"gvisor.dev/gvisor/pkg/tcpip/network/testutil"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/tcpip/transport/icmp"
+	"gvisor.dev/gvisor/pkg/tcpip/transport/tcp"
 	"gvisor.dev/gvisor/pkg/tcpip/transport/udp"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
@@ -43,6 +49,8 @@ const (
 	fragmentExtHdrID    = uint8(header.IPv6FragmentExtHdrIdentifier)
 	destinationExtHdrID = uint8(header.IPv6DestinationOptionsExtHdrIdentifier)
 	noNextHdrID         = uint8(header.IPv6NoNextHeaderIdentifier)
+
+	extraHeaderReserve = 50
 )
 
 // testReceiveICMP tests receiving an ICMP packet from src to dst. want is the
@@ -51,8 +59,8 @@ func testReceiveICMP(t *testing.T, s *stack.Stack, e *channel.Endpoint, src, dst
 	t.Helper()
 
 	// Receive ICMP packet.
-	hdr := buffer.NewPrependable(header.IPv6MinimumSize + header.ICMPv6NeighborAdvertSize)
-	pkt := header.ICMPv6(hdr.Prepend(header.ICMPv6NeighborAdvertSize))
+	hdr := buffer.NewPrependable(header.IPv6MinimumSize + header.ICMPv6NeighborAdvertMinimumSize)
+	pkt := header.ICMPv6(hdr.Prepend(header.ICMPv6NeighborAdvertMinimumSize))
 	pkt.SetType(header.ICMPv6NeighborAdvert)
 	pkt.SetChecksum(header.ICMPv6Checksum(pkt, src, dst, buffer.VectorisedView{}))
 	payloadLength := hdr.UsedLength()
@@ -134,23 +142,101 @@ func testReceiveUDP(t *testing.T, s *stack.Stack, e *channel.Endpoint, src, dst
 	}
 }
 
+func compareFragments(packets []*stack.PacketBuffer, sourcePacket *stack.PacketBuffer, mtu uint32, wantFragments []fragmentInfo, proto tcpip.TransportProtocolNumber) error {
+	// sourcePacket does not have its IP Header populated. Let's copy the one
+	// from the first fragment.
+	source := header.IPv6(packets[0].NetworkHeader().View())
+	sourceIPHeadersLen := len(source)
+	vv := buffer.NewVectorisedView(sourcePacket.Size(), sourcePacket.Views())
+	source = append(source, vv.ToView()...)
+
+	var reassembledPayload buffer.VectorisedView
+	for i, fragment := range packets {
+		// Confirm that the packet is valid.
+		allBytes := buffer.NewVectorisedView(fragment.Size(), fragment.Views())
+		fragmentIPHeaders := header.IPv6(allBytes.ToView())
+		if !fragmentIPHeaders.IsValid(len(fragmentIPHeaders)) {
+			return fmt.Errorf("fragment #%d: IP packet is invalid:\n%s", i, hex.Dump(fragmentIPHeaders))
+		}
+
+		fragmentIPHeadersLength := fragment.NetworkHeader().View().Size()
+		if fragmentIPHeadersLength != sourceIPHeadersLen {
+			return fmt.Errorf("fragment #%d: got fragmentIPHeadersLength = %d, want = %d", i, fragmentIPHeadersLength, sourceIPHeadersLen)
+		}
+
+		if got := len(fragmentIPHeaders); got > int(mtu) {
+			return fmt.Errorf("fragment #%d: got len(fragmentIPHeaders) = %d, want <= %d", i, got, mtu)
+		}
+
+		sourceIPHeader := source[:header.IPv6MinimumSize]
+		fragmentIPHeader := fragmentIPHeaders[:header.IPv6MinimumSize]
+
+		if got := fragmentIPHeaders.PayloadLength(); got != wantFragments[i].payloadSize {
+			return fmt.Errorf("fragment #%d: got fragmentIPHeaders.PayloadLength() = %d, want = %d", i, got, wantFragments[i].payloadSize)
+		}
+
+		// We expect the IPv6 Header to be similar across each fragment, besides the
+		// payload length.
+		sourceIPHeader.SetPayloadLength(0)
+		fragmentIPHeader.SetPayloadLength(0)
+		if diff := cmp.Diff(fragmentIPHeader, sourceIPHeader); diff != "" {
+			return fmt.Errorf("fragment #%d: fragmentIPHeader mismatch (-want +got):\n%s", i, diff)
+		}
+
+		if got := fragment.AvailableHeaderBytes(); got != extraHeaderReserve {
+			return fmt.Errorf("fragment #%d: got packet.AvailableHeaderBytes() = %d, want = %d", i, got, extraHeaderReserve)
+		}
+		if fragment.NetworkProtocolNumber != sourcePacket.NetworkProtocolNumber {
+			return fmt.Errorf("fragment #%d: got fragment.NetworkProtocolNumber = %d, want = %d", i, fragment.NetworkProtocolNumber, sourcePacket.NetworkProtocolNumber)
+		}
+
+		if len(packets) > 1 {
+			// If the source packet was big enough that it needed fragmentation, let's
+			// inspect the fragment header. Because no other extension headers are
+			// supported, it will always be the last extension header.
+			fragmentHeader := header.IPv6Fragment(fragmentIPHeaders[fragmentIPHeadersLength-header.IPv6FragmentHeaderSize : fragmentIPHeadersLength])
+
+			if got := fragmentHeader.More(); got != wantFragments[i].more {
+				return fmt.Errorf("fragment #%d: got fragmentHeader.More() = %t, want = %t", i, got, wantFragments[i].more)
+			}
+			if got := fragmentHeader.FragmentOffset(); got != wantFragments[i].offset {
+				return fmt.Errorf("fragment #%d: got fragmentHeader.FragmentOffset() = %d, want = %d", i, got, wantFragments[i].offset)
+			}
+			if got := fragmentHeader.NextHeader(); got != uint8(proto) {
+				return fmt.Errorf("fragment #%d: got fragmentHeader.NextHeader() = %d, want = %d", i, got, uint8(proto))
+			}
+		}
+
+		// Store the reassembled payload as we parse each fragment. The payload
+		// includes the Transport header and everything after.
+		reassembledPayload.AppendView(fragment.TransportHeader().View())
+		reassembledPayload.Append(fragment.Data)
+	}
+
+	if diff := cmp.Diff(buffer.View(source[sourceIPHeadersLen:]), reassembledPayload.ToView()); diff != "" {
+		return fmt.Errorf("reassembledPayload mismatch (-want +got):\n%s", diff)
+	}
+
+	return nil
+}
+
 // TestReceiveOnAllNodesMulticastAddr tests that IPv6 endpoints receive ICMP and
 // UDP packets destined to the IPv6 link-local all-nodes multicast address.
 func TestReceiveOnAllNodesMulticastAddr(t *testing.T) {
 	tests := []struct {
 		name            string
-		protocolFactory stack.TransportProtocol
+		protocolFactory stack.TransportProtocolFactory
 		rxf             func(t *testing.T, s *stack.Stack, e *channel.Endpoint, src, dst tcpip.Address, want uint64)
 	}{
-		{"ICMP", icmp.NewProtocol6(), testReceiveICMP},
-		{"UDP", udp.NewProtocol(), testReceiveUDP},
+		{"ICMP", icmp.NewProtocol6, testReceiveICMP},
+		{"UDP", udp.NewProtocol, testReceiveUDP},
 	}
 
 	for _, test := range tests {
 		t.Run(test.name, func(t *testing.T) {
 			s := stack.New(stack.Options{
-				NetworkProtocols:   []stack.NetworkProtocol{NewProtocol()},
-				TransportProtocols: []stack.TransportProtocol{test.protocolFactory},
+				NetworkProtocols:   []stack.NetworkProtocolFactory{NewProtocol},
+				TransportProtocols: []stack.TransportProtocolFactory{test.protocolFactory},
 			})
 			e := channel.New(10, 1280, linkAddr1)
 			if err := s.CreateNIC(1, e); err != nil {
@@ -168,15 +254,13 @@ func TestReceiveOnAllNodesMulticastAddr(t *testing.T) {
 // packets destined to the IPv6 solicited-node address of an assigned IPv6
 // address.
 func TestReceiveOnSolicitedNodeAddr(t *testing.T) {
-	const nicID = 1
-
 	tests := []struct {
 		name            string
-		protocolFactory stack.TransportProtocol
+		protocolFactory stack.TransportProtocolFactory
 		rxf             func(t *testing.T, s *stack.Stack, e *channel.Endpoint, src, dst tcpip.Address, want uint64)
 	}{
-		{"ICMP", icmp.NewProtocol6(), testReceiveICMP},
-		{"UDP", udp.NewProtocol(), testReceiveUDP},
+		{"ICMP", icmp.NewProtocol6, testReceiveICMP},
+		{"UDP", udp.NewProtocol, testReceiveUDP},
 	}
 
 	snmc := header.SolicitedNodeAddr(addr2)
@@ -184,8 +268,8 @@ func TestReceiveOnSolicitedNodeAddr(t *testing.T) {
 	for _, test := range tests {
 		t.Run(test.name, func(t *testing.T) {
 			s := stack.New(stack.Options{
-				NetworkProtocols:   []stack.NetworkProtocol{NewProtocol()},
-				TransportProtocols: []stack.TransportProtocol{test.protocolFactory},
+				NetworkProtocols:   []stack.NetworkProtocolFactory{NewProtocol},
+				TransportProtocols: []stack.TransportProtocolFactory{test.protocolFactory},
 			})
 			e := channel.New(1, 1280, linkAddr1)
 			if err := s.CreateNIC(nicID, e); err != nil {
@@ -193,7 +277,7 @@ func TestReceiveOnSolicitedNodeAddr(t *testing.T) {
 			}
 
 			s.SetRouteTable([]tcpip.Route{
-				tcpip.Route{
+				{
 					Destination: header.IPv6EmptySubnet,
 					NIC:         nicID,
 				},
@@ -271,7 +355,7 @@ func TestAddIpv6Address(t *testing.T) {
 	for _, test := range tests {
 		t.Run(test.name, func(t *testing.T) {
 			s := stack.New(stack.Options{
-				NetworkProtocols: []stack.NetworkProtocol{NewProtocol()},
+				NetworkProtocols: []stack.NetworkProtocolFactory{NewProtocol},
 			})
 			if err := s.CreateNIC(1, &stubLinkEndpoint{}); err != nil {
 				t.Fatalf("CreateNIC(_) = %s", err)
@@ -293,17 +377,22 @@ func TestAddIpv6Address(t *testing.T) {
 }
 
 func TestReceiveIPv6ExtHdrs(t *testing.T) {
-	const nicID = 1
-
 	tests := []struct {
 		name         string
 		extHdr       func(nextHdr uint8) ([]byte, uint8)
 		shouldAccept bool
+		// Should we expect an ICMP response and if so, with what contents?
+		expectICMP bool
+		ICMPType   header.ICMPv6Type
+		ICMPCode   header.ICMPv6Code
+		pointer    uint32
+		multicast  bool
 	}{
 		{
 			name:         "None",
 			extHdr:       func(nextHdr uint8) ([]byte, uint8) { return []byte{}, nextHdr },
 			shouldAccept: true,
+			expectICMP:   false,
 		},
 		{
 			name: "hopbyhop with unknown option skippable action",
@@ -334,9 +423,10 @@ func TestReceiveIPv6ExtHdrs(t *testing.T) {
 				}, hopByHopExtHdrID
 			},
 			shouldAccept: false,
+			expectICMP:   false,
 		},
 		{
-			name: "hopbyhop with unknown option discard and send icmp action",
+			name: "hopbyhop with unknown option discard and send icmp action (unicast)",
 			extHdr: func(nextHdr uint8) ([]byte, uint8) {
 				return []byte{
 					nextHdr, 1,
@@ -346,12 +436,38 @@ func TestReceiveIPv6ExtHdrs(t *testing.T) {
 
 					// Discard & send ICMP if option is unknown.
 					191, 6, 1, 2, 3, 4, 5, 6,
+					//^ Unknown option.
 				}, hopByHopExtHdrID
 			},
 			shouldAccept: false,
+			expectICMP:   true,
+			ICMPType:     header.ICMPv6ParamProblem,
+			ICMPCode:     header.ICMPv6UnknownOption,
+			pointer:      header.IPv6FixedHeaderSize + 8,
 		},
 		{
-			name: "hopbyhop with unknown option discard and send icmp action unless multicast dest",
+			name: "hopbyhop with unknown option discard and send icmp action (multicast)",
+			extHdr: func(nextHdr uint8) ([]byte, uint8) {
+				return []byte{
+					nextHdr, 1,
+
+					// Skippable unknown.
+					63, 4, 1, 2, 3, 4,
+
+					// Discard & send ICMP if option is unknown.
+					191, 6, 1, 2, 3, 4, 5, 6,
+					//^ Unknown option.
+				}, hopByHopExtHdrID
+			},
+			multicast:    true,
+			shouldAccept: false,
+			expectICMP:   true,
+			ICMPType:     header.ICMPv6ParamProblem,
+			ICMPCode:     header.ICMPv6UnknownOption,
+			pointer:      header.IPv6FixedHeaderSize + 8,
+		},
+		{
+			name: "hopbyhop with unknown option discard and send icmp action unless multicast dest (unicast)",
 			extHdr: func(nextHdr uint8) ([]byte, uint8) {
 				return []byte{
 					nextHdr, 1,
@@ -362,39 +478,97 @@ func TestReceiveIPv6ExtHdrs(t *testing.T) {
 					// Discard & send ICMP unless packet is for multicast destination if
 					// option is unknown.
 					255, 6, 1, 2, 3, 4, 5, 6,
+					//^ Unknown option.
 				}, hopByHopExtHdrID
 			},
+			expectICMP: true,
+			ICMPType:   header.ICMPv6ParamProblem,
+			ICMPCode:   header.ICMPv6UnknownOption,
+			pointer:    header.IPv6FixedHeaderSize + 8,
+		},
+		{
+			name: "hopbyhop with unknown option discard and send icmp action unless multicast dest (multicast)",
+			extHdr: func(nextHdr uint8) ([]byte, uint8) {
+				return []byte{
+					nextHdr, 1,
+
+					// Skippable unknown.
+					63, 4, 1, 2, 3, 4,
+
+					// Discard & send ICMP unless packet is for multicast destination if
+					// option is unknown.
+					255, 6, 1, 2, 3, 4, 5, 6,
+					//^ Unknown option.
+				}, hopByHopExtHdrID
+			},
+			multicast:    true,
 			shouldAccept: false,
+			expectICMP:   false,
 		},
 		{
-			name:         "routing with zero segments left",
-			extHdr:       func(nextHdr uint8) ([]byte, uint8) { return []byte{nextHdr, 0, 1, 0, 2, 3, 4, 5}, routingExtHdrID },
+			name: "routing with zero segments left",
+			extHdr: func(nextHdr uint8) ([]byte, uint8) {
+				return []byte{
+					nextHdr, 0,
+					1, 0, 2, 3, 4, 5,
+				}, routingExtHdrID
+			},
 			shouldAccept: true,
 		},
 		{
-			name:         "routing with non-zero segments left",
-			extHdr:       func(nextHdr uint8) ([]byte, uint8) { return []byte{nextHdr, 0, 1, 1, 2, 3, 4, 5}, routingExtHdrID },
+			name: "routing with non-zero segments left",
+			extHdr: func(nextHdr uint8) ([]byte, uint8) {
+				return []byte{
+					nextHdr, 0,
+					1, 1, 2, 3, 4, 5,
+				}, routingExtHdrID
+			},
 			shouldAccept: false,
+			expectICMP:   true,
+			ICMPType:     header.ICMPv6ParamProblem,
+			ICMPCode:     header.ICMPv6ErroneousHeader,
+			pointer:      header.IPv6FixedHeaderSize + 2,
 		},
 		{
-			name:         "atomic fragment with zero ID",
-			extHdr:       func(nextHdr uint8) ([]byte, uint8) { return []byte{nextHdr, 0, 0, 0, 0, 0, 0, 0}, fragmentExtHdrID },
+			name: "atomic fragment with zero ID",
+			extHdr: func(nextHdr uint8) ([]byte, uint8) {
+				return []byte{
+					nextHdr, 0,
+					0, 0, 0, 0, 0, 0,
+				}, fragmentExtHdrID
+			},
 			shouldAccept: true,
 		},
 		{
-			name:         "atomic fragment with non-zero ID",
-			extHdr:       func(nextHdr uint8) ([]byte, uint8) { return []byte{nextHdr, 0, 0, 0, 1, 2, 3, 4}, fragmentExtHdrID },
+			name: "atomic fragment with non-zero ID",
+			extHdr: func(nextHdr uint8) ([]byte, uint8) {
+				return []byte{
+					nextHdr, 0,
+					0, 0, 1, 2, 3, 4,
+				}, fragmentExtHdrID
+			},
 			shouldAccept: true,
+			expectICMP:   false,
 		},
 		{
-			name:         "fragment",
-			extHdr:       func(nextHdr uint8) ([]byte, uint8) { return []byte{nextHdr, 0, 1, 0, 1, 2, 3, 4}, fragmentExtHdrID },
+			name: "fragment",
+			extHdr: func(nextHdr uint8) ([]byte, uint8) {
+				return []byte{
+					nextHdr, 0,
+					1, 0, 1, 2, 3, 4,
+				}, fragmentExtHdrID
+			},
 			shouldAccept: false,
+			expectICMP:   false,
 		},
 		{
-			name:         "No next header",
-			extHdr:       func(nextHdr uint8) ([]byte, uint8) { return []byte{}, noNextHdrID },
+			name: "No next header",
+			extHdr: func(nextHdr uint8) ([]byte, uint8) {
+				return []byte{},
+					noNextHdrID
+			},
 			shouldAccept: false,
+			expectICMP:   false,
 		},
 		{
 			name: "destination with unknown option skippable action",
@@ -410,6 +584,7 @@ func TestReceiveIPv6ExtHdrs(t *testing.T) {
 				}, destinationExtHdrID
 			},
 			shouldAccept: true,
+			expectICMP:   false,
 		},
 		{
 			name: "destination with unknown option discard action",
@@ -425,9 +600,30 @@ func TestReceiveIPv6ExtHdrs(t *testing.T) {
 				}, destinationExtHdrID
 			},
 			shouldAccept: false,
+			expectICMP:   false,
+		},
+		{
+			name: "destination with unknown option discard and send icmp action (unicast)",
+			extHdr: func(nextHdr uint8) ([]byte, uint8) {
+				return []byte{
+					nextHdr, 1,
+
+					// Skippable unknown.
+					63, 4, 1, 2, 3, 4,
+
+					// Discard & send ICMP if option is unknown.
+					191, 6, 1, 2, 3, 4, 5, 6,
+					//^  191 is an unknown option.
+				}, destinationExtHdrID
+			},
+			shouldAccept: false,
+			expectICMP:   true,
+			ICMPType:     header.ICMPv6ParamProblem,
+			ICMPCode:     header.ICMPv6UnknownOption,
+			pointer:      header.IPv6FixedHeaderSize + 8,
 		},
 		{
-			name: "destination with unknown option discard and send icmp action",
+			name: "destination with unknown option discard and send icmp action (muilticast)",
 			extHdr: func(nextHdr uint8) ([]byte, uint8) {
 				return []byte{
 					nextHdr, 1,
@@ -437,12 +633,18 @@ func TestReceiveIPv6ExtHdrs(t *testing.T) {
 
 					// Discard & send ICMP if option is unknown.
 					191, 6, 1, 2, 3, 4, 5, 6,
+					//^  191 is an unknown option.
 				}, destinationExtHdrID
 			},
+			multicast:    true,
 			shouldAccept: false,
+			expectICMP:   true,
+			ICMPType:     header.ICMPv6ParamProblem,
+			ICMPCode:     header.ICMPv6UnknownOption,
+			pointer:      header.IPv6FixedHeaderSize + 8,
 		},
 		{
-			name: "destination with unknown option discard and send icmp action unless multicast dest",
+			name: "destination with unknown option discard and send icmp action unless multicast dest (unicast)",
 			extHdr: func(nextHdr uint8) ([]byte, uint8) {
 				return []byte{
 					nextHdr, 1,
@@ -453,22 +655,33 @@ func TestReceiveIPv6ExtHdrs(t *testing.T) {
 					// Discard & send ICMP unless packet is for multicast destination if
 					// option is unknown.
 					255, 6, 1, 2, 3, 4, 5, 6,
+					//^ 255 is unknown.
 				}, destinationExtHdrID
 			},
 			shouldAccept: false,
+			expectICMP:   true,
+			ICMPType:     header.ICMPv6ParamProblem,
+			ICMPCode:     header.ICMPv6UnknownOption,
+			pointer:      header.IPv6FixedHeaderSize + 8,
 		},
 		{
-			name: "routing - atomic fragment",
+			name: "destination with unknown option discard and send icmp action unless multicast dest (multicast)",
 			extHdr: func(nextHdr uint8) ([]byte, uint8) {
 				return []byte{
-					// Routing extension header.
-					fragmentExtHdrID, 0, 1, 0, 2, 3, 4, 5,
+					nextHdr, 1,
 
-					// Fragment extension header.
-					nextHdr, 0, 0, 0, 1, 2, 3, 4,
-				}, routingExtHdrID
+					// Skippable unknown.
+					63, 4, 1, 2, 3, 4,
+
+					// Discard & send ICMP unless packet is for multicast destination if
+					// option is unknown.
+					255, 6, 1, 2, 3, 4, 5, 6,
+					//^ 255 is unknown.
+				}, destinationExtHdrID
 			},
-			shouldAccept: true,
+			shouldAccept: false,
+			expectICMP:   false,
+			multicast:    true,
 		},
 		{
 			name: "atomic fragment - routing",
@@ -502,12 +715,42 @@ func TestReceiveIPv6ExtHdrs(t *testing.T) {
 				return []byte{
 					// Routing extension header.
 					hopByHopExtHdrID, 0, 1, 0, 2, 3, 4, 5,
+					// ^^^   The HopByHop extension header may not appear after the first
+					// extension header.
 
 					// Hop By Hop extension header with skippable unknown option.
 					nextHdr, 0, 62, 4, 1, 2, 3, 4,
 				}, routingExtHdrID
 			},
 			shouldAccept: false,
+			expectICMP:   true,
+			ICMPType:     header.ICMPv6ParamProblem,
+			ICMPCode:     header.ICMPv6UnknownHeader,
+			pointer:      header.IPv6FixedHeaderSize,
+		},
+		{
+			name: "routing - hop by hop (with send icmp unknown)",
+			extHdr: func(nextHdr uint8) ([]byte, uint8) {
+				return []byte{
+					// Routing extension header.
+					hopByHopExtHdrID, 0, 1, 0, 2, 3, 4, 5,
+					// ^^^   The HopByHop extension header may not appear after the first
+					// extension header.
+
+					nextHdr, 1,
+
+					// Skippable unknown.
+					63, 4, 1, 2, 3, 4,
+
+					// Skippable unknown.
+					191, 6, 1, 2, 3, 4, 5, 6,
+				}, routingExtHdrID
+			},
+			shouldAccept: false,
+			expectICMP:   true,
+			ICMPType:     header.ICMPv6ParamProblem,
+			ICMPCode:     header.ICMPv6UnknownHeader,
+			pointer:      header.IPv6FixedHeaderSize,
 		},
 		{
 			name:         "No next header",
@@ -551,6 +794,7 @@ func TestReceiveIPv6ExtHdrs(t *testing.T) {
 				}, hopByHopExtHdrID
 			},
 			shouldAccept: false,
+			expectICMP:   false,
 		},
 		{
 			name: "hopbyhop (with skippable unknown) - routing - atomic fragment - destination (with discard unknown)",
@@ -571,16 +815,17 @@ func TestReceiveIPv6ExtHdrs(t *testing.T) {
 				}, hopByHopExtHdrID
 			},
 			shouldAccept: false,
+			expectICMP:   false,
 		},
 	}
 
 	for _, test := range tests {
 		t.Run(test.name, func(t *testing.T) {
 			s := stack.New(stack.Options{
-				NetworkProtocols:   []stack.NetworkProtocol{NewProtocol()},
-				TransportProtocols: []stack.TransportProtocol{udp.NewProtocol()},
+				NetworkProtocols:   []stack.NetworkProtocolFactory{NewProtocol},
+				TransportProtocols: []stack.TransportProtocolFactory{udp.NewProtocol},
 			})
-			e := channel.New(0, 1280, linkAddr1)
+			e := channel.New(1, 1280, linkAddr1)
 			if err := s.CreateNIC(nicID, e); err != nil {
 				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
 			}
@@ -588,6 +833,14 @@ func TestReceiveIPv6ExtHdrs(t *testing.T) {
 				t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, ProtocolNumber, addr2, err)
 			}
 
+			// Add a default route so that a return packet knows where to go.
+			s.SetRouteTable([]tcpip.Route{
+				{
+					Destination: header.IPv6EmptySubnet,
+					NIC:         nicID,
+				},
+			})
+
 			wq := waiter.Queue{}
 			we, ch := waiter.NewChannelEntry(nil)
 			wq.EventRegister(&we, waiter.EventIn)
@@ -629,12 +882,16 @@ func TestReceiveIPv6ExtHdrs(t *testing.T) {
 			// Serialize IPv6 fixed header.
 			payloadLength := hdr.UsedLength()
 			ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
+			dstAddr := tcpip.Address(addr2)
+			if test.multicast {
+				dstAddr = header.IPv6AllNodesMulticastAddress
+			}
 			ip.Encode(&header.IPv6Fields{
 				PayloadLength: uint16(payloadLength),
 				NextHeader:    ipv6NextHdr,
 				HopLimit:      255,
 				SrcAddr:       addr1,
-				DstAddr:       addr2,
+				DstAddr:       dstAddr,
 			})
 
 			e.InjectInbound(ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
@@ -648,6 +905,44 @@ func TestReceiveIPv6ExtHdrs(t *testing.T) {
 					t.Errorf("got UDP Rx Packets = %d, want = 0", got)
 				}
 
+				if !test.expectICMP {
+					if p, ok := e.Read(); ok {
+						t.Fatalf("unexpected packet received: %#v", p)
+					}
+					return
+				}
+
+				// ICMP required.
+				p, ok := e.Read()
+				if !ok {
+					t.Fatalf("expected packet wasn't written out")
+				}
+
+				// Pack the output packet into a single buffer.View as the checkers
+				// assume that.
+				vv := buffer.NewVectorisedView(p.Pkt.Size(), p.Pkt.Views())
+				pkt := vv.ToView()
+				if got, want := len(pkt), header.IPv6FixedHeaderSize+header.ICMPv6MinimumSize+hdr.UsedLength(); got != want {
+					t.Fatalf("got an ICMP packet of size = %d, want = %d", got, want)
+				}
+
+				ipHdr := header.IPv6(pkt)
+				checker.IPv6(t, ipHdr, checker.ICMPv6(
+					checker.ICMPv6Type(test.ICMPType),
+					checker.ICMPv6Code(test.ICMPCode)))
+
+				// We know we are looking at no extension headers in the error ICMP
+				// packets.
+				icmpPkt := header.ICMPv6(ipHdr.Payload())
+				// We know we sent small packets that won't be truncated when reflected
+				// back to us.
+				originalPacket := icmpPkt.Payload()
+				if got, want := icmpPkt.TypeSpecific(), test.pointer; got != want {
+					t.Errorf("unexpected ICMPv6 pointer, got = %d, want = %d\n", got, want)
+				}
+				if diff := cmp.Diff(hdr.View(), buffer.View(originalPacket)); diff != "" {
+					t.Errorf("ICMPv6 payload mismatch (-want +got):\n%s", diff)
+				}
 				return
 			}
 
@@ -681,12 +976,12 @@ type fragmentData struct {
 
 func TestReceiveIPv6Fragments(t *testing.T) {
 	const (
-		nicID             = 1
 		udpPayload1Length = 256
 		udpPayload2Length = 128
 		// Used to test cases where the fragment blocks are not a multiple of
 		// the fragment block size of 8 (RFC 8200 section 4.5).
 		udpPayload3Length = 127
+		udpPayload4Length = header.IPv6MaximumPayloadSize - header.UDPMinimumSize
 		fragmentExtHdrLen = 8
 		// Note, not all routing extension headers will be 8 bytes but this test
 		// uses 8 byte routing extension headers for most sub tests.
@@ -731,6 +1026,10 @@ func TestReceiveIPv6Fragments(t *testing.T) {
 	udpPayload3Addr1ToAddr2 := udpPayload3Addr1ToAddr2Buf[:]
 	ipv6Payload3Addr1ToAddr2 := udpGen(udpPayload3Addr1ToAddr2, 3, addr1, addr2)
 
+	var udpPayload4Addr1ToAddr2Buf [udpPayload4Length]byte
+	udpPayload4Addr1ToAddr2 := udpPayload4Addr1ToAddr2Buf[:]
+	ipv6Payload4Addr1ToAddr2 := udpGen(udpPayload4Addr1ToAddr2, 4, addr1, addr2)
+
 	tests := []struct {
 		name             string
 		expectedPayload  []byte
@@ -866,6 +1165,46 @@ func TestReceiveIPv6Fragments(t *testing.T) {
 			expectedPayloads: [][]byte{udpPayload1Addr1ToAddr2},
 		},
 		{
+			name: "Two fragments with different Next Header values",
+			fragments: []fragmentData{
+				{
+					srcAddr: addr1,
+					dstAddr: addr2,
+					nextHdr: fragmentExtHdrID,
+					data: buffer.NewVectorisedView(
+						fragmentExtHdrLen+64,
+						[]buffer.View{
+							// Fragment extension header.
+							//
+							// Fragment offset = 0, More = true, ID = 1
+							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 1, 0, 0, 0, 1}),
+
+							ipv6Payload1Addr1ToAddr2[:64],
+						},
+					),
+				},
+				{
+					srcAddr: addr1,
+					dstAddr: addr2,
+					nextHdr: fragmentExtHdrID,
+					data: buffer.NewVectorisedView(
+						fragmentExtHdrLen+len(ipv6Payload1Addr1ToAddr2)-64,
+						[]buffer.View{
+							// Fragment extension header.
+							//
+							// Fragment offset = 8, More = false, ID = 1
+							// NextHeader value is different than the one in the first fragment, so
+							// this NextHeader should be ignored.
+							buffer.View([]byte{uint8(header.IPv6NoNextHeaderIdentifier), 0, 0, 64, 0, 0, 0, 1}),
+
+							ipv6Payload1Addr1ToAddr2[64:],
+						},
+					),
+				},
+			},
+			expectedPayloads: [][]byte{udpPayload1Addr1ToAddr2},
+		},
+		{
 			name: "Two fragments with last fragment size not a multiple of fragment block size",
 			fragments: []fragmentData{
 				{
@@ -980,6 +1319,44 @@ func TestReceiveIPv6Fragments(t *testing.T) {
 			expectedPayloads: nil,
 		},
 		{
+			name: "Two fragments reassembled into a maximum UDP packet",
+			fragments: []fragmentData{
+				{
+					srcAddr: addr1,
+					dstAddr: addr2,
+					nextHdr: fragmentExtHdrID,
+					data: buffer.NewVectorisedView(
+						fragmentExtHdrLen+65520,
+						[]buffer.View{
+							// Fragment extension header.
+							//
+							// Fragment offset = 0, More = true, ID = 1
+							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 1, 0, 0, 0, 1}),
+
+							ipv6Payload4Addr1ToAddr2[:65520],
+						},
+					),
+				},
+				{
+					srcAddr: addr1,
+					dstAddr: addr2,
+					nextHdr: fragmentExtHdrID,
+					data: buffer.NewVectorisedView(
+						fragmentExtHdrLen+len(ipv6Payload4Addr1ToAddr2)-65520,
+						[]buffer.View{
+							// Fragment extension header.
+							//
+							// Fragment offset = 8190, More = false, ID = 1
+							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 255, 240, 0, 0, 0, 1}),
+
+							ipv6Payload4Addr1ToAddr2[65520:],
+						},
+					),
+				},
+			},
+			expectedPayloads: [][]byte{udpPayload4Addr1ToAddr2},
+		},
+		{
 			name: "Two fragments with per-fragment routing header with zero segments left",
 			fragments: []fragmentData{
 				{
@@ -1464,8 +1841,8 @@ func TestReceiveIPv6Fragments(t *testing.T) {
 	for _, test := range tests {
 		t.Run(test.name, func(t *testing.T) {
 			s := stack.New(stack.Options{
-				NetworkProtocols:   []stack.NetworkProtocol{NewProtocol()},
-				TransportProtocols: []stack.TransportProtocol{udp.NewProtocol()},
+				NetworkProtocols:   []stack.NetworkProtocolFactory{NewProtocol},
+				TransportProtocols: []stack.TransportProtocolFactory{udp.NewProtocol},
 			})
 			e := channel.New(0, 1280, linkAddr1)
 			if err := s.CreateNIC(nicID, e); err != nil {
@@ -1532,3 +1909,575 @@ func TestReceiveIPv6Fragments(t *testing.T) {
 		})
 	}
 }
+
+func TestInvalidIPv6Fragments(t *testing.T) {
+	const (
+		nicID             = 1
+		fragmentExtHdrLen = 8
+	)
+
+	payloadGen := func(payloadLen int) []byte {
+		payload := make([]byte, payloadLen)
+		for i := 0; i < len(payload); i++ {
+			payload[i] = 0x30
+		}
+		return payload
+	}
+
+	tests := []struct {
+		name                   string
+		fragments              []fragmentData
+		wantMalformedIPPackets uint64
+		wantMalformedFragments uint64
+	}{
+		{
+			name: "fragments reassembled into a payload exceeding the max IPv6 payload size",
+			fragments: []fragmentData{
+				{
+					srcAddr: addr1,
+					dstAddr: addr2,
+					nextHdr: fragmentExtHdrID,
+					data: buffer.NewVectorisedView(
+						fragmentExtHdrLen+(header.IPv6MaximumPayloadSize+1)-16,
+						[]buffer.View{
+							// Fragment extension header.
+							// Fragment offset = 8190, More = false, ID = 1
+							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0,
+								((header.IPv6MaximumPayloadSize + 1) - 16) >> 8,
+								((header.IPv6MaximumPayloadSize + 1) - 16) & math.MaxUint8,
+								0, 0, 0, 1}),
+							// Payload length = 16
+							payloadGen(16),
+						},
+					),
+				},
+			},
+			wantMalformedIPPackets: 1,
+			wantMalformedFragments: 1,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			s := stack.New(stack.Options{
+				NetworkProtocols: []stack.NetworkProtocolFactory{
+					NewProtocol,
+				},
+			})
+			e := channel.New(0, 1500, linkAddr1)
+			if err := s.CreateNIC(nicID, e); err != nil {
+				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+			}
+			if err := s.AddAddress(nicID, ProtocolNumber, addr2); err != nil {
+				t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, ProtocolNumber, addr2, err)
+			}
+
+			for _, f := range test.fragments {
+				hdr := buffer.NewPrependable(header.IPv6MinimumSize)
+
+				// Serialize IPv6 fixed header.
+				ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
+				ip.Encode(&header.IPv6Fields{
+					PayloadLength: uint16(f.data.Size()),
+					NextHeader:    f.nextHdr,
+					HopLimit:      255,
+					SrcAddr:       f.srcAddr,
+					DstAddr:       f.dstAddr,
+				})
+
+				vv := hdr.View().ToVectorisedView()
+				vv.Append(f.data)
+
+				e.InjectInbound(ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
+					Data: vv,
+				}))
+			}
+
+			if got, want := s.Stats().IP.MalformedPacketsReceived.Value(), test.wantMalformedIPPackets; got != want {
+				t.Errorf("got Stats.IP.MalformedPacketsReceived = %d, want = %d", got, want)
+			}
+			if got, want := s.Stats().IP.MalformedFragmentsReceived.Value(), test.wantMalformedFragments; got != want {
+				t.Errorf("got Stats.IP.MalformedFragmentsReceived = %d, want = %d", got, want)
+			}
+		})
+	}
+}
+
+func TestWriteStats(t *testing.T) {
+	const nPackets = 3
+	tests := []struct {
+		name          string
+		setup         func(*testing.T, *stack.Stack)
+		allowPackets  int
+		expectSent    int
+		expectDropped int
+		expectWritten int
+	}{
+		{
+			name: "Accept all",
+			// No setup needed, tables accept everything by default.
+			setup:         func(*testing.T, *stack.Stack) {},
+			allowPackets:  math.MaxInt32,
+			expectSent:    nPackets,
+			expectDropped: 0,
+			expectWritten: nPackets,
+		}, {
+			name: "Accept all with error",
+			// No setup needed, tables accept everything by default.
+			setup:         func(*testing.T, *stack.Stack) {},
+			allowPackets:  nPackets - 1,
+			expectSent:    nPackets - 1,
+			expectDropped: 0,
+			expectWritten: nPackets - 1,
+		}, {
+			name: "Drop all",
+			setup: func(t *testing.T, stk *stack.Stack) {
+				// Install Output DROP rule.
+				t.Helper()
+				ipt := stk.IPTables()
+				filter, ok := ipt.GetTable(stack.FilterTable, true /* ipv6 */)
+				if !ok {
+					t.Fatalf("failed to find filter table")
+				}
+				ruleIdx := filter.BuiltinChains[stack.Output]
+				filter.Rules[ruleIdx].Target = &stack.DropTarget{}
+				if err := ipt.ReplaceTable(stack.FilterTable, filter, true /* ipv6 */); err != nil {
+					t.Fatalf("failed to replace table: %v", err)
+				}
+			},
+			allowPackets:  math.MaxInt32,
+			expectSent:    0,
+			expectDropped: nPackets,
+			expectWritten: nPackets,
+		}, {
+			name: "Drop some",
+			setup: func(t *testing.T, stk *stack.Stack) {
+				// Install Output DROP rule that matches only 1
+				// of the 3 packets.
+				t.Helper()
+				ipt := stk.IPTables()
+				filter, ok := ipt.GetTable(stack.FilterTable, true /* ipv6 */)
+				if !ok {
+					t.Fatalf("failed to find filter table")
+				}
+				// We'll match and DROP the last packet.
+				ruleIdx := filter.BuiltinChains[stack.Output]
+				filter.Rules[ruleIdx].Target = &stack.DropTarget{}
+				filter.Rules[ruleIdx].Matchers = []stack.Matcher{&limitedMatcher{nPackets - 1}}
+				// Make sure the next rule is ACCEPT.
+				filter.Rules[ruleIdx+1].Target = &stack.AcceptTarget{}
+				if err := ipt.ReplaceTable(stack.FilterTable, filter, true /* ipv6 */); err != nil {
+					t.Fatalf("failed to replace table: %v", err)
+				}
+			},
+			allowPackets:  math.MaxInt32,
+			expectSent:    nPackets - 1,
+			expectDropped: 1,
+			expectWritten: nPackets,
+		},
+	}
+
+	writers := []struct {
+		name         string
+		writePackets func(*stack.Route, stack.PacketBufferList) (int, *tcpip.Error)
+	}{
+		{
+			name: "WritePacket",
+			writePackets: func(rt *stack.Route, pkts stack.PacketBufferList) (int, *tcpip.Error) {
+				nWritten := 0
+				for pkt := pkts.Front(); pkt != nil; pkt = pkt.Next() {
+					if err := rt.WritePacket(nil, stack.NetworkHeaderParams{}, pkt); err != nil {
+						return nWritten, err
+					}
+					nWritten++
+				}
+				return nWritten, nil
+			},
+		}, {
+			name: "WritePackets",
+			writePackets: func(rt *stack.Route, pkts stack.PacketBufferList) (int, *tcpip.Error) {
+				return rt.WritePackets(nil, pkts, stack.NetworkHeaderParams{})
+			},
+		},
+	}
+
+	for _, writer := range writers {
+		t.Run(writer.name, func(t *testing.T) {
+			for _, test := range tests {
+				t.Run(test.name, func(t *testing.T) {
+					ep := testutil.NewMockLinkEndpoint(header.IPv6MinimumMTU, tcpip.ErrInvalidEndpointState, test.allowPackets)
+					rt := buildRoute(t, ep)
+					var pkts stack.PacketBufferList
+					for i := 0; i < nPackets; i++ {
+						pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
+							ReserveHeaderBytes: header.UDPMinimumSize + int(rt.MaxHeaderLength()),
+							Data:               buffer.NewView(0).ToVectorisedView(),
+						})
+						pkt.TransportHeader().Push(header.UDPMinimumSize)
+						pkts.PushBack(pkt)
+					}
+
+					test.setup(t, rt.Stack())
+
+					nWritten, _ := writer.writePackets(&rt, pkts)
+
+					if got := int(rt.Stats().IP.PacketsSent.Value()); got != test.expectSent {
+						t.Errorf("sent %d packets, but expected to send %d", got, test.expectSent)
+					}
+					if got := int(rt.Stats().IP.IPTablesOutputDropped.Value()); got != test.expectDropped {
+						t.Errorf("dropped %d packets, but expected to drop %d", got, test.expectDropped)
+					}
+					if nWritten != test.expectWritten {
+						t.Errorf("wrote %d packets, but expected WritePackets to return %d", nWritten, test.expectWritten)
+					}
+				})
+			}
+		})
+	}
+}
+
+func buildRoute(t *testing.T, ep stack.LinkEndpoint) stack.Route {
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocolFactory{NewProtocol},
+	})
+	if err := s.CreateNIC(1, ep); err != nil {
+		t.Fatalf("CreateNIC(1, _) failed: %s", err)
+	}
+	const (
+		src = "\xfc\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01"
+		dst = "\xfc\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02"
+	)
+	if err := s.AddAddress(1, ProtocolNumber, src); err != nil {
+		t.Fatalf("AddAddress(1, %d, %s) failed: %s", ProtocolNumber, src, err)
+	}
+	{
+		mask := tcpip.AddressMask("\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff")
+		subnet, err := tcpip.NewSubnet(dst, mask)
+		if err != nil {
+			t.Fatalf("NewSubnet(%s, %s) failed: %v", dst, mask, err)
+		}
+		s.SetRouteTable([]tcpip.Route{{
+			Destination: subnet,
+			NIC:         1,
+		}})
+	}
+	rt, err := s.FindRoute(1, src, dst, ProtocolNumber, false /* multicastLoop */)
+	if err != nil {
+		t.Fatalf("FindRoute(1, %s, %s, %d, false) = %s, want = nil", src, dst, ProtocolNumber, err)
+	}
+	return rt
+}
+
+// limitedMatcher is an iptables matcher that matches after a certain number of
+// packets are checked against it.
+type limitedMatcher struct {
+	limit int
+}
+
+// Name implements Matcher.Name.
+func (*limitedMatcher) Name() string {
+	return "limitedMatcher"
+}
+
+// Match implements Matcher.Match.
+func (lm *limitedMatcher) Match(stack.Hook, *stack.PacketBuffer, string) (bool, bool) {
+	if lm.limit == 0 {
+		return true, false
+	}
+	lm.limit--
+	return false, false
+}
+
+func TestClearEndpointFromProtocolOnClose(t *testing.T) {
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocolFactory{NewProtocol},
+	})
+	proto := s.NetworkProtocolInstance(ProtocolNumber).(*protocol)
+	ep := proto.NewEndpoint(&testInterface{}, nil, nil, nil).(*endpoint)
+	{
+		proto.mu.Lock()
+		_, hasEP := proto.mu.eps[ep]
+		proto.mu.Unlock()
+		if !hasEP {
+			t.Fatalf("expected protocol to have ep = %p in set of endpoints", ep)
+		}
+	}
+
+	ep.Close()
+
+	{
+		proto.mu.Lock()
+		_, hasEP := proto.mu.eps[ep]
+		proto.mu.Unlock()
+		if hasEP {
+			t.Fatalf("unexpectedly found ep = %p in set of protocol's endpoints", ep)
+		}
+	}
+}
+
+type fragmentInfo struct {
+	offset      uint16
+	more        bool
+	payloadSize uint16
+}
+
+var fragmentationTests = []struct {
+	description   string
+	mtu           uint32
+	gso           *stack.GSO
+	transHdrLen   int
+	payloadSize   int
+	wantFragments []fragmentInfo
+}{
+	{
+		description: "No Fragmentation",
+		mtu:         1280,
+		gso:         nil,
+		transHdrLen: 0,
+		payloadSize: 1000,
+		wantFragments: []fragmentInfo{
+			{offset: 0, payloadSize: 1000, more: false},
+		},
+	},
+	{
+		description: "Fragmented",
+		mtu:         1280,
+		gso:         nil,
+		transHdrLen: 0,
+		payloadSize: 2000,
+		wantFragments: []fragmentInfo{
+			{offset: 0, payloadSize: 1240, more: true},
+			{offset: 154, payloadSize: 776, more: false},
+		},
+	},
+	{
+		description: "No fragmentation with big header",
+		mtu:         2000,
+		gso:         nil,
+		transHdrLen: 100,
+		payloadSize: 1000,
+		wantFragments: []fragmentInfo{
+			{offset: 0, payloadSize: 1100, more: false},
+		},
+	},
+	{
+		description: "Fragmented with gso none",
+		mtu:         1280,
+		gso:         &stack.GSO{Type: stack.GSONone},
+		transHdrLen: 0,
+		payloadSize: 1400,
+		wantFragments: []fragmentInfo{
+			{offset: 0, payloadSize: 1240, more: true},
+			{offset: 154, payloadSize: 176, more: false},
+		},
+	},
+	{
+		description: "Fragmented with big header",
+		mtu:         1280,
+		gso:         nil,
+		transHdrLen: 100,
+		payloadSize: 1200,
+		wantFragments: []fragmentInfo{
+			{offset: 0, payloadSize: 1240, more: true},
+			{offset: 154, payloadSize: 76, more: false},
+		},
+	},
+}
+
+func TestFragmentationWritePacket(t *testing.T) {
+	const (
+		ttl            = 42
+		tos            = stack.DefaultTOS
+		transportProto = tcp.ProtocolNumber
+	)
+
+	for _, ft := range fragmentationTests {
+		t.Run(ft.description, func(t *testing.T) {
+			pkt := testutil.MakeRandPkt(ft.transHdrLen, extraHeaderReserve+header.IPv6MinimumSize, []int{ft.payloadSize}, header.IPv6ProtocolNumber)
+			source := pkt.Clone()
+			ep := testutil.NewMockLinkEndpoint(ft.mtu, nil, math.MaxInt32)
+			r := buildRoute(t, ep)
+			err := r.WritePacket(ft.gso, stack.NetworkHeaderParams{
+				Protocol: tcp.ProtocolNumber,
+				TTL:      ttl,
+				TOS:      stack.DefaultTOS,
+			}, pkt)
+			if err != nil {
+				t.Fatalf("WritePacket(_, _, _): = %s", err)
+			}
+			if got := len(ep.WrittenPackets); got != len(ft.wantFragments) {
+				t.Errorf("got len(ep.WrittenPackets) = %d, want = %d", got, len(ft.wantFragments))
+			}
+			if got := int(r.Stats().IP.PacketsSent.Value()); got != len(ft.wantFragments) {
+				t.Errorf("got c.Route.Stats().IP.PacketsSent.Value() = %d, want = %d", got, len(ft.wantFragments))
+			}
+			if got := r.Stats().IP.OutgoingPacketErrors.Value(); got != 0 {
+				t.Errorf("got r.Stats().IP.OutgoingPacketErrors.Value() = %d, want = 0", got)
+			}
+			if err := compareFragments(ep.WrittenPackets, source, ft.mtu, ft.wantFragments, tcp.ProtocolNumber); err != nil {
+				t.Error(err)
+			}
+		})
+	}
+}
+
+func TestFragmentationWritePackets(t *testing.T) {
+	const ttl = 42
+	tests := []struct {
+		description  string
+		insertBefore int
+		insertAfter  int
+	}{
+		{
+			description:  "Single packet",
+			insertBefore: 0,
+			insertAfter:  0,
+		},
+		{
+			description:  "With packet before",
+			insertBefore: 1,
+			insertAfter:  0,
+		},
+		{
+			description:  "With packet after",
+			insertBefore: 0,
+			insertAfter:  1,
+		},
+		{
+			description:  "With packet before and after",
+			insertBefore: 1,
+			insertAfter:  1,
+		},
+	}
+	tinyPacket := testutil.MakeRandPkt(header.TCPMinimumSize, extraHeaderReserve+header.IPv6MinimumSize, []int{1}, header.IPv6ProtocolNumber)
+
+	for _, test := range tests {
+		t.Run(test.description, func(t *testing.T) {
+			for _, ft := range fragmentationTests {
+				t.Run(ft.description, func(t *testing.T) {
+					var pkts stack.PacketBufferList
+					for i := 0; i < test.insertBefore; i++ {
+						pkts.PushBack(tinyPacket.Clone())
+					}
+					pkt := testutil.MakeRandPkt(ft.transHdrLen, extraHeaderReserve+header.IPv6MinimumSize, []int{ft.payloadSize}, header.IPv6ProtocolNumber)
+					source := pkt
+					pkts.PushBack(pkt.Clone())
+					for i := 0; i < test.insertAfter; i++ {
+						pkts.PushBack(tinyPacket.Clone())
+					}
+
+					ep := testutil.NewMockLinkEndpoint(ft.mtu, nil, math.MaxInt32)
+					r := buildRoute(t, ep)
+
+					wantTotalPackets := len(ft.wantFragments) + test.insertBefore + test.insertAfter
+					n, err := r.WritePackets(ft.gso, pkts, stack.NetworkHeaderParams{
+						Protocol: tcp.ProtocolNumber,
+						TTL:      ttl,
+						TOS:      stack.DefaultTOS,
+					})
+					if n != wantTotalPackets || err != nil {
+						t.Errorf("got WritePackets(_, _, _) = (%d, %s), want = (%d, nil)", n, err, wantTotalPackets)
+					}
+					if got := len(ep.WrittenPackets); got != wantTotalPackets {
+						t.Errorf("got len(ep.WrittenPackets) = %d, want = %d", got, wantTotalPackets)
+					}
+					if got := int(r.Stats().IP.PacketsSent.Value()); got != wantTotalPackets {
+						t.Errorf("got c.Route.Stats().IP.PacketsSent.Value() = %d, want = %d", got, wantTotalPackets)
+					}
+					if got := r.Stats().IP.OutgoingPacketErrors.Value(); got != 0 {
+						t.Errorf("got r.Stats().IP.OutgoingPacketErrors.Value() = %d, want = 0", got)
+					}
+
+					if wantTotalPackets == 0 {
+						return
+					}
+
+					fragments := ep.WrittenPackets[test.insertBefore : len(ft.wantFragments)+test.insertBefore]
+					if err := compareFragments(fragments, source, ft.mtu, ft.wantFragments, tcp.ProtocolNumber); err != nil {
+						t.Error(err)
+					}
+				})
+			}
+		})
+	}
+}
+
+// TestFragmentationErrors checks that errors are returned from WritePacket
+// correctly.
+func TestFragmentationErrors(t *testing.T) {
+	const ttl = 42
+
+	tests := []struct {
+		description    string
+		mtu            uint32
+		transHdrLen    int
+		payloadSize    int
+		allowPackets   int
+		outgoingErrors int
+		mockError      *tcpip.Error
+		wantError      *tcpip.Error
+	}{
+		{
+			description:    "No frag",
+			mtu:            2000,
+			payloadSize:    1000,
+			transHdrLen:    0,
+			allowPackets:   0,
+			outgoingErrors: 1,
+			mockError:      tcpip.ErrAborted,
+			wantError:      tcpip.ErrAborted,
+		},
+		{
+			description:    "Error on first frag",
+			mtu:            1300,
+			payloadSize:    3000,
+			transHdrLen:    0,
+			allowPackets:   0,
+			outgoingErrors: 3,
+			mockError:      tcpip.ErrAborted,
+			wantError:      tcpip.ErrAborted,
+		},
+		{
+			description:    "Error on second frag",
+			mtu:            1500,
+			payloadSize:    4000,
+			transHdrLen:    0,
+			allowPackets:   1,
+			outgoingErrors: 2,
+			mockError:      tcpip.ErrAborted,
+			wantError:      tcpip.ErrAborted,
+		},
+		{
+			description:    "Error on packet with MTU smaller than transport header",
+			mtu:            1280,
+			transHdrLen:    1500,
+			payloadSize:    500,
+			allowPackets:   0,
+			outgoingErrors: 1,
+			mockError:      nil,
+			wantError:      tcpip.ErrMessageTooLong,
+		},
+	}
+
+	for _, ft := range tests {
+		t.Run(ft.description, func(t *testing.T) {
+			pkt := testutil.MakeRandPkt(ft.transHdrLen, extraHeaderReserve+header.IPv6MinimumSize, []int{ft.payloadSize}, header.IPv6ProtocolNumber)
+			ep := testutil.NewMockLinkEndpoint(ft.mtu, ft.mockError, ft.allowPackets)
+			r := buildRoute(t, ep)
+			err := r.WritePacket(&stack.GSO{}, stack.NetworkHeaderParams{
+				Protocol: tcp.ProtocolNumber,
+				TTL:      ttl,
+				TOS:      stack.DefaultTOS,
+			}, pkt)
+			if err != ft.wantError {
+				t.Errorf("got WritePacket(_, _, _) = %s, want = %s", err, ft.wantError)
+			}
+			if got := int(r.Stats().IP.PacketsSent.Value()); got != ft.allowPackets {
+				t.Errorf("got r.Stats().IP.PacketsSent.Value() = %d, want = %d", got, ft.allowPackets)
+			}
+			if got := int(r.Stats().IP.OutgoingPacketErrors.Value()); got != ft.outgoingErrors {
+				t.Errorf("got r.Stats().IP.OutgoingPacketErrors.Value() = %d, want = %d", got, ft.outgoingErrors)
+			}
+		})
+	}
+}
diff --git a/pkg/tcpip/stack/ndp.go b/pkg/tcpip/network/ipv6/ndp.go
index b0873d1af..40da011f8 100644
--- a/pkg/tcpip/stack/ndp.go
+++ b/pkg/tcpip/network/ipv6/ndp.go
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-package stack
+package ipv6
 
 import (
 	"fmt"
@@ -23,9 +23,27 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
 )
 
 const (
+	// defaultRetransmitTimer is the default amount of time to wait between
+	// sending reachability probes.
+	//
+	// Default taken from RETRANS_TIMER of RFC 4861 section 10.
+	defaultRetransmitTimer = time.Second
+
+	// minimumRetransmitTimer is the minimum amount of time to wait between
+	// sending reachability probes.
+	//
+	// Note, RFC 4861 does not impose a minimum Retransmit Timer, but we do here
+	// to make sure the messages are not sent all at once. We also come to this
+	// value because in the RetransmitTimer field of a Router Advertisement, a
+	// value of 0 means unspecified, so the smallest valid value is 1. Note, the
+	// unit of the RetransmitTimer field in the Router Advertisement is
+	// milliseconds.
+	minimumRetransmitTimer = time.Millisecond
+
 	// defaultDupAddrDetectTransmits is the default number of NDP Neighbor
 	// Solicitation messages to send when doing Duplicate Address Detection
 	// for a tentative address.
@@ -34,7 +52,7 @@ const (
 	defaultDupAddrDetectTransmits = 1
 
 	// defaultMaxRtrSolicitations is the default number of Router
-	// Solicitation messages to send when a NIC becomes enabled.
+	// Solicitation messages to send when an IPv6 endpoint becomes enabled.
 	//
 	// Default = 3 (from RFC 4861 section 10).
 	defaultMaxRtrSolicitations = 3
@@ -131,7 +149,7 @@ const (
 	minRegenAdvanceDuration = time.Duration(0)
 
 	// maxSLAACAddrLocalRegenAttempts is the maximum number of times to attempt
-	// SLAAC address regenerations in response to a NIC-local conflict.
+	// SLAAC address regenerations in response to an IPv6 endpoint-local conflict.
 	maxSLAACAddrLocalRegenAttempts = 10
 )
 
@@ -163,7 +181,7 @@ var (
 	// This is exported as a variable (instead of a constant) so tests
 	// can update it to a smaller value.
 	//
-	// This value guarantees that a temporary address will be preferred for at
+	// This value guarantees that a temporary address is preferred for at
 	// least 1hr if the SLAAC prefix is valid for at least that time.
 	MinMaxTempAddrPreferredLifetime = defaultRegenAdvanceDuration + MaxDesyncFactor + time.Hour
 
@@ -173,11 +191,17 @@ var (
 	// This is exported as a variable (instead of a constant) so tests
 	// can update it to a smaller value.
 	//
-	// This value guarantees that a temporary address will be valid for at least
+	// This value guarantees that a temporary address is valid for at least
 	// 2hrs if the SLAAC prefix is valid for at least that time.
 	MinMaxTempAddrValidLifetime = 2 * time.Hour
 )
 
+// NDPEndpoint is an endpoint that supports NDP.
+type NDPEndpoint interface {
+	// SetNDPConfigurations sets the NDP configurations.
+	SetNDPConfigurations(NDPConfigurations)
+}
+
 // DHCPv6ConfigurationFromNDPRA is a configuration available via DHCPv6 that an
 // NDP Router Advertisement informed the Stack about.
 type DHCPv6ConfigurationFromNDPRA int
@@ -192,7 +216,7 @@ const (
 	// DHCPv6ManagedAddress indicates that addresses are available via DHCPv6.
 	//
 	// DHCPv6ManagedAddress also implies DHCPv6OtherConfigurations because DHCPv6
-	// will return all available configuration information.
+	// returns all available configuration information when serving addresses.
 	DHCPv6ManagedAddress
 
 	// DHCPv6OtherConfigurations indicates that other configuration information is
@@ -207,19 +231,18 @@ const (
 // NDPDispatcher is the interface integrators of netstack must implement to
 // receive and handle NDP related events.
 type NDPDispatcher interface {
-	// OnDuplicateAddressDetectionStatus will be called when the DAD process
-	// for an address (addr) on a NIC (with ID nicID) completes. resolved
-	// will be set to true if DAD completed successfully (no duplicate addr
-	// detected); false otherwise (addr was detected to be a duplicate on
-	// the link the NIC is a part of, or it was stopped for some other
-	// reason, such as the address being removed). If an error occured
-	// during DAD, err will be set and resolved must be ignored.
+	// OnDuplicateAddressDetectionStatus is called when the DAD process for an
+	// address (addr) on a NIC (with ID nicID) completes. resolved is set to true
+	// if DAD completed successfully (no duplicate addr detected); false otherwise
+	// (addr was detected to be a duplicate on the link the NIC is a part of, or
+	// it was stopped for some other reason, such as the address being removed).
+	// If an error occured during DAD, err is set and resolved must be ignored.
 	//
 	// This function is not permitted to block indefinitely. This function
 	// is also not permitted to call into the stack.
 	OnDuplicateAddressDetectionStatus(nicID tcpip.NICID, addr tcpip.Address, resolved bool, err *tcpip.Error)
 
-	// OnDefaultRouterDiscovered will be called when a new default router is
+	// OnDefaultRouterDiscovered is called when a new default router is
 	// discovered. Implementations must return true if the newly discovered
 	// router should be remembered.
 	//
@@ -227,56 +250,55 @@ type NDPDispatcher interface {
 	// is also not permitted to call into the stack.
 	OnDefaultRouterDiscovered(nicID tcpip.NICID, addr tcpip.Address) bool
 
-	// OnDefaultRouterInvalidated will be called when a discovered default
-	// router that was remembered is invalidated.
+	// OnDefaultRouterInvalidated is called when a discovered default router that
+	// was remembered is invalidated.
 	//
 	// This function is not permitted to block indefinitely. This function
 	// is also not permitted to call into the stack.
 	OnDefaultRouterInvalidated(nicID tcpip.NICID, addr tcpip.Address)
 
-	// OnOnLinkPrefixDiscovered will be called when a new on-link prefix is
-	// discovered. Implementations must return true if the newly discovered
-	// on-link prefix should be remembered.
+	// OnOnLinkPrefixDiscovered is called when a new on-link prefix is discovered.
+	// Implementations must return true if the newly discovered on-link prefix
+	// should be remembered.
 	//
 	// This function is not permitted to block indefinitely. This function
 	// is also not permitted to call into the stack.
 	OnOnLinkPrefixDiscovered(nicID tcpip.NICID, prefix tcpip.Subnet) bool
 
-	// OnOnLinkPrefixInvalidated will be called when a discovered on-link
-	// prefix that was remembered is invalidated.
+	// OnOnLinkPrefixInvalidated is called when a discovered on-link prefix that
+	// was remembered is invalidated.
 	//
 	// This function is not permitted to block indefinitely. This function
 	// is also not permitted to call into the stack.
 	OnOnLinkPrefixInvalidated(nicID tcpip.NICID, prefix tcpip.Subnet)
 
-	// OnAutoGenAddress will be called when a new prefix with its
-	// autonomous address-configuration flag set has been received and SLAAC
-	// has been performed. Implementations may prevent the stack from
-	// assigning the address to the NIC by returning false.
+	// OnAutoGenAddress is called when a new prefix with its autonomous address-
+	// configuration flag set is received and SLAAC was performed. Implementations
+	// may prevent the stack from assigning the address to the NIC by returning
+	// false.
 	//
 	// This function is not permitted to block indefinitely. It must not
 	// call functions on the stack itself.
 	OnAutoGenAddress(tcpip.NICID, tcpip.AddressWithPrefix) bool
 
-	// OnAutoGenAddressDeprecated will be called when an auto-generated
-	// address (as part of SLAAC) has been deprecated, but is still
-	// considered valid. Note, if an address is invalidated at the same
-	// time it is deprecated, the deprecation event MAY be omitted.
+	// OnAutoGenAddressDeprecated is called when an auto-generated address (SLAAC)
+	// is deprecated, but is still considered valid. Note, if an address is
+	// invalidated at the same ime it is deprecated, the deprecation event may not
+	// be received.
 	//
 	// This function is not permitted to block indefinitely. It must not
 	// call functions on the stack itself.
 	OnAutoGenAddressDeprecated(tcpip.NICID, tcpip.AddressWithPrefix)
 
-	// OnAutoGenAddressInvalidated will be called when an auto-generated
-	// address (as part of SLAAC) has been invalidated.
+	// OnAutoGenAddressInvalidated is called when an auto-generated address
+	// (SLAAC) is invalidated.
 	//
 	// This function is not permitted to block indefinitely. It must not
 	// call functions on the stack itself.
 	OnAutoGenAddressInvalidated(tcpip.NICID, tcpip.AddressWithPrefix)
 
-	// OnRecursiveDNSServerOption will be called when an NDP option with
-	// recursive DNS servers has been received. Note, addrs may contain
-	// link-local addresses.
+	// OnRecursiveDNSServerOption is called when the stack learns of DNS servers
+	// through NDP. Note, the addresses may contain link-local addresses.
 	//
 	// It is up to the caller to use the DNS Servers only for their valid
 	// lifetime. OnRecursiveDNSServerOption may be called for new or
@@ -288,8 +310,8 @@ type NDPDispatcher interface {
 	// call functions on the stack itself.
 	OnRecursiveDNSServerOption(nicID tcpip.NICID, addrs []tcpip.Address, lifetime time.Duration)
 
-	// OnDNSSearchListOption will be called when an NDP option with a DNS
-	// search list has been received.
+	// OnDNSSearchListOption is called when the stack learns of DNS search lists
+	// through NDP.
 	//
 	// It is up to the caller to use the domain names in the search list
 	// for only their valid lifetime. OnDNSSearchListOption may be called
@@ -298,8 +320,8 @@ type NDPDispatcher interface {
 	// be increased, decreased or completely invalidated when lifetime = 0.
 	OnDNSSearchListOption(nicID tcpip.NICID, domainNames []string, lifetime time.Duration)
 
-	// OnDHCPv6Configuration will be called with an updated configuration that is
-	// available via DHCPv6 for a specified NIC.
+	// OnDHCPv6Configuration is called with an updated configuration that is
+	// available via DHCPv6 for the passed NIC.
 	//
 	// This function is not permitted to block indefinitely. It must not
 	// call functions on the stack itself.
@@ -320,7 +342,7 @@ type NDPConfigurations struct {
 	// Must be greater than or equal to 1ms.
 	RetransmitTimer time.Duration
 
-	// The number of Router Solicitation messages to send when the NIC
+	// The number of Router Solicitation messages to send when the IPv6 endpoint
 	// becomes enabled.
 	MaxRtrSolicitations uint8
 
@@ -335,24 +357,22 @@ type NDPConfigurations struct {
 	// Must be greater than or equal to 0s.
 	MaxRtrSolicitationDelay time.Duration
 
-	// HandleRAs determines whether or not Router Advertisements will be
-	// processed.
+	// HandleRAs determines whether or not Router Advertisements are processed.
 	HandleRAs bool
 
-	// DiscoverDefaultRouters determines whether or not default routers will
-	// be discovered from Router Advertisements. This configuration is
-	// ignored if HandleRAs is false.
+	// DiscoverDefaultRouters determines whether or not default routers are
+	// discovered from Router Advertisements, as per RFC 4861 section 6. This
+	// configuration is ignored if HandleRAs is false.
 	DiscoverDefaultRouters bool
 
-	// DiscoverOnLinkPrefixes determines whether or not on-link prefixes
-	// will be discovered from Router Advertisements' Prefix Information
-	// option. This configuration is ignored if HandleRAs is false.
+	// DiscoverOnLinkPrefixes determines whether or not on-link prefixes are
+	// discovered from Router Advertisements' Prefix Information option, as per
+	// RFC 4861 section 6. This configuration is ignored if HandleRAs is false.
 	DiscoverOnLinkPrefixes bool
 
-	// AutoGenGlobalAddresses determines whether or not global IPv6
-	// addresses will be generated for a NIC in response to receiving a new
-	// Prefix Information option with its Autonomous Address
-	// AutoConfiguration flag set, as a host, as per RFC 4862 (SLAAC).
+	// AutoGenGlobalAddresses determines whether or not an IPv6 endpoint performs
+	// SLAAC to auto-generate global SLAAC addresses in response to Prefix
+	// Information options, as per RFC 4862.
 	//
 	// Note, if an address was already generated for some unique prefix, as
 	// part of SLAAC, this option does not affect whether or not the
@@ -366,12 +386,12 @@ type NDPConfigurations struct {
 	//
 	// If the method used to generate the address does not support creating
 	// alternative addresses (e.g. IIDs based on the modified EUI64 of a NIC's
-	// MAC address), then no attempt will be made to resolve the conflict.
+	// MAC address), then no attempt is made to resolve the conflict.
 	AutoGenAddressConflictRetries uint8
 
 	// AutoGenTempGlobalAddresses determines whether or not temporary SLAAC
-	// addresses will be generated for a NIC as part of SLAAC privacy extensions,
-	// RFC 4941.
+	// addresses are generated for an IPv6 endpoint as part of SLAAC privacy
+	// extensions, as per RFC 4941.
 	//
 	// Ignored if AutoGenGlobalAddresses is false.
 	AutoGenTempGlobalAddresses bool
@@ -410,7 +430,7 @@ func DefaultNDPConfigurations() NDPConfigurations {
 }
 
 // validate modifies an NDPConfigurations with valid values. If invalid values
-// are present in c, the corresponding default values will be used instead.
+// are present in c, the corresponding default values are used instead.
 func (c *NDPConfigurations) validate() {
 	if c.RetransmitTimer < minimumRetransmitTimer {
 		c.RetransmitTimer = defaultRetransmitTimer
@@ -439,8 +459,8 @@ func (c *NDPConfigurations) validate() {
 
 // ndpState is the per-interface NDP state.
 type ndpState struct {
-	// The NIC this ndpState is for.
-	nic *NIC
+	// The IPv6 endpoint this ndpState is for.
+	ep *endpoint
 
 	// configs is the per-interface NDP configurations.
 	configs NDPConfigurations
@@ -458,8 +478,8 @@ type ndpState struct {
 		// Used to let the Router Solicitation timer know that it has been stopped.
 		//
 		// Must only be read from or written to while protected by the lock of
-		// the NIC this ndpState is associated with. MUST be set when the timer is
-		// set.
+		// the IPv6 endpoint this ndpState is associated with. MUST be set when the
+		// timer is set.
 		done *bool
 	}
 
@@ -492,7 +512,7 @@ type dadState struct {
 	// Used to let the DAD timer know that it has been stopped.
 	//
 	// Must only be read from or written to while protected by the lock of
-	// the NIC this dadState is associated with.
+	// the IPv6 endpoint this dadState is associated with.
 	done *bool
 }
 
@@ -537,7 +557,7 @@ type tempSLAACAddrState struct {
 	// The address's endpoint.
 	//
 	// Must not be nil.
-	ref *referencedNetworkEndpoint
+	addressEndpoint stack.AddressEndpoint
 
 	// Has a new temporary SLAAC address already been regenerated?
 	regenerated bool
@@ -567,10 +587,10 @@ type slaacPrefixState struct {
 		//
 		// May only be nil when the address is being (re-)generated. Otherwise,
 		// must not be nil as all SLAAC prefixes must have a stable address.
-		ref *referencedNetworkEndpoint
+		addressEndpoint stack.AddressEndpoint
 
-		// The number of times an address has been generated locally where the NIC
-		// already had the generated address.
+		// The number of times an address has been generated locally where the IPv6
+		// endpoint already had the generated address.
 		localGenerationFailures uint8
 	}
 
@@ -578,11 +598,12 @@ type slaacPrefixState struct {
 	tempAddrs map[tcpip.Address]tempSLAACAddrState
 
 	// The next two fields are used by both stable and temporary addresses
-	// generated for a SLAAC prefix. This is safe as only 1 address will be
-	// in the generation and DAD process at any time. That is, no two addresses
-	// will be generated at the same time for a given SLAAC prefix.
+	// generated for a SLAAC prefix. This is safe as only 1 address is in the
+	// generation and DAD process at any time. That is, no two addresses are
+	// generated at the same time for a given SLAAC prefix.
 
-	// The number of times an address has been generated and added to the NIC.
+	// The number of times an address has been generated and added to the IPv6
+	// endpoint.
 	//
 	// Addresses may be regenerated in reseponse to a DAD conflicts.
 	generationAttempts uint8
@@ -597,16 +618,16 @@ type slaacPrefixState struct {
 // This function must only be called by IPv6 addresses that are currently
 // tentative.
 //
-// The NIC that ndp belongs to MUST be locked.
-func (ndp *ndpState) startDuplicateAddressDetection(addr tcpip.Address, ref *referencedNetworkEndpoint) *tcpip.Error {
+// The IPv6 endpoint that ndp belongs to MUST be locked.
+func (ndp *ndpState) startDuplicateAddressDetection(addr tcpip.Address, addressEndpoint stack.AddressEndpoint) *tcpip.Error {
 	// addr must be a valid unicast IPv6 address.
 	if !header.IsV6UnicastAddress(addr) {
 		return tcpip.ErrAddressFamilyNotSupported
 	}
 
-	if ref.getKind() != permanentTentative {
+	if addressEndpoint.GetKind() != stack.PermanentTentative {
 		// The endpoint should be marked as tentative since we are starting DAD.
-		panic(fmt.Sprintf("ndpdad: addr %s is not tentative on NIC(%d)", addr, ndp.nic.ID()))
+		panic(fmt.Sprintf("ndpdad: addr %s is not tentative on NIC(%d)", addr, ndp.ep.nic.ID()))
 	}
 
 	// Should not attempt to perform DAD on an address that is currently in the
@@ -617,18 +638,18 @@ func (ndp *ndpState) startDuplicateAddressDetection(addr tcpip.Address, ref *ref
 		// existed, we would get an error since we attempted to add a duplicate
 		// address, or its reference count would have been increased without doing
 		// the work that would have been done for an address that was brand new.
-		// See NIC.addAddressLocked.
-		panic(fmt.Sprintf("ndpdad: already performing DAD for addr %s on NIC(%d)", addr, ndp.nic.ID()))
+		// See endpoint.addAddressLocked.
+		panic(fmt.Sprintf("ndpdad: already performing DAD for addr %s on NIC(%d)", addr, ndp.ep.nic.ID()))
 	}
 
 	remaining := ndp.configs.DupAddrDetectTransmits
 	if remaining == 0 {
-		ref.setKind(permanent)
+		addressEndpoint.SetKind(stack.Permanent)
 
 		// Consider DAD to have resolved even if no DAD messages were actually
 		// transmitted.
-		if ndpDisp := ndp.nic.stack.ndpDisp; ndpDisp != nil {
-			ndpDisp.OnDuplicateAddressDetectionStatus(ndp.nic.ID(), addr, true, nil)
+		if ndpDisp := ndp.ep.protocol.ndpDisp; ndpDisp != nil {
+			ndpDisp.OnDuplicateAddressDetectionStatus(ndp.ep.nic.ID(), addr, true, nil)
 		}
 
 		return nil
@@ -637,25 +658,25 @@ func (ndp *ndpState) startDuplicateAddressDetection(addr tcpip.Address, ref *ref
 	var done bool
 	var timer tcpip.Timer
 	// We initially start a timer to fire immediately because some of the DAD work
-	// cannot be done while holding the NIC's lock. This is effectively the same
-	// as starting a goroutine but we use a timer that fires immediately so we can
-	// reset it for the next DAD iteration.
-	timer = ndp.nic.stack.Clock().AfterFunc(0, func() {
-		ndp.nic.mu.Lock()
-		defer ndp.nic.mu.Unlock()
+	// cannot be done while holding the IPv6 endpoint's lock. This is effectively
+	// the same as starting a goroutine but we use a timer that fires immediately
+	// so we can reset it for the next DAD iteration.
+	timer = ndp.ep.protocol.stack.Clock().AfterFunc(0, func() {
+		ndp.ep.mu.Lock()
+		defer ndp.ep.mu.Unlock()
 
 		if done {
 			// If we reach this point, it means that the DAD timer fired after
-			// another goroutine already obtained the NIC lock and stopped DAD
-			// before this function obtained the NIC lock. Simply return here and do
-			// nothing further.
+			// another goroutine already obtained the IPv6 endpoint lock and stopped
+			// DAD before this function obtained the NIC lock. Simply return here and
+			// do nothing further.
 			return
 		}
 
-		if ref.getKind() != permanentTentative {
+		if addressEndpoint.GetKind() != stack.PermanentTentative {
 			// The endpoint should still be marked as tentative since we are still
 			// performing DAD on it.
-			panic(fmt.Sprintf("ndpdad: addr %s is no longer tentative on NIC(%d)", addr, ndp.nic.ID()))
+			panic(fmt.Sprintf("ndpdad: addr %s is no longer tentative on NIC(%d)", addr, ndp.ep.nic.ID()))
 		}
 
 		dadDone := remaining == 0
@@ -663,33 +684,34 @@ func (ndp *ndpState) startDuplicateAddressDetection(addr tcpip.Address, ref *ref
 		var err *tcpip.Error
 		if !dadDone {
 			// Use the unspecified address as the source address when performing DAD.
-			ref := ndp.nic.getRefOrCreateTempLocked(header.IPv6ProtocolNumber, header.IPv6Any, NeverPrimaryEndpoint)
+			addressEndpoint := ndp.ep.acquireAddressOrCreateTempLocked(header.IPv6Any, true /* createTemp */, stack.NeverPrimaryEndpoint)
 
 			// Do not hold the lock when sending packets which may be a long running
 			// task or may block link address resolution. We know this is safe
 			// because immediately after obtaining the lock again, we check if DAD
-			// has been stopped before doing any work with the NIC. Note, DAD would be
-			// stopped if the NIC was disabled or removed, or if the address was
-			// removed.
-			ndp.nic.mu.Unlock()
-			err = ndp.sendDADPacket(addr, ref)
-			ndp.nic.mu.Lock()
+			// has been stopped before doing any work with the IPv6 endpoint. Note,
+			// DAD would be stopped if the IPv6 endpoint was disabled or closed, or if
+			// the address was removed.
+			ndp.ep.mu.Unlock()
+			err = ndp.sendDADPacket(addr, addressEndpoint)
+			ndp.ep.mu.Lock()
+			addressEndpoint.DecRef()
 		}
 
 		if done {
 			// If we reach this point, it means that DAD was stopped after we released
-			// the NIC's read lock and before we obtained the write lock.
+			// the IPv6 endpoint's read lock and before we obtained the write lock.
 			return
 		}
 
 		if dadDone {
 			// DAD has resolved.
-			ref.setKind(permanent)
+			addressEndpoint.SetKind(stack.Permanent)
 		} else if err == nil {
 			// DAD is not done and we had no errors when sending the last NDP NS,
 			// schedule the next DAD timer.
 			remaining--
-			timer.Reset(ndp.nic.stack.ndpConfigs.RetransmitTimer)
+			timer.Reset(ndp.configs.RetransmitTimer)
 			return
 		}
 
@@ -698,16 +720,16 @@ func (ndp *ndpState) startDuplicateAddressDetection(addr tcpip.Address, ref *ref
 		// integrator know DAD has completed.
 		delete(ndp.dad, addr)
 
-		if ndpDisp := ndp.nic.stack.ndpDisp; ndpDisp != nil {
-			ndpDisp.OnDuplicateAddressDetectionStatus(ndp.nic.ID(), addr, dadDone, err)
+		if ndpDisp := ndp.ep.protocol.ndpDisp; ndpDisp != nil {
+			ndpDisp.OnDuplicateAddressDetectionStatus(ndp.ep.nic.ID(), addr, dadDone, err)
 		}
 
 		// If DAD resolved for a stable SLAAC address, attempt generation of a
 		// temporary SLAAC address.
-		if dadDone && ref.configType == slaac {
+		if dadDone && addressEndpoint.ConfigType() == stack.AddressConfigSlaac {
 			// Reset the generation attempts counter as we are starting the generation
 			// of a new address for the SLAAC prefix.
-			ndp.regenerateTempSLAACAddr(ref.addrWithPrefix().Subnet(), true /* resetGenAttempts */)
+			ndp.regenerateTempSLAACAddr(addressEndpoint.AddressWithPrefix().Subnet(), true /* resetGenAttempts */)
 		}
 	})
 
@@ -722,28 +744,31 @@ func (ndp *ndpState) startDuplicateAddressDetection(addr tcpip.Address, ref *ref
 // sendDADPacket sends a NS message to see if any nodes on ndp's NIC's link owns
 // addr.
 //
-// addr must be a tentative IPv6 address on ndp's NIC.
+// addr must be a tentative IPv6 address on ndp's IPv6 endpoint.
 //
-// The NIC ndp belongs to MUST NOT be locked.
-func (ndp *ndpState) sendDADPacket(addr tcpip.Address, ref *referencedNetworkEndpoint) *tcpip.Error {
+// The IPv6 endpoint that ndp belongs to MUST NOT be locked.
+func (ndp *ndpState) sendDADPacket(addr tcpip.Address, addressEndpoint stack.AddressEndpoint) *tcpip.Error {
 	snmc := header.SolicitedNodeAddr(addr)
 
-	r := makeRoute(header.IPv6ProtocolNumber, ref.address(), snmc, ndp.nic.linkEP.LinkAddress(), ref, false, false)
+	r, err := ndp.ep.protocol.stack.FindRoute(ndp.ep.nic.ID(), header.IPv6Any, snmc, ProtocolNumber, false /* multicastLoop */)
+	if err != nil {
+		return err
+	}
 	defer r.Release()
 
 	// Route should resolve immediately since snmc is a multicast address so a
 	// remote link address can be calculated without a resolution process.
 	if c, err := r.Resolve(nil); err != nil {
 		// Do not consider the NIC being unknown or disabled as a fatal error.
-		// Since this method is required to be called when the NIC is not locked,
-		// the NIC could have been disabled or removed by another goroutine.
+		// Since this method is required to be called when the IPv6 endpoint is not
+		// locked, the NIC could have been disabled or removed by another goroutine.
 		if err == tcpip.ErrUnknownNICID || err != tcpip.ErrInvalidEndpointState {
 			return err
 		}
 
-		panic(fmt.Sprintf("ndp: error when resolving route to send NDP NS for DAD (%s -> %s on NIC(%d)): %s", header.IPv6Any, snmc, ndp.nic.ID(), err))
+		panic(fmt.Sprintf("ndp: error when resolving route to send NDP NS for DAD (%s -> %s on NIC(%d)): %s", header.IPv6Any, snmc, ndp.ep.nic.ID(), err))
 	} else if c != nil {
-		panic(fmt.Sprintf("ndp: route resolution not immediate for route to send NDP NS for DAD (%s -> %s on NIC(%d))", header.IPv6Any, snmc, ndp.nic.ID()))
+		panic(fmt.Sprintf("ndp: route resolution not immediate for route to send NDP NS for DAD (%s -> %s on NIC(%d))", header.IPv6Any, snmc, ndp.ep.nic.ID()))
 	}
 
 	icmpData := header.ICMPv6(buffer.NewView(header.ICMPv6NeighborSolicitMinimumSize))
@@ -752,17 +777,16 @@ func (ndp *ndpState) sendDADPacket(addr tcpip.Address, ref *referencedNetworkEnd
 	ns.SetTargetAddress(addr)
 	icmpData.SetChecksum(header.ICMPv6Checksum(icmpData, r.LocalAddress, r.RemoteAddress, buffer.VectorisedView{}))
 
-	pkt := NewPacketBuffer(PacketBufferOptions{
+	pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
 		ReserveHeaderBytes: int(r.MaxHeaderLength()),
 		Data:               buffer.View(icmpData).ToVectorisedView(),
 	})
 
 	sent := r.Stats().ICMP.V6PacketsSent
 	if err := r.WritePacket(nil,
-		NetworkHeaderParams{
+		stack.NetworkHeaderParams{
 			Protocol: header.ICMPv6ProtocolNumber,
 			TTL:      header.NDPHopLimit,
-			TOS:      DefaultTOS,
 		}, pkt,
 	); err != nil {
 		sent.Dropped.Increment()
@@ -778,11 +802,9 @@ func (ndp *ndpState) sendDADPacket(addr tcpip.Address, ref *referencedNetworkEnd
 // such a state forever, unless some other external event resolves the DAD
 // process (receiving an NA from the true owner of addr, or an NS for addr
 // (implying another node is attempting to use addr)). It is up to the caller
-// of this function to handle such a scenario. Normally, addr will be removed
-// from n right after this function returns or the address successfully
-// resolved.
+// of this function to handle such a scenario.
 //
-// The NIC that ndp belongs to MUST be locked.
+// The IPv6 endpoint that ndp belongs to MUST be locked.
 func (ndp *ndpState) stopDuplicateAddressDetection(addr tcpip.Address) {
 	dad, ok := ndp.dad[addr]
 	if !ok {
@@ -801,30 +823,30 @@ func (ndp *ndpState) stopDuplicateAddressDetection(addr tcpip.Address) {
 	delete(ndp.dad, addr)
 
 	// Let the integrator know DAD did not resolve.
-	if ndpDisp := ndp.nic.stack.ndpDisp; ndpDisp != nil {
-		ndpDisp.OnDuplicateAddressDetectionStatus(ndp.nic.ID(), addr, false, nil)
+	if ndpDisp := ndp.ep.protocol.ndpDisp; ndpDisp != nil {
+		ndpDisp.OnDuplicateAddressDetectionStatus(ndp.ep.nic.ID(), addr, false, nil)
 	}
 }
 
 // handleRA handles a Router Advertisement message that arrived on the NIC
 // this ndp is for. Does nothing if the NIC is configured to not handle RAs.
 //
-// The NIC that ndp belongs to MUST be locked.
+// The IPv6 endpoint that ndp belongs to MUST be locked.
 func (ndp *ndpState) handleRA(ip tcpip.Address, ra header.NDPRouterAdvert) {
-	// Is the NIC configured to handle RAs at all?
+	// Is the IPv6 endpoint configured to handle RAs at all?
 	//
 	// Currently, the stack does not determine router interface status on a
-	// per-interface basis; it is a stack-wide configuration, so we check
-	// stack's forwarding flag to determine if the NIC is a routing
-	// interface.
-	if !ndp.configs.HandleRAs || ndp.nic.stack.forwarding {
+	// per-interface basis; it is a protocol-wide configuration, so we check the
+	// protocol's forwarding flag to determine if the IPv6 endpoint is forwarding
+	// packets.
+	if !ndp.configs.HandleRAs || ndp.ep.protocol.Forwarding() {
 		return
 	}
 
 	// Only worry about the DHCPv6 configuration if we have an NDPDispatcher as we
 	// only inform the dispatcher on configuration changes. We do nothing else
 	// with the information.
-	if ndpDisp := ndp.nic.stack.ndpDisp; ndpDisp != nil {
+	if ndpDisp := ndp.ep.protocol.ndpDisp; ndpDisp != nil {
 		var configuration DHCPv6ConfigurationFromNDPRA
 		switch {
 		case ra.ManagedAddrConfFlag():
@@ -839,11 +861,11 @@ func (ndp *ndpState) handleRA(ip tcpip.Address, ra header.NDPRouterAdvert) {
 
 		if ndp.dhcpv6Configuration != configuration {
 			ndp.dhcpv6Configuration = configuration
-			ndpDisp.OnDHCPv6Configuration(ndp.nic.ID(), configuration)
+			ndpDisp.OnDHCPv6Configuration(ndp.ep.nic.ID(), configuration)
 		}
 	}
 
-	// Is the NIC configured to discover default routers?
+	// Is the IPv6 endpoint configured to discover default routers?
 	if ndp.configs.DiscoverDefaultRouters {
 		rtr, ok := ndp.defaultRouters[ip]
 		rl := ra.RouterLifetime()
@@ -881,20 +903,20 @@ func (ndp *ndpState) handleRA(ip tcpip.Address, ra header.NDPRouterAdvert) {
 	for opt, done, _ := it.Next(); !done; opt, done, _ = it.Next() {
 		switch opt := opt.(type) {
 		case header.NDPRecursiveDNSServer:
-			if ndp.nic.stack.ndpDisp == nil {
+			if ndp.ep.protocol.ndpDisp == nil {
 				continue
 			}
 
 			addrs, _ := opt.Addresses()
-			ndp.nic.stack.ndpDisp.OnRecursiveDNSServerOption(ndp.nic.ID(), addrs, opt.Lifetime())
+			ndp.ep.protocol.ndpDisp.OnRecursiveDNSServerOption(ndp.ep.nic.ID(), addrs, opt.Lifetime())
 
 		case header.NDPDNSSearchList:
-			if ndp.nic.stack.ndpDisp == nil {
+			if ndp.ep.protocol.ndpDisp == nil {
 				continue
 			}
 
 			domainNames, _ := opt.DomainNames()
-			ndp.nic.stack.ndpDisp.OnDNSSearchListOption(ndp.nic.ID(), domainNames, opt.Lifetime())
+			ndp.ep.protocol.ndpDisp.OnDNSSearchListOption(ndp.ep.nic.ID(), domainNames, opt.Lifetime())
 
 		case header.NDPPrefixInformation:
 			prefix := opt.Subnet()
@@ -928,7 +950,7 @@ func (ndp *ndpState) handleRA(ip tcpip.Address, ra header.NDPRouterAdvert) {
 
 // invalidateDefaultRouter invalidates a discovered default router.
 //
-// The NIC that ndp belongs to MUST be locked.
+// The IPv6 endpoint that ndp belongs to MUST be locked.
 func (ndp *ndpState) invalidateDefaultRouter(ip tcpip.Address) {
 	rtr, ok := ndp.defaultRouters[ip]
 
@@ -942,32 +964,32 @@ func (ndp *ndpState) invalidateDefaultRouter(ip tcpip.Address) {
 	delete(ndp.defaultRouters, ip)
 
 	// Let the integrator know a discovered default router is invalidated.
-	if ndpDisp := ndp.nic.stack.ndpDisp; ndpDisp != nil {
-		ndpDisp.OnDefaultRouterInvalidated(ndp.nic.ID(), ip)
+	if ndpDisp := ndp.ep.protocol.ndpDisp; ndpDisp != nil {
+		ndpDisp.OnDefaultRouterInvalidated(ndp.ep.nic.ID(), ip)
 	}
 }
 
 // rememberDefaultRouter remembers a newly discovered default router with IPv6
 // link-local address ip with lifetime rl.
 //
-// The router identified by ip MUST NOT already be known by the NIC.
+// The router identified by ip MUST NOT already be known by the IPv6 endpoint.
 //
-// The NIC that ndp belongs to MUST be locked.
+// The IPv6 endpoint that ndp belongs to MUST be locked.
 func (ndp *ndpState) rememberDefaultRouter(ip tcpip.Address, rl time.Duration) {
-	ndpDisp := ndp.nic.stack.ndpDisp
+	ndpDisp := ndp.ep.protocol.ndpDisp
 	if ndpDisp == nil {
 		return
 	}
 
 	// Inform the integrator when we discovered a default router.
-	if !ndpDisp.OnDefaultRouterDiscovered(ndp.nic.ID(), ip) {
+	if !ndpDisp.OnDefaultRouterDiscovered(ndp.ep.nic.ID(), ip) {
 		// Informed by the integrator to not remember the router, do
 		// nothing further.
 		return
 	}
 
 	state := defaultRouterState{
-		invalidationJob: ndp.nic.stack.newJob(&ndp.nic.mu, func() {
+		invalidationJob: ndp.ep.protocol.stack.NewJob(&ndp.ep.mu, func() {
 			ndp.invalidateDefaultRouter(ip)
 		}),
 	}
@@ -982,22 +1004,22 @@ func (ndp *ndpState) rememberDefaultRouter(ip tcpip.Address, rl time.Duration) {
 //
 // The prefix identified by prefix MUST NOT already be known.
 //
-// The NIC that ndp belongs to MUST be locked.
+// The IPv6 endpoint that ndp belongs to MUST be locked.
 func (ndp *ndpState) rememberOnLinkPrefix(prefix tcpip.Subnet, l time.Duration) {
-	ndpDisp := ndp.nic.stack.ndpDisp
+	ndpDisp := ndp.ep.protocol.ndpDisp
 	if ndpDisp == nil {
 		return
 	}
 
 	// Inform the integrator when we discovered an on-link prefix.
-	if !ndpDisp.OnOnLinkPrefixDiscovered(ndp.nic.ID(), prefix) {
+	if !ndpDisp.OnOnLinkPrefixDiscovered(ndp.ep.nic.ID(), prefix) {
 		// Informed by the integrator to not remember the prefix, do
 		// nothing further.
 		return
 	}
 
 	state := onLinkPrefixState{
-		invalidationJob: ndp.nic.stack.newJob(&ndp.nic.mu, func() {
+		invalidationJob: ndp.ep.protocol.stack.NewJob(&ndp.ep.mu, func() {
 			ndp.invalidateOnLinkPrefix(prefix)
 		}),
 	}
@@ -1011,7 +1033,7 @@ func (ndp *ndpState) rememberOnLinkPrefix(prefix tcpip.Subnet, l time.Duration)
 
 // invalidateOnLinkPrefix invalidates a discovered on-link prefix.
 //
-// The NIC that ndp belongs to MUST be locked.
+// The IPv6 endpoint that ndp belongs to MUST be locked.
 func (ndp *ndpState) invalidateOnLinkPrefix(prefix tcpip.Subnet) {
 	s, ok := ndp.onLinkPrefixes[prefix]
 
@@ -1025,8 +1047,8 @@ func (ndp *ndpState) invalidateOnLinkPrefix(prefix tcpip.Subnet) {
 	delete(ndp.onLinkPrefixes, prefix)
 
 	// Let the integrator know a discovered on-link prefix is invalidated.
-	if ndpDisp := ndp.nic.stack.ndpDisp; ndpDisp != nil {
-		ndpDisp.OnOnLinkPrefixInvalidated(ndp.nic.ID(), prefix)
+	if ndpDisp := ndp.ep.protocol.ndpDisp; ndpDisp != nil {
+		ndpDisp.OnOnLinkPrefixInvalidated(ndp.ep.nic.ID(), prefix)
 	}
 }
 
@@ -1036,7 +1058,7 @@ func (ndp *ndpState) invalidateOnLinkPrefix(prefix tcpip.Subnet) {
 // handleOnLinkPrefixInformation assumes that the prefix this pi is for is
 // not the link-local prefix and the on-link flag is set.
 //
-// The NIC that ndp belongs to MUST be locked.
+// The IPv6 endpoint that ndp belongs to MUST be locked.
 func (ndp *ndpState) handleOnLinkPrefixInformation(pi header.NDPPrefixInformation) {
 	prefix := pi.Subnet()
 	prefixState, ok := ndp.onLinkPrefixes[prefix]
@@ -1089,7 +1111,7 @@ func (ndp *ndpState) handleOnLinkPrefixInformation(pi header.NDPPrefixInformatio
 // handleAutonomousPrefixInformation assumes that the prefix this pi is for is
 // not the link-local prefix and the autonomous flag is set.
 //
-// The NIC that ndp belongs to MUST be locked.
+// The IPv6 endpoint that ndp belongs to MUST be locked.
 func (ndp *ndpState) handleAutonomousPrefixInformation(pi header.NDPPrefixInformation) {
 	vl := pi.ValidLifetime()
 	pl := pi.PreferredLifetime()
@@ -1125,7 +1147,7 @@ func (ndp *ndpState) handleAutonomousPrefixInformation(pi header.NDPPrefixInform
 //
 // pl is the new preferred lifetime. vl is the new valid lifetime.
 //
-// The NIC that ndp belongs to MUST be locked.
+// The IPv6 endpoint that ndp belongs to MUST be locked.
 func (ndp *ndpState) doSLAAC(prefix tcpip.Subnet, pl, vl time.Duration) {
 	// If we do not already have an address for this prefix and the valid
 	// lifetime is 0, no need to do anything further, as per RFC 4862
@@ -1142,15 +1164,15 @@ func (ndp *ndpState) doSLAAC(prefix tcpip.Subnet, pl, vl time.Duration) {
 	}
 
 	state := slaacPrefixState{
-		deprecationJob: ndp.nic.stack.newJob(&ndp.nic.mu, func() {
+		deprecationJob: ndp.ep.protocol.stack.NewJob(&ndp.ep.mu, func() {
 			state, ok := ndp.slaacPrefixes[prefix]
 			if !ok {
 				panic(fmt.Sprintf("ndp: must have a slaacPrefixes entry for the deprecated SLAAC prefix %s", prefix))
 			}
 
-			ndp.deprecateSLAACAddress(state.stableAddr.ref)
+			ndp.deprecateSLAACAddress(state.stableAddr.addressEndpoint)
 		}),
-		invalidationJob: ndp.nic.stack.newJob(&ndp.nic.mu, func() {
+		invalidationJob: ndp.ep.protocol.stack.NewJob(&ndp.ep.mu, func() {
 			state, ok := ndp.slaacPrefixes[prefix]
 			if !ok {
 				panic(fmt.Sprintf("ndp: must have a slaacPrefixes entry for the invalidated SLAAC prefix %s", prefix))
@@ -1189,7 +1211,7 @@ func (ndp *ndpState) doSLAAC(prefix tcpip.Subnet, pl, vl time.Duration) {
 	}
 
 	// If the address is assigned (DAD resolved), generate a temporary address.
-	if state.stableAddr.ref.getKind() == permanent {
+	if state.stableAddr.addressEndpoint.GetKind() == stack.Permanent {
 		// Reset the generation attempts counter as we are starting the generation
 		// of a new address for the SLAAC prefix.
 		ndp.generateTempSLAACAddr(prefix, &state, true /* resetGenAttempts */)
@@ -1198,32 +1220,27 @@ func (ndp *ndpState) doSLAAC(prefix tcpip.Subnet, pl, vl time.Duration) {
 	ndp.slaacPrefixes[prefix] = state
 }
 
-// addSLAACAddr adds a SLAAC address to the NIC.
+// addAndAcquireSLAACAddr adds a SLAAC address to the IPv6 endpoint.
 //
-// The NIC that ndp belongs to MUST be locked.
-func (ndp *ndpState) addSLAACAddr(addr tcpip.AddressWithPrefix, configType networkEndpointConfigType, deprecated bool) *referencedNetworkEndpoint {
+// The IPv6 endpoint that ndp belongs to MUST be locked.
+func (ndp *ndpState) addAndAcquireSLAACAddr(addr tcpip.AddressWithPrefix, configType stack.AddressConfigType, deprecated bool) stack.AddressEndpoint {
 	// Inform the integrator that we have a new SLAAC address.
-	ndpDisp := ndp.nic.stack.ndpDisp
+	ndpDisp := ndp.ep.protocol.ndpDisp
 	if ndpDisp == nil {
 		return nil
 	}
 
-	if !ndpDisp.OnAutoGenAddress(ndp.nic.ID(), addr) {
+	if !ndpDisp.OnAutoGenAddress(ndp.ep.nic.ID(), addr) {
 		// Informed by the integrator not to add the address.
 		return nil
 	}
 
-	protocolAddr := tcpip.ProtocolAddress{
-		Protocol:          header.IPv6ProtocolNumber,
-		AddressWithPrefix: addr,
-	}
-
-	ref, err := ndp.nic.addAddressLocked(protocolAddr, FirstPrimaryEndpoint, permanent, configType, deprecated)
+	addressEndpoint, err := ndp.ep.addAndAcquirePermanentAddressLocked(addr, stack.FirstPrimaryEndpoint, configType, deprecated)
 	if err != nil {
-		panic(fmt.Sprintf("ndp: error when adding SLAAC address %+v: %s", protocolAddr, err))
+		panic(fmt.Sprintf("ndp: error when adding SLAAC address %+v: %s", addr, err))
 	}
 
-	return ref
+	return addressEndpoint
 }
 
 // generateSLAACAddr generates a SLAAC address for prefix.
@@ -1232,10 +1249,10 @@ func (ndp *ndpState) addSLAACAddr(addr tcpip.AddressWithPrefix, configType netwo
 //
 // Panics if the prefix is not a SLAAC prefix or it already has an address.
 //
-// The NIC that ndp belongs to MUST be locked.
+// The IPv6 endpoint that ndp belongs to MUST be locked.
 func (ndp *ndpState) generateSLAACAddr(prefix tcpip.Subnet, state *slaacPrefixState) bool {
-	if r := state.stableAddr.ref; r != nil {
-		panic(fmt.Sprintf("ndp: SLAAC prefix %s already has a permenant address %s", prefix, r.addrWithPrefix()))
+	if addressEndpoint := state.stableAddr.addressEndpoint; addressEndpoint != nil {
+		panic(fmt.Sprintf("ndp: SLAAC prefix %s already has a permenant address %s", prefix, addressEndpoint.AddressWithPrefix()))
 	}
 
 	// If we have already reached the maximum address generation attempts for the
@@ -1255,11 +1272,11 @@ func (ndp *ndpState) generateSLAACAddr(prefix tcpip.Subnet, state *slaacPrefixSt
 		}
 
 		dadCounter := state.generationAttempts + state.stableAddr.localGenerationFailures
-		if oIID := ndp.nic.stack.opaqueIIDOpts; oIID.NICNameFromID != nil {
+		if oIID := ndp.ep.protocol.opaqueIIDOpts; oIID.NICNameFromID != nil {
 			addrBytes = header.AppendOpaqueInterfaceIdentifier(
 				addrBytes[:header.IIDOffsetInIPv6Address],
 				prefix,
-				oIID.NICNameFromID(ndp.nic.ID(), ndp.nic.name),
+				oIID.NICNameFromID(ndp.ep.nic.ID(), ndp.ep.nic.Name()),
 				dadCounter,
 				oIID.SecretKey,
 			)
@@ -1272,7 +1289,7 @@ func (ndp *ndpState) generateSLAACAddr(prefix tcpip.Subnet, state *slaacPrefixSt
 			//
 			// TODO(b/141011931): Validate a LinkEndpoint's link address (provided by
 			// LinkEndpoint.LinkAddress) before reaching this point.
-			linkAddr := ndp.nic.linkEP.LinkAddress()
+			linkAddr := ndp.ep.nic.LinkAddress()
 			if !header.IsValidUnicastEthernetAddress(linkAddr) {
 				return false
 			}
@@ -1291,15 +1308,15 @@ func (ndp *ndpState) generateSLAACAddr(prefix tcpip.Subnet, state *slaacPrefixSt
 			PrefixLen: validPrefixLenForAutoGen,
 		}
 
-		if !ndp.nic.hasPermanentAddrLocked(generatedAddr.Address) {
+		if !ndp.ep.hasPermanentAddressRLocked(generatedAddr.Address) {
 			break
 		}
 
 		state.stableAddr.localGenerationFailures++
 	}
 
-	if ref := ndp.addSLAACAddr(generatedAddr, slaac, time.Since(state.preferredUntil) >= 0 /* deprecated */); ref != nil {
-		state.stableAddr.ref = ref
+	if addressEndpoint := ndp.addAndAcquireSLAACAddr(generatedAddr, stack.AddressConfigSlaac, time.Since(state.preferredUntil) >= 0 /* deprecated */); addressEndpoint != nil {
+		state.stableAddr.addressEndpoint = addressEndpoint
 		state.generationAttempts++
 		return true
 	}
@@ -1309,10 +1326,9 @@ func (ndp *ndpState) generateSLAACAddr(prefix tcpip.Subnet, state *slaacPrefixSt
 
 // regenerateSLAACAddr regenerates an address for a SLAAC prefix.
 //
-// If generating a new address for the prefix fails, the prefix will be
-// invalidated.
+// If generating a new address for the prefix fails, the prefix is invalidated.
 //
-// The NIC that ndp belongs to MUST be locked.
+// The IPv6 endpoint that ndp belongs to MUST be locked.
 func (ndp *ndpState) regenerateSLAACAddr(prefix tcpip.Subnet) {
 	state, ok := ndp.slaacPrefixes[prefix]
 	if !ok {
@@ -1332,7 +1348,7 @@ func (ndp *ndpState) regenerateSLAACAddr(prefix tcpip.Subnet) {
 
 // generateTempSLAACAddr generates a new temporary SLAAC address.
 //
-// If resetGenAttempts is true, the prefix's generation counter will be reset.
+// If resetGenAttempts is true, the prefix's generation counter is reset.
 //
 // Returns true if a new address was generated.
 func (ndp *ndpState) generateTempSLAACAddr(prefix tcpip.Subnet, prefixState *slaacPrefixState, resetGenAttempts bool) bool {
@@ -1353,7 +1369,7 @@ func (ndp *ndpState) generateTempSLAACAddr(prefix tcpip.Subnet, prefixState *sla
 		return false
 	}
 
-	stableAddr := prefixState.stableAddr.ref.address()
+	stableAddr := prefixState.stableAddr.addressEndpoint.AddressWithPrefix().Address
 	now := time.Now()
 
 	// As per RFC 4941 section 3.3 step 4, the valid lifetime of a temporary
@@ -1392,7 +1408,8 @@ func (ndp *ndpState) generateTempSLAACAddr(prefix tcpip.Subnet, prefixState *sla
 		return false
 	}
 
-	// Attempt to generate a new address that is not already assigned to the NIC.
+	// Attempt to generate a new address that is not already assigned to the IPv6
+	// endpoint.
 	var generatedAddr tcpip.AddressWithPrefix
 	for i := 0; ; i++ {
 		// If we were unable to generate an address after the maximum SLAAC address
@@ -1402,7 +1419,7 @@ func (ndp *ndpState) generateTempSLAACAddr(prefix tcpip.Subnet, prefixState *sla
 		}
 
 		generatedAddr = header.GenerateTempIPv6SLAACAddr(ndp.temporaryIIDHistory[:], stableAddr)
-		if !ndp.nic.hasPermanentAddrLocked(generatedAddr.Address) {
+		if !ndp.ep.hasPermanentAddressRLocked(generatedAddr.Address) {
 			break
 		}
 	}
@@ -1410,13 +1427,13 @@ func (ndp *ndpState) generateTempSLAACAddr(prefix tcpip.Subnet, prefixState *sla
 	// As per RFC RFC 4941 section 3.3 step 5, we MUST NOT create a temporary
 	// address with a zero preferred lifetime. The checks above ensure this
 	// so we know the address is not deprecated.
-	ref := ndp.addSLAACAddr(generatedAddr, slaacTemp, false /* deprecated */)
-	if ref == nil {
+	addressEndpoint := ndp.addAndAcquireSLAACAddr(generatedAddr, stack.AddressConfigSlaacTemp, false /* deprecated */)
+	if addressEndpoint == nil {
 		return false
 	}
 
 	state := tempSLAACAddrState{
-		deprecationJob: ndp.nic.stack.newJob(&ndp.nic.mu, func() {
+		deprecationJob: ndp.ep.protocol.stack.NewJob(&ndp.ep.mu, func() {
 			prefixState, ok := ndp.slaacPrefixes[prefix]
 			if !ok {
 				panic(fmt.Sprintf("ndp: must have a slaacPrefixes entry for %s to deprecate temporary address %s", prefix, generatedAddr))
@@ -1427,9 +1444,9 @@ func (ndp *ndpState) generateTempSLAACAddr(prefix tcpip.Subnet, prefixState *sla
 				panic(fmt.Sprintf("ndp: must have a tempAddr entry to deprecate temporary address %s", generatedAddr))
 			}
 
-			ndp.deprecateSLAACAddress(tempAddrState.ref)
+			ndp.deprecateSLAACAddress(tempAddrState.addressEndpoint)
 		}),
-		invalidationJob: ndp.nic.stack.newJob(&ndp.nic.mu, func() {
+		invalidationJob: ndp.ep.protocol.stack.NewJob(&ndp.ep.mu, func() {
 			prefixState, ok := ndp.slaacPrefixes[prefix]
 			if !ok {
 				panic(fmt.Sprintf("ndp: must have a slaacPrefixes entry for %s to invalidate temporary address %s", prefix, generatedAddr))
@@ -1442,7 +1459,7 @@ func (ndp *ndpState) generateTempSLAACAddr(prefix tcpip.Subnet, prefixState *sla
 
 			ndp.invalidateTempSLAACAddr(prefixState.tempAddrs, generatedAddr.Address, tempAddrState)
 		}),
-		regenJob: ndp.nic.stack.newJob(&ndp.nic.mu, func() {
+		regenJob: ndp.ep.protocol.stack.NewJob(&ndp.ep.mu, func() {
 			prefixState, ok := ndp.slaacPrefixes[prefix]
 			if !ok {
 				panic(fmt.Sprintf("ndp: must have a slaacPrefixes entry for %s to regenerate temporary address after %s", prefix, generatedAddr))
@@ -1465,8 +1482,8 @@ func (ndp *ndpState) generateTempSLAACAddr(prefix tcpip.Subnet, prefixState *sla
 			prefixState.tempAddrs[generatedAddr.Address] = tempAddrState
 			ndp.slaacPrefixes[prefix] = prefixState
 		}),
-		createdAt: now,
-		ref:       ref,
+		createdAt:       now,
+		addressEndpoint: addressEndpoint,
 	}
 
 	state.deprecationJob.Schedule(pl)
@@ -1481,7 +1498,7 @@ func (ndp *ndpState) generateTempSLAACAddr(prefix tcpip.Subnet, prefixState *sla
 
 // regenerateTempSLAACAddr regenerates a temporary address for a SLAAC prefix.
 //
-// The NIC that ndp belongs to MUST be locked.
+// The IPv6 endpoint that ndp belongs to MUST be locked.
 func (ndp *ndpState) regenerateTempSLAACAddr(prefix tcpip.Subnet, resetGenAttempts bool) {
 	state, ok := ndp.slaacPrefixes[prefix]
 	if !ok {
@@ -1496,14 +1513,14 @@ func (ndp *ndpState) regenerateTempSLAACAddr(prefix tcpip.Subnet, resetGenAttemp
 //
 // pl is the new preferred lifetime. vl is the new valid lifetime.
 //
-// The NIC that ndp belongs to MUST be locked.
+// The IPv6 endpoint that ndp belongs to MUST be locked.
 func (ndp *ndpState) refreshSLAACPrefixLifetimes(prefix tcpip.Subnet, prefixState *slaacPrefixState, pl, vl time.Duration) {
 	// If the preferred lifetime is zero, then the prefix should be deprecated.
 	deprecated := pl == 0
 	if deprecated {
-		ndp.deprecateSLAACAddress(prefixState.stableAddr.ref)
+		ndp.deprecateSLAACAddress(prefixState.stableAddr.addressEndpoint)
 	} else {
-		prefixState.stableAddr.ref.deprecated = false
+		prefixState.stableAddr.addressEndpoint.SetDeprecated(false)
 	}
 
 	// If prefix was preferred for some finite lifetime before, cancel the
@@ -1565,7 +1582,7 @@ func (ndp *ndpState) refreshSLAACPrefixLifetimes(prefix tcpip.Subnet, prefixStat
 
 	// If DAD is not yet complete on the stable address, there is no need to do
 	// work with temporary addresses.
-	if prefixState.stableAddr.ref.getKind() != permanent {
+	if prefixState.stableAddr.addressEndpoint.GetKind() != stack.Permanent {
 		return
 	}
 
@@ -1608,9 +1625,9 @@ func (ndp *ndpState) refreshSLAACPrefixLifetimes(prefix tcpip.Subnet, prefixStat
 		newPreferredLifetime := preferredUntil.Sub(now)
 		tempAddrState.deprecationJob.Cancel()
 		if newPreferredLifetime <= 0 {
-			ndp.deprecateSLAACAddress(tempAddrState.ref)
+			ndp.deprecateSLAACAddress(tempAddrState.addressEndpoint)
 		} else {
-			tempAddrState.ref.deprecated = false
+			tempAddrState.addressEndpoint.SetDeprecated(false)
 			tempAddrState.deprecationJob.Schedule(newPreferredLifetime)
 		}
 
@@ -1635,8 +1652,8 @@ func (ndp *ndpState) refreshSLAACPrefixLifetimes(prefix tcpip.Subnet, prefixStat
 	// due to an update in preferred lifetime.
 	//
 	// If each temporay address has already been regenerated, no new temporary
-	// address will be generated. To ensure continuation of temporary SLAAC
-	// addresses, we manually try to regenerate an address here.
+	// address is generated. To ensure continuation of temporary SLAAC addresses,
+	// we manually try to regenerate an address here.
 	if len(regenForAddr) != 0 || allAddressesRegenerated {
 		// Reset the generation attempts counter as we are starting the generation
 		// of a new address for the SLAAC prefix.
@@ -1647,57 +1664,58 @@ func (ndp *ndpState) refreshSLAACPrefixLifetimes(prefix tcpip.Subnet, prefixStat
 	}
 }
 
-// deprecateSLAACAddress marks ref as deprecated and notifies the stack's NDP
-// dispatcher that ref has been deprecated.
+// deprecateSLAACAddress marks the address as deprecated and notifies the NDP
+// dispatcher that address has been deprecated.
 //
-// deprecateSLAACAddress does nothing if ref is already deprecated.
+// deprecateSLAACAddress does nothing if the address is already deprecated.
 //
-// The NIC that ndp belongs to MUST be locked.
-func (ndp *ndpState) deprecateSLAACAddress(ref *referencedNetworkEndpoint) {
-	if ref.deprecated {
+// The IPv6 endpoint that ndp belongs to MUST be locked.
+func (ndp *ndpState) deprecateSLAACAddress(addressEndpoint stack.AddressEndpoint) {
+	if addressEndpoint.Deprecated() {
 		return
 	}
 
-	ref.deprecated = true
-	if ndpDisp := ndp.nic.stack.ndpDisp; ndpDisp != nil {
-		ndpDisp.OnAutoGenAddressDeprecated(ndp.nic.ID(), ref.addrWithPrefix())
+	addressEndpoint.SetDeprecated(true)
+	if ndpDisp := ndp.ep.protocol.ndpDisp; ndpDisp != nil {
+		ndpDisp.OnAutoGenAddressDeprecated(ndp.ep.nic.ID(), addressEndpoint.AddressWithPrefix())
 	}
 }
 
 // invalidateSLAACPrefix invalidates a SLAAC prefix.
 //
-// The NIC that ndp belongs to MUST be locked.
+// The IPv6 endpoint that ndp belongs to MUST be locked.
 func (ndp *ndpState) invalidateSLAACPrefix(prefix tcpip.Subnet, state slaacPrefixState) {
-	if r := state.stableAddr.ref; r != nil {
+	ndp.cleanupSLAACPrefixResources(prefix, state)
+
+	if addressEndpoint := state.stableAddr.addressEndpoint; addressEndpoint != nil {
 		// Since we are already invalidating the prefix, do not invalidate the
 		// prefix when removing the address.
-		if err := ndp.nic.removePermanentIPv6EndpointLocked(r, false /* allowSLAACInvalidation */); err != nil {
-			panic(fmt.Sprintf("ndp: error removing stable SLAAC address %s: %s", r.addrWithPrefix(), err))
+		if err := ndp.ep.removePermanentEndpointLocked(addressEndpoint, false /* allowSLAACInvalidation */); err != nil {
+			panic(fmt.Sprintf("ndp: error removing stable SLAAC address %s: %s", addressEndpoint.AddressWithPrefix(), err))
 		}
 	}
-
-	ndp.cleanupSLAACPrefixResources(prefix, state)
 }
 
 // cleanupSLAACAddrResourcesAndNotify cleans up an invalidated SLAAC address's
 // resources.
 //
-// The NIC that ndp belongs to MUST be locked.
+// The IPv6 endpoint that ndp belongs to MUST be locked.
 func (ndp *ndpState) cleanupSLAACAddrResourcesAndNotify(addr tcpip.AddressWithPrefix, invalidatePrefix bool) {
-	if ndpDisp := ndp.nic.stack.ndpDisp; ndpDisp != nil {
-		ndpDisp.OnAutoGenAddressInvalidated(ndp.nic.ID(), addr)
+	if ndpDisp := ndp.ep.protocol.ndpDisp; ndpDisp != nil {
+		ndpDisp.OnAutoGenAddressInvalidated(ndp.ep.nic.ID(), addr)
 	}
 
 	prefix := addr.Subnet()
 	state, ok := ndp.slaacPrefixes[prefix]
-	if !ok || state.stableAddr.ref == nil || addr.Address != state.stableAddr.ref.address() {
+	if !ok || state.stableAddr.addressEndpoint == nil || addr.Address != state.stableAddr.addressEndpoint.AddressWithPrefix().Address {
 		return
 	}
 
 	if !invalidatePrefix {
 		// If the prefix is not being invalidated, disassociate the address from the
 		// prefix and do nothing further.
-		state.stableAddr.ref = nil
+		state.stableAddr.addressEndpoint.DecRef()
+		state.stableAddr.addressEndpoint = nil
 		ndp.slaacPrefixes[prefix] = state
 		return
 	}
@@ -1709,14 +1727,17 @@ func (ndp *ndpState) cleanupSLAACAddrResourcesAndNotify(addr tcpip.AddressWithPr
 //
 // Panics if the SLAAC prefix is not known.
 //
-// The NIC that ndp belongs to MUST be locked.
+// The IPv6 endpoint that ndp belongs to MUST be locked.
 func (ndp *ndpState) cleanupSLAACPrefixResources(prefix tcpip.Subnet, state slaacPrefixState) {
 	// Invalidate all temporary addresses.
 	for tempAddr, tempAddrState := range state.tempAddrs {
 		ndp.invalidateTempSLAACAddr(state.tempAddrs, tempAddr, tempAddrState)
 	}
 
-	state.stableAddr.ref = nil
+	if state.stableAddr.addressEndpoint != nil {
+		state.stableAddr.addressEndpoint.DecRef()
+		state.stableAddr.addressEndpoint = nil
+	}
 	state.deprecationJob.Cancel()
 	state.invalidationJob.Cancel()
 	delete(ndp.slaacPrefixes, prefix)
@@ -1724,12 +1745,12 @@ func (ndp *ndpState) cleanupSLAACPrefixResources(prefix tcpip.Subnet, state slaa
 
 // invalidateTempSLAACAddr invalidates a temporary SLAAC address.
 //
-// The NIC that ndp belongs to MUST be locked.
+// The IPv6 endpoint that ndp belongs to MUST be locked.
 func (ndp *ndpState) invalidateTempSLAACAddr(tempAddrs map[tcpip.Address]tempSLAACAddrState, tempAddr tcpip.Address, tempAddrState tempSLAACAddrState) {
 	// Since we are already invalidating the address, do not invalidate the
 	// address when removing the address.
-	if err := ndp.nic.removePermanentIPv6EndpointLocked(tempAddrState.ref, false /* allowSLAACInvalidation */); err != nil {
-		panic(fmt.Sprintf("error removing temporary SLAAC address %s: %s", tempAddrState.ref.addrWithPrefix(), err))
+	if err := ndp.ep.removePermanentEndpointLocked(tempAddrState.addressEndpoint, false /* allowSLAACInvalidation */); err != nil {
+		panic(fmt.Sprintf("error removing temporary SLAAC address %s: %s", tempAddrState.addressEndpoint.AddressWithPrefix(), err))
 	}
 
 	ndp.cleanupTempSLAACAddrResources(tempAddrs, tempAddr, tempAddrState)
@@ -1738,10 +1759,10 @@ func (ndp *ndpState) invalidateTempSLAACAddr(tempAddrs map[tcpip.Address]tempSLA
 // cleanupTempSLAACAddrResourcesAndNotify cleans up an invalidated temporary
 // SLAAC address's resources from ndp.
 //
-// The NIC that ndp belongs to MUST be locked.
+// The IPv6 endpoint that ndp belongs to MUST be locked.
 func (ndp *ndpState) cleanupTempSLAACAddrResourcesAndNotify(addr tcpip.AddressWithPrefix, invalidateAddr bool) {
-	if ndpDisp := ndp.nic.stack.ndpDisp; ndpDisp != nil {
-		ndpDisp.OnAutoGenAddressInvalidated(ndp.nic.ID(), addr)
+	if ndpDisp := ndp.ep.protocol.ndpDisp; ndpDisp != nil {
+		ndpDisp.OnAutoGenAddressInvalidated(ndp.ep.nic.ID(), addr)
 	}
 
 	if !invalidateAddr {
@@ -1765,35 +1786,29 @@ func (ndp *ndpState) cleanupTempSLAACAddrResourcesAndNotify(addr tcpip.AddressWi
 // cleanupTempSLAACAddrResourcesAndNotify cleans up a temporary SLAAC address's
 // jobs and entry.
 //
-// The NIC that ndp belongs to MUST be locked.
+// The IPv6 endpoint that ndp belongs to MUST be locked.
 func (ndp *ndpState) cleanupTempSLAACAddrResources(tempAddrs map[tcpip.Address]tempSLAACAddrState, tempAddr tcpip.Address, tempAddrState tempSLAACAddrState) {
+	tempAddrState.addressEndpoint.DecRef()
+	tempAddrState.addressEndpoint = nil
 	tempAddrState.deprecationJob.Cancel()
 	tempAddrState.invalidationJob.Cancel()
 	tempAddrState.regenJob.Cancel()
 	delete(tempAddrs, tempAddr)
 }
 
-// cleanupState cleans up ndp's state.
-//
-// If hostOnly is true, then only host-specific state will be cleaned up.
+// removeSLAACAddresses removes all SLAAC addresses.
 //
-// cleanupState MUST be called with hostOnly set to true when ndp's NIC is
-// transitioning from a host to a router. This function will invalidate all
-// discovered on-link prefixes, discovered routers, and auto-generated
-// addresses.
-//
-// If hostOnly is true, then the link-local auto-generated address will not be
-// invalidated as routers are also expected to generate a link-local address.
+// If keepLinkLocal is false, the SLAAC generated link-local address is removed.
 //
-// The NIC that ndp belongs to MUST be locked.
-func (ndp *ndpState) cleanupState(hostOnly bool) {
+// The IPv6 endpoint that ndp belongs to MUST be locked.
+func (ndp *ndpState) removeSLAACAddresses(keepLinkLocal bool) {
 	linkLocalSubnet := header.IPv6LinkLocalPrefix.Subnet()
-	linkLocalPrefixes := 0
+	var linkLocalPrefixes int
 	for prefix, state := range ndp.slaacPrefixes {
 		// RFC 4862 section 5 states that routers are also expected to generate a
 		// link-local address so we do not invalidate them if we are cleaning up
 		// host-only state.
-		if hostOnly && prefix == linkLocalSubnet {
+		if keepLinkLocal && prefix == linkLocalSubnet {
 			linkLocalPrefixes++
 			continue
 		}
@@ -1804,6 +1819,21 @@ func (ndp *ndpState) cleanupState(hostOnly bool) {
 	if got := len(ndp.slaacPrefixes); got != linkLocalPrefixes {
 		panic(fmt.Sprintf("ndp: still have non-linklocal SLAAC prefixes after cleaning up; found = %d prefixes, of which %d are link-local", got, linkLocalPrefixes))
 	}
+}
+
+// cleanupState cleans up ndp's state.
+//
+// If hostOnly is true, then only host-specific state is cleaned up.
+//
+// This function invalidates all discovered on-link prefixes, discovered
+// routers, and auto-generated addresses.
+//
+// If hostOnly is true, then the link-local auto-generated address aren't
+// invalidated as routers are also expected to generate a link-local address.
+//
+// The IPv6 endpoint that ndp belongs to MUST be locked.
+func (ndp *ndpState) cleanupState(hostOnly bool) {
+	ndp.removeSLAACAddresses(hostOnly /* keepLinkLocal */)
 
 	for prefix := range ndp.onLinkPrefixes {
 		ndp.invalidateOnLinkPrefix(prefix)
@@ -1827,7 +1857,7 @@ func (ndp *ndpState) cleanupState(hostOnly bool) {
 // startSolicitingRouters starts soliciting routers, as per RFC 4861 section
 // 6.3.7. If routers are already being solicited, this function does nothing.
 //
-// The NIC ndp belongs to MUST be locked.
+// The IPv6 endpoint that ndp belongs to MUST be locked.
 func (ndp *ndpState) startSolicitingRouters() {
 	if ndp.rtrSolicit.timer != nil {
 		// We are already soliciting routers.
@@ -1848,27 +1878,37 @@ func (ndp *ndpState) startSolicitingRouters() {
 
 	var done bool
 	ndp.rtrSolicit.done = &done
-	ndp.rtrSolicit.timer = ndp.nic.stack.Clock().AfterFunc(delay, func() {
-		ndp.nic.mu.Lock()
+	ndp.rtrSolicit.timer = ndp.ep.protocol.stack.Clock().AfterFunc(delay, func() {
+		ndp.ep.mu.Lock()
 		if done {
 			// If we reach this point, it means that the RS timer fired after another
-			// goroutine already obtained the NIC lock and stopped solicitations.
-			// Simply return here and do nothing further.
-			ndp.nic.mu.Unlock()
+			// goroutine already obtained the IPv6 endpoint lock and stopped
+			// solicitations. Simply return here and do nothing further.
+			ndp.ep.mu.Unlock()
 			return
 		}
 
 		// As per RFC 4861 section 4.1, the source of the RS is an address assigned
 		// to the sending interface, or the unspecified address if no address is
 		// assigned to the sending interface.
-		ref := ndp.nic.primaryIPv6EndpointRLocked(header.IPv6AllRoutersMulticastAddress)
-		if ref == nil {
-			ref = ndp.nic.getRefOrCreateTempLocked(header.IPv6ProtocolNumber, header.IPv6Any, NeverPrimaryEndpoint)
+		addressEndpoint := ndp.ep.acquireOutgoingPrimaryAddressRLocked(header.IPv6AllRoutersMulticastAddress, false)
+		if addressEndpoint == nil {
+			// Incase this ends up creating a new temporary address, we need to hold
+			// onto the endpoint until a route is obtained. If we decrement the
+			// reference count before obtaing a route, the address's resources would
+			// be released and attempting to obtain a route after would fail. Once a
+			// route is obtainted, it is safe to decrement the reference count since
+			// obtaining a route increments the address's reference count.
+			addressEndpoint = ndp.ep.acquireAddressOrCreateTempLocked(header.IPv6Any, true /* createTemp */, stack.NeverPrimaryEndpoint)
 		}
-		ndp.nic.mu.Unlock()
+		ndp.ep.mu.Unlock()
 
-		localAddr := ref.address()
-		r := makeRoute(header.IPv6ProtocolNumber, localAddr, header.IPv6AllRoutersMulticastAddress, ndp.nic.linkEP.LinkAddress(), ref, false, false)
+		localAddr := addressEndpoint.AddressWithPrefix().Address
+		r, err := ndp.ep.protocol.stack.FindRoute(ndp.ep.nic.ID(), localAddr, header.IPv6AllRoutersMulticastAddress, ProtocolNumber, false /* multicastLoop */)
+		addressEndpoint.DecRef()
+		if err != nil {
+			return
+		}
 		defer r.Release()
 
 		// Route should resolve immediately since
@@ -1876,15 +1916,16 @@ func (ndp *ndpState) startSolicitingRouters() {
 		// remote link address can be calculated without a resolution process.
 		if c, err := r.Resolve(nil); err != nil {
 			// Do not consider the NIC being unknown or disabled as a fatal error.
-			// Since this method is required to be called when the NIC is not locked,
-			// the NIC could have been disabled or removed by another goroutine.
+			// Since this method is required to be called when the IPv6 endpoint is
+			// not locked, the IPv6 endpoint could have been disabled or removed by
+			// another goroutine.
 			if err == tcpip.ErrUnknownNICID || err == tcpip.ErrInvalidEndpointState {
 				return
 			}
 
-			panic(fmt.Sprintf("ndp: error when resolving route to send NDP RS (%s -> %s on NIC(%d)): %s", header.IPv6Any, header.IPv6AllRoutersMulticastAddress, ndp.nic.ID(), err))
+			panic(fmt.Sprintf("ndp: error when resolving route to send NDP RS (%s -> %s on NIC(%d)): %s", header.IPv6Any, header.IPv6AllRoutersMulticastAddress, ndp.ep.nic.ID(), err))
 		} else if c != nil {
-			panic(fmt.Sprintf("ndp: route resolution not immediate for route to send NDP RS (%s -> %s on NIC(%d))", header.IPv6Any, header.IPv6AllRoutersMulticastAddress, ndp.nic.ID()))
+			panic(fmt.Sprintf("ndp: route resolution not immediate for route to send NDP RS (%s -> %s on NIC(%d))", header.IPv6Any, header.IPv6AllRoutersMulticastAddress, ndp.ep.nic.ID()))
 		}
 
 		// As per RFC 4861 section 4.1, an NDP RS SHOULD include the source
@@ -1907,21 +1948,20 @@ func (ndp *ndpState) startSolicitingRouters() {
 		rs.Options().Serialize(optsSerializer)
 		icmpData.SetChecksum(header.ICMPv6Checksum(icmpData, r.LocalAddress, r.RemoteAddress, buffer.VectorisedView{}))
 
-		pkt := NewPacketBuffer(PacketBufferOptions{
+		pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
 			ReserveHeaderBytes: int(r.MaxHeaderLength()),
 			Data:               buffer.View(icmpData).ToVectorisedView(),
 		})
 
 		sent := r.Stats().ICMP.V6PacketsSent
 		if err := r.WritePacket(nil,
-			NetworkHeaderParams{
+			stack.NetworkHeaderParams{
 				Protocol: header.ICMPv6ProtocolNumber,
 				TTL:      header.NDPHopLimit,
-				TOS:      DefaultTOS,
 			}, pkt,
 		); err != nil {
 			sent.Dropped.Increment()
-			log.Printf("startSolicitingRouters: error writing NDP router solicit message on NIC(%d); err = %s", ndp.nic.ID(), err)
+			log.Printf("startSolicitingRouters: error writing NDP router solicit message on NIC(%d); err = %s", ndp.ep.nic.ID(), err)
 			// Don't send any more messages if we had an error.
 			remaining = 0
 		} else {
@@ -1929,19 +1969,19 @@ func (ndp *ndpState) startSolicitingRouters() {
 			remaining--
 		}
 
-		ndp.nic.mu.Lock()
+		ndp.ep.mu.Lock()
 		if done || remaining == 0 {
 			ndp.rtrSolicit.timer = nil
 			ndp.rtrSolicit.done = nil
 		} else if ndp.rtrSolicit.timer != nil {
 			// Note, we need to explicitly check to make sure that
 			// the timer field is not nil because if it was nil but
-			// we still reached this point, then we know the NIC
+			// we still reached this point, then we know the IPv6 endpoint
 			// was requested to stop soliciting routers so we don't
 			// need to send the next Router Solicitation message.
 			ndp.rtrSolicit.timer.Reset(ndp.configs.RtrSolicitationInterval)
 		}
-		ndp.nic.mu.Unlock()
+		ndp.ep.mu.Unlock()
 	})
 
 }
@@ -1949,7 +1989,7 @@ func (ndp *ndpState) startSolicitingRouters() {
 // stopSolicitingRouters stops soliciting routers. If routers are not currently
 // being solicited, this function does nothing.
 //
-// The NIC ndp belongs to MUST be locked.
+// The IPv6 endpoint that ndp belongs to MUST be locked.
 func (ndp *ndpState) stopSolicitingRouters() {
 	if ndp.rtrSolicit.timer == nil {
 		// Nothing to do.
@@ -1965,7 +2005,7 @@ func (ndp *ndpState) stopSolicitingRouters() {
 // initializeTempAddrState initializes state related to temporary SLAAC
 // addresses.
 func (ndp *ndpState) initializeTempAddrState() {
-	header.InitialTempIID(ndp.temporaryIIDHistory[:], ndp.nic.stack.tempIIDSeed, ndp.nic.ID())
+	header.InitialTempIID(ndp.temporaryIIDHistory[:], ndp.ep.protocol.tempIIDSeed, ndp.ep.nic.ID())
 
 	if MaxDesyncFactor != 0 {
 		ndp.temporaryAddressDesyncFactor = time.Duration(rand.Int63n(int64(MaxDesyncFactor)))
diff --git a/pkg/tcpip/network/ipv6/ndp_test.go b/pkg/tcpip/network/ipv6/ndp_test.go
index af71a7d6b..ac20f217e 100644
--- a/pkg/tcpip/network/ipv6/ndp_test.go
+++ b/pkg/tcpip/network/ipv6/ndp_test.go
@@ -15,9 +15,12 @@
 package ipv6
 
 import (
+	"context"
 	"strings"
 	"testing"
+	"time"
 
+	"github.com/google/go-cmp/cmp"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/checker"
@@ -30,12 +33,13 @@ import (
 // setupStackAndEndpoint creates a stack with a single NIC with a link-local
 // address llladdr and an IPv6 endpoint to a remote with link-local address
 // rlladdr
-func setupStackAndEndpoint(t *testing.T, llladdr, rlladdr tcpip.Address) (*stack.Stack, stack.NetworkEndpoint) {
+func setupStackAndEndpoint(t *testing.T, llladdr, rlladdr tcpip.Address, useNeighborCache bool) (*stack.Stack, stack.NetworkEndpoint) {
 	t.Helper()
 
 	s := stack.New(stack.Options{
-		NetworkProtocols:   []stack.NetworkProtocol{NewProtocol()},
-		TransportProtocols: []stack.TransportProtocol{icmp.NewProtocol6()},
+		NetworkProtocols:   []stack.NetworkProtocolFactory{NewProtocol},
+		TransportProtocols: []stack.TransportProtocolFactory{icmp.NewProtocol6},
+		UseNeighborCache:   useNeighborCache,
 	})
 
 	if err := s.CreateNIC(1, &stubLinkEndpoint{}); err != nil {
@@ -63,11 +67,94 @@ func setupStackAndEndpoint(t *testing.T, llladdr, rlladdr tcpip.Address) (*stack
 		t.Fatalf("cannot find protocol instance for network protocol %d", ProtocolNumber)
 	}
 
-	ep := netProto.NewEndpoint(0, &stubLinkAddressCache{}, &stubDispatcher{}, nil, s)
+	ep := netProto.NewEndpoint(&testInterface{}, &stubLinkAddressCache{}, &stubNUDHandler{}, &stubDispatcher{})
+	if err := ep.Enable(); err != nil {
+		t.Fatalf("ep.Enable(): %s", err)
+	}
+	t.Cleanup(ep.Close)
 
 	return s, ep
 }
 
+var _ NDPDispatcher = (*testNDPDispatcher)(nil)
+
+// testNDPDispatcher is an NDPDispatcher only allows default router discovery.
+type testNDPDispatcher struct {
+	addr tcpip.Address
+}
+
+func (*testNDPDispatcher) OnDuplicateAddressDetectionStatus(tcpip.NICID, tcpip.Address, bool, *tcpip.Error) {
+}
+
+func (t *testNDPDispatcher) OnDefaultRouterDiscovered(_ tcpip.NICID, addr tcpip.Address) bool {
+	t.addr = addr
+	return true
+}
+
+func (t *testNDPDispatcher) OnDefaultRouterInvalidated(_ tcpip.NICID, addr tcpip.Address) {
+	t.addr = addr
+}
+
+func (*testNDPDispatcher) OnOnLinkPrefixDiscovered(tcpip.NICID, tcpip.Subnet) bool {
+	return false
+}
+
+func (*testNDPDispatcher) OnOnLinkPrefixInvalidated(tcpip.NICID, tcpip.Subnet) {
+}
+
+func (*testNDPDispatcher) OnAutoGenAddress(tcpip.NICID, tcpip.AddressWithPrefix) bool {
+	return false
+}
+
+func (*testNDPDispatcher) OnAutoGenAddressDeprecated(tcpip.NICID, tcpip.AddressWithPrefix) {
+}
+
+func (*testNDPDispatcher) OnAutoGenAddressInvalidated(tcpip.NICID, tcpip.AddressWithPrefix) {
+}
+
+func (*testNDPDispatcher) OnRecursiveDNSServerOption(tcpip.NICID, []tcpip.Address, time.Duration) {
+}
+
+func (*testNDPDispatcher) OnDNSSearchListOption(tcpip.NICID, []string, time.Duration) {
+}
+
+func (*testNDPDispatcher) OnDHCPv6Configuration(tcpip.NICID, DHCPv6ConfigurationFromNDPRA) {
+}
+
+func TestStackNDPEndpointInvalidateDefaultRouter(t *testing.T) {
+	var ndpDisp testNDPDispatcher
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocolFactory{NewProtocolWithOptions(Options{
+			NDPDisp: &ndpDisp,
+		})},
+	})
+
+	if err := s.CreateNIC(nicID, &stubLinkEndpoint{}); err != nil {
+		t.Fatalf("s.CreateNIC(%d, _): %s", nicID, err)
+	}
+
+	ep, err := s.GetNetworkEndpoint(nicID, ProtocolNumber)
+	if err != nil {
+		t.Fatalf("s.GetNetworkEndpoint(%d, %d): %s", nicID, ProtocolNumber, err)
+	}
+
+	ipv6EP := ep.(*endpoint)
+	ipv6EP.mu.Lock()
+	ipv6EP.mu.ndp.rememberDefaultRouter(lladdr1, time.Hour)
+	ipv6EP.mu.Unlock()
+
+	if ndpDisp.addr != lladdr1 {
+		t.Fatalf("got ndpDisp.addr = %s, want = %s", ndpDisp.addr, lladdr1)
+	}
+
+	ndpDisp.addr = ""
+	ndpEP := ep.(stack.NDPEndpoint)
+	ndpEP.InvalidateDefaultRouter(lladdr1)
+	if ndpDisp.addr != lladdr1 {
+		t.Fatalf("got ndpDisp.addr = %s, want = %s", ndpDisp.addr, lladdr1)
+	}
+}
+
 // TestNeighorSolicitationWithSourceLinkLayerOption tests that receiving a
 // valid NDP NS message with the Source Link Layer Address option results in a
 // new entry in the link address cache for the sender of the message.
@@ -97,7 +184,7 @@ func TestNeighorSolicitationWithSourceLinkLayerOption(t *testing.T) {
 	for _, test := range tests {
 		t.Run(test.name, func(t *testing.T) {
 			s := stack.New(stack.Options{
-				NetworkProtocols: []stack.NetworkProtocol{NewProtocol()},
+				NetworkProtocols: []stack.NetworkProtocolFactory{NewProtocol},
 			})
 			e := channel.New(0, 1280, linkAddr0)
 			if err := s.CreateNIC(nicID, e); err != nil {
@@ -171,6 +258,123 @@ func TestNeighorSolicitationWithSourceLinkLayerOption(t *testing.T) {
 	}
 }
 
+// TestNeighorSolicitationWithSourceLinkLayerOptionUsingNeighborCache tests
+// that receiving a valid NDP NS message with the Source Link Layer Address
+// option results in a new entry in the link address cache for the sender of
+// the message.
+func TestNeighorSolicitationWithSourceLinkLayerOptionUsingNeighborCache(t *testing.T) {
+	const nicID = 1
+
+	tests := []struct {
+		name             string
+		optsBuf          []byte
+		expectedLinkAddr tcpip.LinkAddress
+	}{
+		{
+			name:             "Valid",
+			optsBuf:          []byte{1, 1, 2, 3, 4, 5, 6, 7},
+			expectedLinkAddr: "\x02\x03\x04\x05\x06\x07",
+		},
+		{
+			name:    "Too Small",
+			optsBuf: []byte{1, 1, 2, 3, 4, 5, 6},
+		},
+		{
+			name:    "Invalid Length",
+			optsBuf: []byte{1, 2, 2, 3, 4, 5, 6, 7},
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			s := stack.New(stack.Options{
+				NetworkProtocols: []stack.NetworkProtocolFactory{NewProtocol},
+				UseNeighborCache: true,
+			})
+			e := channel.New(0, 1280, linkAddr0)
+			e.LinkEPCapabilities |= stack.CapabilityResolutionRequired
+			if err := s.CreateNIC(nicID, e); err != nil {
+				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+			}
+			if err := s.AddAddress(nicID, ProtocolNumber, lladdr0); err != nil {
+				t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, ProtocolNumber, lladdr0, err)
+			}
+
+			ndpNSSize := header.ICMPv6NeighborSolicitMinimumSize + len(test.optsBuf)
+			hdr := buffer.NewPrependable(header.IPv6MinimumSize + ndpNSSize)
+			pkt := header.ICMPv6(hdr.Prepend(ndpNSSize))
+			pkt.SetType(header.ICMPv6NeighborSolicit)
+			ns := header.NDPNeighborSolicit(pkt.NDPPayload())
+			ns.SetTargetAddress(lladdr0)
+			opts := ns.Options()
+			copy(opts, test.optsBuf)
+			pkt.SetChecksum(header.ICMPv6Checksum(pkt, lladdr1, lladdr0, buffer.VectorisedView{}))
+			payloadLength := hdr.UsedLength()
+			ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
+			ip.Encode(&header.IPv6Fields{
+				PayloadLength: uint16(payloadLength),
+				NextHeader:    uint8(header.ICMPv6ProtocolNumber),
+				HopLimit:      255,
+				SrcAddr:       lladdr1,
+				DstAddr:       lladdr0,
+			})
+
+			invalid := s.Stats().ICMP.V6PacketsReceived.Invalid
+
+			// Invalid count should initially be 0.
+			if got := invalid.Value(); got != 0 {
+				t.Fatalf("got invalid = %d, want = 0", got)
+			}
+
+			e.InjectInbound(ProtocolNumber, &stack.PacketBuffer{
+				Data: hdr.View().ToVectorisedView(),
+			})
+
+			neighbors, err := s.Neighbors(nicID)
+			if err != nil {
+				t.Fatalf("s.Neighbors(%d): %s", nicID, err)
+			}
+
+			neighborByAddr := make(map[tcpip.Address]stack.NeighborEntry)
+			for _, n := range neighbors {
+				if existing, ok := neighborByAddr[n.Addr]; ok {
+					if diff := cmp.Diff(existing, n); diff != "" {
+						t.Fatalf("s.Neighbors(%d) returned unexpected duplicate neighbor entry (-existing +got):\n%s", nicID, diff)
+					}
+					t.Fatalf("s.Neighbors(%d) returned unexpected duplicate neighbor entry: %s", nicID, existing)
+				}
+				neighborByAddr[n.Addr] = n
+			}
+
+			if neigh, ok := neighborByAddr[lladdr1]; len(test.expectedLinkAddr) != 0 {
+				// Invalid count should not have increased.
+				if got := invalid.Value(); got != 0 {
+					t.Errorf("got invalid = %d, want = 0", got)
+				}
+
+				if !ok {
+					t.Fatalf("expected a neighbor entry for %q", lladdr1)
+				}
+				if neigh.LinkAddr != test.expectedLinkAddr {
+					t.Errorf("got link address = %s, want = %s", neigh.LinkAddr, test.expectedLinkAddr)
+				}
+				if neigh.State != stack.Stale {
+					t.Errorf("got NUD state = %s, want = %s", neigh.State, stack.Stale)
+				}
+			} else {
+				// Invalid count should have increased.
+				if got := invalid.Value(); got != 1 {
+					t.Errorf("got invalid = %d, want = 1", got)
+				}
+
+				if ok {
+					t.Fatalf("unexpectedly got neighbor entry: %s", neigh)
+				}
+			}
+		})
+	}
+}
+
 func TestNeighorSolicitationResponse(t *testing.T) {
 	const nicID = 1
 	nicAddr := lladdr0
@@ -180,26 +384,41 @@ func TestNeighorSolicitationResponse(t *testing.T) {
 	remoteLinkAddr0 := linkAddr1
 	remoteLinkAddr1 := linkAddr2
 
+	stacks := []struct {
+		name             string
+		useNeighborCache bool
+	}{
+		{
+			name:             "linkAddrCache",
+			useNeighborCache: false,
+		},
+		{
+			name:             "neighborCache",
+			useNeighborCache: true,
+		},
+	}
+
 	tests := []struct {
-		name          string
-		nsOpts        header.NDPOptionsSerializer
-		nsSrcLinkAddr tcpip.LinkAddress
-		nsSrc         tcpip.Address
-		nsDst         tcpip.Address
-		nsInvalid     bool
-		naDstLinkAddr tcpip.LinkAddress
-		naSolicited   bool
-		naSrc         tcpip.Address
-		naDst         tcpip.Address
+		name                   string
+		nsOpts                 header.NDPOptionsSerializer
+		nsSrcLinkAddr          tcpip.LinkAddress
+		nsSrc                  tcpip.Address
+		nsDst                  tcpip.Address
+		nsInvalid              bool
+		naDstLinkAddr          tcpip.LinkAddress
+		naSolicited            bool
+		naSrc                  tcpip.Address
+		naDst                  tcpip.Address
+		performsLinkResolution bool
 	}{
 		{
-			name:          "Unspecified source to multicast destination",
+			name:          "Unspecified source to solicited-node multicast destination",
 			nsOpts:        nil,
 			nsSrcLinkAddr: remoteLinkAddr0,
 			nsSrc:         header.IPv6Any,
 			nsDst:         nicAddrSNMC,
 			nsInvalid:     false,
-			naDstLinkAddr: remoteLinkAddr0,
+			naDstLinkAddr: header.EthernetAddressFromMulticastIPv6Address(header.IPv6AllNodesMulticastAddress),
 			naSolicited:   false,
 			naSrc:         nicAddr,
 			naDst:         header.IPv6AllNodesMulticastAddress,
@@ -220,11 +439,7 @@ func TestNeighorSolicitationResponse(t *testing.T) {
 			nsSrcLinkAddr: remoteLinkAddr0,
 			nsSrc:         header.IPv6Any,
 			nsDst:         nicAddr,
-			nsInvalid:     false,
-			naDstLinkAddr: remoteLinkAddr0,
-			naSolicited:   false,
-			naSrc:         nicAddr,
-			naDst:         header.IPv6AllNodesMulticastAddress,
+			nsInvalid:     true,
 		},
 		{
 			name: "Unspecified source with source ll option to unicast destination",
@@ -236,7 +451,6 @@ func TestNeighorSolicitationResponse(t *testing.T) {
 			nsDst:         nicAddr,
 			nsInvalid:     true,
 		},
-
 		{
 			name: "Specified source with 1 source ll to multicast destination",
 			nsOpts: header.NDPOptionsSerializer{
@@ -296,6 +510,10 @@ func TestNeighorSolicitationResponse(t *testing.T) {
 			naSolicited:   true,
 			naSrc:         nicAddr,
 			naDst:         remoteAddr,
+			// Since we send a unicast solicitations to a node without an entry for
+			// the remote, the node needs to perform neighbor discovery to get the
+			// remote's link address to send the advertisement response.
+			performsLinkResolution: true,
 		},
 		{
 			name: "Specified source with 1 source ll to unicast destination",
@@ -338,86 +556,159 @@ func TestNeighorSolicitationResponse(t *testing.T) {
 		},
 	}
 
-	for _, test := range tests {
-		t.Run(test.name, func(t *testing.T) {
-			s := stack.New(stack.Options{
-				NetworkProtocols: []stack.NetworkProtocol{NewProtocol()},
-			})
-			e := channel.New(1, 1280, nicLinkAddr)
-			if err := s.CreateNIC(nicID, e); err != nil {
-				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
-			}
-			if err := s.AddAddress(nicID, ProtocolNumber, nicAddr); err != nil {
-				t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, ProtocolNumber, nicAddr, err)
-			}
+	for _, stackTyp := range stacks {
+		t.Run(stackTyp.name, func(t *testing.T) {
+			for _, test := range tests {
+				t.Run(test.name, func(t *testing.T) {
+					s := stack.New(stack.Options{
+						NetworkProtocols: []stack.NetworkProtocolFactory{NewProtocol},
+						UseNeighborCache: stackTyp.useNeighborCache,
+					})
+					e := channel.New(1, 1280, nicLinkAddr)
+					e.LinkEPCapabilities |= stack.CapabilityResolutionRequired
+					if err := s.CreateNIC(nicID, e); err != nil {
+						t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+					}
+					if err := s.AddAddress(nicID, ProtocolNumber, nicAddr); err != nil {
+						t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, ProtocolNumber, nicAddr, err)
+					}
 
-			ndpNSSize := header.ICMPv6NeighborSolicitMinimumSize + test.nsOpts.Length()
-			hdr := buffer.NewPrependable(header.IPv6MinimumSize + ndpNSSize)
-			pkt := header.ICMPv6(hdr.Prepend(ndpNSSize))
-			pkt.SetType(header.ICMPv6NeighborSolicit)
-			ns := header.NDPNeighborSolicit(pkt.NDPPayload())
-			ns.SetTargetAddress(nicAddr)
-			opts := ns.Options()
-			opts.Serialize(test.nsOpts)
-			pkt.SetChecksum(header.ICMPv6Checksum(pkt, test.nsSrc, test.nsDst, buffer.VectorisedView{}))
-			payloadLength := hdr.UsedLength()
-			ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
-			ip.Encode(&header.IPv6Fields{
-				PayloadLength: uint16(payloadLength),
-				NextHeader:    uint8(header.ICMPv6ProtocolNumber),
-				HopLimit:      255,
-				SrcAddr:       test.nsSrc,
-				DstAddr:       test.nsDst,
-			})
+					ndpNSSize := header.ICMPv6NeighborSolicitMinimumSize + test.nsOpts.Length()
+					hdr := buffer.NewPrependable(header.IPv6MinimumSize + ndpNSSize)
+					pkt := header.ICMPv6(hdr.Prepend(ndpNSSize))
+					pkt.SetType(header.ICMPv6NeighborSolicit)
+					ns := header.NDPNeighborSolicit(pkt.NDPPayload())
+					ns.SetTargetAddress(nicAddr)
+					opts := ns.Options()
+					opts.Serialize(test.nsOpts)
+					pkt.SetChecksum(header.ICMPv6Checksum(pkt, test.nsSrc, test.nsDst, buffer.VectorisedView{}))
+					payloadLength := hdr.UsedLength()
+					ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
+					ip.Encode(&header.IPv6Fields{
+						PayloadLength: uint16(payloadLength),
+						NextHeader:    uint8(header.ICMPv6ProtocolNumber),
+						HopLimit:      255,
+						SrcAddr:       test.nsSrc,
+						DstAddr:       test.nsDst,
+					})
+
+					invalid := s.Stats().ICMP.V6PacketsReceived.Invalid
 
-			invalid := s.Stats().ICMP.V6PacketsReceived.Invalid
+					// Invalid count should initially be 0.
+					if got := invalid.Value(); got != 0 {
+						t.Fatalf("got invalid = %d, want = 0", got)
+					}
 
-			// Invalid count should initially be 0.
-			if got := invalid.Value(); got != 0 {
-				t.Fatalf("got invalid = %d, want = 0", got)
-			}
+					e.InjectLinkAddr(ProtocolNumber, test.nsSrcLinkAddr, stack.NewPacketBuffer(stack.PacketBufferOptions{
+						Data: hdr.View().ToVectorisedView(),
+					}))
 
-			e.InjectLinkAddr(ProtocolNumber, test.nsSrcLinkAddr, stack.NewPacketBuffer(stack.PacketBufferOptions{
-				Data: hdr.View().ToVectorisedView(),
-			}))
+					if test.nsInvalid {
+						if got := invalid.Value(); got != 1 {
+							t.Fatalf("got invalid = %d, want = 1", got)
+						}
 
-			if test.nsInvalid {
-				if got := invalid.Value(); got != 1 {
-					t.Fatalf("got invalid = %d, want = 1", got)
-				}
+						if p, got := e.Read(); got {
+							t.Fatalf("unexpected response to an invalid NS = %+v", p.Pkt)
+						}
 
-				if p, got := e.Read(); got {
-					t.Fatalf("unexpected response to an invalid NS = %+v", p.Pkt)
-				}
+						// If we expected the NS to be invalid, we have nothing else to check.
+						return
+					}
 
-				// If we expected the NS to be invalid, we have nothing else to check.
-				return
-			}
+					if got := invalid.Value(); got != 0 {
+						t.Fatalf("got invalid = %d, want = 0", got)
+					}
 
-			if got := invalid.Value(); got != 0 {
-				t.Fatalf("got invalid = %d, want = 0", got)
-			}
+					if test.performsLinkResolution {
+						p, got := e.ReadContext(context.Background())
+						if !got {
+							t.Fatal("expected an NDP NS response")
+						}
+
+						if p.Route.LocalAddress != nicAddr {
+							t.Errorf("got p.Route.LocalAddress = %s, want = %s", p.Route.LocalAddress, nicAddr)
+						}
+						if p.Route.LocalLinkAddress != nicLinkAddr {
+							t.Errorf("p.Route.LocalLinkAddress = %s, want = %s", p.Route.LocalLinkAddress, nicLinkAddr)
+						}
+						respNSDst := header.SolicitedNodeAddr(test.nsSrc)
+						if p.Route.RemoteAddress != respNSDst {
+							t.Errorf("got p.Route.RemoteAddress = %s, want = %s", p.Route.RemoteAddress, respNSDst)
+						}
+						if want := header.EthernetAddressFromMulticastIPv6Address(respNSDst); p.Route.RemoteLinkAddress != want {
+							t.Errorf("got p.Route.RemoteLinkAddress = %s, want = %s", p.Route.RemoteLinkAddress, want)
+						}
+
+						checker.IPv6(t, stack.PayloadSince(p.Pkt.NetworkHeader()),
+							checker.SrcAddr(nicAddr),
+							checker.DstAddr(respNSDst),
+							checker.TTL(header.NDPHopLimit),
+							checker.NDPNS(
+								checker.NDPNSTargetAddress(test.nsSrc),
+								checker.NDPNSOptions([]header.NDPOption{
+									header.NDPSourceLinkLayerAddressOption(nicLinkAddr),
+								}),
+							))
+
+						ser := header.NDPOptionsSerializer{
+							header.NDPTargetLinkLayerAddressOption(linkAddr1),
+						}
+						ndpNASize := header.ICMPv6NeighborAdvertMinimumSize + ser.Length()
+						hdr := buffer.NewPrependable(header.IPv6MinimumSize + ndpNASize)
+						pkt := header.ICMPv6(hdr.Prepend(ndpNASize))
+						pkt.SetType(header.ICMPv6NeighborAdvert)
+						na := header.NDPNeighborAdvert(pkt.NDPPayload())
+						na.SetSolicitedFlag(true)
+						na.SetOverrideFlag(true)
+						na.SetTargetAddress(test.nsSrc)
+						na.Options().Serialize(ser)
+						pkt.SetChecksum(header.ICMPv6Checksum(pkt, test.nsSrc, nicAddr, buffer.VectorisedView{}))
+						payloadLength := hdr.UsedLength()
+						ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
+						ip.Encode(&header.IPv6Fields{
+							PayloadLength: uint16(payloadLength),
+							NextHeader:    uint8(header.ICMPv6ProtocolNumber),
+							HopLimit:      header.NDPHopLimit,
+							SrcAddr:       test.nsSrc,
+							DstAddr:       nicAddr,
+						})
+						e.InjectLinkAddr(ProtocolNumber, "", stack.NewPacketBuffer(stack.PacketBufferOptions{
+							Data: hdr.View().ToVectorisedView(),
+						}))
+					}
 
-			p, got := e.Read()
-			if !got {
-				t.Fatal("expected an NDP NA response")
-			}
+					p, got := e.ReadContext(context.Background())
+					if !got {
+						t.Fatal("expected an NDP NA response")
+					}
 
-			if p.Route.RemoteLinkAddress != test.naDstLinkAddr {
-				t.Errorf("got p.Route.RemoteLinkAddress = %s, want = %s", p.Route.RemoteLinkAddress, test.naDstLinkAddr)
-			}
+					if p.Route.LocalAddress != test.naSrc {
+						t.Errorf("got p.Route.LocalAddress = %s, want = %s", p.Route.LocalAddress, test.naSrc)
+					}
+					if p.Route.LocalLinkAddress != nicLinkAddr {
+						t.Errorf("p.Route.LocalLinkAddress = %s, want = %s", p.Route.LocalLinkAddress, nicLinkAddr)
+					}
+					if p.Route.RemoteAddress != test.naDst {
+						t.Errorf("got p.Route.RemoteAddress = %s, want = %s", p.Route.RemoteAddress, test.naDst)
+					}
+					if p.Route.RemoteLinkAddress != test.naDstLinkAddr {
+						t.Errorf("got p.Route.RemoteLinkAddress = %s, want = %s", p.Route.RemoteLinkAddress, test.naDstLinkAddr)
+					}
 
-			checker.IPv6(t, stack.PayloadSince(p.Pkt.NetworkHeader()),
-				checker.SrcAddr(test.naSrc),
-				checker.DstAddr(test.naDst),
-				checker.TTL(header.NDPHopLimit),
-				checker.NDPNA(
-					checker.NDPNASolicitedFlag(test.naSolicited),
-					checker.NDPNATargetAddress(nicAddr),
-					checker.NDPNAOptions([]header.NDPOption{
-						header.NDPTargetLinkLayerAddressOption(nicLinkAddr[:]),
-					}),
-				))
+					checker.IPv6(t, stack.PayloadSince(p.Pkt.NetworkHeader()),
+						checker.SrcAddr(test.naSrc),
+						checker.DstAddr(test.naDst),
+						checker.TTL(header.NDPHopLimit),
+						checker.NDPNA(
+							checker.NDPNASolicitedFlag(test.naSolicited),
+							checker.NDPNATargetAddress(nicAddr),
+							checker.NDPNAOptions([]header.NDPOption{
+								header.NDPTargetLinkLayerAddressOption(nicLinkAddr[:]),
+							}),
+						))
+				})
+			}
 		})
 	}
 }
@@ -458,7 +749,7 @@ func TestNeighorAdvertisementWithTargetLinkLayerOption(t *testing.T) {
 	for _, test := range tests {
 		t.Run(test.name, func(t *testing.T) {
 			s := stack.New(stack.Options{
-				NetworkProtocols: []stack.NetworkProtocol{NewProtocol()},
+				NetworkProtocols: []stack.NetworkProtocolFactory{NewProtocol},
 			})
 			e := channel.New(0, 1280, linkAddr0)
 			if err := s.CreateNIC(nicID, e); err != nil {
@@ -532,197 +823,380 @@ func TestNeighorAdvertisementWithTargetLinkLayerOption(t *testing.T) {
 	}
 }
 
-func TestNDPValidation(t *testing.T) {
-	setup := func(t *testing.T) (*stack.Stack, stack.NetworkEndpoint, stack.Route) {
-		t.Helper()
-
-		// Create a stack with the assigned link-local address lladdr0
-		// and an endpoint to lladdr1.
-		s, ep := setupStackAndEndpoint(t, lladdr0, lladdr1)
-
-		r, err := s.FindRoute(1, lladdr0, lladdr1, ProtocolNumber, false /* multicastLoop */)
-		if err != nil {
-			t.Fatalf("FindRoute(_) = _, %s, want = _, nil", err)
-		}
-
-		return s, ep, r
-	}
-
-	handleIPv6Payload := func(payload buffer.View, hopLimit uint8, atomicFragment bool, ep stack.NetworkEndpoint, r *stack.Route) {
-		nextHdr := uint8(header.ICMPv6ProtocolNumber)
-		var extensions buffer.View
-		if atomicFragment {
-			extensions = buffer.NewView(header.IPv6FragmentExtHdrLength)
-			extensions[0] = nextHdr
-			nextHdr = uint8(header.IPv6FragmentExtHdrIdentifier)
-		}
-
-		pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
-			ReserveHeaderBytes: header.IPv6MinimumSize + len(extensions),
-			Data:               payload.ToVectorisedView(),
-		})
-		ip := header.IPv6(pkt.NetworkHeader().Push(header.IPv6MinimumSize + len(extensions)))
-		ip.Encode(&header.IPv6Fields{
-			PayloadLength: uint16(len(payload) + len(extensions)),
-			NextHeader:    nextHdr,
-			HopLimit:      hopLimit,
-			SrcAddr:       r.LocalAddress,
-			DstAddr:       r.RemoteAddress,
-		})
-		if n := copy(ip[header.IPv6MinimumSize:], extensions); n != len(extensions) {
-			t.Fatalf("expected to write %d bytes of extensions, but wrote %d", len(extensions), n)
-		}
-		ep.HandlePacket(r, pkt)
-	}
-
-	var tllData [header.NDPLinkLayerAddressSize]byte
-	header.NDPOptions(tllData[:]).Serialize(header.NDPOptionsSerializer{
-		header.NDPTargetLinkLayerAddressOption(linkAddr1),
-	})
+// TestNeighorAdvertisementWithTargetLinkLayerOptionUsingNeighborCache tests
+// that receiving a valid NDP NA message with the Target Link Layer Address
+// option does not result in a new entry in the neighbor cache for the target
+// of the message.
+func TestNeighorAdvertisementWithTargetLinkLayerOptionUsingNeighborCache(t *testing.T) {
+	const nicID = 1
 
-	types := []struct {
-		name        string
-		typ         header.ICMPv6Type
-		size        int
-		extraData   []byte
-		statCounter func(tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter
+	tests := []struct {
+		name    string
+		optsBuf []byte
+		isValid bool
 	}{
 		{
-			name: "RouterSolicit",
-			typ:  header.ICMPv6RouterSolicit,
-			size: header.ICMPv6MinimumSize,
-			statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
-				return stats.RouterSolicit
-			},
-		},
-		{
-			name: "RouterAdvert",
-			typ:  header.ICMPv6RouterAdvert,
-			size: header.ICMPv6HeaderSize + header.NDPRAMinimumSize,
-			statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
-				return stats.RouterAdvert
-			},
+			name:    "Valid",
+			optsBuf: []byte{2, 1, 2, 3, 4, 5, 6, 7},
+			isValid: true,
 		},
 		{
-			name: "NeighborSolicit",
-			typ:  header.ICMPv6NeighborSolicit,
-			size: header.ICMPv6NeighborSolicitMinimumSize,
-			statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
-				return stats.NeighborSolicit
-			},
+			name:    "Too Small",
+			optsBuf: []byte{2, 1, 2, 3, 4, 5, 6},
 		},
 		{
-			name:      "NeighborAdvert",
-			typ:       header.ICMPv6NeighborAdvert,
-			size:      header.ICMPv6NeighborAdvertMinimumSize,
-			extraData: tllData[:],
-			statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
-				return stats.NeighborAdvert
-			},
+			name:    "Invalid Length",
+			optsBuf: []byte{2, 2, 2, 3, 4, 5, 6, 7},
 		},
 		{
-			name: "RedirectMsg",
-			typ:  header.ICMPv6RedirectMsg,
-			size: header.ICMPv6MinimumSize,
-			statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
-				return stats.RedirectMsg
+			name: "Multiple",
+			optsBuf: []byte{
+				2, 1, 2, 3, 4, 5, 6, 7,
+				2, 1, 2, 3, 4, 5, 6, 8,
 			},
 		},
 	}
 
-	subTests := []struct {
-		name           string
-		atomicFragment bool
-		hopLimit       uint8
-		code           header.ICMPv6Code
-		valid          bool
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			s := stack.New(stack.Options{
+				NetworkProtocols: []stack.NetworkProtocolFactory{NewProtocol},
+				UseNeighborCache: true,
+			})
+			e := channel.New(0, 1280, linkAddr0)
+			e.LinkEPCapabilities |= stack.CapabilityResolutionRequired
+			if err := s.CreateNIC(nicID, e); err != nil {
+				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+			}
+			if err := s.AddAddress(nicID, ProtocolNumber, lladdr0); err != nil {
+				t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, ProtocolNumber, lladdr0, err)
+			}
+
+			ndpNASize := header.ICMPv6NeighborAdvertMinimumSize + len(test.optsBuf)
+			hdr := buffer.NewPrependable(header.IPv6MinimumSize + ndpNASize)
+			pkt := header.ICMPv6(hdr.Prepend(ndpNASize))
+			pkt.SetType(header.ICMPv6NeighborAdvert)
+			ns := header.NDPNeighborAdvert(pkt.NDPPayload())
+			ns.SetTargetAddress(lladdr1)
+			opts := ns.Options()
+			copy(opts, test.optsBuf)
+			pkt.SetChecksum(header.ICMPv6Checksum(pkt, lladdr1, lladdr0, buffer.VectorisedView{}))
+			payloadLength := hdr.UsedLength()
+			ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
+			ip.Encode(&header.IPv6Fields{
+				PayloadLength: uint16(payloadLength),
+				NextHeader:    uint8(header.ICMPv6ProtocolNumber),
+				HopLimit:      255,
+				SrcAddr:       lladdr1,
+				DstAddr:       lladdr0,
+			})
+
+			invalid := s.Stats().ICMP.V6PacketsReceived.Invalid
+
+			// Invalid count should initially be 0.
+			if got := invalid.Value(); got != 0 {
+				t.Fatalf("got invalid = %d, want = 0", got)
+			}
+
+			e.InjectInbound(ProtocolNumber, &stack.PacketBuffer{
+				Data: hdr.View().ToVectorisedView(),
+			})
+
+			neighbors, err := s.Neighbors(nicID)
+			if err != nil {
+				t.Fatalf("s.Neighbors(%d): %s", nicID, err)
+			}
+
+			neighborByAddr := make(map[tcpip.Address]stack.NeighborEntry)
+			for _, n := range neighbors {
+				if existing, ok := neighborByAddr[n.Addr]; ok {
+					if diff := cmp.Diff(existing, n); diff != "" {
+						t.Fatalf("s.Neighbors(%d) returned unexpected duplicate neighbor entry (-existing +got):\n%s", nicID, diff)
+					}
+					t.Fatalf("s.Neighbors(%d) returned unexpected duplicate neighbor entry: %s", nicID, existing)
+				}
+				neighborByAddr[n.Addr] = n
+			}
+
+			if neigh, ok := neighborByAddr[lladdr1]; ok {
+				t.Fatalf("unexpectedly got neighbor entry: %s", neigh)
+			}
+
+			if test.isValid {
+				// Invalid count should not have increased.
+				if got := invalid.Value(); got != 0 {
+					t.Errorf("got invalid = %d, want = 0", got)
+				}
+			} else {
+				// Invalid count should have increased.
+				if got := invalid.Value(); got != 1 {
+					t.Errorf("got invalid = %d, want = 1", got)
+				}
+			}
+		})
+	}
+}
+
+func TestNDPValidation(t *testing.T) {
+	stacks := []struct {
+		name             string
+		useNeighborCache bool
 	}{
 		{
-			name:           "Valid",
-			atomicFragment: false,
-			hopLimit:       header.NDPHopLimit,
-			code:           0,
-			valid:          true,
+			name:             "linkAddrCache",
+			useNeighborCache: false,
 		},
 		{
-			name:           "Fragmented",
-			atomicFragment: true,
-			hopLimit:       header.NDPHopLimit,
-			code:           0,
-			valid:          false,
-		},
-		{
-			name:           "Invalid hop limit",
-			atomicFragment: false,
-			hopLimit:       header.NDPHopLimit - 1,
-			code:           0,
-			valid:          false,
-		},
-		{
-			name:           "Invalid ICMPv6 code",
-			atomicFragment: false,
-			hopLimit:       header.NDPHopLimit,
-			code:           1,
-			valid:          false,
+			name:             "neighborCache",
+			useNeighborCache: true,
 		},
 	}
 
-	for _, typ := range types {
-		t.Run(typ.name, func(t *testing.T) {
-			for _, test := range subTests {
-				t.Run(test.name, func(t *testing.T) {
-					s, ep, r := setup(t)
-					defer r.Release()
+	for _, stackTyp := range stacks {
+		t.Run(stackTyp.name, func(t *testing.T) {
+			setup := func(t *testing.T) (*stack.Stack, stack.NetworkEndpoint, stack.Route) {
+				t.Helper()
 
-					stats := s.Stats().ICMP.V6PacketsReceived
-					invalid := stats.Invalid
-					typStat := typ.statCounter(stats)
+				// Create a stack with the assigned link-local address lladdr0
+				// and an endpoint to lladdr1.
+				s, ep := setupStackAndEndpoint(t, lladdr0, lladdr1, stackTyp.useNeighborCache)
 
-					icmp := header.ICMPv6(buffer.NewView(typ.size + len(typ.extraData)))
-					copy(icmp[typ.size:], typ.extraData)
-					icmp.SetType(typ.typ)
-					icmp.SetCode(test.code)
-					icmp.SetChecksum(header.ICMPv6Checksum(icmp[:typ.size], r.LocalAddress, r.RemoteAddress, buffer.View(typ.extraData).ToVectorisedView()))
+				r, err := s.FindRoute(1, lladdr0, lladdr1, ProtocolNumber, false /* multicastLoop */)
+				if err != nil {
+					t.Fatalf("FindRoute(_) = _, %s, want = _, nil", err)
+				}
 
-					// Rx count of the NDP message should initially be 0.
-					if got := typStat.Value(); got != 0 {
-						t.Errorf("got %s = %d, want = 0", typ.name, got)
-					}
+				return s, ep, r
+			}
 
-					// Invalid count should initially be 0.
-					if got := invalid.Value(); got != 0 {
-						t.Errorf("got invalid = %d, want = 0", got)
-					}
+			handleIPv6Payload := func(payload buffer.View, hopLimit uint8, atomicFragment bool, ep stack.NetworkEndpoint, r *stack.Route) {
+				nextHdr := uint8(header.ICMPv6ProtocolNumber)
+				var extensions buffer.View
+				if atomicFragment {
+					extensions = buffer.NewView(header.IPv6FragmentExtHdrLength)
+					extensions[0] = nextHdr
+					nextHdr = uint8(header.IPv6FragmentExtHdrIdentifier)
+				}
 
-					if t.Failed() {
-						t.FailNow()
-					}
+				pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
+					ReserveHeaderBytes: header.IPv6MinimumSize + len(extensions),
+					Data:               payload.ToVectorisedView(),
+				})
+				ip := header.IPv6(pkt.NetworkHeader().Push(header.IPv6MinimumSize + len(extensions)))
+				ip.Encode(&header.IPv6Fields{
+					PayloadLength: uint16(len(payload) + len(extensions)),
+					NextHeader:    nextHdr,
+					HopLimit:      hopLimit,
+					SrcAddr:       r.LocalAddress,
+					DstAddr:       r.RemoteAddress,
+				})
+				if n := copy(ip[header.IPv6MinimumSize:], extensions); n != len(extensions) {
+					t.Fatalf("expected to write %d bytes of extensions, but wrote %d", len(extensions), n)
+				}
+				ep.HandlePacket(r, pkt)
+			}
 
-					handleIPv6Payload(buffer.View(icmp), test.hopLimit, test.atomicFragment, ep, &r)
+			var tllData [header.NDPLinkLayerAddressSize]byte
+			header.NDPOptions(tllData[:]).Serialize(header.NDPOptionsSerializer{
+				header.NDPTargetLinkLayerAddressOption(linkAddr1),
+			})
 
-					// Rx count of the NDP packet should have increased.
-					if got := typStat.Value(); got != 1 {
-						t.Errorf("got %s = %d, want = 1", typ.name, got)
-					}
+			var sllData [header.NDPLinkLayerAddressSize]byte
+			header.NDPOptions(sllData[:]).Serialize(header.NDPOptionsSerializer{
+				header.NDPSourceLinkLayerAddressOption(linkAddr1),
+			})
 
-					want := uint64(0)
-					if !test.valid {
-						// Invalid count should have increased.
-						want = 1
-					}
-					if got := invalid.Value(); got != want {
-						t.Errorf("got invalid = %d, want = %d", got, want)
+			types := []struct {
+				name        string
+				typ         header.ICMPv6Type
+				size        int
+				extraData   []byte
+				statCounter func(tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter
+				routerOnly  bool
+			}{
+				{
+					name: "RouterSolicit",
+					typ:  header.ICMPv6RouterSolicit,
+					size: header.ICMPv6MinimumSize,
+					statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+						return stats.RouterSolicit
+					},
+					routerOnly: true,
+				},
+				{
+					name: "RouterAdvert",
+					typ:  header.ICMPv6RouterAdvert,
+					size: header.ICMPv6HeaderSize + header.NDPRAMinimumSize,
+					statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+						return stats.RouterAdvert
+					},
+				},
+				{
+					name:      "NeighborSolicit",
+					typ:       header.ICMPv6NeighborSolicit,
+					size:      header.ICMPv6NeighborSolicitMinimumSize,
+					extraData: sllData[:],
+					statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+						return stats.NeighborSolicit
+					},
+				},
+				{
+					name:      "NeighborAdvert",
+					typ:       header.ICMPv6NeighborAdvert,
+					size:      header.ICMPv6NeighborAdvertMinimumSize,
+					extraData: tllData[:],
+					statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+						return stats.NeighborAdvert
+					},
+				},
+				{
+					name: "RedirectMsg",
+					typ:  header.ICMPv6RedirectMsg,
+					size: header.ICMPv6MinimumSize,
+					statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+						return stats.RedirectMsg
+					},
+				},
+			}
+
+			subTests := []struct {
+				name           string
+				atomicFragment bool
+				hopLimit       uint8
+				code           header.ICMPv6Code
+				valid          bool
+			}{
+				{
+					name:           "Valid",
+					atomicFragment: false,
+					hopLimit:       header.NDPHopLimit,
+					code:           0,
+					valid:          true,
+				},
+				{
+					name:           "Fragmented",
+					atomicFragment: true,
+					hopLimit:       header.NDPHopLimit,
+					code:           0,
+					valid:          false,
+				},
+				{
+					name:           "Invalid hop limit",
+					atomicFragment: false,
+					hopLimit:       header.NDPHopLimit - 1,
+					code:           0,
+					valid:          false,
+				},
+				{
+					name:           "Invalid ICMPv6 code",
+					atomicFragment: false,
+					hopLimit:       header.NDPHopLimit,
+					code:           1,
+					valid:          false,
+				},
+			}
+
+			for _, typ := range types {
+				for _, isRouter := range []bool{false, true} {
+					name := typ.name
+					if isRouter {
+						name += " (Router)"
 					}
-				})
+
+					t.Run(name, func(t *testing.T) {
+						for _, test := range subTests {
+							t.Run(test.name, func(t *testing.T) {
+								s, ep, r := setup(t)
+								defer r.Release()
+
+								if isRouter {
+									// Enabling forwarding makes the stack act as a router.
+									s.SetForwarding(ProtocolNumber, true)
+								}
+
+								stats := s.Stats().ICMP.V6PacketsReceived
+								invalid := stats.Invalid
+								routerOnly := stats.RouterOnlyPacketsDroppedByHost
+								typStat := typ.statCounter(stats)
+
+								icmp := header.ICMPv6(buffer.NewView(typ.size + len(typ.extraData)))
+								copy(icmp[typ.size:], typ.extraData)
+								icmp.SetType(typ.typ)
+								icmp.SetCode(test.code)
+								icmp.SetChecksum(header.ICMPv6Checksum(icmp[:typ.size], r.LocalAddress, r.RemoteAddress, buffer.View(typ.extraData).ToVectorisedView()))
+
+								// Rx count of the NDP message should initially be 0.
+								if got := typStat.Value(); got != 0 {
+									t.Errorf("got %s = %d, want = 0", typ.name, got)
+								}
+
+								// Invalid count should initially be 0.
+								if got := invalid.Value(); got != 0 {
+									t.Errorf("got invalid = %d, want = 0", got)
+								}
+
+								// RouterOnlyPacketsReceivedByHost count should initially be 0.
+								if got := routerOnly.Value(); got != 0 {
+									t.Errorf("got RouterOnlyPacketsReceivedByHost = %d, want = 0", got)
+								}
+
+								if t.Failed() {
+									t.FailNow()
+								}
+
+								handleIPv6Payload(buffer.View(icmp), test.hopLimit, test.atomicFragment, ep, &r)
+
+								// Rx count of the NDP packet should have increased.
+								if got := typStat.Value(); got != 1 {
+									t.Errorf("got %s = %d, want = 1", typ.name, got)
+								}
+
+								want := uint64(0)
+								if !test.valid {
+									// Invalid count should have increased.
+									want = 1
+								}
+								if got := invalid.Value(); got != want {
+									t.Errorf("got invalid = %d, want = %d", got, want)
+								}
+
+								want = 0
+								if test.valid && !isRouter && typ.routerOnly {
+									// RouterOnlyPacketsReceivedByHost count should have increased.
+									want = 1
+								}
+								if got := routerOnly.Value(); got != want {
+									t.Errorf("got RouterOnlyPacketsReceivedByHost = %d, want = %d", got, want)
+								}
+
+							})
+						}
+					})
+				}
 			}
 		})
 	}
+
 }
 
 // TestRouterAdvertValidation tests that when the NIC is configured to handle
 // NDP Router Advertisement packets, it validates the Router Advertisement
 // properly before handling them.
 func TestRouterAdvertValidation(t *testing.T) {
+	stacks := []struct {
+		name             string
+		useNeighborCache bool
+	}{
+		{
+			name:             "linkAddrCache",
+			useNeighborCache: false,
+		},
+		{
+			name:             "neighborCache",
+			useNeighborCache: true,
+		},
+	}
+
 	tests := []struct {
 		name            string
 		src             tcpip.Address
@@ -844,61 +1318,67 @@ func TestRouterAdvertValidation(t *testing.T) {
 		},
 	}
 
-	for _, test := range tests {
-		t.Run(test.name, func(t *testing.T) {
-			e := channel.New(10, 1280, linkAddr1)
-			s := stack.New(stack.Options{
-				NetworkProtocols: []stack.NetworkProtocol{NewProtocol()},
-			})
+	for _, stackTyp := range stacks {
+		t.Run(stackTyp.name, func(t *testing.T) {
+			for _, test := range tests {
+				t.Run(test.name, func(t *testing.T) {
+					e := channel.New(10, 1280, linkAddr1)
+					e.LinkEPCapabilities |= stack.CapabilityResolutionRequired
+					s := stack.New(stack.Options{
+						NetworkProtocols: []stack.NetworkProtocolFactory{NewProtocol},
+						UseNeighborCache: stackTyp.useNeighborCache,
+					})
+
+					if err := s.CreateNIC(1, e); err != nil {
+						t.Fatalf("CreateNIC(_) = %s", err)
+					}
 
-			if err := s.CreateNIC(1, e); err != nil {
-				t.Fatalf("CreateNIC(_) = %s", err)
-			}
+					icmpSize := header.ICMPv6HeaderSize + len(test.ndpPayload)
+					hdr := buffer.NewPrependable(header.IPv6MinimumSize + icmpSize)
+					pkt := header.ICMPv6(hdr.Prepend(icmpSize))
+					pkt.SetType(header.ICMPv6RouterAdvert)
+					pkt.SetCode(test.code)
+					copy(pkt.NDPPayload(), test.ndpPayload)
+					payloadLength := hdr.UsedLength()
+					pkt.SetChecksum(header.ICMPv6Checksum(pkt, test.src, header.IPv6AllNodesMulticastAddress, buffer.VectorisedView{}))
+					ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
+					ip.Encode(&header.IPv6Fields{
+						PayloadLength: uint16(payloadLength),
+						NextHeader:    uint8(icmp.ProtocolNumber6),
+						HopLimit:      test.hopLimit,
+						SrcAddr:       test.src,
+						DstAddr:       header.IPv6AllNodesMulticastAddress,
+					})
 
-			icmpSize := header.ICMPv6HeaderSize + len(test.ndpPayload)
-			hdr := buffer.NewPrependable(header.IPv6MinimumSize + icmpSize)
-			pkt := header.ICMPv6(hdr.Prepend(icmpSize))
-			pkt.SetType(header.ICMPv6RouterAdvert)
-			pkt.SetCode(test.code)
-			copy(pkt.NDPPayload(), test.ndpPayload)
-			payloadLength := hdr.UsedLength()
-			pkt.SetChecksum(header.ICMPv6Checksum(pkt, test.src, header.IPv6AllNodesMulticastAddress, buffer.VectorisedView{}))
-			ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
-			ip.Encode(&header.IPv6Fields{
-				PayloadLength: uint16(payloadLength),
-				NextHeader:    uint8(icmp.ProtocolNumber6),
-				HopLimit:      test.hopLimit,
-				SrcAddr:       test.src,
-				DstAddr:       header.IPv6AllNodesMulticastAddress,
-			})
-
-			stats := s.Stats().ICMP.V6PacketsReceived
-			invalid := stats.Invalid
-			rxRA := stats.RouterAdvert
+					stats := s.Stats().ICMP.V6PacketsReceived
+					invalid := stats.Invalid
+					rxRA := stats.RouterAdvert
 
-			if got := invalid.Value(); got != 0 {
-				t.Fatalf("got invalid = %d, want = 0", got)
-			}
-			if got := rxRA.Value(); got != 0 {
-				t.Fatalf("got rxRA = %d, want = 0", got)
-			}
+					if got := invalid.Value(); got != 0 {
+						t.Fatalf("got invalid = %d, want = 0", got)
+					}
+					if got := rxRA.Value(); got != 0 {
+						t.Fatalf("got rxRA = %d, want = 0", got)
+					}
 
-			e.InjectInbound(header.IPv6ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
-				Data: hdr.View().ToVectorisedView(),
-			}))
+					e.InjectInbound(header.IPv6ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
+						Data: hdr.View().ToVectorisedView(),
+					}))
 
-			if got := rxRA.Value(); got != 1 {
-				t.Fatalf("got rxRA = %d, want = 1", got)
-			}
+					if got := rxRA.Value(); got != 1 {
+						t.Fatalf("got rxRA = %d, want = 1", got)
+					}
 
-			if test.expectedSuccess {
-				if got := invalid.Value(); got != 0 {
-					t.Fatalf("got invalid = %d, want = 0", got)
-				}
-			} else {
-				if got := invalid.Value(); got != 1 {
-					t.Fatalf("got invalid = %d, want = 1", got)
-				}
+					if test.expectedSuccess {
+						if got := invalid.Value(); got != 0 {
+							t.Fatalf("got invalid = %d, want = 0", got)
+						}
+					} else {
+						if got := invalid.Value(); got != 1 {
+							t.Fatalf("got invalid = %d, want = 1", got)
+						}
+					}
+				})
 			}
 		})
 	}
diff --git a/pkg/tcpip/network/testutil/BUILD b/pkg/tcpip/network/testutil/BUILD
new file mode 100644
index 000000000..d0ffc299a
--- /dev/null
+++ b/pkg/tcpip/network/testutil/BUILD
@@ -0,0 +1,21 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "testutil",
+    srcs = [
+        "testutil.go",
+    ],
+    visibility = [
+        "//pkg/tcpip/network/fragmentation:__pkg__",
+        "//pkg/tcpip/network/ipv4:__pkg__",
+        "//pkg/tcpip/network/ipv6:__pkg__",
+    ],
+    deps = [
+        "//pkg/tcpip",
+        "//pkg/tcpip/buffer",
+        "//pkg/tcpip/header",
+        "//pkg/tcpip/stack",
+    ],
+)
diff --git a/pkg/tcpip/network/testutil/testutil.go b/pkg/tcpip/network/testutil/testutil.go
new file mode 100644
index 000000000..7cc52985e
--- /dev/null
+++ b/pkg/tcpip/network/testutil/testutil.go
@@ -0,0 +1,144 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package testutil defines types and functions used to test Network Layer
+// functionality such as IP fragmentation.
+package testutil
+
+import (
+	"fmt"
+	"math/rand"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+)
+
+// MockLinkEndpoint is an endpoint used for testing, it stores packets written
+// to it and can mock errors.
+type MockLinkEndpoint struct {
+	// WrittenPackets is where packets written to the endpoint are stored.
+	WrittenPackets []*stack.PacketBuffer
+
+	mtu          uint32
+	err          *tcpip.Error
+	allowPackets int
+}
+
+// NewMockLinkEndpoint creates a new MockLinkEndpoint.
+//
+// err is the error that will be returned once allowPackets packets are written
+// to the endpoint.
+func NewMockLinkEndpoint(mtu uint32, err *tcpip.Error, allowPackets int) *MockLinkEndpoint {
+	return &MockLinkEndpoint{
+		mtu:          mtu,
+		err:          err,
+		allowPackets: allowPackets,
+	}
+}
+
+// MTU implements LinkEndpoint.MTU.
+func (ep *MockLinkEndpoint) MTU() uint32 { return ep.mtu }
+
+// Capabilities implements LinkEndpoint.Capabilities.
+func (*MockLinkEndpoint) Capabilities() stack.LinkEndpointCapabilities { return 0 }
+
+// MaxHeaderLength implements LinkEndpoint.MaxHeaderLength.
+func (*MockLinkEndpoint) MaxHeaderLength() uint16 { return 0 }
+
+// LinkAddress implements LinkEndpoint.LinkAddress.
+func (*MockLinkEndpoint) LinkAddress() tcpip.LinkAddress { return "" }
+
+// WritePacket implements LinkEndpoint.WritePacket.
+func (ep *MockLinkEndpoint) WritePacket(_ *stack.Route, _ *stack.GSO, _ tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) *tcpip.Error {
+	if ep.allowPackets == 0 {
+		return ep.err
+	}
+	ep.allowPackets--
+	ep.WrittenPackets = append(ep.WrittenPackets, pkt)
+	return nil
+}
+
+// WritePackets implements LinkEndpoint.WritePackets.
+func (ep *MockLinkEndpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.PacketBufferList, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+	var n int
+
+	for pkt := pkts.Front(); pkt != nil; pkt = pkt.Next() {
+		if err := ep.WritePacket(r, gso, protocol, pkt); err != nil {
+			return n, err
+		}
+		n++
+	}
+
+	return n, nil
+}
+
+// WriteRawPacket implements LinkEndpoint.WriteRawPacket.
+func (ep *MockLinkEndpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error {
+	if ep.allowPackets == 0 {
+		return ep.err
+	}
+	ep.allowPackets--
+
+	pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
+		Data: vv,
+	})
+	ep.WrittenPackets = append(ep.WrittenPackets, pkt)
+
+	return nil
+}
+
+// Attach implements LinkEndpoint.Attach.
+func (*MockLinkEndpoint) Attach(stack.NetworkDispatcher) {}
+
+// IsAttached implements LinkEndpoint.IsAttached.
+func (*MockLinkEndpoint) IsAttached() bool { return false }
+
+// Wait implements LinkEndpoint.Wait.
+func (*MockLinkEndpoint) Wait() {}
+
+// ARPHardwareType implements LinkEndpoint.ARPHardwareType.
+func (*MockLinkEndpoint) ARPHardwareType() header.ARPHardwareType { return header.ARPHardwareNone }
+
+// AddHeader implements LinkEndpoint.AddHeader.
+func (*MockLinkEndpoint) AddHeader(_, _ tcpip.LinkAddress, _ tcpip.NetworkProtocolNumber, _ *stack.PacketBuffer) {
+}
+
+// MakeRandPkt generates a randomized packet. transportHeaderLength indicates
+// how many random bytes will be copied in the Transport Header.
+// extraHeaderReserveLength indicates how much extra space will be reserved for
+// the other headers. The payload is made from Views of the sizes listed in
+// viewSizes.
+func MakeRandPkt(transportHeaderLength int, extraHeaderReserveLength int, viewSizes []int, proto tcpip.NetworkProtocolNumber) *stack.PacketBuffer {
+	var views buffer.VectorisedView
+
+	for _, s := range viewSizes {
+		newView := buffer.NewView(s)
+		if _, err := rand.Read(newView); err != nil {
+			panic(fmt.Sprintf("rand.Read: %s", err))
+		}
+		views.AppendView(newView)
+	}
+
+	pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
+		ReserveHeaderBytes: transportHeaderLength + extraHeaderReserveLength,
+		Data:               views,
+	})
+	pkt.NetworkProtocolNumber = proto
+	if _, err := rand.Read(pkt.TransportHeader().Push(transportHeaderLength)); err != nil {
+		panic(fmt.Sprintf("rand.Read: %s", err))
+	}
+	return pkt
+}
diff --git a/pkg/tcpip/ports/ports.go b/pkg/tcpip/ports/ports.go
index f6d592eb5..d87193650 100644
--- a/pkg/tcpip/ports/ports.go
+++ b/pkg/tcpip/ports/ports.go
@@ -400,7 +400,11 @@ func (s *PortManager) isPortAvailableLocked(networks []tcpip.NetworkProtocolNumb
 // reserved by another endpoint. If port is zero, ReservePort will search for
 // an unreserved ephemeral port and reserve it, returning its value in the
 // "port" return value.
-func (s *PortManager) ReservePort(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16, flags Flags, bindToDevice tcpip.NICID, dest tcpip.FullAddress) (reservedPort uint16, err *tcpip.Error) {
+//
+// An optional testPort closure can be passed in which if provided will be used
+// to test if the picked port can be used. The function should return true if
+// the port is safe to use, false otherwise.
+func (s *PortManager) ReservePort(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16, flags Flags, bindToDevice tcpip.NICID, dest tcpip.FullAddress, testPort func(port uint16) bool) (reservedPort uint16, err *tcpip.Error) {
 	s.mu.Lock()
 	defer s.mu.Unlock()
 
@@ -412,12 +416,23 @@ func (s *PortManager) ReservePort(networks []tcpip.NetworkProtocolNumber, transp
 		if !s.reserveSpecificPort(networks, transport, addr, port, flags, bindToDevice, dst) {
 			return 0, tcpip.ErrPortInUse
 		}
+		if testPort != nil && !testPort(port) {
+			s.releasePortLocked(networks, transport, addr, port, flags.Bits(), bindToDevice, dst)
+			return 0, tcpip.ErrPortInUse
+		}
 		return port, nil
 	}
 
 	// A port wasn't specified, so try to find one.
 	return s.PickEphemeralPort(func(p uint16) (bool, *tcpip.Error) {
-		return s.reserveSpecificPort(networks, transport, addr, p, flags, bindToDevice, dst), nil
+		if !s.reserveSpecificPort(networks, transport, addr, p, flags, bindToDevice, dst) {
+			return false, nil
+		}
+		if testPort != nil && !testPort(p) {
+			s.releasePortLocked(networks, transport, addr, p, flags.Bits(), bindToDevice, dst)
+			return false, nil
+		}
+		return true, nil
 	})
 }
 
diff --git a/pkg/tcpip/ports/ports_test.go b/pkg/tcpip/ports/ports_test.go
index 58db5868c..4bc949fd8 100644
--- a/pkg/tcpip/ports/ports_test.go
+++ b/pkg/tcpip/ports/ports_test.go
@@ -332,7 +332,7 @@ func TestPortReservation(t *testing.T) {
 					pm.ReleasePort(net, fakeTransNumber, test.ip, test.port, test.flags, test.device, test.dest)
 					continue
 				}
-				gotPort, err := pm.ReservePort(net, fakeTransNumber, test.ip, test.port, test.flags, test.device, test.dest)
+				gotPort, err := pm.ReservePort(net, fakeTransNumber, test.ip, test.port, test.flags, test.device, test.dest, nil /* testPort */)
 				if err != test.want {
 					t.Fatalf("ReservePort(.., .., %s, %d, %+v, %d, %v) = %v, want %v", test.ip, test.port, test.flags, test.device, test.dest, err, test.want)
 				}
diff --git a/pkg/tcpip/sample/tun_tcp_connect/main.go b/pkg/tcpip/sample/tun_tcp_connect/main.go
index 0ab089208..51d428049 100644
--- a/pkg/tcpip/sample/tun_tcp_connect/main.go
+++ b/pkg/tcpip/sample/tun_tcp_connect/main.go
@@ -127,8 +127,8 @@ func main() {
 	// Create the stack with ipv4 and tcp protocols, then add a tun-based
 	// NIC and ipv4 address.
 	s := stack.New(stack.Options{
-		NetworkProtocols:   []stack.NetworkProtocol{ipv4.NewProtocol()},
-		TransportProtocols: []stack.TransportProtocol{tcp.NewProtocol()},
+		NetworkProtocols:   []stack.NetworkProtocolFactory{ipv4.NewProtocol},
+		TransportProtocols: []stack.TransportProtocolFactory{tcp.NewProtocol},
 	})
 
 	mtu, err := rawfile.GetMTU(tunName)
@@ -182,7 +182,7 @@ func main() {
 	if terr == tcpip.ErrConnectStarted {
 		fmt.Println("Connect is pending...")
 		<-notifyCh
-		terr = ep.GetSockOpt(tcpip.ErrorOption{})
+		terr = ep.LastError()
 	}
 	wq.EventUnregister(&waitEntry)
 
diff --git a/pkg/tcpip/sample/tun_tcp_echo/main.go b/pkg/tcpip/sample/tun_tcp_echo/main.go
index 9e37cab18..8e0ee1cd7 100644
--- a/pkg/tcpip/sample/tun_tcp_echo/main.go
+++ b/pkg/tcpip/sample/tun_tcp_echo/main.go
@@ -112,8 +112,8 @@ func main() {
 	// Create the stack with ip and tcp protocols, then add a tun-based
 	// NIC and address.
 	s := stack.New(stack.Options{
-		NetworkProtocols:   []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol(), arp.NewProtocol()},
-		TransportProtocols: []stack.TransportProtocol{tcp.NewProtocol()},
+		NetworkProtocols:   []stack.NetworkProtocolFactory{ipv4.NewProtocol, ipv6.NewProtocol, arp.NewProtocol},
+		TransportProtocols: []stack.TransportProtocolFactory{tcp.NewProtocol},
 	})
 
 	mtu, err := rawfile.GetMTU(tunName)
@@ -188,7 +188,7 @@ func main() {
 	defer wq.EventUnregister(&waitEntry)
 
 	for {
-		n, wq, err := ep.Accept()
+		n, wq, err := ep.Accept(nil)
 		if err != nil {
 			if err == tcpip.ErrWouldBlock {
 				<-notifyCh
diff --git a/pkg/tcpip/stack/BUILD b/pkg/tcpip/stack/BUILD
index 900938dd1..d09ebe7fa 100644
--- a/pkg/tcpip/stack/BUILD
+++ b/pkg/tcpip/stack/BUILD
@@ -54,9 +54,8 @@ go_template_instance(
 go_library(
     name = "stack",
     srcs = [
+        "addressable_endpoint_state.go",
         "conntrack.go",
-        "dhcpv6configurationfromndpra_string.go",
-        "forwarder.go",
         "headertype_string.go",
         "icmp_rate_limit.go",
         "iptables.go",
@@ -65,7 +64,6 @@ go_library(
         "iptables_types.go",
         "linkaddrcache.go",
         "linkaddrentry_list.go",
-        "ndp.go",
         "neighbor_cache.go",
         "neighbor_entry.go",
         "neighbor_entry_list.go",
@@ -74,6 +72,7 @@ go_library(
         "nud.go",
         "packet_buffer.go",
         "packet_buffer_list.go",
+        "pending_packets.go",
         "rand.go",
         "registration.go",
         "route.go",
@@ -106,6 +105,7 @@ go_test(
     name = "stack_x_test",
     size = "medium",
     srcs = [
+        "addressable_endpoint_state_test.go",
         "ndp_test.go",
         "nud_test.go",
         "stack_test.go",
@@ -116,6 +116,7 @@ go_test(
     deps = [
         ":stack",
         "//pkg/rand",
+        "//pkg/sync",
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/checker",
@@ -138,8 +139,7 @@ go_test(
     name = "stack_test",
     size = "small",
     srcs = [
-        "fake_time_test.go",
-        "forwarder_test.go",
+        "forwarding_test.go",
         "linkaddrcache_test.go",
         "neighbor_cache_test.go",
         "neighbor_entry_test.go",
@@ -152,8 +152,8 @@ go_test(
         "//pkg/sync",
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
+        "//pkg/tcpip/faketime",
         "//pkg/tcpip/header",
-        "@com_github_dpjacques_clockwork//:go_default_library",
         "@com_github_google_go_cmp//cmp:go_default_library",
         "@com_github_google_go_cmp//cmp/cmpopts:go_default_library",
     ],
diff --git a/pkg/tcpip/stack/addressable_endpoint_state.go b/pkg/tcpip/stack/addressable_endpoint_state.go
new file mode 100644
index 000000000..261705575
--- /dev/null
+++ b/pkg/tcpip/stack/addressable_endpoint_state.go
@@ -0,0 +1,755 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package stack
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/tcpip"
+)
+
+var _ GroupAddressableEndpoint = (*AddressableEndpointState)(nil)
+var _ AddressableEndpoint = (*AddressableEndpointState)(nil)
+
+// AddressableEndpointState is an implementation of an AddressableEndpoint.
+type AddressableEndpointState struct {
+	networkEndpoint NetworkEndpoint
+
+	// Lock ordering (from outer to inner lock ordering):
+	//
+	// AddressableEndpointState.mu
+	//   addressState.mu
+	mu struct {
+		sync.RWMutex
+
+		endpoints map[tcpip.Address]*addressState
+		primary   []*addressState
+
+		// groups holds the mapping between group addresses and the number of times
+		// they have been joined.
+		groups map[tcpip.Address]uint32
+	}
+}
+
+// Init initializes the AddressableEndpointState with networkEndpoint.
+//
+// Must be called before calling any other function on m.
+func (a *AddressableEndpointState) Init(networkEndpoint NetworkEndpoint) {
+	a.networkEndpoint = networkEndpoint
+
+	a.mu.Lock()
+	defer a.mu.Unlock()
+	a.mu.endpoints = make(map[tcpip.Address]*addressState)
+	a.mu.groups = make(map[tcpip.Address]uint32)
+}
+
+// ReadOnlyAddressableEndpointState provides read-only access to an
+// AddressableEndpointState.
+type ReadOnlyAddressableEndpointState struct {
+	inner *AddressableEndpointState
+}
+
+// AddrOrMatching returns an endpoint for the passed address that is consisdered
+// bound to the wrapped AddressableEndpointState.
+//
+// If addr is an exact match with an existing address, that address is returned.
+// Otherwise, f is called with each address and the address that f returns true
+// for is returned.
+//
+// Returns nil of no address matches.
+func (m ReadOnlyAddressableEndpointState) AddrOrMatching(addr tcpip.Address, spoofingOrPrimiscuous bool, f func(AddressEndpoint) bool) AddressEndpoint {
+	m.inner.mu.RLock()
+	defer m.inner.mu.RUnlock()
+
+	if ep, ok := m.inner.mu.endpoints[addr]; ok {
+		if ep.IsAssigned(spoofingOrPrimiscuous) && ep.IncRef() {
+			return ep
+		}
+	}
+
+	for _, ep := range m.inner.mu.endpoints {
+		if ep.IsAssigned(spoofingOrPrimiscuous) && f(ep) && ep.IncRef() {
+			return ep
+		}
+	}
+
+	return nil
+}
+
+// Lookup returns the AddressEndpoint for the passed address.
+//
+// Returns nil if the passed address is not associated with the
+// AddressableEndpointState.
+func (m ReadOnlyAddressableEndpointState) Lookup(addr tcpip.Address) AddressEndpoint {
+	m.inner.mu.RLock()
+	defer m.inner.mu.RUnlock()
+
+	ep, ok := m.inner.mu.endpoints[addr]
+	if !ok {
+		return nil
+	}
+	return ep
+}
+
+// ForEach calls f for each address pair.
+//
+// If f returns false, f is no longer be called.
+func (m ReadOnlyAddressableEndpointState) ForEach(f func(AddressEndpoint) bool) {
+	m.inner.mu.RLock()
+	defer m.inner.mu.RUnlock()
+
+	for _, ep := range m.inner.mu.endpoints {
+		if !f(ep) {
+			return
+		}
+	}
+}
+
+// ForEachPrimaryEndpoint calls f for each primary address.
+//
+// If f returns false, f is no longer be called.
+func (m ReadOnlyAddressableEndpointState) ForEachPrimaryEndpoint(f func(AddressEndpoint)) {
+	m.inner.mu.RLock()
+	defer m.inner.mu.RUnlock()
+	for _, ep := range m.inner.mu.primary {
+		f(ep)
+	}
+}
+
+// ReadOnly returns a readonly reference to a.
+func (a *AddressableEndpointState) ReadOnly() ReadOnlyAddressableEndpointState {
+	return ReadOnlyAddressableEndpointState{inner: a}
+}
+
+func (a *AddressableEndpointState) releaseAddressState(addrState *addressState) {
+	a.mu.Lock()
+	defer a.mu.Unlock()
+	a.releaseAddressStateLocked(addrState)
+}
+
+// releaseAddressState removes addrState from s's address state (primary and endpoints list).
+//
+// Preconditions: a.mu must be write locked.
+func (a *AddressableEndpointState) releaseAddressStateLocked(addrState *addressState) {
+	oldPrimary := a.mu.primary
+	for i, s := range a.mu.primary {
+		if s == addrState {
+			a.mu.primary = append(a.mu.primary[:i], a.mu.primary[i+1:]...)
+			oldPrimary[len(oldPrimary)-1] = nil
+			break
+		}
+	}
+	delete(a.mu.endpoints, addrState.addr.Address)
+}
+
+// AddAndAcquirePermanentAddress implements AddressableEndpoint.
+func (a *AddressableEndpointState) AddAndAcquirePermanentAddress(addr tcpip.AddressWithPrefix, peb PrimaryEndpointBehavior, configType AddressConfigType, deprecated bool) (AddressEndpoint, *tcpip.Error) {
+	a.mu.Lock()
+	defer a.mu.Unlock()
+	ep, err := a.addAndAcquireAddressLocked(addr, peb, configType, deprecated, true /* permanent */)
+	// From https://golang.org/doc/faq#nil_error:
+	//
+	// Under the covers, interfaces are implemented as two elements, a type T and
+	// a value V.
+	//
+	// An interface value is nil only if the V and T are both unset, (T=nil, V is
+	// not set), In particular, a nil interface will always hold a nil type. If we
+	// store a nil pointer of type *int inside an interface value, the inner type
+	// will be *int regardless of the value of the pointer: (T=*int, V=nil). Such
+	// an interface value will therefore be non-nil even when the pointer value V
+	// inside is nil.
+	//
+	// Since addAndAcquireAddressLocked returns a nil value with a non-nil type,
+	// we need to explicitly return nil below if ep is (a typed) nil.
+	if ep == nil {
+		return nil, err
+	}
+	return ep, err
+}
+
+// AddAndAcquireTemporaryAddress adds a temporary address.
+//
+// Returns tcpip.ErrDuplicateAddress if the address exists.
+//
+// The temporary address's endpoint is acquired and returned.
+func (a *AddressableEndpointState) AddAndAcquireTemporaryAddress(addr tcpip.AddressWithPrefix, peb PrimaryEndpointBehavior) (AddressEndpoint, *tcpip.Error) {
+	a.mu.Lock()
+	defer a.mu.Unlock()
+	ep, err := a.addAndAcquireAddressLocked(addr, peb, AddressConfigStatic, false /* deprecated */, false /* permanent */)
+	// From https://golang.org/doc/faq#nil_error:
+	//
+	// Under the covers, interfaces are implemented as two elements, a type T and
+	// a value V.
+	//
+	// An interface value is nil only if the V and T are both unset, (T=nil, V is
+	// not set), In particular, a nil interface will always hold a nil type. If we
+	// store a nil pointer of type *int inside an interface value, the inner type
+	// will be *int regardless of the value of the pointer: (T=*int, V=nil). Such
+	// an interface value will therefore be non-nil even when the pointer value V
+	// inside is nil.
+	//
+	// Since addAndAcquireAddressLocked returns a nil value with a non-nil type,
+	// we need to explicitly return nil below if ep is (a typed) nil.
+	if ep == nil {
+		return nil, err
+	}
+	return ep, err
+}
+
+// addAndAcquireAddressLocked adds, acquires and returns a permanent or
+// temporary address.
+//
+// If the addressable endpoint already has the address in a non-permanent state,
+// and addAndAcquireAddressLocked is adding a permanent address, that address is
+// promoted in place and its properties set to the properties provided. If the
+// address already exists in any other state, then tcpip.ErrDuplicateAddress is
+// returned, regardless the kind of address that is being added.
+//
+// Precondition: a.mu must be write locked.
+func (a *AddressableEndpointState) addAndAcquireAddressLocked(addr tcpip.AddressWithPrefix, peb PrimaryEndpointBehavior, configType AddressConfigType, deprecated, permanent bool) (*addressState, *tcpip.Error) {
+	// attemptAddToPrimary is false when the address is already in the primary
+	// address list.
+	attemptAddToPrimary := true
+	addrState, ok := a.mu.endpoints[addr.Address]
+	if ok {
+		if !permanent {
+			// We are adding a non-permanent address but the address exists. No need
+			// to go any further since we can only promote existing temporary/expired
+			// addresses to permanent.
+			return nil, tcpip.ErrDuplicateAddress
+		}
+
+		addrState.mu.Lock()
+		if addrState.mu.kind.IsPermanent() {
+			addrState.mu.Unlock()
+			// We are adding a permanent address but a permanent address already
+			// exists.
+			return nil, tcpip.ErrDuplicateAddress
+		}
+
+		if addrState.mu.refs == 0 {
+			panic(fmt.Sprintf("found an address that should have been released (ref count == 0); address = %s", addrState.addr))
+		}
+
+		// We now promote the address.
+		for i, s := range a.mu.primary {
+			if s == addrState {
+				switch peb {
+				case CanBePrimaryEndpoint:
+					// The address is already in the primary address list.
+					attemptAddToPrimary = false
+				case FirstPrimaryEndpoint:
+					if i == 0 {
+						// The address is already first in the primary address list.
+						attemptAddToPrimary = false
+					} else {
+						a.mu.primary = append(a.mu.primary[:i], a.mu.primary[i+1:]...)
+					}
+				case NeverPrimaryEndpoint:
+					a.mu.primary = append(a.mu.primary[:i], a.mu.primary[i+1:]...)
+				default:
+					panic(fmt.Sprintf("unrecognized primary endpoint behaviour = %d", peb))
+				}
+				break
+			}
+		}
+	}
+
+	if addrState == nil {
+		addrState = &addressState{
+			addressableEndpointState: a,
+			addr:                     addr,
+		}
+		a.mu.endpoints[addr.Address] = addrState
+		addrState.mu.Lock()
+		// We never promote an address to temporary - it can only be added as such.
+		// If we are actaully adding a permanent address, it is promoted below.
+		addrState.mu.kind = Temporary
+	}
+
+	// At this point we have an address we are either promoting from an expired or
+	// temporary address to permanent, promoting an expired address to temporary,
+	// or we are adding a new temporary or permanent address.
+	//
+	// The address MUST be write locked at this point.
+	defer addrState.mu.Unlock()
+
+	if permanent {
+		if addrState.mu.kind.IsPermanent() {
+			panic(fmt.Sprintf("only non-permanent addresses should be promoted to permanent; address = %s", addrState.addr))
+		}
+
+		// Primary addresses are biased by 1.
+		addrState.mu.refs++
+		addrState.mu.kind = Permanent
+	}
+	// Acquire the address before returning it.
+	addrState.mu.refs++
+	addrState.mu.deprecated = deprecated
+	addrState.mu.configType = configType
+
+	if attemptAddToPrimary {
+		switch peb {
+		case NeverPrimaryEndpoint:
+		case CanBePrimaryEndpoint:
+			a.mu.primary = append(a.mu.primary, addrState)
+		case FirstPrimaryEndpoint:
+			if cap(a.mu.primary) == len(a.mu.primary) {
+				a.mu.primary = append([]*addressState{addrState}, a.mu.primary...)
+			} else {
+				// Shift all the endpoints by 1 to make room for the new address at the
+				// front. We could have just created a new slice but this saves
+				// allocations when the slice has capacity for the new address.
+				primaryCount := len(a.mu.primary)
+				a.mu.primary = append(a.mu.primary, nil)
+				if n := copy(a.mu.primary[1:], a.mu.primary); n != primaryCount {
+					panic(fmt.Sprintf("copied %d elements; expected = %d elements", n, primaryCount))
+				}
+				a.mu.primary[0] = addrState
+			}
+		default:
+			panic(fmt.Sprintf("unrecognized primary endpoint behaviour = %d", peb))
+		}
+	}
+
+	return addrState, nil
+}
+
+// RemovePermanentAddress implements AddressableEndpoint.
+func (a *AddressableEndpointState) RemovePermanentAddress(addr tcpip.Address) *tcpip.Error {
+	a.mu.Lock()
+	defer a.mu.Unlock()
+
+	if _, ok := a.mu.groups[addr]; ok {
+		panic(fmt.Sprintf("group address = %s must be removed with LeaveGroup", addr))
+	}
+
+	return a.removePermanentAddressLocked(addr)
+}
+
+// removePermanentAddressLocked is like RemovePermanentAddress but with locking
+// requirements.
+//
+// Precondition: a.mu must be write locked.
+func (a *AddressableEndpointState) removePermanentAddressLocked(addr tcpip.Address) *tcpip.Error {
+	addrState, ok := a.mu.endpoints[addr]
+	if !ok {
+		return tcpip.ErrBadLocalAddress
+	}
+
+	return a.removePermanentEndpointLocked(addrState)
+}
+
+// RemovePermanentEndpoint removes the passed endpoint if it is associated with
+// a and permanent.
+func (a *AddressableEndpointState) RemovePermanentEndpoint(ep AddressEndpoint) *tcpip.Error {
+	addrState, ok := ep.(*addressState)
+	if !ok || addrState.addressableEndpointState != a {
+		return tcpip.ErrInvalidEndpointState
+	}
+
+	a.mu.Lock()
+	defer a.mu.Unlock()
+	return a.removePermanentEndpointLocked(addrState)
+}
+
+// removePermanentAddressLocked is like RemovePermanentAddress but with locking
+// requirements.
+//
+// Precondition: a.mu must be write locked.
+func (a *AddressableEndpointState) removePermanentEndpointLocked(addrState *addressState) *tcpip.Error {
+	if !addrState.GetKind().IsPermanent() {
+		return tcpip.ErrBadLocalAddress
+	}
+
+	addrState.SetKind(PermanentExpired)
+	a.decAddressRefLocked(addrState)
+	return nil
+}
+
+// decAddressRef decrements the address's reference count and releases it once
+// the reference count hits 0.
+func (a *AddressableEndpointState) decAddressRef(addrState *addressState) {
+	a.mu.Lock()
+	defer a.mu.Unlock()
+	a.decAddressRefLocked(addrState)
+}
+
+// decAddressRefLocked is like decAddressRef but with locking requirements.
+//
+// Precondition: a.mu must be write locked.
+func (a *AddressableEndpointState) decAddressRefLocked(addrState *addressState) {
+	addrState.mu.Lock()
+	defer addrState.mu.Unlock()
+
+	if addrState.mu.refs == 0 {
+		panic(fmt.Sprintf("attempted to decrease ref count for AddressEndpoint w/ addr = %s when it is already released", addrState.addr))
+	}
+
+	addrState.mu.refs--
+
+	if addrState.mu.refs != 0 {
+		return
+	}
+
+	// A non-expired permanent address must not have its reference count dropped
+	// to 0.
+	if addrState.mu.kind.IsPermanent() {
+		panic(fmt.Sprintf("permanent addresses should be removed through the AddressableEndpoint: addr = %s, kind = %d", addrState.addr, addrState.mu.kind))
+	}
+
+	a.releaseAddressStateLocked(addrState)
+}
+
+// MainAddress implements AddressableEndpoint.
+func (a *AddressableEndpointState) MainAddress() tcpip.AddressWithPrefix {
+	a.mu.RLock()
+	defer a.mu.RUnlock()
+
+	ep := a.acquirePrimaryAddressRLocked(func(ep *addressState) bool {
+		return ep.GetKind() == Permanent
+	})
+	if ep == nil {
+		return tcpip.AddressWithPrefix{}
+	}
+
+	addr := ep.AddressWithPrefix()
+	a.decAddressRefLocked(ep)
+	return addr
+}
+
+// acquirePrimaryAddressRLocked returns an acquired primary address that is
+// valid according to isValid.
+//
+// Precondition: e.mu must be read locked
+func (a *AddressableEndpointState) acquirePrimaryAddressRLocked(isValid func(*addressState) bool) *addressState {
+	var deprecatedEndpoint *addressState
+	for _, ep := range a.mu.primary {
+		if !isValid(ep) {
+			continue
+		}
+
+		if !ep.Deprecated() {
+			if ep.IncRef() {
+				// ep is not deprecated, so return it immediately.
+				//
+				// If we kept track of a deprecated endpoint, decrement its reference
+				// count since it was incremented when we decided to keep track of it.
+				if deprecatedEndpoint != nil {
+					a.decAddressRefLocked(deprecatedEndpoint)
+					deprecatedEndpoint = nil
+				}
+
+				return ep
+			}
+		} else if deprecatedEndpoint == nil && ep.IncRef() {
+			// We prefer an endpoint that is not deprecated, but we keep track of
+			// ep in case a doesn't have any non-deprecated endpoints.
+			//
+			// If we end up finding a more preferred endpoint, ep's reference count
+			// will be decremented.
+			deprecatedEndpoint = ep
+		}
+	}
+
+	return deprecatedEndpoint
+}
+
+// AcquireAssignedAddress implements AddressableEndpoint.
+func (a *AddressableEndpointState) AcquireAssignedAddress(localAddr tcpip.Address, allowTemp bool, tempPEB PrimaryEndpointBehavior) AddressEndpoint {
+	a.mu.Lock()
+	defer a.mu.Unlock()
+
+	if addrState, ok := a.mu.endpoints[localAddr]; ok {
+		if !addrState.IsAssigned(allowTemp) {
+			return nil
+		}
+
+		if !addrState.IncRef() {
+			panic(fmt.Sprintf("failed to increase the reference count for address = %s", addrState.addr))
+		}
+
+		return addrState
+	}
+
+	if !allowTemp {
+		return nil
+	}
+
+	addr := localAddr.WithPrefix()
+	ep, err := a.addAndAcquireAddressLocked(addr, tempPEB, AddressConfigStatic, false /* deprecated */, false /* permanent */)
+	if err != nil {
+		// addAndAcquireAddressLocked only returns an error if the address is
+		// already assigned but we just checked above if the address exists so we
+		// expect no error.
+		panic(fmt.Sprintf("a.addAndAcquireAddressLocked(%s, %d, %d, false, false): %s", addr, tempPEB, AddressConfigStatic, err))
+	}
+	// From https://golang.org/doc/faq#nil_error:
+	//
+	// Under the covers, interfaces are implemented as two elements, a type T and
+	// a value V.
+	//
+	// An interface value is nil only if the V and T are both unset, (T=nil, V is
+	// not set), In particular, a nil interface will always hold a nil type. If we
+	// store a nil pointer of type *int inside an interface value, the inner type
+	// will be *int regardless of the value of the pointer: (T=*int, V=nil). Such
+	// an interface value will therefore be non-nil even when the pointer value V
+	// inside is nil.
+	//
+	// Since addAndAcquireAddressLocked returns a nil value with a non-nil type,
+	// we need to explicitly return nil below if ep is (a typed) nil.
+	if ep == nil {
+		return nil
+	}
+	return ep
+}
+
+// AcquireOutgoingPrimaryAddress implements AddressableEndpoint.
+func (a *AddressableEndpointState) AcquireOutgoingPrimaryAddress(remoteAddr tcpip.Address, allowExpired bool) AddressEndpoint {
+	a.mu.RLock()
+	defer a.mu.RUnlock()
+
+	ep := a.acquirePrimaryAddressRLocked(func(ep *addressState) bool {
+		return ep.IsAssigned(allowExpired)
+	})
+
+	// From https://golang.org/doc/faq#nil_error:
+	//
+	// Under the covers, interfaces are implemented as two elements, a type T and
+	// a value V.
+	//
+	// An interface value is nil only if the V and T are both unset, (T=nil, V is
+	// not set), In particular, a nil interface will always hold a nil type. If we
+	// store a nil pointer of type *int inside an interface value, the inner type
+	// will be *int regardless of the value of the pointer: (T=*int, V=nil). Such
+	// an interface value will therefore be non-nil even when the pointer value V
+	// inside is nil.
+	//
+	// Since acquirePrimaryAddressRLocked returns a nil value with a non-nil type,
+	// we need to explicitly return nil below if ep is (a typed) nil.
+	if ep == nil {
+		return nil
+	}
+
+	return ep
+}
+
+// PrimaryAddresses implements AddressableEndpoint.
+func (a *AddressableEndpointState) PrimaryAddresses() []tcpip.AddressWithPrefix {
+	a.mu.RLock()
+	defer a.mu.RUnlock()
+
+	var addrs []tcpip.AddressWithPrefix
+	for _, ep := range a.mu.primary {
+		// Don't include tentative, expired or temporary endpoints
+		// to avoid confusion and prevent the caller from using
+		// those.
+		switch ep.GetKind() {
+		case PermanentTentative, PermanentExpired, Temporary:
+			continue
+		}
+
+		addrs = append(addrs, ep.AddressWithPrefix())
+	}
+
+	return addrs
+}
+
+// PermanentAddresses implements AddressableEndpoint.
+func (a *AddressableEndpointState) PermanentAddresses() []tcpip.AddressWithPrefix {
+	a.mu.RLock()
+	defer a.mu.RUnlock()
+
+	var addrs []tcpip.AddressWithPrefix
+	for _, ep := range a.mu.endpoints {
+		if !ep.GetKind().IsPermanent() {
+			continue
+		}
+
+		addrs = append(addrs, ep.AddressWithPrefix())
+	}
+
+	return addrs
+}
+
+// JoinGroup implements GroupAddressableEndpoint.
+func (a *AddressableEndpointState) JoinGroup(group tcpip.Address) (bool, *tcpip.Error) {
+	a.mu.Lock()
+	defer a.mu.Unlock()
+
+	joins, ok := a.mu.groups[group]
+	if !ok {
+		ep, err := a.addAndAcquireAddressLocked(group.WithPrefix(), NeverPrimaryEndpoint, AddressConfigStatic, false /* deprecated */, true /* permanent */)
+		if err != nil {
+			return false, err
+		}
+		// We have no need for the address endpoint.
+		a.decAddressRefLocked(ep)
+	}
+
+	a.mu.groups[group] = joins + 1
+	return !ok, nil
+}
+
+// LeaveGroup implements GroupAddressableEndpoint.
+func (a *AddressableEndpointState) LeaveGroup(group tcpip.Address) (bool, *tcpip.Error) {
+	a.mu.Lock()
+	defer a.mu.Unlock()
+
+	joins, ok := a.mu.groups[group]
+	if !ok {
+		return false, tcpip.ErrBadLocalAddress
+	}
+
+	if joins == 1 {
+		a.removeGroupAddressLocked(group)
+		delete(a.mu.groups, group)
+		return true, nil
+	}
+
+	a.mu.groups[group] = joins - 1
+	return false, nil
+}
+
+// IsInGroup implements GroupAddressableEndpoint.
+func (a *AddressableEndpointState) IsInGroup(group tcpip.Address) bool {
+	a.mu.RLock()
+	defer a.mu.RUnlock()
+	_, ok := a.mu.groups[group]
+	return ok
+}
+
+func (a *AddressableEndpointState) removeGroupAddressLocked(group tcpip.Address) {
+	if err := a.removePermanentAddressLocked(group); err != nil {
+		// removePermanentEndpointLocked would only return an error if group is
+		// not bound to the addressable endpoint, but we know it MUST be assigned
+		// since we have group in our map of groups.
+		panic(fmt.Sprintf("error removing group address = %s: %s", group, err))
+	}
+}
+
+// Cleanup forcefully leaves all groups and removes all permanent addresses.
+func (a *AddressableEndpointState) Cleanup() {
+	a.mu.Lock()
+	defer a.mu.Unlock()
+
+	for group := range a.mu.groups {
+		a.removeGroupAddressLocked(group)
+	}
+	a.mu.groups = make(map[tcpip.Address]uint32)
+
+	for _, ep := range a.mu.endpoints {
+		// removePermanentEndpointLocked returns tcpip.ErrBadLocalAddress if ep is
+		// not a permanent address.
+		if err := a.removePermanentEndpointLocked(ep); err != nil && err != tcpip.ErrBadLocalAddress {
+			panic(fmt.Sprintf("unexpected error from removePermanentEndpointLocked(%s): %s", ep.addr, err))
+		}
+	}
+}
+
+var _ AddressEndpoint = (*addressState)(nil)
+
+// addressState holds state for an address.
+type addressState struct {
+	addressableEndpointState *AddressableEndpointState
+	addr                     tcpip.AddressWithPrefix
+
+	// Lock ordering (from outer to inner lock ordering):
+	//
+	// AddressableEndpointState.mu
+	//   addressState.mu
+	mu struct {
+		sync.RWMutex
+
+		refs       uint32
+		kind       AddressKind
+		configType AddressConfigType
+		deprecated bool
+	}
+}
+
+// AddressWithPrefix implements AddressEndpoint.
+func (a *addressState) AddressWithPrefix() tcpip.AddressWithPrefix {
+	return a.addr
+}
+
+// GetKind implements AddressEndpoint.
+func (a *addressState) GetKind() AddressKind {
+	a.mu.RLock()
+	defer a.mu.RUnlock()
+	return a.mu.kind
+}
+
+// SetKind implements AddressEndpoint.
+func (a *addressState) SetKind(kind AddressKind) {
+	a.mu.Lock()
+	defer a.mu.Unlock()
+	a.mu.kind = kind
+}
+
+// IsAssigned implements AddressEndpoint.
+func (a *addressState) IsAssigned(allowExpired bool) bool {
+	if !a.addressableEndpointState.networkEndpoint.Enabled() {
+		return false
+	}
+
+	switch a.GetKind() {
+	case PermanentTentative:
+		return false
+	case PermanentExpired:
+		return allowExpired
+	default:
+		return true
+	}
+}
+
+// IncRef implements AddressEndpoint.
+func (a *addressState) IncRef() bool {
+	a.mu.Lock()
+	defer a.mu.Unlock()
+	if a.mu.refs == 0 {
+		return false
+	}
+
+	a.mu.refs++
+	return true
+}
+
+// DecRef implements AddressEndpoint.
+func (a *addressState) DecRef() {
+	a.addressableEndpointState.decAddressRef(a)
+}
+
+// ConfigType implements AddressEndpoint.
+func (a *addressState) ConfigType() AddressConfigType {
+	a.mu.RLock()
+	defer a.mu.RUnlock()
+	return a.mu.configType
+}
+
+// SetDeprecated implements AddressEndpoint.
+func (a *addressState) SetDeprecated(d bool) {
+	a.mu.Lock()
+	defer a.mu.Unlock()
+	a.mu.deprecated = d
+}
+
+// Deprecated implements AddressEndpoint.
+func (a *addressState) Deprecated() bool {
+	a.mu.RLock()
+	defer a.mu.RUnlock()
+	return a.mu.deprecated
+}
diff --git a/pkg/tcpip/stack/addressable_endpoint_state_test.go b/pkg/tcpip/stack/addressable_endpoint_state_test.go
new file mode 100644
index 000000000..26787d0a3
--- /dev/null
+++ b/pkg/tcpip/stack/addressable_endpoint_state_test.go
@@ -0,0 +1,77 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package stack_test
+
+import (
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+)
+
+// TestAddressableEndpointStateCleanup tests that cleaning up an addressable
+// endpoint state removes permanent addresses and leaves groups.
+func TestAddressableEndpointStateCleanup(t *testing.T) {
+	var ep fakeNetworkEndpoint
+	if err := ep.Enable(); err != nil {
+		t.Fatalf("ep.Enable(): %s", err)
+	}
+
+	var s stack.AddressableEndpointState
+	s.Init(&ep)
+
+	addr := tcpip.AddressWithPrefix{
+		Address:   "\x01",
+		PrefixLen: 8,
+	}
+
+	{
+		ep, err := s.AddAndAcquirePermanentAddress(addr, stack.NeverPrimaryEndpoint, stack.AddressConfigStatic, false /* deprecated */)
+		if err != nil {
+			t.Fatalf("s.AddAndAcquirePermanentAddress(%s, %d, %d, false): %s", addr, stack.NeverPrimaryEndpoint, stack.AddressConfigStatic, err)
+		}
+		// We don't need the address endpoint.
+		ep.DecRef()
+	}
+	{
+		ep := s.AcquireAssignedAddress(addr.Address, false /* allowTemp */, stack.NeverPrimaryEndpoint)
+		if ep == nil {
+			t.Fatalf("got s.AcquireAssignedAddress(%s, false, NeverPrimaryEndpoint) = nil, want = non-nil", addr.Address)
+		}
+		ep.DecRef()
+	}
+
+	group := tcpip.Address("\x02")
+	if added, err := s.JoinGroup(group); err != nil {
+		t.Fatalf("s.JoinGroup(%s): %s", group, err)
+	} else if !added {
+		t.Fatalf("got s.JoinGroup(%s) = false, want = true", group)
+	}
+	if !s.IsInGroup(group) {
+		t.Fatalf("got s.IsInGroup(%s) = false, want = true", group)
+	}
+
+	s.Cleanup()
+	{
+		ep := s.AcquireAssignedAddress(addr.Address, false /* allowTemp */, stack.NeverPrimaryEndpoint)
+		if ep != nil {
+			ep.DecRef()
+			t.Fatalf("got s.AcquireAssignedAddress(%s, false, NeverPrimaryEndpoint) = %s, want = nil", addr.Address, ep.AddressWithPrefix())
+		}
+	}
+	if s.IsInGroup(group) {
+		t.Fatalf("got s.IsInGroup(%s) = true, want = false", group)
+	}
+}
diff --git a/pkg/tcpip/stack/conntrack.go b/pkg/tcpip/stack/conntrack.go
index 7dd344b4f..0cd1da11f 100644
--- a/pkg/tcpip/stack/conntrack.go
+++ b/pkg/tcpip/stack/conntrack.go
@@ -196,13 +196,14 @@ type bucket struct {
 
 // packetToTupleID converts packet to a tuple ID. It fails when pkt lacks a valid
 // TCP header.
+//
+// Preconditions: pkt.NetworkHeader() is valid.
 func packetToTupleID(pkt *PacketBuffer) (tupleID, *tcpip.Error) {
-	// TODO(gvisor.dev/issue/170): Need to support for other
-	// protocols as well.
-	netHeader := header.IPv4(pkt.NetworkHeader().View())
-	if len(netHeader) < header.IPv4MinimumSize || netHeader.TransportProtocol() != header.TCPProtocolNumber {
+	netHeader := pkt.Network()
+	if netHeader.TransportProtocol() != header.TCPProtocolNumber {
 		return tupleID{}, tcpip.ErrUnknownProtocol
 	}
+
 	tcpHeader := header.TCP(pkt.TransportHeader().View())
 	if len(tcpHeader) < header.TCPMinimumSize {
 		return tupleID{}, tcpip.ErrUnknownProtocol
@@ -214,7 +215,7 @@ func packetToTupleID(pkt *PacketBuffer) (tupleID, *tcpip.Error) {
 		dstAddr:    netHeader.DestinationAddress(),
 		dstPort:    tcpHeader.DestinationPort(),
 		transProto: netHeader.TransportProtocol(),
-		netProto:   header.IPv4ProtocolNumber,
+		netProto:   pkt.NetworkProtocolNumber,
 	}, nil
 }
 
@@ -268,7 +269,7 @@ func (ct *ConnTrack) connForTID(tid tupleID) (*conn, direction) {
 	return nil, dirOriginal
 }
 
-func (ct *ConnTrack) insertRedirectConn(pkt *PacketBuffer, hook Hook, rt RedirectTarget) *conn {
+func (ct *ConnTrack) insertRedirectConn(pkt *PacketBuffer, hook Hook, rt *RedirectTarget) *conn {
 	tid, err := packetToTupleID(pkt)
 	if err != nil {
 		return nil
@@ -281,8 +282,8 @@ func (ct *ConnTrack) insertRedirectConn(pkt *PacketBuffer, hook Hook, rt Redirec
 	// rule. This tuple will be used to manipulate the packet in
 	// handlePacket.
 	replyTID := tid.reply()
-	replyTID.srcAddr = rt.MinIP
-	replyTID.srcPort = rt.MinPort
+	replyTID.srcAddr = rt.Addr
+	replyTID.srcPort = rt.Port
 	var manip manipType
 	switch hook {
 	case Prerouting:
@@ -344,7 +345,7 @@ func handlePacketPrerouting(pkt *PacketBuffer, conn *conn, dir direction) {
 		return
 	}
 
-	netHeader := header.IPv4(pkt.NetworkHeader().View())
+	netHeader := pkt.Network()
 	tcpHeader := header.TCP(pkt.TransportHeader().View())
 
 	// For prerouting redirection, packets going in the original direction
@@ -366,8 +367,12 @@ func handlePacketPrerouting(pkt *PacketBuffer, conn *conn, dir direction) {
 	// support cases when they are validated, e.g. when we can't offload
 	// receive checksumming.
 
-	netHeader.SetChecksum(0)
-	netHeader.SetChecksum(^netHeader.CalculateChecksum())
+	// After modification, IPv4 packets need a valid checksum.
+	if pkt.NetworkProtocolNumber == header.IPv4ProtocolNumber {
+		netHeader := header.IPv4(pkt.NetworkHeader().View())
+		netHeader.SetChecksum(0)
+		netHeader.SetChecksum(^netHeader.CalculateChecksum())
+	}
 }
 
 // handlePacketOutput manipulates ports for packets in Output hook.
@@ -377,7 +382,7 @@ func handlePacketOutput(pkt *PacketBuffer, conn *conn, gso *GSO, r *Route, dir d
 		return
 	}
 
-	netHeader := header.IPv4(pkt.NetworkHeader().View())
+	netHeader := pkt.Network()
 	tcpHeader := header.TCP(pkt.TransportHeader().View())
 
 	// For output redirection, packets going in the original direction
@@ -396,7 +401,7 @@ func handlePacketOutput(pkt *PacketBuffer, conn *conn, gso *GSO, r *Route, dir d
 
 	// Calculate the TCP checksum and set it.
 	tcpHeader.SetChecksum(0)
-	length := uint16(pkt.Size()) - uint16(netHeader.HeaderLength())
+	length := uint16(pkt.Size()) - uint16(len(pkt.NetworkHeader().View()))
 	xsum := r.PseudoHeaderChecksum(header.TCPProtocolNumber, length)
 	if gso != nil && gso.NeedsCsum {
 		tcpHeader.SetChecksum(xsum)
@@ -405,8 +410,11 @@ func handlePacketOutput(pkt *PacketBuffer, conn *conn, gso *GSO, r *Route, dir d
 		tcpHeader.SetChecksum(^tcpHeader.CalculateChecksum(xsum))
 	}
 
-	netHeader.SetChecksum(0)
-	netHeader.SetChecksum(^netHeader.CalculateChecksum())
+	if pkt.NetworkProtocolNumber == header.IPv4ProtocolNumber {
+		netHeader := header.IPv4(pkt.NetworkHeader().View())
+		netHeader.SetChecksum(0)
+		netHeader.SetChecksum(^netHeader.CalculateChecksum())
+	}
 }
 
 // handlePacket will manipulate the port and address of the packet if the
@@ -422,7 +430,7 @@ func (ct *ConnTrack) handlePacket(pkt *PacketBuffer, hook Hook, gso *GSO, r *Rou
 	}
 
 	// TODO(gvisor.dev/issue/170): Support other transport protocols.
-	if nh := pkt.NetworkHeader().View(); nh.IsEmpty() || header.IPv4(nh).TransportProtocol() != header.TCPProtocolNumber {
+	if pkt.Network().TransportProtocol() != header.TCPProtocolNumber {
 		return false
 	}
 
@@ -473,7 +481,7 @@ func (ct *ConnTrack) maybeInsertNoop(pkt *PacketBuffer, hook Hook) {
 	}
 
 	// We only track TCP connections.
-	if nh := pkt.NetworkHeader().View(); nh.IsEmpty() || header.IPv4(nh).TransportProtocol() != header.TCPProtocolNumber {
+	if pkt.Network().TransportProtocol() != header.TCPProtocolNumber {
 		return
 	}
 
@@ -572,7 +580,9 @@ func (ct *ConnTrack) reapUnused(start int, prevInterval time.Duration) (int, tim
 // reapTupleLocked tries to remove tuple and its reply from the table. It
 // returns whether the tuple's connection has timed out.
 //
-// Preconditions: ct.mu is locked for reading and bucket is locked.
+// Preconditions:
+// * ct.mu is locked for reading.
+// * bucket is locked.
 func (ct *ConnTrack) reapTupleLocked(tuple *tuple, bucket int, now time.Time) bool {
 	if !tuple.conn.timedOut(now) {
 		return false
@@ -607,7 +617,7 @@ func (ct *ConnTrack) reapTupleLocked(tuple *tuple, bucket int, now time.Time) bo
 	return true
 }
 
-func (ct *ConnTrack) originalDst(epID TransportEndpointID) (tcpip.Address, uint16, *tcpip.Error) {
+func (ct *ConnTrack) originalDst(epID TransportEndpointID, netProto tcpip.NetworkProtocolNumber) (tcpip.Address, uint16, *tcpip.Error) {
 	// Lookup the connection. The reply's original destination
 	// describes the original address.
 	tid := tupleID{
@@ -616,7 +626,7 @@ func (ct *ConnTrack) originalDst(epID TransportEndpointID) (tcpip.Address, uint1
 		dstAddr:    epID.RemoteAddress,
 		dstPort:    epID.RemotePort,
 		transProto: header.TCPProtocolNumber,
-		netProto:   header.IPv4ProtocolNumber,
+		netProto:   netProto,
 	}
 	conn, _ := ct.connForTID(tid)
 	if conn == nil {
diff --git a/pkg/tcpip/stack/forwarder_test.go b/pkg/tcpip/stack/forwarder_test.go
deleted file mode 100644
index 5a684eb9d..000000000
--- a/pkg/tcpip/stack/forwarder_test.go
+++ /dev/null
@@ -1,650 +0,0 @@
-// Copyright 2020 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package stack
-
-import (
-	"encoding/binary"
-	"math"
-	"testing"
-	"time"
-
-	"gvisor.dev/gvisor/pkg/tcpip"
-	"gvisor.dev/gvisor/pkg/tcpip/buffer"
-	"gvisor.dev/gvisor/pkg/tcpip/header"
-)
-
-const (
-	fwdTestNetNumber           tcpip.NetworkProtocolNumber = math.MaxUint32
-	fwdTestNetHeaderLen                                    = 12
-	fwdTestNetDefaultPrefixLen                             = 8
-
-	// fwdTestNetDefaultMTU is the MTU, in bytes, used throughout the tests,
-	// except where another value is explicitly used. It is chosen to match
-	// the MTU of loopback interfaces on linux systems.
-	fwdTestNetDefaultMTU = 65536
-
-	dstAddrOffset        = 0
-	srcAddrOffset        = 1
-	protocolNumberOffset = 2
-)
-
-// fwdTestNetworkEndpoint is a network-layer protocol endpoint.
-// Headers of this protocol are fwdTestNetHeaderLen bytes, but we currently only
-// use the first three: destination address, source address, and transport
-// protocol. They're all one byte fields to simplify parsing.
-type fwdTestNetworkEndpoint struct {
-	nicID      tcpip.NICID
-	proto      *fwdTestNetworkProtocol
-	dispatcher TransportDispatcher
-	ep         LinkEndpoint
-}
-
-func (f *fwdTestNetworkEndpoint) MTU() uint32 {
-	return f.ep.MTU() - uint32(f.MaxHeaderLength())
-}
-
-func (f *fwdTestNetworkEndpoint) NICID() tcpip.NICID {
-	return f.nicID
-}
-
-func (*fwdTestNetworkEndpoint) DefaultTTL() uint8 {
-	return 123
-}
-
-func (f *fwdTestNetworkEndpoint) HandlePacket(r *Route, pkt *PacketBuffer) {
-	// Dispatch the packet to the transport protocol.
-	f.dispatcher.DeliverTransportPacket(r, tcpip.TransportProtocolNumber(pkt.NetworkHeader().View()[protocolNumberOffset]), pkt)
-}
-
-func (f *fwdTestNetworkEndpoint) MaxHeaderLength() uint16 {
-	return f.ep.MaxHeaderLength() + fwdTestNetHeaderLen
-}
-
-func (f *fwdTestNetworkEndpoint) PseudoHeaderChecksum(protocol tcpip.TransportProtocolNumber, dstAddr tcpip.Address) uint16 {
-	return 0
-}
-
-func (f *fwdTestNetworkEndpoint) Capabilities() LinkEndpointCapabilities {
-	return f.ep.Capabilities()
-}
-
-func (f *fwdTestNetworkEndpoint) NetworkProtocolNumber() tcpip.NetworkProtocolNumber {
-	return f.proto.Number()
-}
-
-func (f *fwdTestNetworkEndpoint) WritePacket(r *Route, gso *GSO, params NetworkHeaderParams, pkt *PacketBuffer) *tcpip.Error {
-	// Add the protocol's header to the packet and send it to the link
-	// endpoint.
-	b := pkt.NetworkHeader().Push(fwdTestNetHeaderLen)
-	b[dstAddrOffset] = r.RemoteAddress[0]
-	b[srcAddrOffset] = r.LocalAddress[0]
-	b[protocolNumberOffset] = byte(params.Protocol)
-
-	return f.ep.WritePacket(r, gso, fwdTestNetNumber, pkt)
-}
-
-// WritePackets implements LinkEndpoint.WritePackets.
-func (f *fwdTestNetworkEndpoint) WritePackets(r *Route, gso *GSO, pkts PacketBufferList, params NetworkHeaderParams) (int, *tcpip.Error) {
-	panic("not implemented")
-}
-
-func (*fwdTestNetworkEndpoint) WriteHeaderIncludedPacket(r *Route, pkt *PacketBuffer) *tcpip.Error {
-	return tcpip.ErrNotSupported
-}
-
-func (*fwdTestNetworkEndpoint) Close() {}
-
-// fwdTestNetworkProtocol is a network-layer protocol that implements Address
-// resolution.
-type fwdTestNetworkProtocol struct {
-	addrCache              *linkAddrCache
-	addrResolveDelay       time.Duration
-	onLinkAddressResolved  func(cache *linkAddrCache, addr tcpip.Address, _ tcpip.LinkAddress)
-	onResolveStaticAddress func(tcpip.Address) (tcpip.LinkAddress, bool)
-}
-
-var _ LinkAddressResolver = (*fwdTestNetworkProtocol)(nil)
-
-func (f *fwdTestNetworkProtocol) Number() tcpip.NetworkProtocolNumber {
-	return fwdTestNetNumber
-}
-
-func (f *fwdTestNetworkProtocol) MinimumPacketSize() int {
-	return fwdTestNetHeaderLen
-}
-
-func (f *fwdTestNetworkProtocol) DefaultPrefixLen() int {
-	return fwdTestNetDefaultPrefixLen
-}
-
-func (*fwdTestNetworkProtocol) ParseAddresses(v buffer.View) (src, dst tcpip.Address) {
-	return tcpip.Address(v[srcAddrOffset : srcAddrOffset+1]), tcpip.Address(v[dstAddrOffset : dstAddrOffset+1])
-}
-
-func (*fwdTestNetworkProtocol) Parse(pkt *PacketBuffer) (tcpip.TransportProtocolNumber, bool, bool) {
-	netHeader, ok := pkt.NetworkHeader().Consume(fwdTestNetHeaderLen)
-	if !ok {
-		return 0, false, false
-	}
-	return tcpip.TransportProtocolNumber(netHeader[protocolNumberOffset]), true, true
-}
-
-func (f *fwdTestNetworkProtocol) NewEndpoint(nicID tcpip.NICID, linkAddrCache LinkAddressCache, dispatcher TransportDispatcher, ep LinkEndpoint, _ *Stack) NetworkEndpoint {
-	return &fwdTestNetworkEndpoint{
-		nicID:      nicID,
-		proto:      f,
-		dispatcher: dispatcher,
-		ep:         ep,
-	}
-}
-
-func (f *fwdTestNetworkProtocol) SetOption(option interface{}) *tcpip.Error {
-	return tcpip.ErrUnknownProtocolOption
-}
-
-func (f *fwdTestNetworkProtocol) Option(option interface{}) *tcpip.Error {
-	return tcpip.ErrUnknownProtocolOption
-}
-
-func (f *fwdTestNetworkProtocol) Close() {}
-
-func (f *fwdTestNetworkProtocol) Wait() {}
-
-func (f *fwdTestNetworkProtocol) LinkAddressRequest(addr, localAddr tcpip.Address, remoteLinkAddr tcpip.LinkAddress, linkEP LinkEndpoint) *tcpip.Error {
-	if f.addrCache != nil && f.onLinkAddressResolved != nil {
-		time.AfterFunc(f.addrResolveDelay, func() {
-			f.onLinkAddressResolved(f.addrCache, addr, remoteLinkAddr)
-		})
-	}
-	return nil
-}
-
-func (f *fwdTestNetworkProtocol) ResolveStaticAddress(addr tcpip.Address) (tcpip.LinkAddress, bool) {
-	if f.onResolveStaticAddress != nil {
-		return f.onResolveStaticAddress(addr)
-	}
-	return "", false
-}
-
-func (f *fwdTestNetworkProtocol) LinkAddressProtocol() tcpip.NetworkProtocolNumber {
-	return fwdTestNetNumber
-}
-
-// fwdTestPacketInfo holds all the information about an outbound packet.
-type fwdTestPacketInfo struct {
-	RemoteLinkAddress tcpip.LinkAddress
-	LocalLinkAddress  tcpip.LinkAddress
-	Pkt               *PacketBuffer
-}
-
-type fwdTestLinkEndpoint struct {
-	dispatcher NetworkDispatcher
-	mtu        uint32
-	linkAddr   tcpip.LinkAddress
-
-	// C is where outbound packets are queued.
-	C chan fwdTestPacketInfo
-}
-
-// InjectInbound injects an inbound packet.
-func (e *fwdTestLinkEndpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer) {
-	e.InjectLinkAddr(protocol, "", pkt)
-}
-
-// InjectLinkAddr injects an inbound packet with a remote link address.
-func (e *fwdTestLinkEndpoint) InjectLinkAddr(protocol tcpip.NetworkProtocolNumber, remote tcpip.LinkAddress, pkt *PacketBuffer) {
-	e.dispatcher.DeliverNetworkPacket(remote, "" /* local */, protocol, pkt)
-}
-
-// Attach saves the stack network-layer dispatcher for use later when packets
-// are injected.
-func (e *fwdTestLinkEndpoint) Attach(dispatcher NetworkDispatcher) {
-	e.dispatcher = dispatcher
-}
-
-// IsAttached implements stack.LinkEndpoint.IsAttached.
-func (e *fwdTestLinkEndpoint) IsAttached() bool {
-	return e.dispatcher != nil
-}
-
-// MTU implements stack.LinkEndpoint.MTU. It returns the value initialized
-// during construction.
-func (e *fwdTestLinkEndpoint) MTU() uint32 {
-	return e.mtu
-}
-
-// Capabilities implements stack.LinkEndpoint.Capabilities.
-func (e fwdTestLinkEndpoint) Capabilities() LinkEndpointCapabilities {
-	caps := LinkEndpointCapabilities(0)
-	return caps | CapabilityResolutionRequired
-}
-
-// GSOMaxSize returns the maximum GSO packet size.
-func (*fwdTestLinkEndpoint) GSOMaxSize() uint32 {
-	return 1 << 15
-}
-
-// MaxHeaderLength returns the maximum size of the link layer header. Given it
-// doesn't have a header, it just returns 0.
-func (*fwdTestLinkEndpoint) MaxHeaderLength() uint16 {
-	return 0
-}
-
-// LinkAddress returns the link address of this endpoint.
-func (e *fwdTestLinkEndpoint) LinkAddress() tcpip.LinkAddress {
-	return e.linkAddr
-}
-
-func (e fwdTestLinkEndpoint) WritePacket(r *Route, gso *GSO, protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer) *tcpip.Error {
-	p := fwdTestPacketInfo{
-		RemoteLinkAddress: r.RemoteLinkAddress,
-		LocalLinkAddress:  r.LocalLinkAddress,
-		Pkt:               pkt,
-	}
-
-	select {
-	case e.C <- p:
-	default:
-	}
-
-	return nil
-}
-
-// WritePackets stores outbound packets into the channel.
-func (e *fwdTestLinkEndpoint) WritePackets(r *Route, gso *GSO, pkts PacketBufferList, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
-	n := 0
-	for pkt := pkts.Front(); pkt != nil; pkt = pkt.Next() {
-		e.WritePacket(r, gso, protocol, pkt)
-		n++
-	}
-
-	return n, nil
-}
-
-// WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket.
-func (e *fwdTestLinkEndpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error {
-	p := fwdTestPacketInfo{
-		Pkt: NewPacketBuffer(PacketBufferOptions{Data: vv}),
-	}
-
-	select {
-	case e.C <- p:
-	default:
-	}
-
-	return nil
-}
-
-// Wait implements stack.LinkEndpoint.Wait.
-func (*fwdTestLinkEndpoint) Wait() {}
-
-// ARPHardwareType implements stack.LinkEndpoint.ARPHardwareType.
-func (*fwdTestLinkEndpoint) ARPHardwareType() header.ARPHardwareType {
-	panic("not implemented")
-}
-
-// AddHeader implements stack.LinkEndpoint.AddHeader.
-func (e *fwdTestLinkEndpoint) AddHeader(local, remote tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer) {
-	panic("not implemented")
-}
-
-func fwdTestNetFactory(t *testing.T, proto *fwdTestNetworkProtocol) (ep1, ep2 *fwdTestLinkEndpoint) {
-	// Create a stack with the network protocol and two NICs.
-	s := New(Options{
-		NetworkProtocols: []NetworkProtocol{proto},
-	})
-
-	proto.addrCache = s.linkAddrCache
-
-	// Enable forwarding.
-	s.SetForwarding(true)
-
-	// NIC 1 has the link address "a", and added the network address 1.
-	ep1 = &fwdTestLinkEndpoint{
-		C:        make(chan fwdTestPacketInfo, 300),
-		mtu:      fwdTestNetDefaultMTU,
-		linkAddr: "a",
-	}
-	if err := s.CreateNIC(1, ep1); err != nil {
-		t.Fatal("CreateNIC #1 failed:", err)
-	}
-	if err := s.AddAddress(1, fwdTestNetNumber, "\x01"); err != nil {
-		t.Fatal("AddAddress #1 failed:", err)
-	}
-
-	// NIC 2 has the link address "b", and added the network address 2.
-	ep2 = &fwdTestLinkEndpoint{
-		C:        make(chan fwdTestPacketInfo, 300),
-		mtu:      fwdTestNetDefaultMTU,
-		linkAddr: "b",
-	}
-	if err := s.CreateNIC(2, ep2); err != nil {
-		t.Fatal("CreateNIC #2 failed:", err)
-	}
-	if err := s.AddAddress(2, fwdTestNetNumber, "\x02"); err != nil {
-		t.Fatal("AddAddress #2 failed:", err)
-	}
-
-	// Route all packets to NIC 2.
-	{
-		subnet, err := tcpip.NewSubnet("\x00", "\x00")
-		if err != nil {
-			t.Fatal(err)
-		}
-		s.SetRouteTable([]tcpip.Route{{Destination: subnet, NIC: 2}})
-	}
-
-	return ep1, ep2
-}
-
-func TestForwardingWithStaticResolver(t *testing.T) {
-	// Create a network protocol with a static resolver.
-	proto := &fwdTestNetworkProtocol{
-		onResolveStaticAddress:
-		// The network address 3 is resolved to the link address "c".
-		func(addr tcpip.Address) (tcpip.LinkAddress, bool) {
-			if addr == "\x03" {
-				return "c", true
-			}
-			return "", false
-		},
-	}
-
-	ep1, ep2 := fwdTestNetFactory(t, proto)
-
-	// Inject an inbound packet to address 3 on NIC 1, and see if it is
-	// forwarded to NIC 2.
-	buf := buffer.NewView(30)
-	buf[dstAddrOffset] = 3
-	ep1.InjectInbound(fwdTestNetNumber, NewPacketBuffer(PacketBufferOptions{
-		Data: buf.ToVectorisedView(),
-	}))
-
-	var p fwdTestPacketInfo
-
-	select {
-	case p = <-ep2.C:
-	default:
-		t.Fatal("packet not forwarded")
-	}
-
-	// Test that the static address resolution happened correctly.
-	if p.RemoteLinkAddress != "c" {
-		t.Fatalf("got p.RemoteLinkAddress = %s, want = c", p.RemoteLinkAddress)
-	}
-	if p.LocalLinkAddress != "b" {
-		t.Fatalf("got p.LocalLinkAddress = %s, want = b", p.LocalLinkAddress)
-	}
-}
-
-func TestForwardingWithFakeResolver(t *testing.T) {
-	// Create a network protocol with a fake resolver.
-	proto := &fwdTestNetworkProtocol{
-		addrResolveDelay: 500 * time.Millisecond,
-		onLinkAddressResolved: func(cache *linkAddrCache, addr tcpip.Address, _ tcpip.LinkAddress) {
-			// Any address will be resolved to the link address "c".
-			cache.add(tcpip.FullAddress{NIC: 2, Addr: addr}, "c")
-		},
-	}
-
-	ep1, ep2 := fwdTestNetFactory(t, proto)
-
-	// Inject an inbound packet to address 3 on NIC 1, and see if it is
-	// forwarded to NIC 2.
-	buf := buffer.NewView(30)
-	buf[dstAddrOffset] = 3
-	ep1.InjectInbound(fwdTestNetNumber, NewPacketBuffer(PacketBufferOptions{
-		Data: buf.ToVectorisedView(),
-	}))
-
-	var p fwdTestPacketInfo
-
-	select {
-	case p = <-ep2.C:
-	case <-time.After(time.Second):
-		t.Fatal("packet not forwarded")
-	}
-
-	// Test that the address resolution happened correctly.
-	if p.RemoteLinkAddress != "c" {
-		t.Fatalf("got p.RemoteLinkAddress = %s, want = c", p.RemoteLinkAddress)
-	}
-	if p.LocalLinkAddress != "b" {
-		t.Fatalf("got p.LocalLinkAddress = %s, want = b", p.LocalLinkAddress)
-	}
-}
-
-func TestForwardingWithNoResolver(t *testing.T) {
-	// Create a network protocol without a resolver.
-	proto := &fwdTestNetworkProtocol{}
-
-	ep1, ep2 := fwdTestNetFactory(t, proto)
-
-	// inject an inbound packet to address 3 on NIC 1, and see if it is
-	// forwarded to NIC 2.
-	buf := buffer.NewView(30)
-	buf[dstAddrOffset] = 3
-	ep1.InjectInbound(fwdTestNetNumber, NewPacketBuffer(PacketBufferOptions{
-		Data: buf.ToVectorisedView(),
-	}))
-
-	select {
-	case <-ep2.C:
-		t.Fatal("Packet should not be forwarded")
-	case <-time.After(time.Second):
-	}
-}
-
-func TestForwardingWithFakeResolverPartialTimeout(t *testing.T) {
-	// Create a network protocol with a fake resolver.
-	proto := &fwdTestNetworkProtocol{
-		addrResolveDelay: 500 * time.Millisecond,
-		onLinkAddressResolved: func(cache *linkAddrCache, addr tcpip.Address, _ tcpip.LinkAddress) {
-			// Only packets to address 3 will be resolved to the
-			// link address "c".
-			if addr == "\x03" {
-				cache.add(tcpip.FullAddress{NIC: 2, Addr: addr}, "c")
-			}
-		},
-	}
-
-	ep1, ep2 := fwdTestNetFactory(t, proto)
-
-	// Inject an inbound packet to address 4 on NIC 1. This packet should
-	// not be forwarded.
-	buf := buffer.NewView(30)
-	buf[dstAddrOffset] = 4
-	ep1.InjectInbound(fwdTestNetNumber, NewPacketBuffer(PacketBufferOptions{
-		Data: buf.ToVectorisedView(),
-	}))
-
-	// Inject an inbound packet to address 3 on NIC 1, and see if it is
-	// forwarded to NIC 2.
-	buf = buffer.NewView(30)
-	buf[dstAddrOffset] = 3
-	ep1.InjectInbound(fwdTestNetNumber, NewPacketBuffer(PacketBufferOptions{
-		Data: buf.ToVectorisedView(),
-	}))
-
-	var p fwdTestPacketInfo
-
-	select {
-	case p = <-ep2.C:
-	case <-time.After(time.Second):
-		t.Fatal("packet not forwarded")
-	}
-
-	if nh := PayloadSince(p.Pkt.NetworkHeader()); nh[dstAddrOffset] != 3 {
-		t.Fatalf("got p.Pkt.NetworkHeader[dstAddrOffset] = %d, want = 3", nh[dstAddrOffset])
-	}
-
-	// Test that the address resolution happened correctly.
-	if p.RemoteLinkAddress != "c" {
-		t.Fatalf("got p.RemoteLinkAddress = %s, want = c", p.RemoteLinkAddress)
-	}
-	if p.LocalLinkAddress != "b" {
-		t.Fatalf("got p.LocalLinkAddress = %s, want = b", p.LocalLinkAddress)
-	}
-}
-
-func TestForwardingWithFakeResolverTwoPackets(t *testing.T) {
-	// Create a network protocol with a fake resolver.
-	proto := &fwdTestNetworkProtocol{
-		addrResolveDelay: 500 * time.Millisecond,
-		onLinkAddressResolved: func(cache *linkAddrCache, addr tcpip.Address, _ tcpip.LinkAddress) {
-			// Any packets will be resolved to the link address "c".
-			cache.add(tcpip.FullAddress{NIC: 2, Addr: addr}, "c")
-		},
-	}
-
-	ep1, ep2 := fwdTestNetFactory(t, proto)
-
-	// Inject two inbound packets to address 3 on NIC 1.
-	for i := 0; i < 2; i++ {
-		buf := buffer.NewView(30)
-		buf[dstAddrOffset] = 3
-		ep1.InjectInbound(fwdTestNetNumber, NewPacketBuffer(PacketBufferOptions{
-			Data: buf.ToVectorisedView(),
-		}))
-	}
-
-	for i := 0; i < 2; i++ {
-		var p fwdTestPacketInfo
-
-		select {
-		case p = <-ep2.C:
-		case <-time.After(time.Second):
-			t.Fatal("packet not forwarded")
-		}
-
-		if nh := PayloadSince(p.Pkt.NetworkHeader()); nh[dstAddrOffset] != 3 {
-			t.Fatalf("got p.Pkt.NetworkHeader[dstAddrOffset] = %d, want = 3", nh[dstAddrOffset])
-		}
-
-		// Test that the address resolution happened correctly.
-		if p.RemoteLinkAddress != "c" {
-			t.Fatalf("got p.RemoteLinkAddress = %s, want = c", p.RemoteLinkAddress)
-		}
-		if p.LocalLinkAddress != "b" {
-			t.Fatalf("got p.LocalLinkAddress = %s, want = b", p.LocalLinkAddress)
-		}
-	}
-}
-
-func TestForwardingWithFakeResolverManyPackets(t *testing.T) {
-	// Create a network protocol with a fake resolver.
-	proto := &fwdTestNetworkProtocol{
-		addrResolveDelay: 500 * time.Millisecond,
-		onLinkAddressResolved: func(cache *linkAddrCache, addr tcpip.Address, _ tcpip.LinkAddress) {
-			// Any packets will be resolved to the link address "c".
-			cache.add(tcpip.FullAddress{NIC: 2, Addr: addr}, "c")
-		},
-	}
-
-	ep1, ep2 := fwdTestNetFactory(t, proto)
-
-	for i := 0; i < maxPendingPacketsPerResolution+5; i++ {
-		// Inject inbound 'maxPendingPacketsPerResolution + 5' packets on NIC 1.
-		buf := buffer.NewView(30)
-		buf[dstAddrOffset] = 3
-		// Set the packet sequence number.
-		binary.BigEndian.PutUint16(buf[fwdTestNetHeaderLen:], uint16(i))
-		ep1.InjectInbound(fwdTestNetNumber, NewPacketBuffer(PacketBufferOptions{
-			Data: buf.ToVectorisedView(),
-		}))
-	}
-
-	for i := 0; i < maxPendingPacketsPerResolution; i++ {
-		var p fwdTestPacketInfo
-
-		select {
-		case p = <-ep2.C:
-		case <-time.After(time.Second):
-			t.Fatal("packet not forwarded")
-		}
-
-		b := PayloadSince(p.Pkt.NetworkHeader())
-		if b[dstAddrOffset] != 3 {
-			t.Fatalf("got b[dstAddrOffset] = %d, want = 3", b[dstAddrOffset])
-		}
-		if len(b) < fwdTestNetHeaderLen+2 {
-			t.Fatalf("packet is too short to hold a sequence number: len(b) = %d", b)
-		}
-		seqNumBuf := b[fwdTestNetHeaderLen:]
-
-		// The first 5 packets should not be forwarded so the sequence number should
-		// start with 5.
-		want := uint16(i + 5)
-		if n := binary.BigEndian.Uint16(seqNumBuf); n != want {
-			t.Fatalf("got the packet #%d, want = #%d", n, want)
-		}
-
-		// Test that the address resolution happened correctly.
-		if p.RemoteLinkAddress != "c" {
-			t.Fatalf("got p.RemoteLinkAddress = %s, want = c", p.RemoteLinkAddress)
-		}
-		if p.LocalLinkAddress != "b" {
-			t.Fatalf("got p.LocalLinkAddress = %s, want = b", p.LocalLinkAddress)
-		}
-	}
-}
-
-func TestForwardingWithFakeResolverManyResolutions(t *testing.T) {
-	// Create a network protocol with a fake resolver.
-	proto := &fwdTestNetworkProtocol{
-		addrResolveDelay: 500 * time.Millisecond,
-		onLinkAddressResolved: func(cache *linkAddrCache, addr tcpip.Address, _ tcpip.LinkAddress) {
-			// Any packets will be resolved to the link address "c".
-			cache.add(tcpip.FullAddress{NIC: 2, Addr: addr}, "c")
-		},
-	}
-
-	ep1, ep2 := fwdTestNetFactory(t, proto)
-
-	for i := 0; i < maxPendingResolutions+5; i++ {
-		// Inject inbound 'maxPendingResolutions + 5' packets on NIC 1.
-		// Each packet has a different destination address (3 to
-		// maxPendingResolutions + 7).
-		buf := buffer.NewView(30)
-		buf[dstAddrOffset] = byte(3 + i)
-		ep1.InjectInbound(fwdTestNetNumber, NewPacketBuffer(PacketBufferOptions{
-			Data: buf.ToVectorisedView(),
-		}))
-	}
-
-	for i := 0; i < maxPendingResolutions; i++ {
-		var p fwdTestPacketInfo
-
-		select {
-		case p = <-ep2.C:
-		case <-time.After(time.Second):
-			t.Fatal("packet not forwarded")
-		}
-
-		// The first 5 packets (address 3 to 7) should not be forwarded
-		// because their address resolutions are interrupted.
-		if nh := PayloadSince(p.Pkt.NetworkHeader()); nh[dstAddrOffset] < 8 {
-			t.Fatalf("got p.Pkt.NetworkHeader[dstAddrOffset] = %d, want p.Pkt.NetworkHeader[dstAddrOffset] >= 8", nh[dstAddrOffset])
-		}
-
-		// Test that the address resolution happened correctly.
-		if p.RemoteLinkAddress != "c" {
-			t.Fatalf("got p.RemoteLinkAddress = %s, want = c", p.RemoteLinkAddress)
-		}
-		if p.LocalLinkAddress != "b" {
-			t.Fatalf("got p.LocalLinkAddress = %s, want = b", p.LocalLinkAddress)
-		}
-	}
-}
diff --git a/pkg/tcpip/stack/forwarding_test.go b/pkg/tcpip/stack/forwarding_test.go
new file mode 100644
index 000000000..cf042309e
--- /dev/null
+++ b/pkg/tcpip/stack/forwarding_test.go
@@ -0,0 +1,876 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package stack
+
+import (
+	"encoding/binary"
+	"math"
+	"testing"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+)
+
+const (
+	fwdTestNetNumber           tcpip.NetworkProtocolNumber = math.MaxUint32
+	fwdTestNetHeaderLen                                    = 12
+	fwdTestNetDefaultPrefixLen                             = 8
+
+	// fwdTestNetDefaultMTU is the MTU, in bytes, used throughout the tests,
+	// except where another value is explicitly used. It is chosen to match
+	// the MTU of loopback interfaces on linux systems.
+	fwdTestNetDefaultMTU = 65536
+
+	dstAddrOffset        = 0
+	srcAddrOffset        = 1
+	protocolNumberOffset = 2
+)
+
+// fwdTestNetworkEndpoint is a network-layer protocol endpoint.
+// Headers of this protocol are fwdTestNetHeaderLen bytes, but we currently only
+// use the first three: destination address, source address, and transport
+// protocol. They're all one byte fields to simplify parsing.
+type fwdTestNetworkEndpoint struct {
+	AddressableEndpointState
+
+	nic        NetworkInterface
+	proto      *fwdTestNetworkProtocol
+	dispatcher TransportDispatcher
+}
+
+var _ NetworkEndpoint = (*fwdTestNetworkEndpoint)(nil)
+
+func (*fwdTestNetworkEndpoint) Enable() *tcpip.Error {
+	return nil
+}
+
+func (*fwdTestNetworkEndpoint) Enabled() bool {
+	return true
+}
+
+func (*fwdTestNetworkEndpoint) Disable() {}
+
+func (f *fwdTestNetworkEndpoint) MTU() uint32 {
+	return f.nic.MTU() - uint32(f.MaxHeaderLength())
+}
+
+func (*fwdTestNetworkEndpoint) DefaultTTL() uint8 {
+	return 123
+}
+
+func (f *fwdTestNetworkEndpoint) HandlePacket(r *Route, pkt *PacketBuffer) {
+	// Dispatch the packet to the transport protocol.
+	f.dispatcher.DeliverTransportPacket(r, tcpip.TransportProtocolNumber(pkt.NetworkHeader().View()[protocolNumberOffset]), pkt)
+}
+
+func (f *fwdTestNetworkEndpoint) MaxHeaderLength() uint16 {
+	return f.nic.MaxHeaderLength() + fwdTestNetHeaderLen
+}
+
+func (f *fwdTestNetworkEndpoint) PseudoHeaderChecksum(protocol tcpip.TransportProtocolNumber, dstAddr tcpip.Address) uint16 {
+	return 0
+}
+
+func (f *fwdTestNetworkEndpoint) NetworkProtocolNumber() tcpip.NetworkProtocolNumber {
+	return f.proto.Number()
+}
+
+func (f *fwdTestNetworkEndpoint) WritePacket(r *Route, gso *GSO, params NetworkHeaderParams, pkt *PacketBuffer) *tcpip.Error {
+	// Add the protocol's header to the packet and send it to the link
+	// endpoint.
+	b := pkt.NetworkHeader().Push(fwdTestNetHeaderLen)
+	b[dstAddrOffset] = r.RemoteAddress[0]
+	b[srcAddrOffset] = r.LocalAddress[0]
+	b[protocolNumberOffset] = byte(params.Protocol)
+
+	return f.nic.WritePacket(r, gso, fwdTestNetNumber, pkt)
+}
+
+// WritePackets implements LinkEndpoint.WritePackets.
+func (f *fwdTestNetworkEndpoint) WritePackets(r *Route, gso *GSO, pkts PacketBufferList, params NetworkHeaderParams) (int, *tcpip.Error) {
+	panic("not implemented")
+}
+
+func (*fwdTestNetworkEndpoint) WriteHeaderIncludedPacket(r *Route, pkt *PacketBuffer) *tcpip.Error {
+	return tcpip.ErrNotSupported
+}
+
+func (f *fwdTestNetworkEndpoint) Close() {
+	f.AddressableEndpointState.Cleanup()
+}
+
+// fwdTestNetworkProtocol is a network-layer protocol that implements Address
+// resolution.
+type fwdTestNetworkProtocol struct {
+	addrCache              *linkAddrCache
+	neigh                  *neighborCache
+	addrResolveDelay       time.Duration
+	onLinkAddressResolved  func(cache *linkAddrCache, neigh *neighborCache, addr tcpip.Address, _ tcpip.LinkAddress)
+	onResolveStaticAddress func(tcpip.Address) (tcpip.LinkAddress, bool)
+
+	mu struct {
+		sync.RWMutex
+		forwarding bool
+	}
+}
+
+var _ NetworkProtocol = (*fwdTestNetworkProtocol)(nil)
+var _ LinkAddressResolver = (*fwdTestNetworkProtocol)(nil)
+
+func (f *fwdTestNetworkProtocol) Number() tcpip.NetworkProtocolNumber {
+	return fwdTestNetNumber
+}
+
+func (f *fwdTestNetworkProtocol) MinimumPacketSize() int {
+	return fwdTestNetHeaderLen
+}
+
+func (f *fwdTestNetworkProtocol) DefaultPrefixLen() int {
+	return fwdTestNetDefaultPrefixLen
+}
+
+func (*fwdTestNetworkProtocol) ParseAddresses(v buffer.View) (src, dst tcpip.Address) {
+	return tcpip.Address(v[srcAddrOffset : srcAddrOffset+1]), tcpip.Address(v[dstAddrOffset : dstAddrOffset+1])
+}
+
+func (*fwdTestNetworkProtocol) Parse(pkt *PacketBuffer) (tcpip.TransportProtocolNumber, bool, bool) {
+	netHeader, ok := pkt.NetworkHeader().Consume(fwdTestNetHeaderLen)
+	if !ok {
+		return 0, false, false
+	}
+	return tcpip.TransportProtocolNumber(netHeader[protocolNumberOffset]), true, true
+}
+
+func (f *fwdTestNetworkProtocol) NewEndpoint(nic NetworkInterface, _ LinkAddressCache, _ NUDHandler, dispatcher TransportDispatcher) NetworkEndpoint {
+	e := &fwdTestNetworkEndpoint{
+		nic:        nic,
+		proto:      f,
+		dispatcher: dispatcher,
+	}
+	e.AddressableEndpointState.Init(e)
+	return e
+}
+
+func (*fwdTestNetworkProtocol) SetOption(tcpip.SettableNetworkProtocolOption) *tcpip.Error {
+	return tcpip.ErrUnknownProtocolOption
+}
+
+func (*fwdTestNetworkProtocol) Option(tcpip.GettableNetworkProtocolOption) *tcpip.Error {
+	return tcpip.ErrUnknownProtocolOption
+}
+
+func (*fwdTestNetworkProtocol) Close() {}
+
+func (*fwdTestNetworkProtocol) Wait() {}
+
+func (f *fwdTestNetworkProtocol) LinkAddressRequest(addr, localAddr tcpip.Address, remoteLinkAddr tcpip.LinkAddress, linkEP LinkEndpoint) *tcpip.Error {
+	if f.onLinkAddressResolved != nil {
+		time.AfterFunc(f.addrResolveDelay, func() {
+			f.onLinkAddressResolved(f.addrCache, f.neigh, addr, remoteLinkAddr)
+		})
+	}
+	return nil
+}
+
+func (f *fwdTestNetworkProtocol) ResolveStaticAddress(addr tcpip.Address) (tcpip.LinkAddress, bool) {
+	if f.onResolveStaticAddress != nil {
+		return f.onResolveStaticAddress(addr)
+	}
+	return "", false
+}
+
+func (*fwdTestNetworkProtocol) LinkAddressProtocol() tcpip.NetworkProtocolNumber {
+	return fwdTestNetNumber
+}
+
+// Forwarding implements stack.ForwardingNetworkProtocol.
+func (f *fwdTestNetworkProtocol) Forwarding() bool {
+	f.mu.RLock()
+	defer f.mu.RUnlock()
+	return f.mu.forwarding
+
+}
+
+// SetForwarding implements stack.ForwardingNetworkProtocol.
+func (f *fwdTestNetworkProtocol) SetForwarding(v bool) {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	f.mu.forwarding = v
+}
+
+// fwdTestPacketInfo holds all the information about an outbound packet.
+type fwdTestPacketInfo struct {
+	RemoteLinkAddress tcpip.LinkAddress
+	LocalLinkAddress  tcpip.LinkAddress
+	Pkt               *PacketBuffer
+}
+
+type fwdTestLinkEndpoint struct {
+	dispatcher NetworkDispatcher
+	mtu        uint32
+	linkAddr   tcpip.LinkAddress
+
+	// C is where outbound packets are queued.
+	C chan fwdTestPacketInfo
+}
+
+// InjectInbound injects an inbound packet.
+func (e *fwdTestLinkEndpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer) {
+	e.InjectLinkAddr(protocol, "", pkt)
+}
+
+// InjectLinkAddr injects an inbound packet with a remote link address.
+func (e *fwdTestLinkEndpoint) InjectLinkAddr(protocol tcpip.NetworkProtocolNumber, remote tcpip.LinkAddress, pkt *PacketBuffer) {
+	e.dispatcher.DeliverNetworkPacket(remote, "" /* local */, protocol, pkt)
+}
+
+// Attach saves the stack network-layer dispatcher for use later when packets
+// are injected.
+func (e *fwdTestLinkEndpoint) Attach(dispatcher NetworkDispatcher) {
+	e.dispatcher = dispatcher
+}
+
+// IsAttached implements stack.LinkEndpoint.IsAttached.
+func (e *fwdTestLinkEndpoint) IsAttached() bool {
+	return e.dispatcher != nil
+}
+
+// MTU implements stack.LinkEndpoint.MTU. It returns the value initialized
+// during construction.
+func (e *fwdTestLinkEndpoint) MTU() uint32 {
+	return e.mtu
+}
+
+// Capabilities implements stack.LinkEndpoint.Capabilities.
+func (e fwdTestLinkEndpoint) Capabilities() LinkEndpointCapabilities {
+	caps := LinkEndpointCapabilities(0)
+	return caps | CapabilityResolutionRequired
+}
+
+// GSOMaxSize returns the maximum GSO packet size.
+func (*fwdTestLinkEndpoint) GSOMaxSize() uint32 {
+	return 1 << 15
+}
+
+// MaxHeaderLength returns the maximum size of the link layer header. Given it
+// doesn't have a header, it just returns 0.
+func (*fwdTestLinkEndpoint) MaxHeaderLength() uint16 {
+	return 0
+}
+
+// LinkAddress returns the link address of this endpoint.
+func (e *fwdTestLinkEndpoint) LinkAddress() tcpip.LinkAddress {
+	return e.linkAddr
+}
+
+func (e fwdTestLinkEndpoint) WritePacket(r *Route, gso *GSO, protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer) *tcpip.Error {
+	p := fwdTestPacketInfo{
+		RemoteLinkAddress: r.RemoteLinkAddress,
+		LocalLinkAddress:  r.LocalLinkAddress,
+		Pkt:               pkt,
+	}
+
+	select {
+	case e.C <- p:
+	default:
+	}
+
+	return nil
+}
+
+// WritePackets stores outbound packets into the channel.
+func (e *fwdTestLinkEndpoint) WritePackets(r *Route, gso *GSO, pkts PacketBufferList, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+	n := 0
+	for pkt := pkts.Front(); pkt != nil; pkt = pkt.Next() {
+		e.WritePacket(r, gso, protocol, pkt)
+		n++
+	}
+
+	return n, nil
+}
+
+// WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket.
+func (e *fwdTestLinkEndpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error {
+	p := fwdTestPacketInfo{
+		Pkt: NewPacketBuffer(PacketBufferOptions{Data: vv}),
+	}
+
+	select {
+	case e.C <- p:
+	default:
+	}
+
+	return nil
+}
+
+// Wait implements stack.LinkEndpoint.Wait.
+func (*fwdTestLinkEndpoint) Wait() {}
+
+// ARPHardwareType implements stack.LinkEndpoint.ARPHardwareType.
+func (*fwdTestLinkEndpoint) ARPHardwareType() header.ARPHardwareType {
+	panic("not implemented")
+}
+
+// AddHeader implements stack.LinkEndpoint.AddHeader.
+func (e *fwdTestLinkEndpoint) AddHeader(local, remote tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer) {
+	panic("not implemented")
+}
+
+func fwdTestNetFactory(t *testing.T, proto *fwdTestNetworkProtocol, useNeighborCache bool) (ep1, ep2 *fwdTestLinkEndpoint) {
+	// Create a stack with the network protocol and two NICs.
+	s := New(Options{
+		NetworkProtocols: []NetworkProtocolFactory{func(*Stack) NetworkProtocol { return proto }},
+		UseNeighborCache: useNeighborCache,
+	})
+
+	if !useNeighborCache {
+		proto.addrCache = s.linkAddrCache
+	}
+
+	// Enable forwarding.
+	s.SetForwarding(proto.Number(), true)
+
+	// NIC 1 has the link address "a", and added the network address 1.
+	ep1 = &fwdTestLinkEndpoint{
+		C:        make(chan fwdTestPacketInfo, 300),
+		mtu:      fwdTestNetDefaultMTU,
+		linkAddr: "a",
+	}
+	if err := s.CreateNIC(1, ep1); err != nil {
+		t.Fatal("CreateNIC #1 failed:", err)
+	}
+	if err := s.AddAddress(1, fwdTestNetNumber, "\x01"); err != nil {
+		t.Fatal("AddAddress #1 failed:", err)
+	}
+
+	// NIC 2 has the link address "b", and added the network address 2.
+	ep2 = &fwdTestLinkEndpoint{
+		C:        make(chan fwdTestPacketInfo, 300),
+		mtu:      fwdTestNetDefaultMTU,
+		linkAddr: "b",
+	}
+	if err := s.CreateNIC(2, ep2); err != nil {
+		t.Fatal("CreateNIC #2 failed:", err)
+	}
+	if err := s.AddAddress(2, fwdTestNetNumber, "\x02"); err != nil {
+		t.Fatal("AddAddress #2 failed:", err)
+	}
+
+	if useNeighborCache {
+		// Control the neighbor cache for NIC 2.
+		nic, ok := s.nics[2]
+		if !ok {
+			t.Fatal("failed to get the neighbor cache for NIC 2")
+		}
+		proto.neigh = nic.neigh
+	}
+
+	// Route all packets to NIC 2.
+	{
+		subnet, err := tcpip.NewSubnet("\x00", "\x00")
+		if err != nil {
+			t.Fatal(err)
+		}
+		s.SetRouteTable([]tcpip.Route{{Destination: subnet, NIC: 2}})
+	}
+
+	return ep1, ep2
+}
+
+func TestForwardingWithStaticResolver(t *testing.T) {
+	tests := []struct {
+		name             string
+		useNeighborCache bool
+	}{
+		{
+			name:             "linkAddrCache",
+			useNeighborCache: false,
+		},
+		{
+			name:             "neighborCache",
+			useNeighborCache: true,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			// Create a network protocol with a static resolver.
+			proto := &fwdTestNetworkProtocol{
+				onResolveStaticAddress:
+				// The network address 3 is resolved to the link address "c".
+				func(addr tcpip.Address) (tcpip.LinkAddress, bool) {
+					if addr == "\x03" {
+						return "c", true
+					}
+					return "", false
+				},
+			}
+
+			ep1, ep2 := fwdTestNetFactory(t, proto, test.useNeighborCache)
+
+			// Inject an inbound packet to address 3 on NIC 1, and see if it is
+			// forwarded to NIC 2.
+			buf := buffer.NewView(30)
+			buf[dstAddrOffset] = 3
+			ep1.InjectInbound(fwdTestNetNumber, NewPacketBuffer(PacketBufferOptions{
+				Data: buf.ToVectorisedView(),
+			}))
+
+			var p fwdTestPacketInfo
+
+			select {
+			case p = <-ep2.C:
+			default:
+				t.Fatal("packet not forwarded")
+			}
+
+			// Test that the static address resolution happened correctly.
+			if p.RemoteLinkAddress != "c" {
+				t.Fatalf("got p.RemoteLinkAddress = %s, want = c", p.RemoteLinkAddress)
+			}
+			if p.LocalLinkAddress != "b" {
+				t.Fatalf("got p.LocalLinkAddress = %s, want = b", p.LocalLinkAddress)
+			}
+		})
+	}
+}
+
+func TestForwardingWithFakeResolver(t *testing.T) {
+	tests := []struct {
+		name             string
+		useNeighborCache bool
+		proto            *fwdTestNetworkProtocol
+	}{
+		{
+			name:             "linkAddrCache",
+			useNeighborCache: false,
+			proto: &fwdTestNetworkProtocol{
+				addrResolveDelay: 500 * time.Millisecond,
+				onLinkAddressResolved: func(cache *linkAddrCache, neigh *neighborCache, addr tcpip.Address, _ tcpip.LinkAddress) {
+					// Any address will be resolved to the link address "c".
+					cache.add(tcpip.FullAddress{NIC: 2, Addr: addr}, "c")
+				},
+			},
+		},
+		{
+			name:             "neighborCache",
+			useNeighborCache: true,
+			proto: &fwdTestNetworkProtocol{
+				addrResolveDelay: 500 * time.Millisecond,
+				onLinkAddressResolved: func(cache *linkAddrCache, neigh *neighborCache, addr tcpip.Address, remoteLinkAddr tcpip.LinkAddress) {
+					t.Helper()
+					if len(remoteLinkAddr) != 0 {
+						t.Fatalf("got remoteLinkAddr=%q, want unspecified", remoteLinkAddr)
+					}
+					// Any address will be resolved to the link address "c".
+					neigh.HandleConfirmation(addr, "c", ReachabilityConfirmationFlags{
+						Solicited: true,
+						Override:  false,
+						IsRouter:  false,
+					})
+				},
+			},
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			ep1, ep2 := fwdTestNetFactory(t, test.proto, test.useNeighborCache)
+
+			// Inject an inbound packet to address 3 on NIC 1, and see if it is
+			// forwarded to NIC 2.
+			buf := buffer.NewView(30)
+			buf[dstAddrOffset] = 3
+			ep1.InjectInbound(fwdTestNetNumber, NewPacketBuffer(PacketBufferOptions{
+				Data: buf.ToVectorisedView(),
+			}))
+
+			var p fwdTestPacketInfo
+
+			select {
+			case p = <-ep2.C:
+			case <-time.After(time.Second):
+				t.Fatal("packet not forwarded")
+			}
+
+			// Test that the address resolution happened correctly.
+			if p.RemoteLinkAddress != "c" {
+				t.Fatalf("got p.RemoteLinkAddress = %s, want = c", p.RemoteLinkAddress)
+			}
+			if p.LocalLinkAddress != "b" {
+				t.Fatalf("got p.LocalLinkAddress = %s, want = b", p.LocalLinkAddress)
+			}
+		})
+	}
+}
+
+func TestForwardingWithNoResolver(t *testing.T) {
+	// Create a network protocol without a resolver.
+	proto := &fwdTestNetworkProtocol{}
+
+	// Whether or not we use the neighbor cache here does not matter since
+	// neither linkAddrCache nor neighborCache will be used.
+	ep1, ep2 := fwdTestNetFactory(t, proto, false /* useNeighborCache */)
+
+	// inject an inbound packet to address 3 on NIC 1, and see if it is
+	// forwarded to NIC 2.
+	buf := buffer.NewView(30)
+	buf[dstAddrOffset] = 3
+	ep1.InjectInbound(fwdTestNetNumber, NewPacketBuffer(PacketBufferOptions{
+		Data: buf.ToVectorisedView(),
+	}))
+
+	select {
+	case <-ep2.C:
+		t.Fatal("Packet should not be forwarded")
+	case <-time.After(time.Second):
+	}
+}
+
+func TestForwardingWithFakeResolverPartialTimeout(t *testing.T) {
+	tests := []struct {
+		name             string
+		useNeighborCache bool
+		proto            *fwdTestNetworkProtocol
+	}{
+		{
+			name:             "linkAddrCache",
+			useNeighborCache: false,
+			proto: &fwdTestNetworkProtocol{
+				addrResolveDelay: 500 * time.Millisecond,
+				onLinkAddressResolved: func(cache *linkAddrCache, neigh *neighborCache, addr tcpip.Address, _ tcpip.LinkAddress) {
+					// Only packets to address 3 will be resolved to the
+					// link address "c".
+					if addr == "\x03" {
+						cache.add(tcpip.FullAddress{NIC: 2, Addr: addr}, "c")
+					}
+				},
+			},
+		},
+		{
+			name:             "neighborCache",
+			useNeighborCache: true,
+			proto: &fwdTestNetworkProtocol{
+				addrResolveDelay: 500 * time.Millisecond,
+				onLinkAddressResolved: func(cache *linkAddrCache, neigh *neighborCache, addr tcpip.Address, remoteLinkAddr tcpip.LinkAddress) {
+					t.Helper()
+					if len(remoteLinkAddr) != 0 {
+						t.Fatalf("got remoteLinkAddr=%q, want unspecified", remoteLinkAddr)
+					}
+					// Only packets to address 3 will be resolved to the
+					// link address "c".
+					if addr == "\x03" {
+						neigh.HandleConfirmation(addr, "c", ReachabilityConfirmationFlags{
+							Solicited: true,
+							Override:  false,
+							IsRouter:  false,
+						})
+					}
+				},
+			},
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			ep1, ep2 := fwdTestNetFactory(t, test.proto, test.useNeighborCache)
+
+			// Inject an inbound packet to address 4 on NIC 1. This packet should
+			// not be forwarded.
+			buf := buffer.NewView(30)
+			buf[dstAddrOffset] = 4
+			ep1.InjectInbound(fwdTestNetNumber, NewPacketBuffer(PacketBufferOptions{
+				Data: buf.ToVectorisedView(),
+			}))
+
+			// Inject an inbound packet to address 3 on NIC 1, and see if it is
+			// forwarded to NIC 2.
+			buf = buffer.NewView(30)
+			buf[dstAddrOffset] = 3
+			ep1.InjectInbound(fwdTestNetNumber, NewPacketBuffer(PacketBufferOptions{
+				Data: buf.ToVectorisedView(),
+			}))
+
+			var p fwdTestPacketInfo
+
+			select {
+			case p = <-ep2.C:
+			case <-time.After(time.Second):
+				t.Fatal("packet not forwarded")
+			}
+
+			if nh := PayloadSince(p.Pkt.NetworkHeader()); nh[dstAddrOffset] != 3 {
+				t.Fatalf("got p.Pkt.NetworkHeader[dstAddrOffset] = %d, want = 3", nh[dstAddrOffset])
+			}
+
+			// Test that the address resolution happened correctly.
+			if p.RemoteLinkAddress != "c" {
+				t.Fatalf("got p.RemoteLinkAddress = %s, want = c", p.RemoteLinkAddress)
+			}
+			if p.LocalLinkAddress != "b" {
+				t.Fatalf("got p.LocalLinkAddress = %s, want = b", p.LocalLinkAddress)
+			}
+		})
+	}
+}
+
+func TestForwardingWithFakeResolverTwoPackets(t *testing.T) {
+	tests := []struct {
+		name             string
+		useNeighborCache bool
+		proto            *fwdTestNetworkProtocol
+	}{
+		{
+			name:             "linkAddrCache",
+			useNeighborCache: false,
+			proto: &fwdTestNetworkProtocol{
+				addrResolveDelay: 500 * time.Millisecond,
+				onLinkAddressResolved: func(cache *linkAddrCache, neigh *neighborCache, addr tcpip.Address, _ tcpip.LinkAddress) {
+					// Any packets will be resolved to the link address "c".
+					cache.add(tcpip.FullAddress{NIC: 2, Addr: addr}, "c")
+				},
+			},
+		},
+		{
+			name:             "neighborCache",
+			useNeighborCache: true,
+			proto: &fwdTestNetworkProtocol{
+				addrResolveDelay: 500 * time.Millisecond,
+				onLinkAddressResolved: func(cache *linkAddrCache, neigh *neighborCache, addr tcpip.Address, remoteLinkAddr tcpip.LinkAddress) {
+					t.Helper()
+					if len(remoteLinkAddr) != 0 {
+						t.Fatalf("got remoteLinkAddr=%q, want unspecified", remoteLinkAddr)
+					}
+					// Any packets will be resolved to the link address "c".
+					neigh.HandleConfirmation(addr, "c", ReachabilityConfirmationFlags{
+						Solicited: true,
+						Override:  false,
+						IsRouter:  false,
+					})
+				},
+			},
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			ep1, ep2 := fwdTestNetFactory(t, test.proto, test.useNeighborCache)
+
+			// Inject two inbound packets to address 3 on NIC 1.
+			for i := 0; i < 2; i++ {
+				buf := buffer.NewView(30)
+				buf[dstAddrOffset] = 3
+				ep1.InjectInbound(fwdTestNetNumber, NewPacketBuffer(PacketBufferOptions{
+					Data: buf.ToVectorisedView(),
+				}))
+			}
+
+			for i := 0; i < 2; i++ {
+				var p fwdTestPacketInfo
+
+				select {
+				case p = <-ep2.C:
+				case <-time.After(time.Second):
+					t.Fatal("packet not forwarded")
+				}
+
+				if nh := PayloadSince(p.Pkt.NetworkHeader()); nh[dstAddrOffset] != 3 {
+					t.Fatalf("got p.Pkt.NetworkHeader[dstAddrOffset] = %d, want = 3", nh[dstAddrOffset])
+				}
+
+				// Test that the address resolution happened correctly.
+				if p.RemoteLinkAddress != "c" {
+					t.Fatalf("got p.RemoteLinkAddress = %s, want = c", p.RemoteLinkAddress)
+				}
+				if p.LocalLinkAddress != "b" {
+					t.Fatalf("got p.LocalLinkAddress = %s, want = b", p.LocalLinkAddress)
+				}
+			}
+		})
+	}
+}
+
+func TestForwardingWithFakeResolverManyPackets(t *testing.T) {
+	tests := []struct {
+		name             string
+		useNeighborCache bool
+		proto            *fwdTestNetworkProtocol
+	}{
+		{
+			name:             "linkAddrCache",
+			useNeighborCache: false,
+			proto: &fwdTestNetworkProtocol{
+				addrResolveDelay: 500 * time.Millisecond,
+				onLinkAddressResolved: func(cache *linkAddrCache, neigh *neighborCache, addr tcpip.Address, _ tcpip.LinkAddress) {
+					// Any packets will be resolved to the link address "c".
+					cache.add(tcpip.FullAddress{NIC: 2, Addr: addr}, "c")
+				},
+			},
+		},
+		{
+			name:             "neighborCache",
+			useNeighborCache: true,
+			proto: &fwdTestNetworkProtocol{
+				addrResolveDelay: 500 * time.Millisecond,
+				onLinkAddressResolved: func(cache *linkAddrCache, neigh *neighborCache, addr tcpip.Address, remoteLinkAddr tcpip.LinkAddress) {
+					t.Helper()
+					if len(remoteLinkAddr) != 0 {
+						t.Fatalf("got remoteLinkAddr=%q, want unspecified", remoteLinkAddr)
+					}
+					// Any packets will be resolved to the link address "c".
+					neigh.HandleConfirmation(addr, "c", ReachabilityConfirmationFlags{
+						Solicited: true,
+						Override:  false,
+						IsRouter:  false,
+					})
+				},
+			},
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			ep1, ep2 := fwdTestNetFactory(t, test.proto, test.useNeighborCache)
+
+			for i := 0; i < maxPendingPacketsPerResolution+5; i++ {
+				// Inject inbound 'maxPendingPacketsPerResolution + 5' packets on NIC 1.
+				buf := buffer.NewView(30)
+				buf[dstAddrOffset] = 3
+				// Set the packet sequence number.
+				binary.BigEndian.PutUint16(buf[fwdTestNetHeaderLen:], uint16(i))
+				ep1.InjectInbound(fwdTestNetNumber, NewPacketBuffer(PacketBufferOptions{
+					Data: buf.ToVectorisedView(),
+				}))
+			}
+
+			for i := 0; i < maxPendingPacketsPerResolution; i++ {
+				var p fwdTestPacketInfo
+
+				select {
+				case p = <-ep2.C:
+				case <-time.After(time.Second):
+					t.Fatal("packet not forwarded")
+				}
+
+				b := PayloadSince(p.Pkt.NetworkHeader())
+				if b[dstAddrOffset] != 3 {
+					t.Fatalf("got b[dstAddrOffset] = %d, want = 3", b[dstAddrOffset])
+				}
+				if len(b) < fwdTestNetHeaderLen+2 {
+					t.Fatalf("packet is too short to hold a sequence number: len(b) = %d", b)
+				}
+				seqNumBuf := b[fwdTestNetHeaderLen:]
+
+				// The first 5 packets should not be forwarded so the sequence number should
+				// start with 5.
+				want := uint16(i + 5)
+				if n := binary.BigEndian.Uint16(seqNumBuf); n != want {
+					t.Fatalf("got the packet #%d, want = #%d", n, want)
+				}
+
+				// Test that the address resolution happened correctly.
+				if p.RemoteLinkAddress != "c" {
+					t.Fatalf("got p.RemoteLinkAddress = %s, want = c", p.RemoteLinkAddress)
+				}
+				if p.LocalLinkAddress != "b" {
+					t.Fatalf("got p.LocalLinkAddress = %s, want = b", p.LocalLinkAddress)
+				}
+			}
+		})
+	}
+}
+
+func TestForwardingWithFakeResolverManyResolutions(t *testing.T) {
+	tests := []struct {
+		name             string
+		useNeighborCache bool
+		proto            *fwdTestNetworkProtocol
+	}{
+		{
+			name:             "linkAddrCache",
+			useNeighborCache: false,
+			proto: &fwdTestNetworkProtocol{
+				addrResolveDelay: 500 * time.Millisecond,
+				onLinkAddressResolved: func(cache *linkAddrCache, neigh *neighborCache, addr tcpip.Address, _ tcpip.LinkAddress) {
+					// Any packets will be resolved to the link address "c".
+					cache.add(tcpip.FullAddress{NIC: 2, Addr: addr}, "c")
+				},
+			},
+		},
+		{
+			name:             "neighborCache",
+			useNeighborCache: true,
+			proto: &fwdTestNetworkProtocol{
+				addrResolveDelay: 500 * time.Millisecond,
+				onLinkAddressResolved: func(cache *linkAddrCache, neigh *neighborCache, addr tcpip.Address, remoteLinkAddr tcpip.LinkAddress) {
+					t.Helper()
+					if len(remoteLinkAddr) != 0 {
+						t.Fatalf("got remoteLinkAddr=%q, want unspecified", remoteLinkAddr)
+					}
+					// Any packets will be resolved to the link address "c".
+					neigh.HandleConfirmation(addr, "c", ReachabilityConfirmationFlags{
+						Solicited: true,
+						Override:  false,
+						IsRouter:  false,
+					})
+				},
+			},
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			ep1, ep2 := fwdTestNetFactory(t, test.proto, test.useNeighborCache)
+
+			for i := 0; i < maxPendingResolutions+5; i++ {
+				// Inject inbound 'maxPendingResolutions + 5' packets on NIC 1.
+				// Each packet has a different destination address (3 to
+				// maxPendingResolutions + 7).
+				buf := buffer.NewView(30)
+				buf[dstAddrOffset] = byte(3 + i)
+				ep1.InjectInbound(fwdTestNetNumber, NewPacketBuffer(PacketBufferOptions{
+					Data: buf.ToVectorisedView(),
+				}))
+			}
+
+			for i := 0; i < maxPendingResolutions; i++ {
+				var p fwdTestPacketInfo
+
+				select {
+				case p = <-ep2.C:
+				case <-time.After(time.Second):
+					t.Fatal("packet not forwarded")
+				}
+
+				// The first 5 packets (address 3 to 7) should not be forwarded
+				// because their address resolutions are interrupted.
+				if nh := PayloadSince(p.Pkt.NetworkHeader()); nh[dstAddrOffset] < 8 {
+					t.Fatalf("got p.Pkt.NetworkHeader[dstAddrOffset] = %d, want p.Pkt.NetworkHeader[dstAddrOffset] >= 8", nh[dstAddrOffset])
+				}
+
+				// Test that the address resolution happened correctly.
+				if p.RemoteLinkAddress != "c" {
+					t.Fatalf("got p.RemoteLinkAddress = %s, want = c", p.RemoteLinkAddress)
+				}
+				if p.LocalLinkAddress != "b" {
+					t.Fatalf("got p.LocalLinkAddress = %s, want = b", p.LocalLinkAddress)
+				}
+			}
+		})
+	}
+}
diff --git a/pkg/tcpip/stack/iptables.go b/pkg/tcpip/stack/iptables.go
index c37da814f..8d6d9a7f1 100644
--- a/pkg/tcpip/stack/iptables.go
+++ b/pkg/tcpip/stack/iptables.go
@@ -57,14 +57,14 @@ const reaperDelay = 5 * time.Second
 // all packets.
 func DefaultTables() *IPTables {
 	return &IPTables{
-		tables: [numTables]Table{
+		v4Tables: [numTables]Table{
 			natID: Table{
 				Rules: []Rule{
-					Rule{Target: AcceptTarget{}},
-					Rule{Target: AcceptTarget{}},
-					Rule{Target: AcceptTarget{}},
-					Rule{Target: AcceptTarget{}},
-					Rule{Target: ErrorTarget{}},
+					Rule{Target: &AcceptTarget{NetworkProtocol: header.IPv4ProtocolNumber}},
+					Rule{Target: &AcceptTarget{NetworkProtocol: header.IPv4ProtocolNumber}},
+					Rule{Target: &AcceptTarget{NetworkProtocol: header.IPv4ProtocolNumber}},
+					Rule{Target: &AcceptTarget{NetworkProtocol: header.IPv4ProtocolNumber}},
+					Rule{Target: &ErrorTarget{NetworkProtocol: header.IPv4ProtocolNumber}},
 				},
 				BuiltinChains: [NumHooks]int{
 					Prerouting:  0,
@@ -83,9 +83,9 @@ func DefaultTables() *IPTables {
 			},
 			mangleID: Table{
 				Rules: []Rule{
-					Rule{Target: AcceptTarget{}},
-					Rule{Target: AcceptTarget{}},
-					Rule{Target: ErrorTarget{}},
+					Rule{Target: &AcceptTarget{NetworkProtocol: header.IPv4ProtocolNumber}},
+					Rule{Target: &AcceptTarget{NetworkProtocol: header.IPv4ProtocolNumber}},
+					Rule{Target: &ErrorTarget{NetworkProtocol: header.IPv4ProtocolNumber}},
 				},
 				BuiltinChains: [NumHooks]int{
 					Prerouting: 0,
@@ -101,10 +101,75 @@ func DefaultTables() *IPTables {
 			},
 			filterID: Table{
 				Rules: []Rule{
-					Rule{Target: AcceptTarget{}},
-					Rule{Target: AcceptTarget{}},
-					Rule{Target: AcceptTarget{}},
-					Rule{Target: ErrorTarget{}},
+					Rule{Target: &AcceptTarget{NetworkProtocol: header.IPv4ProtocolNumber}},
+					Rule{Target: &AcceptTarget{NetworkProtocol: header.IPv4ProtocolNumber}},
+					Rule{Target: &AcceptTarget{NetworkProtocol: header.IPv4ProtocolNumber}},
+					Rule{Target: &ErrorTarget{NetworkProtocol: header.IPv4ProtocolNumber}},
+				},
+				BuiltinChains: [NumHooks]int{
+					Prerouting:  HookUnset,
+					Input:       0,
+					Forward:     1,
+					Output:      2,
+					Postrouting: HookUnset,
+				},
+				Underflows: [NumHooks]int{
+					Prerouting:  HookUnset,
+					Input:       0,
+					Forward:     1,
+					Output:      2,
+					Postrouting: HookUnset,
+				},
+			},
+		},
+		v6Tables: [numTables]Table{
+			natID: Table{
+				Rules: []Rule{
+					Rule{Target: &AcceptTarget{NetworkProtocol: header.IPv6ProtocolNumber}},
+					Rule{Target: &AcceptTarget{NetworkProtocol: header.IPv6ProtocolNumber}},
+					Rule{Target: &AcceptTarget{NetworkProtocol: header.IPv6ProtocolNumber}},
+					Rule{Target: &AcceptTarget{NetworkProtocol: header.IPv6ProtocolNumber}},
+					Rule{Target: &ErrorTarget{NetworkProtocol: header.IPv6ProtocolNumber}},
+				},
+				BuiltinChains: [NumHooks]int{
+					Prerouting:  0,
+					Input:       1,
+					Forward:     HookUnset,
+					Output:      2,
+					Postrouting: 3,
+				},
+				Underflows: [NumHooks]int{
+					Prerouting:  0,
+					Input:       1,
+					Forward:     HookUnset,
+					Output:      2,
+					Postrouting: 3,
+				},
+			},
+			mangleID: Table{
+				Rules: []Rule{
+					Rule{Target: &AcceptTarget{NetworkProtocol: header.IPv6ProtocolNumber}},
+					Rule{Target: &AcceptTarget{NetworkProtocol: header.IPv6ProtocolNumber}},
+					Rule{Target: &ErrorTarget{NetworkProtocol: header.IPv6ProtocolNumber}},
+				},
+				BuiltinChains: [NumHooks]int{
+					Prerouting: 0,
+					Output:     1,
+				},
+				Underflows: [NumHooks]int{
+					Prerouting:  0,
+					Input:       HookUnset,
+					Forward:     HookUnset,
+					Output:      1,
+					Postrouting: HookUnset,
+				},
+			},
+			filterID: Table{
+				Rules: []Rule{
+					Rule{Target: &AcceptTarget{NetworkProtocol: header.IPv6ProtocolNumber}},
+					Rule{Target: &AcceptTarget{NetworkProtocol: header.IPv6ProtocolNumber}},
+					Rule{Target: &AcceptTarget{NetworkProtocol: header.IPv6ProtocolNumber}},
+					Rule{Target: &ErrorTarget{NetworkProtocol: header.IPv6ProtocolNumber}},
 				},
 				BuiltinChains: [NumHooks]int{
 					Prerouting:  HookUnset,
@@ -165,18 +230,21 @@ func EmptyNATTable() Table {
 }
 
 // GetTable returns a table by name.
-func (it *IPTables) GetTable(name string) (Table, bool) {
+func (it *IPTables) GetTable(name string, ipv6 bool) (Table, bool) {
 	id, ok := nameToID[name]
 	if !ok {
 		return Table{}, false
 	}
 	it.mu.RLock()
 	defer it.mu.RUnlock()
-	return it.tables[id], true
+	if ipv6 {
+		return it.v6Tables[id], true
+	}
+	return it.v4Tables[id], true
 }
 
 // ReplaceTable replaces or inserts table by name.
-func (it *IPTables) ReplaceTable(name string, table Table) *tcpip.Error {
+func (it *IPTables) ReplaceTable(name string, table Table, ipv6 bool) *tcpip.Error {
 	id, ok := nameToID[name]
 	if !ok {
 		return tcpip.ErrInvalidOptionValue
@@ -190,7 +258,11 @@ func (it *IPTables) ReplaceTable(name string, table Table) *tcpip.Error {
 		it.startReaper(reaperDelay)
 	}
 	it.modified = true
-	it.tables[id] = table
+	if ipv6 {
+		it.v6Tables[id] = table
+	} else {
+		it.v4Tables[id] = table
+	}
 	return nil
 }
 
@@ -213,8 +285,15 @@ const (
 // should continue traversing the network stack and false when it should be
 // dropped.
 //
+// TODO(gvisor.dev/issue/170): PacketBuffer should hold the GSO and route, from
+// which address and nicName can be gathered. Currently, address is only
+// needed for prerouting and nicName is only needed for output.
+//
 // Precondition: pkt.NetworkHeader is set.
-func (it *IPTables) Check(hook Hook, pkt *PacketBuffer, gso *GSO, r *Route, address tcpip.Address, nicName string) bool {
+func (it *IPTables) Check(hook Hook, pkt *PacketBuffer, gso *GSO, r *Route, preroutingAddr tcpip.Address, nicName string) bool {
+	if pkt.NetworkProtocolNumber != header.IPv4ProtocolNumber && pkt.NetworkProtocolNumber != header.IPv6ProtocolNumber {
+		return true
+	}
 	// Many users never configure iptables. Spare them the cost of rule
 	// traversal if rules have never been set.
 	it.mu.RLock()
@@ -235,9 +314,14 @@ func (it *IPTables) Check(hook Hook, pkt *PacketBuffer, gso *GSO, r *Route, addr
 		if tableID == natID && pkt.NatDone {
 			continue
 		}
-		table := it.tables[tableID]
+		var table Table
+		if pkt.NetworkProtocolNumber == header.IPv6ProtocolNumber {
+			table = it.v6Tables[tableID]
+		} else {
+			table = it.v4Tables[tableID]
+		}
 		ruleIdx := table.BuiltinChains[hook]
-		switch verdict := it.checkChain(hook, pkt, table, ruleIdx, gso, r, address, nicName); verdict {
+		switch verdict := it.checkChain(hook, pkt, table, ruleIdx, gso, r, preroutingAddr, nicName); verdict {
 		// If the table returns Accept, move on to the next table.
 		case chainAccept:
 			continue
@@ -248,7 +332,7 @@ func (it *IPTables) Check(hook Hook, pkt *PacketBuffer, gso *GSO, r *Route, addr
 			// Any Return from a built-in chain means we have to
 			// call the underflow.
 			underflow := table.Rules[table.Underflows[hook]]
-			switch v, _ := underflow.Target.Action(pkt, &it.connections, hook, gso, r, address); v {
+			switch v, _ := underflow.Target.Action(pkt, &it.connections, hook, gso, r, preroutingAddr); v {
 			case RuleAccept:
 				continue
 			case RuleDrop:
@@ -315,8 +399,8 @@ func (it *IPTables) startReaper(interval time.Duration) {
 // should not go forward.
 //
 // Preconditions:
-// - pkt is a IPv4 packet of at least length header.IPv4MinimumSize.
-// - pkt.NetworkHeader is not nil.
+// * pkt is a IPv4 packet of at least length header.IPv4MinimumSize.
+// * pkt.NetworkHeader is not nil.
 //
 // NOTE: unlike the Check API the returned map contains packets that should be
 // dropped.
@@ -341,13 +425,13 @@ func (it *IPTables) CheckPackets(hook Hook, pkts PacketBufferList, gso *GSO, r *
 }
 
 // Preconditions:
-// - pkt is a IPv4 packet of at least length header.IPv4MinimumSize.
-// - pkt.NetworkHeader is not nil.
-func (it *IPTables) checkChain(hook Hook, pkt *PacketBuffer, table Table, ruleIdx int, gso *GSO, r *Route, address tcpip.Address, nicName string) chainVerdict {
+// * pkt is a IPv4 packet of at least length header.IPv4MinimumSize.
+// * pkt.NetworkHeader is not nil.
+func (it *IPTables) checkChain(hook Hook, pkt *PacketBuffer, table Table, ruleIdx int, gso *GSO, r *Route, preroutingAddr tcpip.Address, nicName string) chainVerdict {
 	// Start from ruleIdx and walk the list of rules until a rule gives us
 	// a verdict.
 	for ruleIdx < len(table.Rules) {
-		switch verdict, jumpTo := it.checkRule(hook, pkt, table, ruleIdx, gso, r, address, nicName); verdict {
+		switch verdict, jumpTo := it.checkRule(hook, pkt, table, ruleIdx, gso, r, preroutingAddr, nicName); verdict {
 		case RuleAccept:
 			return chainAccept
 
@@ -364,7 +448,7 @@ func (it *IPTables) checkChain(hook Hook, pkt *PacketBuffer, table Table, ruleId
 				ruleIdx++
 				continue
 			}
-			switch verdict := it.checkChain(hook, pkt, table, jumpTo, gso, r, address, nicName); verdict {
+			switch verdict := it.checkChain(hook, pkt, table, jumpTo, gso, r, preroutingAddr, nicName); verdict {
 			case chainAccept:
 				return chainAccept
 			case chainDrop:
@@ -388,13 +472,13 @@ func (it *IPTables) checkChain(hook Hook, pkt *PacketBuffer, table Table, ruleId
 }
 
 // Preconditions:
-// - pkt is a IPv4 packet of at least length header.IPv4MinimumSize.
-// - pkt.NetworkHeader is not nil.
-func (it *IPTables) checkRule(hook Hook, pkt *PacketBuffer, table Table, ruleIdx int, gso *GSO, r *Route, address tcpip.Address, nicName string) (RuleVerdict, int) {
+// * pkt is a IPv4 packet of at least length header.IPv4MinimumSize.
+// * pkt.NetworkHeader is not nil.
+func (it *IPTables) checkRule(hook Hook, pkt *PacketBuffer, table Table, ruleIdx int, gso *GSO, r *Route, preroutingAddr tcpip.Address, nicName string) (RuleVerdict, int) {
 	rule := table.Rules[ruleIdx]
 
 	// Check whether the packet matches the IP header filter.
-	if !rule.Filter.match(header.IPv4(pkt.NetworkHeader().View()), hook, nicName) {
+	if !rule.Filter.match(pkt, hook, nicName) {
 		// Continue on to the next rule.
 		return RuleJump, ruleIdx + 1
 	}
@@ -413,11 +497,16 @@ func (it *IPTables) checkRule(hook Hook, pkt *PacketBuffer, table Table, ruleIdx
 	}
 
 	// All the matchers matched, so run the target.
-	return rule.Target.Action(pkt, &it.connections, hook, gso, r, address)
+	return rule.Target.Action(pkt, &it.connections, hook, gso, r, preroutingAddr)
 }
 
 // OriginalDst returns the original destination of redirected connections. It
 // returns an error if the connection doesn't exist or isn't redirected.
-func (it *IPTables) OriginalDst(epID TransportEndpointID) (tcpip.Address, uint16, *tcpip.Error) {
-	return it.connections.originalDst(epID)
+func (it *IPTables) OriginalDst(epID TransportEndpointID, netProto tcpip.NetworkProtocolNumber) (tcpip.Address, uint16, *tcpip.Error) {
+	it.mu.RLock()
+	defer it.mu.RUnlock()
+	if !it.modified {
+		return "", 0, tcpip.ErrNotConnected
+	}
+	return it.connections.originalDst(epID, netProto)
 }
diff --git a/pkg/tcpip/stack/iptables_targets.go b/pkg/tcpip/stack/iptables_targets.go
index 5f1b2af64..538c4625d 100644
--- a/pkg/tcpip/stack/iptables_targets.go
+++ b/pkg/tcpip/stack/iptables_targets.go
@@ -21,78 +21,139 @@ import (
 )
 
 // AcceptTarget accepts packets.
-type AcceptTarget struct{}
+type AcceptTarget struct {
+	// NetworkProtocol is the network protocol the target is used with.
+	NetworkProtocol tcpip.NetworkProtocolNumber
+}
+
+// ID implements Target.ID.
+func (at *AcceptTarget) ID() TargetID {
+	return TargetID{
+		NetworkProtocol: at.NetworkProtocol,
+	}
+}
 
 // Action implements Target.Action.
-func (AcceptTarget) Action(*PacketBuffer, *ConnTrack, Hook, *GSO, *Route, tcpip.Address) (RuleVerdict, int) {
+func (*AcceptTarget) Action(*PacketBuffer, *ConnTrack, Hook, *GSO, *Route, tcpip.Address) (RuleVerdict, int) {
 	return RuleAccept, 0
 }
 
 // DropTarget drops packets.
-type DropTarget struct{}
+type DropTarget struct {
+	// NetworkProtocol is the network protocol the target is used with.
+	NetworkProtocol tcpip.NetworkProtocolNumber
+}
+
+// ID implements Target.ID.
+func (dt *DropTarget) ID() TargetID {
+	return TargetID{
+		NetworkProtocol: dt.NetworkProtocol,
+	}
+}
 
 // Action implements Target.Action.
-func (DropTarget) Action(*PacketBuffer, *ConnTrack, Hook, *GSO, *Route, tcpip.Address) (RuleVerdict, int) {
+func (*DropTarget) Action(*PacketBuffer, *ConnTrack, Hook, *GSO, *Route, tcpip.Address) (RuleVerdict, int) {
 	return RuleDrop, 0
 }
 
+// ErrorTargetName is used to mark targets as error targets. Error targets
+// shouldn't be reached - an error has occurred if we fall through to one.
+const ErrorTargetName = "ERROR"
+
 // ErrorTarget logs an error and drops the packet. It represents a target that
 // should be unreachable.
-type ErrorTarget struct{}
+type ErrorTarget struct {
+	// NetworkProtocol is the network protocol the target is used with.
+	NetworkProtocol tcpip.NetworkProtocolNumber
+}
+
+// ID implements Target.ID.
+func (et *ErrorTarget) ID() TargetID {
+	return TargetID{
+		Name:            ErrorTargetName,
+		NetworkProtocol: et.NetworkProtocol,
+	}
+}
 
 // Action implements Target.Action.
-func (ErrorTarget) Action(*PacketBuffer, *ConnTrack, Hook, *GSO, *Route, tcpip.Address) (RuleVerdict, int) {
+func (*ErrorTarget) Action(*PacketBuffer, *ConnTrack, Hook, *GSO, *Route, tcpip.Address) (RuleVerdict, int) {
 	log.Debugf("ErrorTarget triggered.")
 	return RuleDrop, 0
 }
 
 // UserChainTarget marks a rule as the beginning of a user chain.
 type UserChainTarget struct {
+	// Name is the chain name.
 	Name string
+
+	// NetworkProtocol is the network protocol the target is used with.
+	NetworkProtocol tcpip.NetworkProtocolNumber
+}
+
+// ID implements Target.ID.
+func (uc *UserChainTarget) ID() TargetID {
+	return TargetID{
+		Name:            ErrorTargetName,
+		NetworkProtocol: uc.NetworkProtocol,
+	}
 }
 
 // Action implements Target.Action.
-func (UserChainTarget) Action(*PacketBuffer, *ConnTrack, Hook, *GSO, *Route, tcpip.Address) (RuleVerdict, int) {
+func (*UserChainTarget) Action(*PacketBuffer, *ConnTrack, Hook, *GSO, *Route, tcpip.Address) (RuleVerdict, int) {
 	panic("UserChainTarget should never be called.")
 }
 
 // ReturnTarget returns from the current chain. If the chain is a built-in, the
 // hook's underflow should be called.
-type ReturnTarget struct{}
+type ReturnTarget struct {
+	// NetworkProtocol is the network protocol the target is used with.
+	NetworkProtocol tcpip.NetworkProtocolNumber
+}
+
+// ID implements Target.ID.
+func (rt *ReturnTarget) ID() TargetID {
+	return TargetID{
+		NetworkProtocol: rt.NetworkProtocol,
+	}
+}
 
 // Action implements Target.Action.
-func (ReturnTarget) Action(*PacketBuffer, *ConnTrack, Hook, *GSO, *Route, tcpip.Address) (RuleVerdict, int) {
+func (*ReturnTarget) Action(*PacketBuffer, *ConnTrack, Hook, *GSO, *Route, tcpip.Address) (RuleVerdict, int) {
 	return RuleReturn, 0
 }
 
+// RedirectTargetName is used to mark targets as redirect targets. Redirect
+// targets should be reached for only NAT and Mangle tables. These targets will
+// change the destination port/destination IP for packets.
+const RedirectTargetName = "REDIRECT"
+
 // RedirectTarget redirects the packet by modifying the destination port/IP.
-// Min and Max values for IP and Ports in the struct indicate the range of
-// values which can be used to redirect.
+// TODO(gvisor.dev/issue/170): Other flags need to be added after we support
+// them.
 type RedirectTarget struct {
-	// TODO(gvisor.dev/issue/170): Other flags need to be added after
-	// we support them.
-	// RangeProtoSpecified flag indicates single port is specified to
-	// redirect.
-	RangeProtoSpecified bool
+	// Addr indicates address used to redirect.
+	Addr tcpip.Address
 
-	// MinIP indicates address used to redirect.
-	MinIP tcpip.Address
+	// Port indicates port used to redirect.
+	Port uint16
 
-	// MaxIP indicates address used to redirect.
-	MaxIP tcpip.Address
-
-	// MinPort indicates port used to redirect.
-	MinPort uint16
+	// NetworkProtocol is the network protocol the target is used with.
+	NetworkProtocol tcpip.NetworkProtocolNumber
+}
 
-	// MaxPort indicates port used to redirect.
-	MaxPort uint16
+// ID implements Target.ID.
+func (rt *RedirectTarget) ID() TargetID {
+	return TargetID{
+		Name:            RedirectTargetName,
+		NetworkProtocol: rt.NetworkProtocol,
+	}
 }
 
 // Action implements Target.Action.
 // TODO(gvisor.dev/issue/170): Parse headers without copying. The current
 // implementation only works for PREROUTING and calls pkt.Clone(), neither
 // of which should be the case.
-func (rt RedirectTarget) Action(pkt *PacketBuffer, ct *ConnTrack, hook Hook, gso *GSO, r *Route, address tcpip.Address) (RuleVerdict, int) {
+func (rt *RedirectTarget) Action(pkt *PacketBuffer, ct *ConnTrack, hook Hook, gso *GSO, r *Route, address tcpip.Address) (RuleVerdict, int) {
 	// Packet is already manipulated.
 	if pkt.NatDone {
 		return RuleAccept, 0
@@ -103,34 +164,35 @@ func (rt RedirectTarget) Action(pkt *PacketBuffer, ct *ConnTrack, hook Hook, gso
 		return RuleDrop, 0
 	}
 
-	// Change the address to localhost (127.0.0.1) in Output and
-	// to primary address of the incoming interface in Prerouting.
+	// Change the address to localhost (127.0.0.1 or ::1) in Output and to
+	// the primary address of the incoming interface in Prerouting.
 	switch hook {
 	case Output:
-		rt.MinIP = tcpip.Address([]byte{127, 0, 0, 1})
-		rt.MaxIP = tcpip.Address([]byte{127, 0, 0, 1})
+		if pkt.NetworkProtocolNumber == header.IPv4ProtocolNumber {
+			rt.Addr = tcpip.Address([]byte{127, 0, 0, 1})
+		} else {
+			rt.Addr = header.IPv6Loopback
+		}
 	case Prerouting:
-		rt.MinIP = address
-		rt.MaxIP = address
+		rt.Addr = address
 	default:
 		panic("redirect target is supported only on output and prerouting hooks")
 	}
 
 	// TODO(gvisor.dev/issue/170): Check Flags in RedirectTarget if
 	// we need to change dest address (for OUTPUT chain) or ports.
-	netHeader := header.IPv4(pkt.NetworkHeader().View())
-	switch protocol := netHeader.TransportProtocol(); protocol {
+	switch protocol := pkt.TransportProtocolNumber; protocol {
 	case header.UDPProtocolNumber:
 		udpHeader := header.UDP(pkt.TransportHeader().View())
-		udpHeader.SetDestinationPort(rt.MinPort)
+		udpHeader.SetDestinationPort(rt.Port)
 
 		// Calculate UDP checksum and set it.
 		if hook == Output {
 			udpHeader.SetChecksum(0)
-			length := uint16(pkt.Size()) - uint16(netHeader.HeaderLength())
 
 			// Only calculate the checksum if offloading isn't supported.
 			if r.Capabilities()&CapabilityTXChecksumOffload == 0 {
+				length := uint16(pkt.Size()) - uint16(len(pkt.NetworkHeader().View()))
 				xsum := r.PseudoHeaderChecksum(protocol, length)
 				for _, v := range pkt.Data.Views() {
 					xsum = header.Checksum(v, xsum)
@@ -139,10 +201,15 @@ func (rt RedirectTarget) Action(pkt *PacketBuffer, ct *ConnTrack, hook Hook, gso
 				udpHeader.SetChecksum(^udpHeader.CalculateChecksum(xsum))
 			}
 		}
-		// Change destination address.
-		netHeader.SetDestinationAddress(rt.MinIP)
-		netHeader.SetChecksum(0)
-		netHeader.SetChecksum(^netHeader.CalculateChecksum())
+
+		pkt.Network().SetDestinationAddress(rt.Addr)
+
+		// After modification, IPv4 packets need a valid checksum.
+		if pkt.NetworkProtocolNumber == header.IPv4ProtocolNumber {
+			netHeader := header.IPv4(pkt.NetworkHeader().View())
+			netHeader.SetChecksum(0)
+			netHeader.SetChecksum(^netHeader.CalculateChecksum())
+		}
 		pkt.NatDone = true
 	case header.TCPProtocolNumber:
 		if ct == nil {
diff --git a/pkg/tcpip/stack/iptables_types.go b/pkg/tcpip/stack/iptables_types.go
index 73274ada9..7b3f3e88b 100644
--- a/pkg/tcpip/stack/iptables_types.go
+++ b/pkg/tcpip/stack/iptables_types.go
@@ -15,6 +15,7 @@
 package stack
 
 import (
+	"fmt"
 	"strings"
 	"sync"
 
@@ -81,31 +82,42 @@ const (
 //
 // +stateify savable
 type IPTables struct {
-	// mu protects tables, priorities, and modified.
+	// mu protects v4Tables, v6Tables, and modified.
 	mu sync.RWMutex
-
-	// tables maps tableIDs to tables. Holds builtin tables only, not user
-	// tables. mu must be locked for accessing.
-	tables [numTables]Table
-
-	// priorities maps each hook to a list of table names. The order of the
-	// list is the order in which each table should be visited for that
-	// hook. mu needs to be locked for accessing.
-	priorities [NumHooks][]tableID
-
+	// v4Tables and v6tables map tableIDs to tables. They hold builtin
+	// tables only, not user tables. mu must be locked for accessing.
+	v4Tables [numTables]Table
+	v6Tables [numTables]Table
 	// modified is whether tables have been modified at least once. It is
 	// used to elide the iptables performance overhead for workloads that
 	// don't utilize iptables.
 	modified bool
 
+	// priorities maps each hook to a list of table names. The order of the
+	// list is the order in which each table should be visited for that
+	// hook. It is immutable.
+	priorities [NumHooks][]tableID
+
 	connections ConnTrack
 
-	// reaperDone can be signalled to stop the reaper goroutine.
+	// reaperDone can be signaled to stop the reaper goroutine.
 	reaperDone chan struct{}
 }
 
-// A Table defines a set of chains and hooks into the network stack. It is
-// really just a list of rules.
+// A Table defines a set of chains and hooks into the network stack.
+//
+// It is a list of Rules, entry points (BuiltinChains), and error handlers
+// (Underflows). As packets traverse netstack, they hit hooks. When a packet
+// hits a hook, iptables compares it to Rules starting from that hook's entry
+// point. So if a packet hits the Input hook, we look up the corresponding
+// entry point in BuiltinChains and jump to that point.
+//
+// If the Rule doesn't match the packet, iptables continues to the next Rule.
+// If a Rule does match, it can issue a verdict on the packet (e.g. RuleAccept
+// or RuleDrop) that causes the packet to stop traversing iptables. It can also
+// jump to other rules or perform custom actions based on Rule.Target.
+//
+// Underflow Rules are invoked when a chain returns without reaching a verdict.
 //
 // +stateify savable
 type Table struct {
@@ -148,13 +160,18 @@ type Rule struct {
 	Target Target
 }
 
-// IPHeaderFilter holds basic IP filtering data common to every rule.
+// IPHeaderFilter performs basic IP header matching common to every rule.
 //
 // +stateify savable
 type IPHeaderFilter struct {
 	// Protocol matches the transport protocol.
 	Protocol tcpip.TransportProtocolNumber
 
+	// CheckProtocol determines whether the Protocol field should be
+	// checked during matching.
+	// TODO(gvisor.dev/issue/3549): Check this field during matching.
+	CheckProtocol bool
+
 	// Dst matches the destination IP address.
 	Dst tcpip.Address
 
@@ -191,16 +208,43 @@ type IPHeaderFilter struct {
 	OutputInterfaceInvert bool
 }
 
-// match returns whether hdr matches the filter.
-func (fl IPHeaderFilter) match(hdr header.IPv4, hook Hook, nicName string) bool {
-	// TODO(gvisor.dev/issue/170): Support other fields of the filter.
+// match returns whether pkt matches the filter.
+//
+// Preconditions: pkt.NetworkHeader is set and is at least of the minimal IPv4
+// or IPv6 header length.
+func (fl IPHeaderFilter) match(pkt *PacketBuffer, hook Hook, nicName string) bool {
+	// Extract header fields.
+	var (
+		// TODO(gvisor.dev/issue/170): Support other filter fields.
+		transProto tcpip.TransportProtocolNumber
+		dstAddr    tcpip.Address
+		srcAddr    tcpip.Address
+	)
+	switch proto := pkt.NetworkProtocolNumber; proto {
+	case header.IPv4ProtocolNumber:
+		hdr := header.IPv4(pkt.NetworkHeader().View())
+		transProto = hdr.TransportProtocol()
+		dstAddr = hdr.DestinationAddress()
+		srcAddr = hdr.SourceAddress()
+
+	case header.IPv6ProtocolNumber:
+		hdr := header.IPv6(pkt.NetworkHeader().View())
+		transProto = hdr.TransportProtocol()
+		dstAddr = hdr.DestinationAddress()
+		srcAddr = hdr.SourceAddress()
+
+	default:
+		panic(fmt.Sprintf("unknown network protocol with EtherType: %d", proto))
+	}
+
 	// Check the transport protocol.
-	if fl.Protocol != 0 && fl.Protocol != hdr.TransportProtocol() {
+	if fl.CheckProtocol && fl.Protocol != transProto {
 		return false
 	}
 
-	// Check the source and destination IPs.
-	if !filterAddress(hdr.DestinationAddress(), fl.DstMask, fl.Dst, fl.DstInvert) || !filterAddress(hdr.SourceAddress(), fl.SrcMask, fl.Src, fl.SrcInvert) {
+	// Check the addresses.
+	if !filterAddress(dstAddr, fl.DstMask, fl.Dst, fl.DstInvert) ||
+		!filterAddress(srcAddr, fl.SrcMask, fl.Src, fl.SrcInvert) {
 		return false
 	}
 
@@ -228,6 +272,18 @@ func (fl IPHeaderFilter) match(hdr header.IPv4, hook Hook, nicName string) bool
 	return true
 }
 
+// NetworkProtocol returns the protocol (IPv4 or IPv6) on to which the header
+// applies.
+func (fl IPHeaderFilter) NetworkProtocol() tcpip.NetworkProtocolNumber {
+	switch len(fl.Src) {
+	case header.IPv4AddressSize:
+		return header.IPv4ProtocolNumber
+	case header.IPv6AddressSize:
+		return header.IPv6ProtocolNumber
+	}
+	panic(fmt.Sprintf("invalid address in IPHeaderFilter: %s", fl.Src))
+}
+
 // filterAddress returns whether addr matches the filter.
 func filterAddress(addr, mask, filterAddr tcpip.Address, invert bool) bool {
 	matches := true
@@ -253,8 +309,23 @@ type Matcher interface {
 	Match(hook Hook, packet *PacketBuffer, interfaceName string) (matches bool, hotdrop bool)
 }
 
+// A TargetID uniquely identifies a target.
+type TargetID struct {
+	// Name is the target name as stored in the xt_entry_target struct.
+	Name string
+
+	// NetworkProtocol is the protocol to which the target applies.
+	NetworkProtocol tcpip.NetworkProtocolNumber
+
+	// Revision is the version of the target.
+	Revision uint8
+}
+
 // A Target is the interface for taking an action for a packet.
 type Target interface {
+	// ID uniquely identifies the Target.
+	ID() TargetID
+
 	// Action takes an action on the packet and returns a verdict on how
 	// traversal should (or should not) continue. If the return value is
 	// Jump, it also returns the index of the rule to jump to.
diff --git a/pkg/tcpip/stack/linkaddrcache_test.go b/pkg/tcpip/stack/linkaddrcache_test.go
index b15b8d1cb..33806340e 100644
--- a/pkg/tcpip/stack/linkaddrcache_test.go
+++ b/pkg/tcpip/stack/linkaddrcache_test.go
@@ -16,6 +16,7 @@ package stack
 
 import (
 	"fmt"
+	"math"
 	"sync/atomic"
 	"testing"
 	"time"
@@ -191,7 +192,13 @@ func TestCacheReplace(t *testing.T) {
 }
 
 func TestCacheResolution(t *testing.T) {
-	c := newLinkAddrCache(1<<63-1, 250*time.Millisecond, 1)
+	// There is a race condition causing this test to fail when the executor
+	// takes longer than the resolution timeout to call linkAddrCache.get. This
+	// is especially common when this test is run with gotsan.
+	//
+	// Using a large resolution timeout decreases the probability of experiencing
+	// this race condition and does not affect how long this test takes to run.
+	c := newLinkAddrCache(1<<63-1, math.MaxInt64, 1)
 	linkRes := &testLinkAddressResolver{cache: c}
 	for i, ta := range testAddrs {
 		got, err := getBlocking(c, ta.addr, linkRes)
@@ -275,3 +282,71 @@ func TestStaticResolution(t *testing.T) {
 		t.Errorf("c.get(%q)=%q, want %q", string(addr), string(got), string(want))
 	}
 }
+
+// TestCacheWaker verifies that RemoveWaker removes a waker previously added
+// through get().
+func TestCacheWaker(t *testing.T) {
+	c := newLinkAddrCache(1<<63-1, 1*time.Second, 3)
+
+	// First, sanity check that wakers are working.
+	{
+		linkRes := &testLinkAddressResolver{cache: c}
+		s := sleep.Sleeper{}
+		defer s.Done()
+
+		const wakerID = 1
+		w := sleep.Waker{}
+		s.AddWaker(&w, wakerID)
+
+		e := testAddrs[0]
+
+		if _, _, err := c.get(e.addr, linkRes, "", nil, &w); err != tcpip.ErrWouldBlock {
+			t.Fatalf("got c.get(%q, _, _, _, _) = %s, want = %s", e.addr.Addr, err, tcpip.ErrWouldBlock)
+		}
+		id, ok := s.Fetch(true /* block */)
+		if !ok {
+			t.Fatal("got s.Fetch(true) = (_, false), want = (_, true)")
+		}
+		if id != wakerID {
+			t.Fatalf("got s.Fetch(true) = (%d, %t), want = (%d, true)", id, ok, wakerID)
+		}
+
+		if got, _, err := c.get(e.addr, linkRes, "", nil, nil); err != nil {
+			t.Fatalf("c.get(%q, _, _, _, _): %s", e.addr.Addr, err)
+		} else if got != e.linkAddr {
+			t.Fatalf("got c.get(%q) = %q, want = %q", e.addr.Addr, got, e.linkAddr)
+		}
+	}
+
+	// Check that RemoveWaker works.
+	{
+		linkRes := &testLinkAddressResolver{cache: c}
+		s := sleep.Sleeper{}
+		defer s.Done()
+
+		const wakerID = 2 // different than the ID used in the sanity check
+		w := sleep.Waker{}
+		s.AddWaker(&w, wakerID)
+
+		e := testAddrs[1]
+		linkRes.onLinkAddressRequest = func() {
+			// Remove the waker before the linkAddrCache has the opportunity to send
+			// a notification.
+			c.removeWaker(e.addr, &w)
+		}
+
+		if _, _, err := c.get(e.addr, linkRes, "", nil, &w); err != tcpip.ErrWouldBlock {
+			t.Fatalf("got c.get(%q, _, _, _, _) = %s, want = %s", e.addr.Addr, err, tcpip.ErrWouldBlock)
+		}
+
+		if got, err := getBlocking(c, e.addr, linkRes); err != nil {
+			t.Fatalf("c.get(%q, _, _, _, _): %s", e.addr.Addr, err)
+		} else if got != e.linkAddr {
+			t.Fatalf("c.get(%q) = %q, want = %q", e.addr.Addr, got, e.linkAddr)
+		}
+
+		if id, ok := s.Fetch(false /* block */); ok {
+			t.Fatalf("unexpected notification from waker with id %d", id)
+		}
+	}
+}
diff --git a/pkg/tcpip/stack/ndp_test.go b/pkg/tcpip/stack/ndp_test.go
index 21bf53010..73a01c2dd 100644
--- a/pkg/tcpip/stack/ndp_test.go
+++ b/pkg/tcpip/stack/ndp_test.go
@@ -150,10 +150,10 @@ type ndpDNSSLEvent struct {
 
 type ndpDHCPv6Event struct {
 	nicID         tcpip.NICID
-	configuration stack.DHCPv6ConfigurationFromNDPRA
+	configuration ipv6.DHCPv6ConfigurationFromNDPRA
 }
 
-var _ stack.NDPDispatcher = (*ndpDispatcher)(nil)
+var _ ipv6.NDPDispatcher = (*ndpDispatcher)(nil)
 
 // ndpDispatcher implements NDPDispatcher so tests can know when various NDP
 // related events happen for test purposes.
@@ -170,7 +170,7 @@ type ndpDispatcher struct {
 	dhcpv6ConfigurationC chan ndpDHCPv6Event
 }
 
-// Implements stack.NDPDispatcher.OnDuplicateAddressDetectionStatus.
+// Implements ipv6.NDPDispatcher.OnDuplicateAddressDetectionStatus.
 func (n *ndpDispatcher) OnDuplicateAddressDetectionStatus(nicID tcpip.NICID, addr tcpip.Address, resolved bool, err *tcpip.Error) {
 	if n.dadC != nil {
 		n.dadC <- ndpDADEvent{
@@ -182,7 +182,7 @@ func (n *ndpDispatcher) OnDuplicateAddressDetectionStatus(nicID tcpip.NICID, add
 	}
 }
 
-// Implements stack.NDPDispatcher.OnDefaultRouterDiscovered.
+// Implements ipv6.NDPDispatcher.OnDefaultRouterDiscovered.
 func (n *ndpDispatcher) OnDefaultRouterDiscovered(nicID tcpip.NICID, addr tcpip.Address) bool {
 	if c := n.routerC; c != nil {
 		c <- ndpRouterEvent{
@@ -195,7 +195,7 @@ func (n *ndpDispatcher) OnDefaultRouterDiscovered(nicID tcpip.NICID, addr tcpip.
 	return n.rememberRouter
 }
 
-// Implements stack.NDPDispatcher.OnDefaultRouterInvalidated.
+// Implements ipv6.NDPDispatcher.OnDefaultRouterInvalidated.
 func (n *ndpDispatcher) OnDefaultRouterInvalidated(nicID tcpip.NICID, addr tcpip.Address) {
 	if c := n.routerC; c != nil {
 		c <- ndpRouterEvent{
@@ -206,7 +206,7 @@ func (n *ndpDispatcher) OnDefaultRouterInvalidated(nicID tcpip.NICID, addr tcpip
 	}
 }
 
-// Implements stack.NDPDispatcher.OnOnLinkPrefixDiscovered.
+// Implements ipv6.NDPDispatcher.OnOnLinkPrefixDiscovered.
 func (n *ndpDispatcher) OnOnLinkPrefixDiscovered(nicID tcpip.NICID, prefix tcpip.Subnet) bool {
 	if c := n.prefixC; c != nil {
 		c <- ndpPrefixEvent{
@@ -219,7 +219,7 @@ func (n *ndpDispatcher) OnOnLinkPrefixDiscovered(nicID tcpip.NICID, prefix tcpip
 	return n.rememberPrefix
 }
 
-// Implements stack.NDPDispatcher.OnOnLinkPrefixInvalidated.
+// Implements ipv6.NDPDispatcher.OnOnLinkPrefixInvalidated.
 func (n *ndpDispatcher) OnOnLinkPrefixInvalidated(nicID tcpip.NICID, prefix tcpip.Subnet) {
 	if c := n.prefixC; c != nil {
 		c <- ndpPrefixEvent{
@@ -261,7 +261,7 @@ func (n *ndpDispatcher) OnAutoGenAddressInvalidated(nicID tcpip.NICID, addr tcpi
 	}
 }
 
-// Implements stack.NDPDispatcher.OnRecursiveDNSServerOption.
+// Implements ipv6.NDPDispatcher.OnRecursiveDNSServerOption.
 func (n *ndpDispatcher) OnRecursiveDNSServerOption(nicID tcpip.NICID, addrs []tcpip.Address, lifetime time.Duration) {
 	if c := n.rdnssC; c != nil {
 		c <- ndpRDNSSEvent{
@@ -274,7 +274,7 @@ func (n *ndpDispatcher) OnRecursiveDNSServerOption(nicID tcpip.NICID, addrs []tc
 	}
 }
 
-// Implements stack.NDPDispatcher.OnDNSSearchListOption.
+// Implements ipv6.NDPDispatcher.OnDNSSearchListOption.
 func (n *ndpDispatcher) OnDNSSearchListOption(nicID tcpip.NICID, domainNames []string, lifetime time.Duration) {
 	if n.dnsslC != nil {
 		n.dnsslC <- ndpDNSSLEvent{
@@ -285,8 +285,8 @@ func (n *ndpDispatcher) OnDNSSearchListOption(nicID tcpip.NICID, domainNames []s
 	}
 }
 
-// Implements stack.NDPDispatcher.OnDHCPv6Configuration.
-func (n *ndpDispatcher) OnDHCPv6Configuration(nicID tcpip.NICID, configuration stack.DHCPv6ConfigurationFromNDPRA) {
+// Implements ipv6.NDPDispatcher.OnDHCPv6Configuration.
+func (n *ndpDispatcher) OnDHCPv6Configuration(nicID tcpip.NICID, configuration ipv6.DHCPv6ConfigurationFromNDPRA) {
 	if c := n.dhcpv6ConfigurationC; c != nil {
 		c <- ndpDHCPv6Event{
 			nicID,
@@ -319,13 +319,12 @@ func TestDADDisabled(t *testing.T) {
 	ndpDisp := ndpDispatcher{
 		dadC: make(chan ndpDADEvent, 1),
 	}
-	opts := stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
-		NDPDisp:          &ndpDisp,
-	}
-
 	e := channel.New(0, 1280, linkAddr1)
-	s := stack.New(opts)
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
+			NDPDisp: &ndpDisp,
+		})},
+	})
 	if err := s.CreateNIC(nicID, e); err != nil {
 		t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
 	}
@@ -413,19 +412,21 @@ func TestDADResolve(t *testing.T) {
 			ndpDisp := ndpDispatcher{
 				dadC: make(chan ndpDADEvent),
 			}
-			opts := stack.Options{
-				NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
-				NDPDisp:          &ndpDisp,
-			}
-			opts.NDPConfigs.RetransmitTimer = test.retransTimer
-			opts.NDPConfigs.DupAddrDetectTransmits = test.dupAddrDetectTransmits
 
 			e := channelLinkWithHeaderLength{
 				Endpoint:     channel.New(int(test.dupAddrDetectTransmits), 1280, linkAddr1),
 				headerLength: test.linkHeaderLen,
 			}
 			e.Endpoint.LinkEPCapabilities |= stack.CapabilityResolutionRequired
-			s := stack.New(opts)
+			s := stack.New(stack.Options{
+				NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
+					NDPDisp: &ndpDisp,
+					NDPConfigs: ipv6.NDPConfigurations{
+						RetransmitTimer:        test.retransTimer,
+						DupAddrDetectTransmits: test.dupAddrDetectTransmits,
+					},
+				})},
+			})
 			if err := s.CreateNIC(nicID, &e); err != nil {
 				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
 			}
@@ -558,6 +559,26 @@ func TestDADResolve(t *testing.T) {
 	}
 }
 
+func rxNDPSolicit(e *channel.Endpoint, tgt tcpip.Address) {
+	hdr := buffer.NewPrependable(header.IPv6MinimumSize + header.ICMPv6NeighborSolicitMinimumSize)
+	pkt := header.ICMPv6(hdr.Prepend(header.ICMPv6NeighborSolicitMinimumSize))
+	pkt.SetType(header.ICMPv6NeighborSolicit)
+	ns := header.NDPNeighborSolicit(pkt.NDPPayload())
+	ns.SetTargetAddress(tgt)
+	snmc := header.SolicitedNodeAddr(tgt)
+	pkt.SetChecksum(header.ICMPv6Checksum(pkt, header.IPv6Any, snmc, buffer.VectorisedView{}))
+	payloadLength := hdr.UsedLength()
+	ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
+	ip.Encode(&header.IPv6Fields{
+		PayloadLength: uint16(payloadLength),
+		NextHeader:    uint8(icmp.ProtocolNumber6),
+		HopLimit:      255,
+		SrcAddr:       header.IPv6Any,
+		DstAddr:       snmc,
+	})
+	e.InjectInbound(header.IPv6ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{Data: hdr.View().ToVectorisedView()}))
+}
+
 // TestDADFail tests to make sure that the DAD process fails if another node is
 // detected to be performing DAD on the same address (receive an NS message from
 // a node doing DAD for the same address), or if another node is detected to own
@@ -567,39 +588,19 @@ func TestDADFail(t *testing.T) {
 
 	tests := []struct {
 		name    string
-		makeBuf func(tgt tcpip.Address) buffer.Prependable
+		rxPkt   func(e *channel.Endpoint, tgt tcpip.Address)
 		getStat func(s tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter
 	}{
 		{
-			"RxSolicit",
-			func(tgt tcpip.Address) buffer.Prependable {
-				hdr := buffer.NewPrependable(header.IPv6MinimumSize + header.ICMPv6NeighborSolicitMinimumSize)
-				pkt := header.ICMPv6(hdr.Prepend(header.ICMPv6NeighborSolicitMinimumSize))
-				pkt.SetType(header.ICMPv6NeighborSolicit)
-				ns := header.NDPNeighborSolicit(pkt.NDPPayload())
-				ns.SetTargetAddress(tgt)
-				snmc := header.SolicitedNodeAddr(tgt)
-				pkt.SetChecksum(header.ICMPv6Checksum(pkt, header.IPv6Any, snmc, buffer.VectorisedView{}))
-				payloadLength := hdr.UsedLength()
-				ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
-				ip.Encode(&header.IPv6Fields{
-					PayloadLength: uint16(payloadLength),
-					NextHeader:    uint8(icmp.ProtocolNumber6),
-					HopLimit:      255,
-					SrcAddr:       header.IPv6Any,
-					DstAddr:       snmc,
-				})
-
-				return hdr
-
-			},
-			func(s tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+			name:  "RxSolicit",
+			rxPkt: rxNDPSolicit,
+			getStat: func(s tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
 				return s.NeighborSolicit
 			},
 		},
 		{
-			"RxAdvert",
-			func(tgt tcpip.Address) buffer.Prependable {
+			name: "RxAdvert",
+			rxPkt: func(e *channel.Endpoint, tgt tcpip.Address) {
 				naSize := header.ICMPv6NeighborAdvertMinimumSize + header.NDPLinkLayerAddressSize
 				hdr := buffer.NewPrependable(header.IPv6MinimumSize + naSize)
 				pkt := header.ICMPv6(hdr.Prepend(naSize))
@@ -621,11 +622,9 @@ func TestDADFail(t *testing.T) {
 					SrcAddr:       tgt,
 					DstAddr:       header.IPv6AllNodesMulticastAddress,
 				})
-
-				return hdr
-
+				e.InjectInbound(header.IPv6ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{Data: hdr.View().ToVectorisedView()}))
 			},
-			func(s tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+			getStat: func(s tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
 				return s.NeighborAdvert
 			},
 		},
@@ -636,16 +635,16 @@ func TestDADFail(t *testing.T) {
 			ndpDisp := ndpDispatcher{
 				dadC: make(chan ndpDADEvent, 1),
 			}
-			ndpConfigs := stack.DefaultNDPConfigurations()
-			opts := stack.Options{
-				NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
-				NDPConfigs:       ndpConfigs,
-				NDPDisp:          &ndpDisp,
-			}
-			opts.NDPConfigs.RetransmitTimer = time.Second * 2
+			ndpConfigs := ipv6.DefaultNDPConfigurations()
+			ndpConfigs.RetransmitTimer = time.Second * 2
 
 			e := channel.New(0, 1280, linkAddr1)
-			s := stack.New(opts)
+			s := stack.New(stack.Options{
+				NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
+					NDPDisp:    &ndpDisp,
+					NDPConfigs: ndpConfigs,
+				})},
+			})
 			if err := s.CreateNIC(nicID, e); err != nil {
 				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
 			}
@@ -664,13 +663,8 @@ func TestDADFail(t *testing.T) {
 				t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (%s, nil), want = (%s, nil)", nicID, header.IPv6ProtocolNumber, addr, want)
 			}
 
-			// Receive a packet to simulate multiple nodes owning or
-			// attempting to own the same address.
-			hdr := test.makeBuf(addr1)
-			pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
-				Data: hdr.View().ToVectorisedView(),
-			})
-			e.InjectInbound(header.IPv6ProtocolNumber, pkt)
+			// Receive a packet to simulate an address conflict.
+			test.rxPkt(e, addr1)
 
 			stat := test.getStat(s.Stats().ICMP.V6PacketsReceived)
 			if got := stat.Value(); got != 1 {
@@ -754,18 +748,19 @@ func TestDADStop(t *testing.T) {
 			ndpDisp := ndpDispatcher{
 				dadC: make(chan ndpDADEvent, 1),
 			}
-			ndpConfigs := stack.NDPConfigurations{
+
+			ndpConfigs := ipv6.NDPConfigurations{
 				RetransmitTimer:        time.Second,
 				DupAddrDetectTransmits: 2,
 			}
-			opts := stack.Options{
-				NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
-				NDPDisp:          &ndpDisp,
-				NDPConfigs:       ndpConfigs,
-			}
 
 			e := channel.New(0, 1280, linkAddr1)
-			s := stack.New(opts)
+			s := stack.New(stack.Options{
+				NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
+					NDPDisp:    &ndpDisp,
+					NDPConfigs: ndpConfigs,
+				})},
+			})
 			if err := s.CreateNIC(nicID, e); err != nil {
 				t.Fatalf("CreateNIC(%d, _): %s", nicID, err)
 			}
@@ -815,19 +810,6 @@ func TestDADStop(t *testing.T) {
 	}
 }
 
-// TestSetNDPConfigurationFailsForBadNICID tests to make sure we get an error if
-// we attempt to update NDP configurations using an invalid NICID.
-func TestSetNDPConfigurationFailsForBadNICID(t *testing.T) {
-	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
-	})
-
-	// No NIC with ID 1 yet.
-	if got := s.SetNDPConfigurations(1, stack.NDPConfigurations{}); got != tcpip.ErrUnknownNICID {
-		t.Fatalf("got s.SetNDPConfigurations = %v, want = %s", got, tcpip.ErrUnknownNICID)
-	}
-}
-
 // TestSetNDPConfigurations tests that we can update and use per-interface NDP
 // configurations without affecting the default NDP configurations or other
 // interfaces' configurations.
@@ -863,8 +845,9 @@ func TestSetNDPConfigurations(t *testing.T) {
 			}
 			e := channel.New(0, 1280, linkAddr1)
 			s := stack.New(stack.Options{
-				NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
-				NDPDisp:          &ndpDisp,
+				NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
+					NDPDisp: &ndpDisp,
+				})},
 			})
 
 			expectDADEvent := func(nicID tcpip.NICID, addr tcpip.Address) {
@@ -892,12 +875,15 @@ func TestSetNDPConfigurations(t *testing.T) {
 			}
 
 			// Update the NDP configurations on NIC(1) to use DAD.
-			configs := stack.NDPConfigurations{
+			configs := ipv6.NDPConfigurations{
 				DupAddrDetectTransmits: test.dupAddrDetectTransmits,
 				RetransmitTimer:        test.retransmitTimer,
 			}
-			if err := s.SetNDPConfigurations(nicID1, configs); err != nil {
-				t.Fatalf("got SetNDPConfigurations(%d, _) = %s", nicID1, err)
+			if ipv6Ep, err := s.GetNetworkEndpoint(nicID1, header.IPv6ProtocolNumber); err != nil {
+				t.Fatalf("s.GetNetworkEndpoint(%d, %d): %s", nicID1, header.IPv6ProtocolNumber, err)
+			} else {
+				ndpEP := ipv6Ep.(ipv6.NDPEndpoint)
+				ndpEP.SetNDPConfigurations(configs)
 			}
 
 			// Created after updating NIC(1)'s NDP configurations
@@ -1113,14 +1099,15 @@ func TestNoRouterDiscovery(t *testing.T) {
 			}
 			e := channel.New(0, 1280, linkAddr1)
 			s := stack.New(stack.Options{
-				NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
-				NDPConfigs: stack.NDPConfigurations{
-					HandleRAs:              handle,
-					DiscoverDefaultRouters: discover,
-				},
-				NDPDisp: &ndpDisp,
+				NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
+					NDPConfigs: ipv6.NDPConfigurations{
+						HandleRAs:              handle,
+						DiscoverDefaultRouters: discover,
+					},
+					NDPDisp: &ndpDisp,
+				})},
 			})
-			s.SetForwarding(forwarding)
+			s.SetForwarding(ipv6.ProtocolNumber, forwarding)
 
 			if err := s.CreateNIC(1, e); err != nil {
 				t.Fatalf("CreateNIC(1) = %s", err)
@@ -1151,12 +1138,13 @@ func TestRouterDiscoveryDispatcherNoRemember(t *testing.T) {
 	}
 	e := channel.New(0, 1280, linkAddr1)
 	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
-		NDPConfigs: stack.NDPConfigurations{
-			HandleRAs:              true,
-			DiscoverDefaultRouters: true,
-		},
-		NDPDisp: &ndpDisp,
+		NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
+			NDPConfigs: ipv6.NDPConfigurations{
+				HandleRAs:              true,
+				DiscoverDefaultRouters: true,
+			},
+			NDPDisp: &ndpDisp,
+		})},
 	})
 
 	if err := s.CreateNIC(1, e); err != nil {
@@ -1192,12 +1180,13 @@ func TestRouterDiscovery(t *testing.T) {
 	}
 	e := channel.New(0, 1280, linkAddr1)
 	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
-		NDPConfigs: stack.NDPConfigurations{
-			HandleRAs:              true,
-			DiscoverDefaultRouters: true,
-		},
-		NDPDisp: &ndpDisp,
+		NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
+			NDPConfigs: ipv6.NDPConfigurations{
+				HandleRAs:              true,
+				DiscoverDefaultRouters: true,
+			},
+			NDPDisp: &ndpDisp,
+		})},
 	})
 
 	expectRouterEvent := func(addr tcpip.Address, discovered bool) {
@@ -1285,7 +1274,7 @@ func TestRouterDiscovery(t *testing.T) {
 }
 
 // TestRouterDiscoveryMaxRouters tests that only
-// stack.MaxDiscoveredDefaultRouters discovered routers are remembered.
+// ipv6.MaxDiscoveredDefaultRouters discovered routers are remembered.
 func TestRouterDiscoveryMaxRouters(t *testing.T) {
 	ndpDisp := ndpDispatcher{
 		routerC:        make(chan ndpRouterEvent, 1),
@@ -1293,12 +1282,13 @@ func TestRouterDiscoveryMaxRouters(t *testing.T) {
 	}
 	e := channel.New(0, 1280, linkAddr1)
 	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
-		NDPConfigs: stack.NDPConfigurations{
-			HandleRAs:              true,
-			DiscoverDefaultRouters: true,
-		},
-		NDPDisp: &ndpDisp,
+		NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
+			NDPConfigs: ipv6.NDPConfigurations{
+				HandleRAs:              true,
+				DiscoverDefaultRouters: true,
+			},
+			NDPDisp: &ndpDisp,
+		})},
 	})
 
 	if err := s.CreateNIC(1, e); err != nil {
@@ -1306,14 +1296,14 @@ func TestRouterDiscoveryMaxRouters(t *testing.T) {
 	}
 
 	// Receive an RA from 2 more than the max number of discovered routers.
-	for i := 1; i <= stack.MaxDiscoveredDefaultRouters+2; i++ {
+	for i := 1; i <= ipv6.MaxDiscoveredDefaultRouters+2; i++ {
 		linkAddr := []byte{2, 2, 3, 4, 5, 0}
 		linkAddr[5] = byte(i)
 		llAddr := header.LinkLocalAddr(tcpip.LinkAddress(linkAddr))
 
 		e.InjectInbound(header.IPv6ProtocolNumber, raBuf(llAddr, 5))
 
-		if i <= stack.MaxDiscoveredDefaultRouters {
+		if i <= ipv6.MaxDiscoveredDefaultRouters {
 			select {
 			case e := <-ndpDisp.routerC:
 				if diff := checkRouterEvent(e, llAddr, true); diff != "" {
@@ -1358,14 +1348,15 @@ func TestNoPrefixDiscovery(t *testing.T) {
 			}
 			e := channel.New(0, 1280, linkAddr1)
 			s := stack.New(stack.Options{
-				NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
-				NDPConfigs: stack.NDPConfigurations{
-					HandleRAs:              handle,
-					DiscoverOnLinkPrefixes: discover,
-				},
-				NDPDisp: &ndpDisp,
+				NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
+					NDPConfigs: ipv6.NDPConfigurations{
+						HandleRAs:              handle,
+						DiscoverOnLinkPrefixes: discover,
+					},
+					NDPDisp: &ndpDisp,
+				})},
 			})
-			s.SetForwarding(forwarding)
+			s.SetForwarding(ipv6.ProtocolNumber, forwarding)
 
 			if err := s.CreateNIC(1, e); err != nil {
 				t.Fatalf("CreateNIC(1) = %s", err)
@@ -1399,13 +1390,14 @@ func TestPrefixDiscoveryDispatcherNoRemember(t *testing.T) {
 	}
 	e := channel.New(0, 1280, linkAddr1)
 	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
-		NDPConfigs: stack.NDPConfigurations{
-			HandleRAs:              true,
-			DiscoverDefaultRouters: false,
-			DiscoverOnLinkPrefixes: true,
-		},
-		NDPDisp: &ndpDisp,
+		NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
+			NDPConfigs: ipv6.NDPConfigurations{
+				HandleRAs:              true,
+				DiscoverDefaultRouters: false,
+				DiscoverOnLinkPrefixes: true,
+			},
+			NDPDisp: &ndpDisp,
+		})},
 	})
 
 	if err := s.CreateNIC(1, e); err != nil {
@@ -1445,12 +1437,13 @@ func TestPrefixDiscovery(t *testing.T) {
 	}
 	e := channel.New(0, 1280, linkAddr1)
 	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
-		NDPConfigs: stack.NDPConfigurations{
-			HandleRAs:              true,
-			DiscoverOnLinkPrefixes: true,
-		},
-		NDPDisp: &ndpDisp,
+		NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
+			NDPConfigs: ipv6.NDPConfigurations{
+				HandleRAs:              true,
+				DiscoverOnLinkPrefixes: true,
+			},
+			NDPDisp: &ndpDisp,
+		})},
 	})
 
 	if err := s.CreateNIC(1, e); err != nil {
@@ -1545,12 +1538,13 @@ func TestPrefixDiscoveryWithInfiniteLifetime(t *testing.T) {
 	}
 	e := channel.New(0, 1280, linkAddr1)
 	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
-		NDPConfigs: stack.NDPConfigurations{
-			HandleRAs:              true,
-			DiscoverOnLinkPrefixes: true,
-		},
-		NDPDisp: &ndpDisp,
+		NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
+			NDPConfigs: ipv6.NDPConfigurations{
+				HandleRAs:              true,
+				DiscoverOnLinkPrefixes: true,
+			},
+			NDPDisp: &ndpDisp,
+		})},
 	})
 
 	if err := s.CreateNIC(1, e); err != nil {
@@ -1621,33 +1615,34 @@ func TestPrefixDiscoveryWithInfiniteLifetime(t *testing.T) {
 }
 
 // TestPrefixDiscoveryMaxRouters tests that only
-// stack.MaxDiscoveredOnLinkPrefixes discovered on-link prefixes are remembered.
+// ipv6.MaxDiscoveredOnLinkPrefixes discovered on-link prefixes are remembered.
 func TestPrefixDiscoveryMaxOnLinkPrefixes(t *testing.T) {
 	ndpDisp := ndpDispatcher{
-		prefixC:        make(chan ndpPrefixEvent, stack.MaxDiscoveredOnLinkPrefixes+3),
+		prefixC:        make(chan ndpPrefixEvent, ipv6.MaxDiscoveredOnLinkPrefixes+3),
 		rememberPrefix: true,
 	}
 	e := channel.New(0, 1280, linkAddr1)
 	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
-		NDPConfigs: stack.NDPConfigurations{
-			HandleRAs:              true,
-			DiscoverDefaultRouters: false,
-			DiscoverOnLinkPrefixes: true,
-		},
-		NDPDisp: &ndpDisp,
+		NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
+			NDPConfigs: ipv6.NDPConfigurations{
+				HandleRAs:              true,
+				DiscoverDefaultRouters: false,
+				DiscoverOnLinkPrefixes: true,
+			},
+			NDPDisp: &ndpDisp,
+		})},
 	})
 
 	if err := s.CreateNIC(1, e); err != nil {
 		t.Fatalf("CreateNIC(1) = %s", err)
 	}
 
-	optSer := make(header.NDPOptionsSerializer, stack.MaxDiscoveredOnLinkPrefixes+2)
-	prefixes := [stack.MaxDiscoveredOnLinkPrefixes + 2]tcpip.Subnet{}
+	optSer := make(header.NDPOptionsSerializer, ipv6.MaxDiscoveredOnLinkPrefixes+2)
+	prefixes := [ipv6.MaxDiscoveredOnLinkPrefixes + 2]tcpip.Subnet{}
 
 	// Receive an RA with 2 more than the max number of discovered on-link
 	// prefixes.
-	for i := 0; i < stack.MaxDiscoveredOnLinkPrefixes+2; i++ {
+	for i := 0; i < ipv6.MaxDiscoveredOnLinkPrefixes+2; i++ {
 		prefixAddr := [16]byte{1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 0}
 		prefixAddr[7] = byte(i)
 		prefix := tcpip.AddressWithPrefix{
@@ -1665,8 +1660,8 @@ func TestPrefixDiscoveryMaxOnLinkPrefixes(t *testing.T) {
 	}
 
 	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithOpts(llAddr1, 0, optSer))
-	for i := 0; i < stack.MaxDiscoveredOnLinkPrefixes+2; i++ {
-		if i < stack.MaxDiscoveredOnLinkPrefixes {
+	for i := 0; i < ipv6.MaxDiscoveredOnLinkPrefixes+2; i++ {
+		if i < ipv6.MaxDiscoveredOnLinkPrefixes {
 			select {
 			case e := <-ndpDisp.prefixC:
 				if diff := checkPrefixEvent(e, prefixes[i], true); diff != "" {
@@ -1716,14 +1711,15 @@ func TestNoAutoGenAddr(t *testing.T) {
 			}
 			e := channel.New(0, 1280, linkAddr1)
 			s := stack.New(stack.Options{
-				NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
-				NDPConfigs: stack.NDPConfigurations{
-					HandleRAs:              handle,
-					AutoGenGlobalAddresses: autogen,
-				},
-				NDPDisp: &ndpDisp,
+				NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
+					NDPConfigs: ipv6.NDPConfigurations{
+						HandleRAs:              handle,
+						AutoGenGlobalAddresses: autogen,
+					},
+					NDPDisp: &ndpDisp,
+				})},
 			})
-			s.SetForwarding(forwarding)
+			s.SetForwarding(ipv6.ProtocolNumber, forwarding)
 
 			if err := s.CreateNIC(1, e); err != nil {
 				t.Fatalf("CreateNIC(1) = %s", err)
@@ -1749,14 +1745,14 @@ func checkAutoGenAddrEvent(e ndpAutoGenAddrEvent, addr tcpip.AddressWithPrefix,
 
 // TestAutoGenAddr tests that an address is properly generated and invalidated
 // when configured to do so.
-func TestAutoGenAddr(t *testing.T) {
+func TestAutoGenAddr2(t *testing.T) {
 	const newMinVL = 2
 	newMinVLDuration := newMinVL * time.Second
-	saved := stack.MinPrefixInformationValidLifetimeForUpdate
+	saved := ipv6.MinPrefixInformationValidLifetimeForUpdate
 	defer func() {
-		stack.MinPrefixInformationValidLifetimeForUpdate = saved
+		ipv6.MinPrefixInformationValidLifetimeForUpdate = saved
 	}()
-	stack.MinPrefixInformationValidLifetimeForUpdate = newMinVLDuration
+	ipv6.MinPrefixInformationValidLifetimeForUpdate = newMinVLDuration
 
 	prefix1, _, addr1 := prefixSubnetAddr(0, linkAddr1)
 	prefix2, _, addr2 := prefixSubnetAddr(1, linkAddr1)
@@ -1766,12 +1762,13 @@ func TestAutoGenAddr(t *testing.T) {
 	}
 	e := channel.New(0, 1280, linkAddr1)
 	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
-		NDPConfigs: stack.NDPConfigurations{
-			HandleRAs:              true,
-			AutoGenGlobalAddresses: true,
-		},
-		NDPDisp: &ndpDisp,
+		NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
+			NDPConfigs: ipv6.NDPConfigurations{
+				HandleRAs:              true,
+				AutoGenGlobalAddresses: true,
+			},
+			NDPDisp: &ndpDisp,
+		})},
 	})
 
 	if err := s.CreateNIC(1, e); err != nil {
@@ -1876,14 +1873,14 @@ func TestAutoGenTempAddr(t *testing.T) {
 		newMinVLDuration = newMinVL * time.Second
 	)
 
-	savedMinPrefixInformationValidLifetimeForUpdate := stack.MinPrefixInformationValidLifetimeForUpdate
-	savedMaxDesync := stack.MaxDesyncFactor
+	savedMinPrefixInformationValidLifetimeForUpdate := ipv6.MinPrefixInformationValidLifetimeForUpdate
+	savedMaxDesync := ipv6.MaxDesyncFactor
 	defer func() {
-		stack.MinPrefixInformationValidLifetimeForUpdate = savedMinPrefixInformationValidLifetimeForUpdate
-		stack.MaxDesyncFactor = savedMaxDesync
+		ipv6.MinPrefixInformationValidLifetimeForUpdate = savedMinPrefixInformationValidLifetimeForUpdate
+		ipv6.MaxDesyncFactor = savedMaxDesync
 	}()
-	stack.MinPrefixInformationValidLifetimeForUpdate = newMinVLDuration
-	stack.MaxDesyncFactor = time.Nanosecond
+	ipv6.MinPrefixInformationValidLifetimeForUpdate = newMinVLDuration
+	ipv6.MaxDesyncFactor = time.Nanosecond
 
 	prefix1, _, addr1 := prefixSubnetAddr(0, linkAddr1)
 	prefix2, _, addr2 := prefixSubnetAddr(1, linkAddr1)
@@ -1931,16 +1928,17 @@ func TestAutoGenTempAddr(t *testing.T) {
 				}
 				e := channel.New(0, 1280, linkAddr1)
 				s := stack.New(stack.Options{
-					NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
-					NDPConfigs: stack.NDPConfigurations{
-						DupAddrDetectTransmits:     test.dupAddrTransmits,
-						RetransmitTimer:            test.retransmitTimer,
-						HandleRAs:                  true,
-						AutoGenGlobalAddresses:     true,
-						AutoGenTempGlobalAddresses: true,
-					},
-					NDPDisp:     &ndpDisp,
-					TempIIDSeed: seed,
+					NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
+						NDPConfigs: ipv6.NDPConfigurations{
+							DupAddrDetectTransmits:     test.dupAddrTransmits,
+							RetransmitTimer:            test.retransmitTimer,
+							HandleRAs:                  true,
+							AutoGenGlobalAddresses:     true,
+							AutoGenTempGlobalAddresses: true,
+						},
+						NDPDisp:     &ndpDisp,
+						TempIIDSeed: seed,
+					})},
 				})
 
 				if err := s.CreateNIC(nicID, e); err != nil {
@@ -2119,11 +2117,11 @@ func TestAutoGenTempAddr(t *testing.T) {
 func TestNoAutoGenTempAddrForLinkLocal(t *testing.T) {
 	const nicID = 1
 
-	savedMaxDesyncFactor := stack.MaxDesyncFactor
+	savedMaxDesyncFactor := ipv6.MaxDesyncFactor
 	defer func() {
-		stack.MaxDesyncFactor = savedMaxDesyncFactor
+		ipv6.MaxDesyncFactor = savedMaxDesyncFactor
 	}()
-	stack.MaxDesyncFactor = time.Nanosecond
+	ipv6.MaxDesyncFactor = time.Nanosecond
 
 	tests := []struct {
 		name             string
@@ -2160,12 +2158,13 @@ func TestNoAutoGenTempAddrForLinkLocal(t *testing.T) {
 				}
 				e := channel.New(0, 1280, linkAddr1)
 				s := stack.New(stack.Options{
-					NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
-					NDPConfigs: stack.NDPConfigurations{
-						AutoGenTempGlobalAddresses: true,
-					},
-					NDPDisp:              &ndpDisp,
-					AutoGenIPv6LinkLocal: true,
+					NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
+						NDPConfigs: ipv6.NDPConfigurations{
+							AutoGenTempGlobalAddresses: true,
+						},
+						NDPDisp:              &ndpDisp,
+						AutoGenIPv6LinkLocal: true,
+					})},
 				})
 
 				if err := s.CreateNIC(nicID, e); err != nil {
@@ -2211,11 +2210,11 @@ func TestNoAutoGenTempAddrWithoutStableAddr(t *testing.T) {
 		retransmitTimer = 2 * time.Second
 	)
 
-	savedMaxDesyncFactor := stack.MaxDesyncFactor
+	savedMaxDesyncFactor := ipv6.MaxDesyncFactor
 	defer func() {
-		stack.MaxDesyncFactor = savedMaxDesyncFactor
+		ipv6.MaxDesyncFactor = savedMaxDesyncFactor
 	}()
-	stack.MaxDesyncFactor = 0
+	ipv6.MaxDesyncFactor = 0
 
 	prefix, _, addr := prefixSubnetAddr(0, linkAddr1)
 	var tempIIDHistory [header.IIDSize]byte
@@ -2228,15 +2227,16 @@ func TestNoAutoGenTempAddrWithoutStableAddr(t *testing.T) {
 	}
 	e := channel.New(0, 1280, linkAddr1)
 	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
-		NDPConfigs: stack.NDPConfigurations{
-			DupAddrDetectTransmits:     dadTransmits,
-			RetransmitTimer:            retransmitTimer,
-			HandleRAs:                  true,
-			AutoGenGlobalAddresses:     true,
-			AutoGenTempGlobalAddresses: true,
-		},
-		NDPDisp: &ndpDisp,
+		NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
+			NDPConfigs: ipv6.NDPConfigurations{
+				DupAddrDetectTransmits:     dadTransmits,
+				RetransmitTimer:            retransmitTimer,
+				HandleRAs:                  true,
+				AutoGenGlobalAddresses:     true,
+				AutoGenTempGlobalAddresses: true,
+			},
+			NDPDisp: &ndpDisp,
+		})},
 	})
 
 	if err := s.CreateNIC(nicID, e); err != nil {
@@ -2294,17 +2294,17 @@ func TestAutoGenTempAddrRegen(t *testing.T) {
 		newMinVLDuration = newMinVL * time.Second
 	)
 
-	savedMaxDesyncFactor := stack.MaxDesyncFactor
-	savedMinMaxTempAddrPreferredLifetime := stack.MinMaxTempAddrPreferredLifetime
-	savedMinMaxTempAddrValidLifetime := stack.MinMaxTempAddrValidLifetime
+	savedMaxDesyncFactor := ipv6.MaxDesyncFactor
+	savedMinMaxTempAddrPreferredLifetime := ipv6.MinMaxTempAddrPreferredLifetime
+	savedMinMaxTempAddrValidLifetime := ipv6.MinMaxTempAddrValidLifetime
 	defer func() {
-		stack.MaxDesyncFactor = savedMaxDesyncFactor
-		stack.MinMaxTempAddrPreferredLifetime = savedMinMaxTempAddrPreferredLifetime
-		stack.MinMaxTempAddrValidLifetime = savedMinMaxTempAddrValidLifetime
+		ipv6.MaxDesyncFactor = savedMaxDesyncFactor
+		ipv6.MinMaxTempAddrPreferredLifetime = savedMinMaxTempAddrPreferredLifetime
+		ipv6.MinMaxTempAddrValidLifetime = savedMinMaxTempAddrValidLifetime
 	}()
-	stack.MaxDesyncFactor = 0
-	stack.MinMaxTempAddrPreferredLifetime = newMinVLDuration
-	stack.MinMaxTempAddrValidLifetime = newMinVLDuration
+	ipv6.MaxDesyncFactor = 0
+	ipv6.MinMaxTempAddrPreferredLifetime = newMinVLDuration
+	ipv6.MinMaxTempAddrValidLifetime = newMinVLDuration
 
 	prefix, _, addr := prefixSubnetAddr(0, linkAddr1)
 	var tempIIDHistory [header.IIDSize]byte
@@ -2317,16 +2317,17 @@ func TestAutoGenTempAddrRegen(t *testing.T) {
 		autoGenAddrC: make(chan ndpAutoGenAddrEvent, 2),
 	}
 	e := channel.New(0, 1280, linkAddr1)
-	ndpConfigs := stack.NDPConfigurations{
+	ndpConfigs := ipv6.NDPConfigurations{
 		HandleRAs:                  true,
 		AutoGenGlobalAddresses:     true,
 		AutoGenTempGlobalAddresses: true,
 		RegenAdvanceDuration:       newMinVLDuration - regenAfter,
 	}
 	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
-		NDPConfigs:       ndpConfigs,
-		NDPDisp:          &ndpDisp,
+		NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
+			NDPConfigs: ndpConfigs,
+			NDPDisp:    &ndpDisp,
+		})},
 	})
 
 	if err := s.CreateNIC(nicID, e); err != nil {
@@ -2382,8 +2383,11 @@ func TestAutoGenTempAddrRegen(t *testing.T) {
 
 	// Stop generating temporary addresses
 	ndpConfigs.AutoGenTempGlobalAddresses = false
-	if err := s.SetNDPConfigurations(nicID, ndpConfigs); err != nil {
-		t.Fatalf("s.SetNDPConfigurations(%d, _): %s", nicID, err)
+	if ipv6Ep, err := s.GetNetworkEndpoint(nicID, header.IPv6ProtocolNumber); err != nil {
+		t.Fatalf("s.GetNetworkEndpoint(%d, %d): %s", nicID, header.IPv6ProtocolNumber, err)
+	} else {
+		ndpEP := ipv6Ep.(ipv6.NDPEndpoint)
+		ndpEP.SetNDPConfigurations(ndpConfigs)
 	}
 
 	// Wait for all the temporary addresses to get invalidated.
@@ -2439,17 +2443,17 @@ func TestAutoGenTempAddrRegenJobUpdates(t *testing.T) {
 		newMinVLDuration = newMinVL * time.Second
 	)
 
-	savedMaxDesyncFactor := stack.MaxDesyncFactor
-	savedMinMaxTempAddrPreferredLifetime := stack.MinMaxTempAddrPreferredLifetime
-	savedMinMaxTempAddrValidLifetime := stack.MinMaxTempAddrValidLifetime
+	savedMaxDesyncFactor := ipv6.MaxDesyncFactor
+	savedMinMaxTempAddrPreferredLifetime := ipv6.MinMaxTempAddrPreferredLifetime
+	savedMinMaxTempAddrValidLifetime := ipv6.MinMaxTempAddrValidLifetime
 	defer func() {
-		stack.MaxDesyncFactor = savedMaxDesyncFactor
-		stack.MinMaxTempAddrPreferredLifetime = savedMinMaxTempAddrPreferredLifetime
-		stack.MinMaxTempAddrValidLifetime = savedMinMaxTempAddrValidLifetime
+		ipv6.MaxDesyncFactor = savedMaxDesyncFactor
+		ipv6.MinMaxTempAddrPreferredLifetime = savedMinMaxTempAddrPreferredLifetime
+		ipv6.MinMaxTempAddrValidLifetime = savedMinMaxTempAddrValidLifetime
 	}()
-	stack.MaxDesyncFactor = 0
-	stack.MinMaxTempAddrPreferredLifetime = newMinVLDuration
-	stack.MinMaxTempAddrValidLifetime = newMinVLDuration
+	ipv6.MaxDesyncFactor = 0
+	ipv6.MinMaxTempAddrPreferredLifetime = newMinVLDuration
+	ipv6.MinMaxTempAddrValidLifetime = newMinVLDuration
 
 	prefix, _, addr := prefixSubnetAddr(0, linkAddr1)
 	var tempIIDHistory [header.IIDSize]byte
@@ -2462,16 +2466,17 @@ func TestAutoGenTempAddrRegenJobUpdates(t *testing.T) {
 		autoGenAddrC: make(chan ndpAutoGenAddrEvent, 2),
 	}
 	e := channel.New(0, 1280, linkAddr1)
-	ndpConfigs := stack.NDPConfigurations{
+	ndpConfigs := ipv6.NDPConfigurations{
 		HandleRAs:                  true,
 		AutoGenGlobalAddresses:     true,
 		AutoGenTempGlobalAddresses: true,
 		RegenAdvanceDuration:       newMinVLDuration - regenAfter,
 	}
 	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
-		NDPConfigs:       ndpConfigs,
-		NDPDisp:          &ndpDisp,
+		NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
+			NDPConfigs: ndpConfigs,
+			NDPDisp:    &ndpDisp,
+		})},
 	})
 
 	if err := s.CreateNIC(nicID, e); err != nil {
@@ -2545,9 +2550,12 @@ func TestAutoGenTempAddrRegenJobUpdates(t *testing.T) {
 	// as paased.
 	ndpConfigs.MaxTempAddrValidLifetime = 100 * time.Second
 	ndpConfigs.MaxTempAddrPreferredLifetime = 100 * time.Second
-	if err := s.SetNDPConfigurations(nicID, ndpConfigs); err != nil {
-		t.Fatalf("s.SetNDPConfigurations(%d, _): %s", nicID, err)
+	ipv6Ep, err := s.GetNetworkEndpoint(nicID, header.IPv6ProtocolNumber)
+	if err != nil {
+		t.Fatalf("s.GetNetworkEndpoint(%d, %d): %s", nicID, header.IPv6ProtocolNumber, err)
 	}
+	ndpEP := ipv6Ep.(ipv6.NDPEndpoint)
+	ndpEP.SetNDPConfigurations(ndpConfigs)
 	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, 100, 100))
 	select {
 	case e := <-ndpDisp.autoGenAddrC:
@@ -2565,9 +2573,7 @@ func TestAutoGenTempAddrRegenJobUpdates(t *testing.T) {
 	newLifetimes := newMinVLDuration + regenAfter + defaultAsyncNegativeEventTimeout
 	ndpConfigs.MaxTempAddrValidLifetime = newLifetimes
 	ndpConfigs.MaxTempAddrPreferredLifetime = newLifetimes
-	if err := s.SetNDPConfigurations(nicID, ndpConfigs); err != nil {
-		t.Fatalf("s.SetNDPConfigurations(%d, _): %s", nicID, err)
-	}
+	ndpEP.SetNDPConfigurations(ndpConfigs)
 	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, 100, 100))
 	expectAutoGenAddrEventAsync(tempAddr3, newAddr, regenAfter+defaultAsyncPositiveEventTimeout)
 }
@@ -2655,20 +2661,21 @@ func TestMixedSLAACAddrConflictRegen(t *testing.T) {
 				autoGenAddrC: make(chan ndpAutoGenAddrEvent, 2),
 			}
 			e := channel.New(0, 1280, linkAddr1)
-			ndpConfigs := stack.NDPConfigurations{
+			ndpConfigs := ipv6.NDPConfigurations{
 				HandleRAs:                     true,
 				AutoGenGlobalAddresses:        true,
 				AutoGenTempGlobalAddresses:    test.tempAddrs,
 				AutoGenAddressConflictRetries: 1,
 			}
 			s := stack.New(stack.Options{
-				NetworkProtocols:   []stack.NetworkProtocol{ipv6.NewProtocol()},
-				TransportProtocols: []stack.TransportProtocol{udp.NewProtocol()},
-				NDPConfigs:         ndpConfigs,
-				NDPDisp:            &ndpDisp,
-				OpaqueIIDOpts: stack.OpaqueInterfaceIdentifierOptions{
-					NICNameFromID: test.nicNameFromID,
-				},
+				NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
+					NDPConfigs: ndpConfigs,
+					NDPDisp:    &ndpDisp,
+					OpaqueIIDOpts: ipv6.OpaqueInterfaceIdentifierOptions{
+						NICNameFromID: test.nicNameFromID,
+					},
+				})},
+				TransportProtocols: []stack.TransportProtocolFactory{udp.NewProtocol},
 			})
 
 			s.SetRouteTable([]tcpip.Route{{
@@ -2739,8 +2746,11 @@ func TestMixedSLAACAddrConflictRegen(t *testing.T) {
 			ndpDisp.dadC = make(chan ndpDADEvent, 2)
 			ndpConfigs.DupAddrDetectTransmits = dupAddrTransmits
 			ndpConfigs.RetransmitTimer = retransmitTimer
-			if err := s.SetNDPConfigurations(nicID, ndpConfigs); err != nil {
-				t.Fatalf("s.SetNDPConfigurations(%d, _): %s", nicID, err)
+			if ipv6Ep, err := s.GetNetworkEndpoint(nicID, header.IPv6ProtocolNumber); err != nil {
+				t.Fatalf("s.GetNetworkEndpoint(%d, %d): %s", nicID, header.IPv6ProtocolNumber, err)
+			} else {
+				ndpEP := ipv6Ep.(ipv6.NDPEndpoint)
+				ndpEP.SetNDPConfigurations(ndpConfigs)
 			}
 
 			// Do SLAAC for prefix.
@@ -2754,9 +2764,7 @@ func TestMixedSLAACAddrConflictRegen(t *testing.T) {
 			// DAD failure to restart the local generation process.
 			addr := test.addrs[maxSLAACAddrLocalRegenAttempts-1]
 			expectAutoGenAddrAsyncEvent(addr, newAddr)
-			if err := s.DupTentativeAddrDetected(nicID, addr.Address); err != nil {
-				t.Fatalf("s.DupTentativeAddrDetected(%d, %s): %s", nicID, addr.Address, err)
-			}
+			rxNDPSolicit(e, addr.Address)
 			select {
 			case e := <-ndpDisp.dadC:
 				if diff := checkDADEvent(e, nicID, addr.Address, false, nil); diff != "" {
@@ -2787,20 +2795,22 @@ func TestMixedSLAACAddrConflictRegen(t *testing.T) {
 // stack.Stack will have a default route through the router (llAddr3) installed
 // and a static link-address (linkAddr3) added to the link address cache for the
 // router.
-func stackAndNdpDispatcherWithDefaultRoute(t *testing.T, nicID tcpip.NICID) (*ndpDispatcher, *channel.Endpoint, *stack.Stack) {
+func stackAndNdpDispatcherWithDefaultRoute(t *testing.T, nicID tcpip.NICID, useNeighborCache bool) (*ndpDispatcher, *channel.Endpoint, *stack.Stack) {
 	t.Helper()
 	ndpDisp := &ndpDispatcher{
 		autoGenAddrC: make(chan ndpAutoGenAddrEvent, 1),
 	}
 	e := channel.New(0, 1280, linkAddr1)
 	s := stack.New(stack.Options{
-		NetworkProtocols:   []stack.NetworkProtocol{ipv6.NewProtocol()},
-		TransportProtocols: []stack.TransportProtocol{udp.NewProtocol()},
-		NDPConfigs: stack.NDPConfigurations{
-			HandleRAs:              true,
-			AutoGenGlobalAddresses: true,
-		},
-		NDPDisp: ndpDisp,
+		NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
+			NDPConfigs: ipv6.NDPConfigurations{
+				HandleRAs:              true,
+				AutoGenGlobalAddresses: true,
+			},
+			NDPDisp: ndpDisp,
+		})},
+		TransportProtocols: []stack.TransportProtocolFactory{udp.NewProtocol},
+		UseNeighborCache:   useNeighborCache,
 	})
 	if err := s.CreateNIC(nicID, e); err != nil {
 		t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
@@ -2810,7 +2820,11 @@ func stackAndNdpDispatcherWithDefaultRoute(t *testing.T, nicID tcpip.NICID) (*nd
 		Gateway:     llAddr3,
 		NIC:         nicID,
 	}})
-	s.AddLinkAddress(nicID, llAddr3, linkAddr3)
+	if useNeighborCache {
+		s.AddStaticNeighbor(nicID, llAddr3, linkAddr3)
+	} else {
+		s.AddLinkAddress(nicID, llAddr3, linkAddr3)
+	}
 	return ndpDisp, e, s
 }
 
@@ -2884,110 +2898,128 @@ func addrForNewConnectionWithAddr(t *testing.T, s *stack.Stack, addr tcpip.FullA
 // TestAutoGenAddrDeprecateFromPI tests deprecating a SLAAC address when
 // receiving a PI with 0 preferred lifetime.
 func TestAutoGenAddrDeprecateFromPI(t *testing.T) {
-	const nicID = 1
+	stacks := []struct {
+		name             string
+		useNeighborCache bool
+	}{
+		{
+			name:             "linkAddrCache",
+			useNeighborCache: false,
+		},
+		{
+			name:             "neighborCache",
+			useNeighborCache: true,
+		},
+	}
 
-	prefix1, _, addr1 := prefixSubnetAddr(0, linkAddr1)
-	prefix2, _, addr2 := prefixSubnetAddr(1, linkAddr1)
+	for _, stackTyp := range stacks {
+		t.Run(stackTyp.name, func(t *testing.T) {
+			const nicID = 1
 
-	ndpDisp, e, s := stackAndNdpDispatcherWithDefaultRoute(t, nicID)
+			prefix1, _, addr1 := prefixSubnetAddr(0, linkAddr1)
+			prefix2, _, addr2 := prefixSubnetAddr(1, linkAddr1)
 
-	expectAutoGenAddrEvent := func(addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType) {
-		t.Helper()
+			ndpDisp, e, s := stackAndNdpDispatcherWithDefaultRoute(t, nicID, stackTyp.useNeighborCache)
 
-		select {
-		case e := <-ndpDisp.autoGenAddrC:
-			if diff := checkAutoGenAddrEvent(e, addr, eventType); diff != "" {
-				t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+			expectAutoGenAddrEvent := func(addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType) {
+				t.Helper()
+
+				select {
+				case e := <-ndpDisp.autoGenAddrC:
+					if diff := checkAutoGenAddrEvent(e, addr, eventType); diff != "" {
+						t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+					}
+				default:
+					t.Fatal("expected addr auto gen event")
+				}
 			}
-		default:
-			t.Fatal("expected addr auto gen event")
-		}
-	}
 
-	expectPrimaryAddr := func(addr tcpip.AddressWithPrefix) {
-		t.Helper()
+			expectPrimaryAddr := func(addr tcpip.AddressWithPrefix) {
+				t.Helper()
 
-		if got, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber); err != nil {
-			t.Fatalf("s.GetMainNICAddress(%d, %d): %s", nicID, header.IPv6ProtocolNumber, err)
-		} else if got != addr {
-			t.Errorf("got s.GetMainNICAddress(%d, %d) = %s, want = %s", nicID, header.IPv6ProtocolNumber, got, addr)
-		}
+				if got, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber); err != nil {
+					t.Fatalf("s.GetMainNICAddress(%d, %d): %s", nicID, header.IPv6ProtocolNumber, err)
+				} else if got != addr {
+					t.Errorf("got s.GetMainNICAddress(%d, %d) = %s, want = %s", nicID, header.IPv6ProtocolNumber, got, addr)
+				}
 
-		if got := addrForNewConnection(t, s); got != addr.Address {
-			t.Errorf("got addrForNewConnection = %s, want = %s", got, addr.Address)
-		}
-	}
+				if got := addrForNewConnection(t, s); got != addr.Address {
+					t.Errorf("got addrForNewConnection = %s, want = %s", got, addr.Address)
+				}
+			}
 
-	// Receive PI for prefix1.
-	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 100, 100))
-	expectAutoGenAddrEvent(addr1, newAddr)
-	if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
-		t.Fatalf("should have %s in the list of addresses", addr1)
-	}
-	expectPrimaryAddr(addr1)
+			// Receive PI for prefix1.
+			e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 100, 100))
+			expectAutoGenAddrEvent(addr1, newAddr)
+			if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
+				t.Fatalf("should have %s in the list of addresses", addr1)
+			}
+			expectPrimaryAddr(addr1)
 
-	// Deprecate addr for prefix1 immedaitely.
-	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 100, 0))
-	expectAutoGenAddrEvent(addr1, deprecatedAddr)
-	if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
-		t.Fatalf("should have %s in the list of addresses", addr1)
-	}
-	// addr should still be the primary endpoint as there are no other addresses.
-	expectPrimaryAddr(addr1)
+			// Deprecate addr for prefix1 immedaitely.
+			e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 100, 0))
+			expectAutoGenAddrEvent(addr1, deprecatedAddr)
+			if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
+				t.Fatalf("should have %s in the list of addresses", addr1)
+			}
+			// addr should still be the primary endpoint as there are no other addresses.
+			expectPrimaryAddr(addr1)
 
-	// Refresh lifetimes of addr generated from prefix1.
-	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 100, 100))
-	select {
-	case <-ndpDisp.autoGenAddrC:
-		t.Fatal("unexpectedly got an auto-generated event")
-	default:
-	}
-	expectPrimaryAddr(addr1)
+			// Refresh lifetimes of addr generated from prefix1.
+			e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 100, 100))
+			select {
+			case <-ndpDisp.autoGenAddrC:
+				t.Fatal("unexpectedly got an auto-generated event")
+			default:
+			}
+			expectPrimaryAddr(addr1)
 
-	// Receive PI for prefix2.
-	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, 100, 100))
-	expectAutoGenAddrEvent(addr2, newAddr)
-	if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
-		t.Fatalf("should have %s in the list of addresses", addr2)
-	}
-	expectPrimaryAddr(addr2)
+			// Receive PI for prefix2.
+			e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, 100, 100))
+			expectAutoGenAddrEvent(addr2, newAddr)
+			if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
+				t.Fatalf("should have %s in the list of addresses", addr2)
+			}
+			expectPrimaryAddr(addr2)
 
-	// Deprecate addr for prefix2 immedaitely.
-	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, 100, 0))
-	expectAutoGenAddrEvent(addr2, deprecatedAddr)
-	if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
-		t.Fatalf("should have %s in the list of addresses", addr2)
-	}
-	// addr1 should be the primary endpoint now since addr2 is deprecated but
-	// addr1 is not.
-	expectPrimaryAddr(addr1)
-	// addr2 is deprecated but if explicitly requested, it should be used.
-	fullAddr2 := tcpip.FullAddress{Addr: addr2.Address, NIC: nicID}
-	if got := addrForNewConnectionWithAddr(t, s, fullAddr2); got != addr2.Address {
-		t.Errorf("got addrForNewConnectionWithAddr(_, _, %+v) = %s, want = %s", fullAddr2, got, addr2.Address)
-	}
+			// Deprecate addr for prefix2 immedaitely.
+			e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, 100, 0))
+			expectAutoGenAddrEvent(addr2, deprecatedAddr)
+			if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
+				t.Fatalf("should have %s in the list of addresses", addr2)
+			}
+			// addr1 should be the primary endpoint now since addr2 is deprecated but
+			// addr1 is not.
+			expectPrimaryAddr(addr1)
+			// addr2 is deprecated but if explicitly requested, it should be used.
+			fullAddr2 := tcpip.FullAddress{Addr: addr2.Address, NIC: nicID}
+			if got := addrForNewConnectionWithAddr(t, s, fullAddr2); got != addr2.Address {
+				t.Errorf("got addrForNewConnectionWithAddr(_, _, %+v) = %s, want = %s", fullAddr2, got, addr2.Address)
+			}
 
-	// Another PI w/ 0 preferred lifetime should not result in a deprecation
-	// event.
-	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, 100, 0))
-	select {
-	case <-ndpDisp.autoGenAddrC:
-		t.Fatal("unexpectedly got an auto-generated event")
-	default:
-	}
-	expectPrimaryAddr(addr1)
-	if got := addrForNewConnectionWithAddr(t, s, fullAddr2); got != addr2.Address {
-		t.Errorf("got addrForNewConnectionWithAddr(_, _, %+v) = %s, want = %s", fullAddr2, got, addr2.Address)
-	}
+			// Another PI w/ 0 preferred lifetime should not result in a deprecation
+			// event.
+			e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, 100, 0))
+			select {
+			case <-ndpDisp.autoGenAddrC:
+				t.Fatal("unexpectedly got an auto-generated event")
+			default:
+			}
+			expectPrimaryAddr(addr1)
+			if got := addrForNewConnectionWithAddr(t, s, fullAddr2); got != addr2.Address {
+				t.Errorf("got addrForNewConnectionWithAddr(_, _, %+v) = %s, want = %s", fullAddr2, got, addr2.Address)
+			}
 
-	// Refresh lifetimes of addr generated from prefix2.
-	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, 100, 100))
-	select {
-	case <-ndpDisp.autoGenAddrC:
-		t.Fatal("unexpectedly got an auto-generated event")
-	default:
+			// Refresh lifetimes of addr generated from prefix2.
+			e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, 100, 100))
+			select {
+			case <-ndpDisp.autoGenAddrC:
+				t.Fatal("unexpectedly got an auto-generated event")
+			default:
+			}
+			expectPrimaryAddr(addr2)
+		})
 	}
-	expectPrimaryAddr(addr2)
 }
 
 // TestAutoGenAddrJobDeprecation tests that an address is properly deprecated
@@ -2996,217 +3028,236 @@ func TestAutoGenAddrJobDeprecation(t *testing.T) {
 	const nicID = 1
 	const newMinVL = 2
 	newMinVLDuration := newMinVL * time.Second
-	saved := stack.MinPrefixInformationValidLifetimeForUpdate
-	defer func() {
-		stack.MinPrefixInformationValidLifetimeForUpdate = saved
-	}()
-	stack.MinPrefixInformationValidLifetimeForUpdate = newMinVLDuration
 
-	prefix1, _, addr1 := prefixSubnetAddr(0, linkAddr1)
-	prefix2, _, addr2 := prefixSubnetAddr(1, linkAddr1)
+	stacks := []struct {
+		name             string
+		useNeighborCache bool
+	}{
+		{
+			name:             "linkAddrCache",
+			useNeighborCache: false,
+		},
+		{
+			name:             "neighborCache",
+			useNeighborCache: true,
+		},
+	}
 
-	ndpDisp, e, s := stackAndNdpDispatcherWithDefaultRoute(t, nicID)
+	for _, stackTyp := range stacks {
+		t.Run(stackTyp.name, func(t *testing.T) {
+			saved := ipv6.MinPrefixInformationValidLifetimeForUpdate
+			defer func() {
+				ipv6.MinPrefixInformationValidLifetimeForUpdate = saved
+			}()
+			ipv6.MinPrefixInformationValidLifetimeForUpdate = newMinVLDuration
 
-	expectAutoGenAddrEvent := func(addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType) {
-		t.Helper()
+			prefix1, _, addr1 := prefixSubnetAddr(0, linkAddr1)
+			prefix2, _, addr2 := prefixSubnetAddr(1, linkAddr1)
 
-		select {
-		case e := <-ndpDisp.autoGenAddrC:
-			if diff := checkAutoGenAddrEvent(e, addr, eventType); diff != "" {
-				t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+			ndpDisp, e, s := stackAndNdpDispatcherWithDefaultRoute(t, nicID, stackTyp.useNeighborCache)
+
+			expectAutoGenAddrEvent := func(addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType) {
+				t.Helper()
+
+				select {
+				case e := <-ndpDisp.autoGenAddrC:
+					if diff := checkAutoGenAddrEvent(e, addr, eventType); diff != "" {
+						t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+					}
+				default:
+					t.Fatal("expected addr auto gen event")
+				}
 			}
-		default:
-			t.Fatal("expected addr auto gen event")
-		}
-	}
 
-	expectAutoGenAddrEventAfter := func(addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType, timeout time.Duration) {
-		t.Helper()
+			expectAutoGenAddrEventAfter := func(addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType, timeout time.Duration) {
+				t.Helper()
 
-		select {
-		case e := <-ndpDisp.autoGenAddrC:
-			if diff := checkAutoGenAddrEvent(e, addr, eventType); diff != "" {
-				t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+				select {
+				case e := <-ndpDisp.autoGenAddrC:
+					if diff := checkAutoGenAddrEvent(e, addr, eventType); diff != "" {
+						t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+					}
+				case <-time.After(timeout):
+					t.Fatal("timed out waiting for addr auto gen event")
+				}
 			}
-		case <-time.After(timeout):
-			t.Fatal("timed out waiting for addr auto gen event")
-		}
-	}
 
-	expectPrimaryAddr := func(addr tcpip.AddressWithPrefix) {
-		t.Helper()
+			expectPrimaryAddr := func(addr tcpip.AddressWithPrefix) {
+				t.Helper()
 
-		if got, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber); err != nil {
-			t.Fatalf("s.GetMainNICAddress(%d, %d): %s", nicID, header.IPv6ProtocolNumber, err)
-		} else if got != addr {
-			t.Errorf("got s.GetMainNICAddress(%d, %d) = %s, want = %s", nicID, header.IPv6ProtocolNumber, got, addr)
-		}
+				if got, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber); err != nil {
+					t.Fatalf("s.GetMainNICAddress(%d, %d): %s", nicID, header.IPv6ProtocolNumber, err)
+				} else if got != addr {
+					t.Errorf("got s.GetMainNICAddress(%d, %d) = %s, want = %s", nicID, header.IPv6ProtocolNumber, got, addr)
+				}
 
-		if got := addrForNewConnection(t, s); got != addr.Address {
-			t.Errorf("got addrForNewConnection = %s, want = %s", got, addr.Address)
-		}
-	}
+				if got := addrForNewConnection(t, s); got != addr.Address {
+					t.Errorf("got addrForNewConnection = %s, want = %s", got, addr.Address)
+				}
+			}
 
-	// Receive PI for prefix2.
-	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, 100, 100))
-	expectAutoGenAddrEvent(addr2, newAddr)
-	if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
-		t.Fatalf("should have %s in the list of addresses", addr2)
-	}
-	expectPrimaryAddr(addr2)
+			// Receive PI for prefix2.
+			e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, 100, 100))
+			expectAutoGenAddrEvent(addr2, newAddr)
+			if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
+				t.Fatalf("should have %s in the list of addresses", addr2)
+			}
+			expectPrimaryAddr(addr2)
 
-	// Receive a PI for prefix1.
-	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 100, 90))
-	expectAutoGenAddrEvent(addr1, newAddr)
-	if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
-		t.Fatalf("should have %s in the list of addresses", addr1)
-	}
-	if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
-		t.Fatalf("should have %s in the list of addresses", addr2)
-	}
-	expectPrimaryAddr(addr1)
+			// Receive a PI for prefix1.
+			e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 100, 90))
+			expectAutoGenAddrEvent(addr1, newAddr)
+			if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
+				t.Fatalf("should have %s in the list of addresses", addr1)
+			}
+			if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
+				t.Fatalf("should have %s in the list of addresses", addr2)
+			}
+			expectPrimaryAddr(addr1)
 
-	// Refresh lifetime for addr of prefix1.
-	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, newMinVL, newMinVL-1))
-	select {
-	case <-ndpDisp.autoGenAddrC:
-		t.Fatal("unexpectedly got an auto-generated event")
-	default:
-	}
-	expectPrimaryAddr(addr1)
+			// Refresh lifetime for addr of prefix1.
+			e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, newMinVL, newMinVL-1))
+			select {
+			case <-ndpDisp.autoGenAddrC:
+				t.Fatal("unexpectedly got an auto-generated event")
+			default:
+			}
+			expectPrimaryAddr(addr1)
 
-	// Wait for addr of prefix1 to be deprecated.
-	expectAutoGenAddrEventAfter(addr1, deprecatedAddr, newMinVLDuration-time.Second+defaultAsyncPositiveEventTimeout)
-	if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
-		t.Fatalf("should not have %s in the list of addresses", addr1)
-	}
-	if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
-		t.Fatalf("should have %s in the list of addresses", addr2)
-	}
-	// addr2 should be the primary endpoint now since addr1 is deprecated but
-	// addr2 is not.
-	expectPrimaryAddr(addr2)
-	// addr1 is deprecated but if explicitly requested, it should be used.
-	fullAddr1 := tcpip.FullAddress{Addr: addr1.Address, NIC: nicID}
-	if got := addrForNewConnectionWithAddr(t, s, fullAddr1); got != addr1.Address {
-		t.Errorf("got addrForNewConnectionWithAddr(_, _, %+v) = %s, want = %s", fullAddr1, got, addr1.Address)
-	}
+			// Wait for addr of prefix1 to be deprecated.
+			expectAutoGenAddrEventAfter(addr1, deprecatedAddr, newMinVLDuration-time.Second+defaultAsyncPositiveEventTimeout)
+			if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
+				t.Fatalf("should not have %s in the list of addresses", addr1)
+			}
+			if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
+				t.Fatalf("should have %s in the list of addresses", addr2)
+			}
+			// addr2 should be the primary endpoint now since addr1 is deprecated but
+			// addr2 is not.
+			expectPrimaryAddr(addr2)
+			// addr1 is deprecated but if explicitly requested, it should be used.
+			fullAddr1 := tcpip.FullAddress{Addr: addr1.Address, NIC: nicID}
+			if got := addrForNewConnectionWithAddr(t, s, fullAddr1); got != addr1.Address {
+				t.Errorf("got addrForNewConnectionWithAddr(_, _, %+v) = %s, want = %s", fullAddr1, got, addr1.Address)
+			}
 
-	// Refresh valid lifetime for addr of prefix1, w/ 0 preferred lifetime to make
-	// sure we do not get a deprecation event again.
-	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, newMinVL, 0))
-	select {
-	case <-ndpDisp.autoGenAddrC:
-		t.Fatal("unexpectedly got an auto-generated event")
-	default:
-	}
-	expectPrimaryAddr(addr2)
-	if got := addrForNewConnectionWithAddr(t, s, fullAddr1); got != addr1.Address {
-		t.Errorf("got addrForNewConnectionWithAddr(_, _, %+v) = %s, want = %s", fullAddr1, got, addr1.Address)
-	}
+			// Refresh valid lifetime for addr of prefix1, w/ 0 preferred lifetime to make
+			// sure we do not get a deprecation event again.
+			e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, newMinVL, 0))
+			select {
+			case <-ndpDisp.autoGenAddrC:
+				t.Fatal("unexpectedly got an auto-generated event")
+			default:
+			}
+			expectPrimaryAddr(addr2)
+			if got := addrForNewConnectionWithAddr(t, s, fullAddr1); got != addr1.Address {
+				t.Errorf("got addrForNewConnectionWithAddr(_, _, %+v) = %s, want = %s", fullAddr1, got, addr1.Address)
+			}
 
-	// Refresh lifetimes for addr of prefix1.
-	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, newMinVL, newMinVL-1))
-	select {
-	case <-ndpDisp.autoGenAddrC:
-		t.Fatal("unexpectedly got an auto-generated event")
-	default:
-	}
-	// addr1 is the primary endpoint again since it is non-deprecated now.
-	expectPrimaryAddr(addr1)
+			// Refresh lifetimes for addr of prefix1.
+			e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, newMinVL, newMinVL-1))
+			select {
+			case <-ndpDisp.autoGenAddrC:
+				t.Fatal("unexpectedly got an auto-generated event")
+			default:
+			}
+			// addr1 is the primary endpoint again since it is non-deprecated now.
+			expectPrimaryAddr(addr1)
 
-	// Wait for addr of prefix1 to be deprecated.
-	expectAutoGenAddrEventAfter(addr1, deprecatedAddr, newMinVLDuration-time.Second+defaultAsyncPositiveEventTimeout)
-	if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
-		t.Fatalf("should not have %s in the list of addresses", addr1)
-	}
-	if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
-		t.Fatalf("should have %s in the list of addresses", addr2)
-	}
-	// addr2 should be the primary endpoint now since it is not deprecated.
-	expectPrimaryAddr(addr2)
-	if got := addrForNewConnectionWithAddr(t, s, fullAddr1); got != addr1.Address {
-		t.Errorf("got addrForNewConnectionWithAddr(_, _, %+v) = %s, want = %s", fullAddr1, got, addr1.Address)
-	}
+			// Wait for addr of prefix1 to be deprecated.
+			expectAutoGenAddrEventAfter(addr1, deprecatedAddr, newMinVLDuration-time.Second+defaultAsyncPositiveEventTimeout)
+			if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
+				t.Fatalf("should not have %s in the list of addresses", addr1)
+			}
+			if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
+				t.Fatalf("should have %s in the list of addresses", addr2)
+			}
+			// addr2 should be the primary endpoint now since it is not deprecated.
+			expectPrimaryAddr(addr2)
+			if got := addrForNewConnectionWithAddr(t, s, fullAddr1); got != addr1.Address {
+				t.Errorf("got addrForNewConnectionWithAddr(_, _, %+v) = %s, want = %s", fullAddr1, got, addr1.Address)
+			}
 
-	// Wait for addr of prefix1 to be invalidated.
-	expectAutoGenAddrEventAfter(addr1, invalidatedAddr, time.Second+defaultAsyncPositiveEventTimeout)
-	if containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
-		t.Fatalf("should not have %s in the list of addresses", addr1)
-	}
-	if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
-		t.Fatalf("should have %s in the list of addresses", addr2)
-	}
-	expectPrimaryAddr(addr2)
+			// Wait for addr of prefix1 to be invalidated.
+			expectAutoGenAddrEventAfter(addr1, invalidatedAddr, time.Second+defaultAsyncPositiveEventTimeout)
+			if containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
+				t.Fatalf("should not have %s in the list of addresses", addr1)
+			}
+			if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
+				t.Fatalf("should have %s in the list of addresses", addr2)
+			}
+			expectPrimaryAddr(addr2)
 
-	// Refresh both lifetimes for addr of prefix2 to the same value.
-	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, newMinVL, newMinVL))
-	select {
-	case <-ndpDisp.autoGenAddrC:
-		t.Fatal("unexpectedly got an auto-generated event")
-	default:
-	}
+			// Refresh both lifetimes for addr of prefix2 to the same value.
+			e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, newMinVL, newMinVL))
+			select {
+			case <-ndpDisp.autoGenAddrC:
+				t.Fatal("unexpectedly got an auto-generated event")
+			default:
+			}
 
-	// Wait for a deprecation then invalidation events, or just an invalidation
-	// event. We need to cover both cases but cannot deterministically hit both
-	// cases because the deprecation and invalidation handlers could be handled in
-	// either deprecation then invalidation, or invalidation then deprecation
-	// (which should be cancelled by the invalidation handler).
-	select {
-	case e := <-ndpDisp.autoGenAddrC:
-		if diff := checkAutoGenAddrEvent(e, addr2, deprecatedAddr); diff == "" {
-			// If we get a deprecation event first, we should get an invalidation
-			// event almost immediately after.
+			// Wait for a deprecation then invalidation events, or just an invalidation
+			// event. We need to cover both cases but cannot deterministically hit both
+			// cases because the deprecation and invalidation handlers could be handled in
+			// either deprecation then invalidation, or invalidation then deprecation
+			// (which should be cancelled by the invalidation handler).
 			select {
 			case e := <-ndpDisp.autoGenAddrC:
-				if diff := checkAutoGenAddrEvent(e, addr2, invalidatedAddr); diff != "" {
-					t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+				if diff := checkAutoGenAddrEvent(e, addr2, deprecatedAddr); diff == "" {
+					// If we get a deprecation event first, we should get an invalidation
+					// event almost immediately after.
+					select {
+					case e := <-ndpDisp.autoGenAddrC:
+						if diff := checkAutoGenAddrEvent(e, addr2, invalidatedAddr); diff != "" {
+							t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+						}
+					case <-time.After(defaultAsyncPositiveEventTimeout):
+						t.Fatal("timed out waiting for addr auto gen event")
+					}
+				} else if diff := checkAutoGenAddrEvent(e, addr2, invalidatedAddr); diff == "" {
+					// If we get an invalidation  event first, we should not get a deprecation
+					// event after.
+					select {
+					case <-ndpDisp.autoGenAddrC:
+						t.Fatal("unexpectedly got an auto-generated event")
+					case <-time.After(defaultAsyncNegativeEventTimeout):
+					}
+				} else {
+					t.Fatalf("got unexpected auto-generated event")
 				}
-			case <-time.After(defaultAsyncPositiveEventTimeout):
+			case <-time.After(newMinVLDuration + defaultAsyncPositiveEventTimeout):
 				t.Fatal("timed out waiting for addr auto gen event")
 			}
-		} else if diff := checkAutoGenAddrEvent(e, addr2, invalidatedAddr); diff == "" {
-			// If we get an invalidation  event first, we should not get a deprecation
-			// event after.
-			select {
-			case <-ndpDisp.autoGenAddrC:
-				t.Fatal("unexpectedly got an auto-generated event")
-			case <-time.After(defaultAsyncNegativeEventTimeout):
+			if containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
+				t.Fatalf("should not have %s in the list of addresses", addr1)
+			}
+			if containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
+				t.Fatalf("should not have %s in the list of addresses", addr2)
+			}
+			// Should not have any primary endpoints.
+			if got, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber); err != nil {
+				t.Fatalf("s.GetMainNICAddress(%d, %d): %s", nicID, header.IPv6ProtocolNumber, err)
+			} else if want := (tcpip.AddressWithPrefix{}); got != want {
+				t.Errorf("got s.GetMainNICAddress(%d, %d) = %s, want = %s", nicID, header.IPv6ProtocolNumber, got, want)
+			}
+			wq := waiter.Queue{}
+			we, ch := waiter.NewChannelEntry(nil)
+			wq.EventRegister(&we, waiter.EventIn)
+			defer wq.EventUnregister(&we)
+			defer close(ch)
+			ep, err := s.NewEndpoint(header.UDPProtocolNumber, header.IPv6ProtocolNumber, &wq)
+			if err != nil {
+				t.Fatalf("s.NewEndpoint(%d, %d, _): %s", header.UDPProtocolNumber, header.IPv6ProtocolNumber, err)
+			}
+			defer ep.Close()
+			if err := ep.SetSockOptBool(tcpip.V6OnlyOption, true); err != nil {
+				t.Fatalf("SetSockOpt(tcpip.V6OnlyOption, true): %s", err)
 			}
-		} else {
-			t.Fatalf("got unexpected auto-generated event")
-		}
-	case <-time.After(newMinVLDuration + defaultAsyncPositiveEventTimeout):
-		t.Fatal("timed out waiting for addr auto gen event")
-	}
-	if containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
-		t.Fatalf("should not have %s in the list of addresses", addr1)
-	}
-	if containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
-		t.Fatalf("should not have %s in the list of addresses", addr2)
-	}
-	// Should not have any primary endpoints.
-	if got, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber); err != nil {
-		t.Fatalf("s.GetMainNICAddress(%d, %d): %s", nicID, header.IPv6ProtocolNumber, err)
-	} else if want := (tcpip.AddressWithPrefix{}); got != want {
-		t.Errorf("got s.GetMainNICAddress(%d, %d) = %s, want = %s", nicID, header.IPv6ProtocolNumber, got, want)
-	}
-	wq := waiter.Queue{}
-	we, ch := waiter.NewChannelEntry(nil)
-	wq.EventRegister(&we, waiter.EventIn)
-	defer wq.EventUnregister(&we)
-	defer close(ch)
-	ep, err := s.NewEndpoint(header.UDPProtocolNumber, header.IPv6ProtocolNumber, &wq)
-	if err != nil {
-		t.Fatalf("s.NewEndpoint(%d, %d, _): %s", header.UDPProtocolNumber, header.IPv6ProtocolNumber, err)
-	}
-	defer ep.Close()
-	if err := ep.SetSockOptBool(tcpip.V6OnlyOption, true); err != nil {
-		t.Fatalf("SetSockOpt(tcpip.V6OnlyOption, true): %s", err)
-	}
 
-	if err := ep.Connect(dstAddr); err != tcpip.ErrNoRoute {
-		t.Errorf("got ep.Connect(%+v) = %v, want = %s", dstAddr, err, tcpip.ErrNoRoute)
+			if err := ep.Connect(dstAddr); err != tcpip.ErrNoRoute {
+				t.Errorf("got ep.Connect(%+v) = %s, want = %s", dstAddr, err, tcpip.ErrNoRoute)
+			}
+		})
 	}
 }
 
@@ -3216,12 +3267,12 @@ func TestAutoGenAddrFiniteToInfiniteToFiniteVL(t *testing.T) {
 	const infiniteVLSeconds = 2
 	const minVLSeconds = 1
 	savedIL := header.NDPInfiniteLifetime
-	savedMinVL := stack.MinPrefixInformationValidLifetimeForUpdate
+	savedMinVL := ipv6.MinPrefixInformationValidLifetimeForUpdate
 	defer func() {
-		stack.MinPrefixInformationValidLifetimeForUpdate = savedMinVL
+		ipv6.MinPrefixInformationValidLifetimeForUpdate = savedMinVL
 		header.NDPInfiniteLifetime = savedIL
 	}()
-	stack.MinPrefixInformationValidLifetimeForUpdate = minVLSeconds * time.Second
+	ipv6.MinPrefixInformationValidLifetimeForUpdate = minVLSeconds * time.Second
 	header.NDPInfiniteLifetime = infiniteVLSeconds * time.Second
 
 	prefix, _, addr := prefixSubnetAddr(0, linkAddr1)
@@ -3265,12 +3316,13 @@ func TestAutoGenAddrFiniteToInfiniteToFiniteVL(t *testing.T) {
 				}
 				e := channel.New(0, 1280, linkAddr1)
 				s := stack.New(stack.Options{
-					NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
-					NDPConfigs: stack.NDPConfigurations{
-						HandleRAs:              true,
-						AutoGenGlobalAddresses: true,
-					},
-					NDPDisp: &ndpDisp,
+					NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
+						NDPConfigs: ipv6.NDPConfigurations{
+							HandleRAs:              true,
+							AutoGenGlobalAddresses: true,
+						},
+						NDPDisp: &ndpDisp,
+					})},
 				})
 
 				if err := s.CreateNIC(1, e); err != nil {
@@ -3315,11 +3367,11 @@ func TestAutoGenAddrFiniteToInfiniteToFiniteVL(t *testing.T) {
 func TestAutoGenAddrValidLifetimeUpdates(t *testing.T) {
 	const infiniteVL = 4294967295
 	const newMinVL = 4
-	saved := stack.MinPrefixInformationValidLifetimeForUpdate
+	saved := ipv6.MinPrefixInformationValidLifetimeForUpdate
 	defer func() {
-		stack.MinPrefixInformationValidLifetimeForUpdate = saved
+		ipv6.MinPrefixInformationValidLifetimeForUpdate = saved
 	}()
-	stack.MinPrefixInformationValidLifetimeForUpdate = newMinVL * time.Second
+	ipv6.MinPrefixInformationValidLifetimeForUpdate = newMinVL * time.Second
 
 	prefix, _, addr := prefixSubnetAddr(0, linkAddr1)
 
@@ -3407,12 +3459,13 @@ func TestAutoGenAddrValidLifetimeUpdates(t *testing.T) {
 				}
 				e := channel.New(10, 1280, linkAddr1)
 				s := stack.New(stack.Options{
-					NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
-					NDPConfigs: stack.NDPConfigurations{
-						HandleRAs:              true,
-						AutoGenGlobalAddresses: true,
-					},
-					NDPDisp: &ndpDisp,
+					NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
+						NDPConfigs: ipv6.NDPConfigurations{
+							HandleRAs:              true,
+							AutoGenGlobalAddresses: true,
+						},
+						NDPDisp: &ndpDisp,
+					})},
 				})
 
 				if err := s.CreateNIC(1, e); err != nil {
@@ -3473,12 +3526,13 @@ func TestAutoGenAddrRemoval(t *testing.T) {
 	}
 	e := channel.New(0, 1280, linkAddr1)
 	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
-		NDPConfigs: stack.NDPConfigurations{
-			HandleRAs:              true,
-			AutoGenGlobalAddresses: true,
-		},
-		NDPDisp: &ndpDisp,
+		NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
+			NDPConfigs: ipv6.NDPConfigurations{
+				HandleRAs:              true,
+				AutoGenGlobalAddresses: true,
+			},
+			NDPDisp: &ndpDisp,
+		})},
 	})
 
 	if err := s.CreateNIC(1, e); err != nil {
@@ -3524,110 +3578,128 @@ func TestAutoGenAddrRemoval(t *testing.T) {
 func TestAutoGenAddrAfterRemoval(t *testing.T) {
 	const nicID = 1
 
-	prefix1, _, addr1 := prefixSubnetAddr(0, linkAddr1)
-	prefix2, _, addr2 := prefixSubnetAddr(1, linkAddr1)
-	ndpDisp, e, s := stackAndNdpDispatcherWithDefaultRoute(t, nicID)
-
-	expectAutoGenAddrEvent := func(addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType) {
-		t.Helper()
-
-		select {
-		case e := <-ndpDisp.autoGenAddrC:
-			if diff := checkAutoGenAddrEvent(e, addr, eventType); diff != "" {
-				t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
-			}
-		default:
-			t.Fatal("expected addr auto gen event")
-		}
+	stacks := []struct {
+		name             string
+		useNeighborCache bool
+	}{
+		{
+			name:             "linkAddrCache",
+			useNeighborCache: false,
+		},
+		{
+			name:             "neighborCache",
+			useNeighborCache: true,
+		},
 	}
 
-	expectPrimaryAddr := func(addr tcpip.AddressWithPrefix) {
-		t.Helper()
-
-		if got, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber); err != nil {
-			t.Fatalf("s.GetMainNICAddress(%d, %d): %s", nicID, header.IPv6ProtocolNumber, err)
-		} else if got != addr {
-			t.Errorf("got s.GetMainNICAddress(%d, %d) = %s, want = %s", nicID, header.IPv6ProtocolNumber, got, addr)
-		}
+	for _, stackTyp := range stacks {
+		t.Run(stackTyp.name, func(t *testing.T) {
+			prefix1, _, addr1 := prefixSubnetAddr(0, linkAddr1)
+			prefix2, _, addr2 := prefixSubnetAddr(1, linkAddr1)
+			ndpDisp, e, s := stackAndNdpDispatcherWithDefaultRoute(t, nicID, stackTyp.useNeighborCache)
 
-		if got := addrForNewConnection(t, s); got != addr.Address {
-			t.Errorf("got addrForNewConnection = %s, want = %s", got, addr.Address)
-		}
-	}
+			expectAutoGenAddrEvent := func(addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType) {
+				t.Helper()
 
-	// Receive a PI to auto-generate addr1 with a large valid and preferred
-	// lifetime.
-	const largeLifetimeSeconds = 999
-	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr3, 0, prefix1, true, true, largeLifetimeSeconds, largeLifetimeSeconds))
-	expectAutoGenAddrEvent(addr1, newAddr)
-	expectPrimaryAddr(addr1)
+				select {
+				case e := <-ndpDisp.autoGenAddrC:
+					if diff := checkAutoGenAddrEvent(e, addr, eventType); diff != "" {
+						t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+					}
+				default:
+					t.Fatal("expected addr auto gen event")
+				}
+			}
 
-	// Add addr2 as a static address.
-	protoAddr2 := tcpip.ProtocolAddress{
-		Protocol:          header.IPv6ProtocolNumber,
-		AddressWithPrefix: addr2,
-	}
-	if err := s.AddProtocolAddressWithOptions(nicID, protoAddr2, stack.FirstPrimaryEndpoint); err != nil {
-		t.Fatalf("AddProtocolAddressWithOptions(%d, %+v, %d) = %s", nicID, protoAddr2, stack.FirstPrimaryEndpoint, err)
-	}
-	// addr2 should be more preferred now since it is at the front of the primary
-	// list.
-	expectPrimaryAddr(addr2)
+			expectPrimaryAddr := func(addr tcpip.AddressWithPrefix) {
+				t.Helper()
 
-	// Get a route using addr2 to increment its reference count then remove it
-	// to leave it in the permanentExpired state.
-	r, err := s.FindRoute(nicID, addr2.Address, addr3, header.IPv6ProtocolNumber, false)
-	if err != nil {
-		t.Fatalf("FindRoute(%d, %s, %s, %d, false): %s", nicID, addr2.Address, addr3, header.IPv6ProtocolNumber, err)
-	}
-	defer r.Release()
-	if err := s.RemoveAddress(nicID, addr2.Address); err != nil {
-		t.Fatalf("s.RemoveAddress(%d, %s): %s", nicID, addr2.Address, err)
-	}
-	// addr1 should be preferred again since addr2 is in the expired state.
-	expectPrimaryAddr(addr1)
+				if got, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber); err != nil {
+					t.Fatalf("s.GetMainNICAddress(%d, %d): %s", nicID, header.IPv6ProtocolNumber, err)
+				} else if got != addr {
+					t.Errorf("got s.GetMainNICAddress(%d, %d) = %s, want = %s", nicID, header.IPv6ProtocolNumber, got, addr)
+				}
 
-	// Receive a PI to auto-generate addr2 as valid and preferred.
-	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr3, 0, prefix2, true, true, largeLifetimeSeconds, largeLifetimeSeconds))
-	expectAutoGenAddrEvent(addr2, newAddr)
-	// addr2 should be more preferred now that it is closer to the front of the
-	// primary list and not deprecated.
-	expectPrimaryAddr(addr2)
+				if got := addrForNewConnection(t, s); got != addr.Address {
+					t.Errorf("got addrForNewConnection = %s, want = %s", got, addr.Address)
+				}
+			}
 
-	// Removing the address should result in an invalidation event immediately.
-	// It should still be in the permanentExpired state because r is still held.
-	//
-	// We remove addr2 here to make sure addr2 was marked as a SLAAC address
-	// (it was previously marked as a static address).
-	if err := s.RemoveAddress(1, addr2.Address); err != nil {
-		t.Fatalf("RemoveAddress(_, %s) = %s", addr2.Address, err)
-	}
-	expectAutoGenAddrEvent(addr2, invalidatedAddr)
-	// addr1 should be more preferred since addr2 is in the expired state.
-	expectPrimaryAddr(addr1)
+			// Receive a PI to auto-generate addr1 with a large valid and preferred
+			// lifetime.
+			const largeLifetimeSeconds = 999
+			e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr3, 0, prefix1, true, true, largeLifetimeSeconds, largeLifetimeSeconds))
+			expectAutoGenAddrEvent(addr1, newAddr)
+			expectPrimaryAddr(addr1)
 
-	// Receive a PI to auto-generate addr2 as valid and deprecated.
-	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr3, 0, prefix2, true, true, largeLifetimeSeconds, 0))
-	expectAutoGenAddrEvent(addr2, newAddr)
-	// addr1 should still be more preferred since addr2 is deprecated, even though
-	// it is closer to the front of the primary list.
-	expectPrimaryAddr(addr1)
+			// Add addr2 as a static address.
+			protoAddr2 := tcpip.ProtocolAddress{
+				Protocol:          header.IPv6ProtocolNumber,
+				AddressWithPrefix: addr2,
+			}
+			if err := s.AddProtocolAddressWithOptions(nicID, protoAddr2, stack.FirstPrimaryEndpoint); err != nil {
+				t.Fatalf("AddProtocolAddressWithOptions(%d, %+v, %d) = %s", nicID, protoAddr2, stack.FirstPrimaryEndpoint, err)
+			}
+			// addr2 should be more preferred now since it is at the front of the primary
+			// list.
+			expectPrimaryAddr(addr2)
 
-	// Receive a PI to refresh addr2's preferred lifetime.
-	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr3, 0, prefix2, true, true, largeLifetimeSeconds, largeLifetimeSeconds))
-	select {
-	case <-ndpDisp.autoGenAddrC:
-		t.Fatal("unexpectedly got an auto gen addr event")
-	default:
-	}
-	// addr2 should be more preferred now that it is not deprecated.
-	expectPrimaryAddr(addr2)
+			// Get a route using addr2 to increment its reference count then remove it
+			// to leave it in the permanentExpired state.
+			r, err := s.FindRoute(nicID, addr2.Address, addr3, header.IPv6ProtocolNumber, false)
+			if err != nil {
+				t.Fatalf("FindRoute(%d, %s, %s, %d, false): %s", nicID, addr2.Address, addr3, header.IPv6ProtocolNumber, err)
+			}
+			defer r.Release()
+			if err := s.RemoveAddress(nicID, addr2.Address); err != nil {
+				t.Fatalf("s.RemoveAddress(%d, %s): %s", nicID, addr2.Address, err)
+			}
+			// addr1 should be preferred again since addr2 is in the expired state.
+			expectPrimaryAddr(addr1)
+
+			// Receive a PI to auto-generate addr2 as valid and preferred.
+			e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr3, 0, prefix2, true, true, largeLifetimeSeconds, largeLifetimeSeconds))
+			expectAutoGenAddrEvent(addr2, newAddr)
+			// addr2 should be more preferred now that it is closer to the front of the
+			// primary list and not deprecated.
+			expectPrimaryAddr(addr2)
+
+			// Removing the address should result in an invalidation event immediately.
+			// It should still be in the permanentExpired state because r is still held.
+			//
+			// We remove addr2 here to make sure addr2 was marked as a SLAAC address
+			// (it was previously marked as a static address).
+			if err := s.RemoveAddress(1, addr2.Address); err != nil {
+				t.Fatalf("RemoveAddress(_, %s) = %s", addr2.Address, err)
+			}
+			expectAutoGenAddrEvent(addr2, invalidatedAddr)
+			// addr1 should be more preferred since addr2 is in the expired state.
+			expectPrimaryAddr(addr1)
+
+			// Receive a PI to auto-generate addr2 as valid and deprecated.
+			e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr3, 0, prefix2, true, true, largeLifetimeSeconds, 0))
+			expectAutoGenAddrEvent(addr2, newAddr)
+			// addr1 should still be more preferred since addr2 is deprecated, even though
+			// it is closer to the front of the primary list.
+			expectPrimaryAddr(addr1)
+
+			// Receive a PI to refresh addr2's preferred lifetime.
+			e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr3, 0, prefix2, true, true, largeLifetimeSeconds, largeLifetimeSeconds))
+			select {
+			case <-ndpDisp.autoGenAddrC:
+				t.Fatal("unexpectedly got an auto gen addr event")
+			default:
+			}
+			// addr2 should be more preferred now that it is not deprecated.
+			expectPrimaryAddr(addr2)
 
-	if err := s.RemoveAddress(1, addr2.Address); err != nil {
-		t.Fatalf("RemoveAddress(_, %s) = %s", addr2.Address, err)
+			if err := s.RemoveAddress(1, addr2.Address); err != nil {
+				t.Fatalf("RemoveAddress(_, %s) = %s", addr2.Address, err)
+			}
+			expectAutoGenAddrEvent(addr2, invalidatedAddr)
+			expectPrimaryAddr(addr1)
+		})
 	}
-	expectAutoGenAddrEvent(addr2, invalidatedAddr)
-	expectPrimaryAddr(addr1)
 }
 
 // TestAutoGenAddrStaticConflict tests that if SLAAC generates an address that
@@ -3640,12 +3712,13 @@ func TestAutoGenAddrStaticConflict(t *testing.T) {
 	}
 	e := channel.New(0, 1280, linkAddr1)
 	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
-		NDPConfigs: stack.NDPConfigurations{
-			HandleRAs:              true,
-			AutoGenGlobalAddresses: true,
-		},
-		NDPDisp: &ndpDisp,
+		NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
+			NDPConfigs: ipv6.NDPConfigurations{
+				HandleRAs:              true,
+				AutoGenGlobalAddresses: true,
+			},
+			NDPDisp: &ndpDisp,
+		})},
 	})
 
 	if err := s.CreateNIC(1, e); err != nil {
@@ -3721,18 +3794,19 @@ func TestAutoGenAddrWithOpaqueIID(t *testing.T) {
 	}
 	e := channel.New(0, 1280, linkAddr1)
 	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
-		NDPConfigs: stack.NDPConfigurations{
-			HandleRAs:              true,
-			AutoGenGlobalAddresses: true,
-		},
-		NDPDisp: &ndpDisp,
-		OpaqueIIDOpts: stack.OpaqueInterfaceIdentifierOptions{
-			NICNameFromID: func(_ tcpip.NICID, nicName string) string {
-				return nicName
+		NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
+			NDPConfigs: ipv6.NDPConfigurations{
+				HandleRAs:              true,
+				AutoGenGlobalAddresses: true,
 			},
-			SecretKey: secretKey,
-		},
+			NDPDisp: &ndpDisp,
+			OpaqueIIDOpts: ipv6.OpaqueInterfaceIdentifierOptions{
+				NICNameFromID: func(_ tcpip.NICID, nicName string) string {
+					return nicName
+				},
+				SecretKey: secretKey,
+			},
+		})},
 	})
 	opts := stack.NICOptions{Name: nicName}
 	if err := s.CreateNICWithOptions(nicID, e, opts); err != nil {
@@ -3796,11 +3870,11 @@ func TestAutoGenAddrInResponseToDADConflicts(t *testing.T) {
 	const lifetimeSeconds = 10
 
 	// Needed for the temporary address sub test.
-	savedMaxDesync := stack.MaxDesyncFactor
+	savedMaxDesync := ipv6.MaxDesyncFactor
 	defer func() {
-		stack.MaxDesyncFactor = savedMaxDesync
+		ipv6.MaxDesyncFactor = savedMaxDesync
 	}()
-	stack.MaxDesyncFactor = time.Nanosecond
+	ipv6.MaxDesyncFactor = time.Nanosecond
 
 	var secretKeyBuf [header.OpaqueIIDSecretKeyMinBytes]byte
 	secretKey := secretKeyBuf[:]
@@ -3878,14 +3952,14 @@ func TestAutoGenAddrInResponseToDADConflicts(t *testing.T) {
 
 	addrTypes := []struct {
 		name             string
-		ndpConfigs       stack.NDPConfigurations
+		ndpConfigs       ipv6.NDPConfigurations
 		autoGenLinkLocal bool
 		prepareFn        func(t *testing.T, ndpDisp *ndpDispatcher, e *channel.Endpoint, tempIIDHistory []byte) []tcpip.AddressWithPrefix
 		addrGenFn        func(dadCounter uint8, tempIIDHistory []byte) tcpip.AddressWithPrefix
 	}{
 		{
 			name: "Global address",
-			ndpConfigs: stack.NDPConfigurations{
+			ndpConfigs: ipv6.NDPConfigurations{
 				DupAddrDetectTransmits: dadTransmits,
 				RetransmitTimer:        retransmitTimer,
 				HandleRAs:              true,
@@ -3903,7 +3977,7 @@ func TestAutoGenAddrInResponseToDADConflicts(t *testing.T) {
 		},
 		{
 			name: "LinkLocal address",
-			ndpConfigs: stack.NDPConfigurations{
+			ndpConfigs: ipv6.NDPConfigurations{
 				DupAddrDetectTransmits: dadTransmits,
 				RetransmitTimer:        retransmitTimer,
 			},
@@ -3917,7 +3991,7 @@ func TestAutoGenAddrInResponseToDADConflicts(t *testing.T) {
 		},
 		{
 			name: "Temporary address",
-			ndpConfigs: stack.NDPConfigurations{
+			ndpConfigs: ipv6.NDPConfigurations{
 				DupAddrDetectTransmits:     dadTransmits,
 				RetransmitTimer:            retransmitTimer,
 				HandleRAs:                  true,
@@ -3969,16 +4043,17 @@ func TestAutoGenAddrInResponseToDADConflicts(t *testing.T) {
 						ndpConfigs := addrType.ndpConfigs
 						ndpConfigs.AutoGenAddressConflictRetries = maxRetries
 						s := stack.New(stack.Options{
-							NetworkProtocols:     []stack.NetworkProtocol{ipv6.NewProtocol()},
-							AutoGenIPv6LinkLocal: addrType.autoGenLinkLocal,
-							NDPConfigs:           ndpConfigs,
-							NDPDisp:              &ndpDisp,
-							OpaqueIIDOpts: stack.OpaqueInterfaceIdentifierOptions{
-								NICNameFromID: func(_ tcpip.NICID, nicName string) string {
-									return nicName
+							NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
+								AutoGenIPv6LinkLocal: addrType.autoGenLinkLocal,
+								NDPConfigs:           ndpConfigs,
+								NDPDisp:              &ndpDisp,
+								OpaqueIIDOpts: ipv6.OpaqueInterfaceIdentifierOptions{
+									NICNameFromID: func(_ tcpip.NICID, nicName string) string {
+										return nicName
+									},
+									SecretKey: secretKey,
 								},
-								SecretKey: secretKey,
-							},
+							})},
 						})
 						opts := stack.NICOptions{Name: nicName}
 						if err := s.CreateNICWithOptions(nicID, e, opts); err != nil {
@@ -3999,9 +4074,7 @@ func TestAutoGenAddrInResponseToDADConflicts(t *testing.T) {
 							}
 
 							// Simulate a DAD conflict.
-							if err := s.DupTentativeAddrDetected(nicID, addr.Address); err != nil {
-								t.Fatalf("s.DupTentativeAddrDetected(%d, %s): %s", nicID, addr.Address, err)
-							}
+							rxNDPSolicit(e, addr.Address)
 							expectAutoGenAddrEvent(t, &ndpDisp, addr, invalidatedAddr)
 							expectDADEvent(t, &ndpDisp, addr.Address, false)
 
@@ -4059,14 +4132,14 @@ func TestAutoGenAddrWithEUI64IIDNoDADRetries(t *testing.T) {
 
 	addrTypes := []struct {
 		name             string
-		ndpConfigs       stack.NDPConfigurations
+		ndpConfigs       ipv6.NDPConfigurations
 		autoGenLinkLocal bool
 		subnet           tcpip.Subnet
 		triggerSLAACFn   func(e *channel.Endpoint)
 	}{
 		{
 			name: "Global address",
-			ndpConfigs: stack.NDPConfigurations{
+			ndpConfigs: ipv6.NDPConfigurations{
 				DupAddrDetectTransmits:        dadTransmits,
 				RetransmitTimer:               retransmitTimer,
 				HandleRAs:                     true,
@@ -4082,7 +4155,7 @@ func TestAutoGenAddrWithEUI64IIDNoDADRetries(t *testing.T) {
 		},
 		{
 			name: "LinkLocal address",
-			ndpConfigs: stack.NDPConfigurations{
+			ndpConfigs: ipv6.NDPConfigurations{
 				DupAddrDetectTransmits:        dadTransmits,
 				RetransmitTimer:               retransmitTimer,
 				AutoGenAddressConflictRetries: maxRetries,
@@ -4105,10 +4178,11 @@ func TestAutoGenAddrWithEUI64IIDNoDADRetries(t *testing.T) {
 			}
 			e := channel.New(0, 1280, linkAddr1)
 			s := stack.New(stack.Options{
-				NetworkProtocols:     []stack.NetworkProtocol{ipv6.NewProtocol()},
-				AutoGenIPv6LinkLocal: addrType.autoGenLinkLocal,
-				NDPConfigs:           addrType.ndpConfigs,
-				NDPDisp:              &ndpDisp,
+				NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
+					AutoGenIPv6LinkLocal: addrType.autoGenLinkLocal,
+					NDPConfigs:           addrType.ndpConfigs,
+					NDPDisp:              &ndpDisp,
+				})},
 			})
 			if err := s.CreateNIC(nicID, e); err != nil {
 				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
@@ -4138,9 +4212,7 @@ func TestAutoGenAddrWithEUI64IIDNoDADRetries(t *testing.T) {
 			expectAutoGenAddrEvent(addr, newAddr)
 
 			// Simulate a DAD conflict.
-			if err := s.DupTentativeAddrDetected(nicID, addr.Address); err != nil {
-				t.Fatalf("s.DupTentativeAddrDetected(%d, %s): %s", nicID, addr.Address, err)
-			}
+			rxNDPSolicit(e, addr.Address)
 			expectAutoGenAddrEvent(addr, invalidatedAddr)
 			select {
 			case e := <-ndpDisp.dadC:
@@ -4190,21 +4262,22 @@ func TestAutoGenAddrContinuesLifetimesAfterRetry(t *testing.T) {
 	}
 	e := channel.New(0, 1280, linkAddr1)
 	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
-		NDPConfigs: stack.NDPConfigurations{
-			DupAddrDetectTransmits:        dadTransmits,
-			RetransmitTimer:               retransmitTimer,
-			HandleRAs:                     true,
-			AutoGenGlobalAddresses:        true,
-			AutoGenAddressConflictRetries: maxRetries,
-		},
-		NDPDisp: &ndpDisp,
-		OpaqueIIDOpts: stack.OpaqueInterfaceIdentifierOptions{
-			NICNameFromID: func(_ tcpip.NICID, nicName string) string {
-				return nicName
+		NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
+			NDPConfigs: ipv6.NDPConfigurations{
+				DupAddrDetectTransmits:        dadTransmits,
+				RetransmitTimer:               retransmitTimer,
+				HandleRAs:                     true,
+				AutoGenGlobalAddresses:        true,
+				AutoGenAddressConflictRetries: maxRetries,
 			},
-			SecretKey: secretKey,
-		},
+			NDPDisp: &ndpDisp,
+			OpaqueIIDOpts: ipv6.OpaqueInterfaceIdentifierOptions{
+				NICNameFromID: func(_ tcpip.NICID, nicName string) string {
+					return nicName
+				},
+				SecretKey: secretKey,
+			},
+		})},
 	})
 	opts := stack.NICOptions{Name: nicName}
 	if err := s.CreateNICWithOptions(nicID, e, opts); err != nil {
@@ -4236,9 +4309,7 @@ func TestAutoGenAddrContinuesLifetimesAfterRetry(t *testing.T) {
 
 	// Simulate a DAD conflict after some time has passed.
 	time.Sleep(failureTimer)
-	if err := s.DupTentativeAddrDetected(nicID, addr.Address); err != nil {
-		t.Fatalf("s.DupTentativeAddrDetected(%d, %s): %s", nicID, addr.Address, err)
-	}
+	rxNDPSolicit(e, addr.Address)
 	expectAutoGenAddrEvent(addr, invalidatedAddr)
 	select {
 	case e := <-ndpDisp.dadC:
@@ -4399,11 +4470,12 @@ func TestNDPRecursiveDNSServerDispatch(t *testing.T) {
 			}
 			e := channel.New(0, 1280, linkAddr1)
 			s := stack.New(stack.Options{
-				NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
-				NDPConfigs: stack.NDPConfigurations{
-					HandleRAs: true,
-				},
-				NDPDisp: &ndpDisp,
+				NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
+					NDPConfigs: ipv6.NDPConfigurations{
+						HandleRAs: true,
+					},
+					NDPDisp: &ndpDisp,
+				})},
 			})
 			if err := s.CreateNIC(1, e); err != nil {
 				t.Fatalf("CreateNIC(1) = %s", err)
@@ -4449,11 +4521,12 @@ func TestNDPDNSSearchListDispatch(t *testing.T) {
 	}
 	e := channel.New(0, 1280, linkAddr1)
 	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
-		NDPConfigs: stack.NDPConfigurations{
-			HandleRAs: true,
-		},
-		NDPDisp: &ndpDisp,
+		NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
+			NDPConfigs: ipv6.NDPConfigurations{
+				HandleRAs: true,
+			},
+			NDPDisp: &ndpDisp,
+		})},
 	})
 	if err := s.CreateNIC(nicID, e); err != nil {
 		t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
@@ -4580,7 +4653,7 @@ func TestCleanupNDPState(t *testing.T) {
 			name: "Enable forwarding",
 			cleanupFn: func(t *testing.T, s *stack.Stack) {
 				t.Helper()
-				s.SetForwarding(true)
+				s.SetForwarding(ipv6.ProtocolNumber, true)
 			},
 			keepAutoGenLinkLocal: true,
 			maxAutoGenAddrEvents: 4,
@@ -4634,15 +4707,16 @@ func TestCleanupNDPState(t *testing.T) {
 				autoGenAddrC:   make(chan ndpAutoGenAddrEvent, test.maxAutoGenAddrEvents),
 			}
 			s := stack.New(stack.Options{
-				NetworkProtocols:     []stack.NetworkProtocol{ipv6.NewProtocol()},
-				AutoGenIPv6LinkLocal: true,
-				NDPConfigs: stack.NDPConfigurations{
-					HandleRAs:              true,
-					DiscoverDefaultRouters: true,
-					DiscoverOnLinkPrefixes: true,
-					AutoGenGlobalAddresses: true,
-				},
-				NDPDisp: &ndpDisp,
+				NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
+					AutoGenIPv6LinkLocal: true,
+					NDPConfigs: ipv6.NDPConfigurations{
+						HandleRAs:              true,
+						DiscoverDefaultRouters: true,
+						DiscoverOnLinkPrefixes: true,
+						AutoGenGlobalAddresses: true,
+					},
+					NDPDisp: &ndpDisp,
+				})},
 			})
 
 			expectRouterEvent := func() (bool, ndpRouterEvent) {
@@ -4907,18 +4981,19 @@ func TestDHCPv6ConfigurationFromNDPDA(t *testing.T) {
 	}
 	e := channel.New(0, 1280, linkAddr1)
 	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
-		NDPConfigs: stack.NDPConfigurations{
-			HandleRAs: true,
-		},
-		NDPDisp: &ndpDisp,
+		NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
+			NDPConfigs: ipv6.NDPConfigurations{
+				HandleRAs: true,
+			},
+			NDPDisp: &ndpDisp,
+		})},
 	})
 
 	if err := s.CreateNIC(nicID, e); err != nil {
 		t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
 	}
 
-	expectDHCPv6Event := func(configuration stack.DHCPv6ConfigurationFromNDPRA) {
+	expectDHCPv6Event := func(configuration ipv6.DHCPv6ConfigurationFromNDPRA) {
 		t.Helper()
 		select {
 		case e := <-ndpDisp.dhcpv6ConfigurationC:
@@ -4942,7 +5017,7 @@ func TestDHCPv6ConfigurationFromNDPDA(t *testing.T) {
 	// Even if the first RA reports no DHCPv6 configurations are available, the
 	// dispatcher should get an event.
 	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithDHCPv6(llAddr2, false, false))
-	expectDHCPv6Event(stack.DHCPv6NoConfiguration)
+	expectDHCPv6Event(ipv6.DHCPv6NoConfiguration)
 	// Receiving the same update again should not result in an event to the
 	// dispatcher.
 	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithDHCPv6(llAddr2, false, false))
@@ -4951,19 +5026,19 @@ func TestDHCPv6ConfigurationFromNDPDA(t *testing.T) {
 	// Receive an RA that updates the DHCPv6 configuration to Other
 	// Configurations.
 	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithDHCPv6(llAddr2, false, true))
-	expectDHCPv6Event(stack.DHCPv6OtherConfigurations)
+	expectDHCPv6Event(ipv6.DHCPv6OtherConfigurations)
 	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithDHCPv6(llAddr2, false, true))
 	expectNoDHCPv6Event()
 
 	// Receive an RA that updates the DHCPv6 configuration to Managed Address.
 	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithDHCPv6(llAddr2, true, false))
-	expectDHCPv6Event(stack.DHCPv6ManagedAddress)
+	expectDHCPv6Event(ipv6.DHCPv6ManagedAddress)
 	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithDHCPv6(llAddr2, true, false))
 	expectNoDHCPv6Event()
 
 	// Receive an RA that updates the DHCPv6 configuration to none.
 	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithDHCPv6(llAddr2, false, false))
-	expectDHCPv6Event(stack.DHCPv6NoConfiguration)
+	expectDHCPv6Event(ipv6.DHCPv6NoConfiguration)
 	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithDHCPv6(llAddr2, false, false))
 	expectNoDHCPv6Event()
 
@@ -4971,7 +5046,7 @@ func TestDHCPv6ConfigurationFromNDPDA(t *testing.T) {
 	//
 	// Note, when the M flag is set, the O flag is redundant.
 	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithDHCPv6(llAddr2, true, true))
-	expectDHCPv6Event(stack.DHCPv6ManagedAddress)
+	expectDHCPv6Event(ipv6.DHCPv6ManagedAddress)
 	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithDHCPv6(llAddr2, true, true))
 	expectNoDHCPv6Event()
 	// Even though the DHCPv6 flags are different, the effective configuration is
@@ -4984,7 +5059,7 @@ func TestDHCPv6ConfigurationFromNDPDA(t *testing.T) {
 	// Receive an RA that updates the DHCPv6 configuration to Other
 	// Configurations.
 	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithDHCPv6(llAddr2, false, true))
-	expectDHCPv6Event(stack.DHCPv6OtherConfigurations)
+	expectDHCPv6Event(ipv6.DHCPv6OtherConfigurations)
 	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithDHCPv6(llAddr2, false, true))
 	expectNoDHCPv6Event()
 
@@ -4999,7 +5074,7 @@ func TestDHCPv6ConfigurationFromNDPDA(t *testing.T) {
 	// Receive an RA that updates the DHCPv6 configuration to Other
 	// Configurations.
 	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithDHCPv6(llAddr2, false, true))
-	expectDHCPv6Event(stack.DHCPv6OtherConfigurations)
+	expectDHCPv6Event(ipv6.DHCPv6OtherConfigurations)
 	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithDHCPv6(llAddr2, false, true))
 	expectNoDHCPv6Event()
 }
@@ -5157,12 +5232,13 @@ func TestRouterSolicitation(t *testing.T) {
 					}
 				}
 				s := stack.New(stack.Options{
-					NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
-					NDPConfigs: stack.NDPConfigurations{
-						MaxRtrSolicitations:     test.maxRtrSolicit,
-						RtrSolicitationInterval: test.rtrSolicitInt,
-						MaxRtrSolicitationDelay: test.maxRtrSolicitDelay,
-					},
+					NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
+						NDPConfigs: ipv6.NDPConfigurations{
+							MaxRtrSolicitations:     test.maxRtrSolicit,
+							RtrSolicitationInterval: test.rtrSolicitInt,
+							MaxRtrSolicitationDelay: test.maxRtrSolicitDelay,
+						},
+					})},
 				})
 				if err := s.CreateNIC(nicID, &e); err != nil {
 					t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
@@ -5226,11 +5302,11 @@ func TestStopStartSolicitingRouters(t *testing.T) {
 			name: "Enable and disable forwarding",
 			startFn: func(t *testing.T, s *stack.Stack) {
 				t.Helper()
-				s.SetForwarding(false)
+				s.SetForwarding(ipv6.ProtocolNumber, false)
 			},
 			stopFn: func(t *testing.T, s *stack.Stack, _ bool) {
 				t.Helper()
-				s.SetForwarding(true)
+				s.SetForwarding(ipv6.ProtocolNumber, true)
 			},
 		},
 
@@ -5297,12 +5373,13 @@ func TestStopStartSolicitingRouters(t *testing.T) {
 					checker.NDPRS())
 			}
 			s := stack.New(stack.Options{
-				NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
-				NDPConfigs: stack.NDPConfigurations{
-					MaxRtrSolicitations:     maxRtrSolicitations,
-					RtrSolicitationInterval: interval,
-					MaxRtrSolicitationDelay: delay,
-				},
+				NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
+					NDPConfigs: ipv6.NDPConfigurations{
+						MaxRtrSolicitations:     maxRtrSolicitations,
+						RtrSolicitationInterval: interval,
+						MaxRtrSolicitationDelay: delay,
+					},
+				})},
 			})
 			if err := s.CreateNIC(nicID, e); err != nil {
 				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
diff --git a/pkg/tcpip/stack/neighbor_cache.go b/pkg/tcpip/stack/neighbor_cache.go
index 27e1feec0..4df288798 100644
--- a/pkg/tcpip/stack/neighbor_cache.go
+++ b/pkg/tcpip/stack/neighbor_cache.go
@@ -131,10 +131,17 @@ func (n *neighborCache) entry(remoteAddr, localAddr tcpip.Address, linkRes LinkA
 	defer entry.mu.Unlock()
 
 	switch s := entry.neigh.State; s {
-	case Reachable, Static:
+	case Stale:
+		entry.handlePacketQueuedLocked()
+		fallthrough
+	case Reachable, Static, Delay, Probe:
+		// As per RFC 4861 section 7.3.3:
+		//  "Neighbor Unreachability Detection operates in parallel with the sending
+		//   of packets to a neighbor. While reasserting a neighbor's reachability,
+		//   a node continues sending packets to that neighbor using the cached
+		//   link-layer address."
 		return entry.neigh, nil, nil
-
-	case Unknown, Incomplete, Stale, Delay, Probe:
+	case Unknown, Incomplete:
 		entry.addWakerLocked(w)
 
 		if entry.done == nil {
@@ -147,10 +154,8 @@ func (n *neighborCache) entry(remoteAddr, localAddr tcpip.Address, linkRes LinkA
 
 		entry.handlePacketQueuedLocked()
 		return entry.neigh, entry.done, tcpip.ErrWouldBlock
-
 	case Failed:
 		return entry.neigh, nil, tcpip.ErrNoLinkAddress
-
 	default:
 		panic(fmt.Sprintf("Invalid cache entry state: %s", s))
 	}
diff --git a/pkg/tcpip/stack/neighbor_cache_test.go b/pkg/tcpip/stack/neighbor_cache_test.go
index b4fa69e3e..fcd54ed83 100644
--- a/pkg/tcpip/stack/neighbor_cache_test.go
+++ b/pkg/tcpip/stack/neighbor_cache_test.go
@@ -30,6 +30,7 @@ import (
 	"github.com/google/go-cmp/cmp/cmpopts"
 	"gvisor.dev/gvisor/pkg/sleep"
 	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/faketime"
 )
 
 const (
@@ -239,7 +240,7 @@ type entryEvent struct {
 func TestNeighborCacheGetConfig(t *testing.T) {
 	nudDisp := testNUDDispatcher{}
 	c := DefaultNUDConfigurations()
-	clock := newFakeClock()
+	clock := faketime.NewManualClock()
 	neigh := newTestNeighborCache(&nudDisp, c, clock)
 
 	if got, want := neigh.config(), c; got != want {
@@ -257,7 +258,7 @@ func TestNeighborCacheGetConfig(t *testing.T) {
 func TestNeighborCacheSetConfig(t *testing.T) {
 	nudDisp := testNUDDispatcher{}
 	c := DefaultNUDConfigurations()
-	clock := newFakeClock()
+	clock := faketime.NewManualClock()
 	neigh := newTestNeighborCache(&nudDisp, c, clock)
 
 	c.MinRandomFactor = 1
@@ -279,7 +280,7 @@ func TestNeighborCacheSetConfig(t *testing.T) {
 func TestNeighborCacheEntry(t *testing.T) {
 	c := DefaultNUDConfigurations()
 	nudDisp := testNUDDispatcher{}
-	clock := newFakeClock()
+	clock := faketime.NewManualClock()
 	neigh := newTestNeighborCache(&nudDisp, c, clock)
 	store := newTestEntryStore()
 	linkRes := &testNeighborResolver{
@@ -298,7 +299,7 @@ func TestNeighborCacheEntry(t *testing.T) {
 		t.Errorf("got neigh.entry(%s, %s, _, nil) = %v, want = %s", entry.Addr, entry.LocalAddr, err, tcpip.ErrWouldBlock)
 	}
 
-	clock.advance(typicalLatency)
+	clock.Advance(typicalLatency)
 
 	wantEvents := []testEntryEventInfo{
 		{
@@ -339,7 +340,7 @@ func TestNeighborCacheRemoveEntry(t *testing.T) {
 	config := DefaultNUDConfigurations()
 
 	nudDisp := testNUDDispatcher{}
-	clock := newFakeClock()
+	clock := faketime.NewManualClock()
 	neigh := newTestNeighborCache(&nudDisp, config, clock)
 	store := newTestEntryStore()
 	linkRes := &testNeighborResolver{
@@ -358,7 +359,7 @@ func TestNeighborCacheRemoveEntry(t *testing.T) {
 		t.Errorf("got neigh.entry(%s, %s, _, nil) = %v, want = %s", entry.Addr, entry.LocalAddr, err, tcpip.ErrWouldBlock)
 	}
 
-	clock.advance(typicalLatency)
+	clock.Advance(typicalLatency)
 
 	wantEvents := []testEntryEventInfo{
 		{
@@ -409,7 +410,7 @@ func TestNeighborCacheRemoveEntry(t *testing.T) {
 }
 
 type testContext struct {
-	clock   *fakeClock
+	clock   *faketime.ManualClock
 	neigh   *neighborCache
 	store   *testEntryStore
 	linkRes *testNeighborResolver
@@ -418,7 +419,7 @@ type testContext struct {
 
 func newTestContext(c NUDConfigurations) testContext {
 	nudDisp := &testNUDDispatcher{}
-	clock := newFakeClock()
+	clock := faketime.NewManualClock()
 	neigh := newTestNeighborCache(nudDisp, c, clock)
 	store := newTestEntryStore()
 	linkRes := &testNeighborResolver{
@@ -454,7 +455,7 @@ func (c *testContext) overflowCache(opts overflowOptions) error {
 		if _, _, err := c.neigh.entry(entry.Addr, entry.LocalAddr, c.linkRes, nil); err != tcpip.ErrWouldBlock {
 			return fmt.Errorf("got c.neigh.entry(%s, %s, _, nil) = %v, want = %s", entry.Addr, entry.LocalAddr, err, tcpip.ErrWouldBlock)
 		}
-		c.clock.advance(c.neigh.config().RetransmitTimer)
+		c.clock.Advance(c.neigh.config().RetransmitTimer)
 
 		var wantEvents []testEntryEventInfo
 
@@ -567,7 +568,7 @@ func TestNeighborCacheRemoveEntryThenOverflow(t *testing.T) {
 	if err != tcpip.ErrWouldBlock {
 		t.Errorf("got c.neigh.entry(%s, %s, _, nil) = %v, want = %s", entry.Addr, entry.LocalAddr, err, tcpip.ErrWouldBlock)
 	}
-	c.clock.advance(c.neigh.config().RetransmitTimer)
+	c.clock.Advance(c.neigh.config().RetransmitTimer)
 	wantEvents := []testEntryEventInfo{
 		{
 			EventType: entryTestAdded,
@@ -803,7 +804,7 @@ func TestNeighborCacheOverwriteWithStaticEntryThenOverflow(t *testing.T) {
 	if err != tcpip.ErrWouldBlock {
 		t.Errorf("got c.neigh.entry(%s, %s, _, nil) = %v, want = %s", entry.Addr, entry.LocalAddr, err, tcpip.ErrWouldBlock)
 	}
-	c.clock.advance(typicalLatency)
+	c.clock.Advance(typicalLatency)
 	wantEvents := []testEntryEventInfo{
 		{
 			EventType: entryTestAdded,
@@ -876,7 +877,7 @@ func TestNeighborCacheNotifiesWaker(t *testing.T) {
 	config := DefaultNUDConfigurations()
 
 	nudDisp := testNUDDispatcher{}
-	clock := newFakeClock()
+	clock := faketime.NewManualClock()
 	neigh := newTestNeighborCache(&nudDisp, config, clock)
 	store := newTestEntryStore()
 	linkRes := &testNeighborResolver{
@@ -902,7 +903,7 @@ func TestNeighborCacheNotifiesWaker(t *testing.T) {
 	if doneCh == nil {
 		t.Fatalf("expected done channel from neigh.entry(%s, %s, _, _)", entry.Addr, entry.LocalAddr)
 	}
-	clock.advance(typicalLatency)
+	clock.Advance(typicalLatency)
 
 	select {
 	case <-doneCh:
@@ -944,7 +945,7 @@ func TestNeighborCacheRemoveWaker(t *testing.T) {
 	config := DefaultNUDConfigurations()
 
 	nudDisp := testNUDDispatcher{}
-	clock := newFakeClock()
+	clock := faketime.NewManualClock()
 	neigh := newTestNeighborCache(&nudDisp, config, clock)
 	store := newTestEntryStore()
 	linkRes := &testNeighborResolver{
@@ -974,7 +975,7 @@ func TestNeighborCacheRemoveWaker(t *testing.T) {
 	// Remove the waker before the neighbor cache has the opportunity to send a
 	// notification.
 	neigh.removeWaker(entry.Addr, &w)
-	clock.advance(typicalLatency)
+	clock.Advance(typicalLatency)
 
 	select {
 	case <-doneCh:
@@ -1073,7 +1074,7 @@ func TestNeighborCacheClear(t *testing.T) {
 	config := DefaultNUDConfigurations()
 
 	nudDisp := testNUDDispatcher{}
-	clock := newFakeClock()
+	clock := faketime.NewManualClock()
 	neigh := newTestNeighborCache(&nudDisp, config, clock)
 	store := newTestEntryStore()
 	linkRes := &testNeighborResolver{
@@ -1092,7 +1093,7 @@ func TestNeighborCacheClear(t *testing.T) {
 	if err != tcpip.ErrWouldBlock {
 		t.Errorf("got neigh.entry(%s, %s, _, nil) = %v, want = %s", entry.Addr, entry.LocalAddr, err, tcpip.ErrWouldBlock)
 	}
-	clock.advance(typicalLatency)
+	clock.Advance(typicalLatency)
 
 	wantEvents := []testEntryEventInfo{
 		{
@@ -1188,7 +1189,7 @@ func TestNeighborCacheClearThenOverflow(t *testing.T) {
 	if err != tcpip.ErrWouldBlock {
 		t.Errorf("got c.neigh.entry(%s, %s, _, nil) = %v, want = %s", entry.Addr, entry.LocalAddr, err, tcpip.ErrWouldBlock)
 	}
-	c.clock.advance(typicalLatency)
+	c.clock.Advance(typicalLatency)
 	wantEvents := []testEntryEventInfo{
 		{
 			EventType: entryTestAdded,
@@ -1249,7 +1250,7 @@ func TestNeighborCacheKeepFrequentlyUsed(t *testing.T) {
 	config.MaxRandomFactor = 1
 
 	nudDisp := testNUDDispatcher{}
-	clock := newFakeClock()
+	clock := faketime.NewManualClock()
 	neigh := newTestNeighborCache(&nudDisp, config, clock)
 	store := newTestEntryStore()
 	linkRes := &testNeighborResolver{
@@ -1277,7 +1278,7 @@ func TestNeighborCacheKeepFrequentlyUsed(t *testing.T) {
 		if err != tcpip.ErrWouldBlock {
 			t.Errorf("got neigh.entry(%s, %s, _, nil) = %v, want = %s", entry.Addr, entry.LocalAddr, err, tcpip.ErrWouldBlock)
 		}
-		clock.advance(typicalLatency)
+		clock.Advance(typicalLatency)
 		select {
 		case <-doneCh:
 		default:
@@ -1325,7 +1326,7 @@ func TestNeighborCacheKeepFrequentlyUsed(t *testing.T) {
 		if err != tcpip.ErrWouldBlock {
 			t.Errorf("got neigh.entry(%s, %s, _, nil) = %v, want = %s", entry.Addr, entry.LocalAddr, err, tcpip.ErrWouldBlock)
 		}
-		clock.advance(typicalLatency)
+		clock.Advance(typicalLatency)
 		select {
 		case <-doneCh:
 		default:
@@ -1412,7 +1413,7 @@ func TestNeighborCacheConcurrent(t *testing.T) {
 	config := DefaultNUDConfigurations()
 
 	nudDisp := testNUDDispatcher{}
-	clock := newFakeClock()
+	clock := faketime.NewManualClock()
 	neigh := newTestNeighborCache(&nudDisp, config, clock)
 	store := newTestEntryStore()
 	linkRes := &testNeighborResolver{
@@ -1440,7 +1441,7 @@ func TestNeighborCacheConcurrent(t *testing.T) {
 		wg.Wait()
 
 		// Process all the requests for a single entry concurrently
-		clock.advance(typicalLatency)
+		clock.Advance(typicalLatency)
 	}
 
 	// All goroutines add in the same order and add more values than can fit in
@@ -1472,7 +1473,7 @@ func TestNeighborCacheReplace(t *testing.T) {
 	config := DefaultNUDConfigurations()
 
 	nudDisp := testNUDDispatcher{}
-	clock := newFakeClock()
+	clock := faketime.NewManualClock()
 	neigh := newTestNeighborCache(&nudDisp, config, clock)
 	store := newTestEntryStore()
 	linkRes := &testNeighborResolver{
@@ -1491,7 +1492,7 @@ func TestNeighborCacheReplace(t *testing.T) {
 	if err != tcpip.ErrWouldBlock {
 		t.Fatalf("got neigh.entry(%s, %s, _, nil) = %v, want = %s", entry.Addr, entry.LocalAddr, err, tcpip.ErrWouldBlock)
 	}
-	clock.advance(typicalLatency)
+	clock.Advance(typicalLatency)
 	select {
 	case <-doneCh:
 	default:
@@ -1499,24 +1500,26 @@ func TestNeighborCacheReplace(t *testing.T) {
 	}
 
 	// Verify the entry exists
-	e, doneCh, err := neigh.entry(entry.Addr, entry.LocalAddr, linkRes, nil)
-	if err != nil {
-		t.Errorf("unexpected error from neigh.entry(%s, %s, _, nil): %s", entry.Addr, entry.LocalAddr, err)
-	}
-	if doneCh != nil {
-		t.Errorf("unexpected done channel from neigh.entry(%s, %s, _, nil): %v", entry.Addr, entry.LocalAddr, doneCh)
-	}
-	if t.Failed() {
-		t.FailNow()
-	}
-	want := NeighborEntry{
-		Addr:      entry.Addr,
-		LocalAddr: entry.LocalAddr,
-		LinkAddr:  entry.LinkAddr,
-		State:     Reachable,
-	}
-	if diff := cmp.Diff(e, want, entryDiffOpts()...); diff != "" {
-		t.Errorf("neigh.entry(%s, %s, _, nil) mismatch (-got, +want):\n%s", entry.Addr, entry.LinkAddr, diff)
+	{
+		e, doneCh, err := neigh.entry(entry.Addr, entry.LocalAddr, linkRes, nil)
+		if err != nil {
+			t.Errorf("unexpected error from neigh.entry(%s, %s, _, nil): %s", entry.Addr, entry.LocalAddr, err)
+		}
+		if doneCh != nil {
+			t.Errorf("unexpected done channel from neigh.entry(%s, %s, _, nil): %v", entry.Addr, entry.LocalAddr, doneCh)
+		}
+		if t.Failed() {
+			t.FailNow()
+		}
+		want := NeighborEntry{
+			Addr:      entry.Addr,
+			LocalAddr: entry.LocalAddr,
+			LinkAddr:  entry.LinkAddr,
+			State:     Reachable,
+		}
+		if diff := cmp.Diff(e, want, entryDiffOpts()...); diff != "" {
+			t.Errorf("neigh.entry(%s, %s, _, nil) mismatch (-got, +want):\n%s", entry.Addr, entry.LinkAddr, diff)
+		}
 	}
 
 	// Notify of a link address change
@@ -1535,28 +1538,34 @@ func TestNeighborCacheReplace(t *testing.T) {
 		IsRouter:  false,
 	})
 
-	// Requesting the entry again should start address resolution
+	// Requesting the entry again should start neighbor reachability confirmation.
+	//
+	// Verify the entry's new link address and the new state.
 	{
-		_, doneCh, err := neigh.entry(entry.Addr, entry.LocalAddr, linkRes, nil)
-		if err != tcpip.ErrWouldBlock {
-			t.Fatalf("got neigh.entry(%s, %s, _, nil) = %v, want = %s", entry.Addr, entry.LocalAddr, err, tcpip.ErrWouldBlock)
+		e, _, err := neigh.entry(entry.Addr, entry.LocalAddr, linkRes, nil)
+		if err != nil {
+			t.Fatalf("neigh.entry(%s, %s, _, nil): %s", entry.Addr, entry.LocalAddr, err)
 		}
-		clock.advance(config.DelayFirstProbeTime + typicalLatency)
-		select {
-		case <-doneCh:
-		default:
-			t.Fatalf("expected notification from done channel returned by neigh.entry(%s, %s, _, nil)", entry.Addr, entry.LocalAddr)
+		want := NeighborEntry{
+			Addr:      entry.Addr,
+			LocalAddr: entry.LocalAddr,
+			LinkAddr:  updatedLinkAddr,
+			State:     Delay,
+		}
+		if diff := cmp.Diff(e, want, entryDiffOpts()...); diff != "" {
+			t.Errorf("neigh.entry(%s, %s, _, nil) mismatch (-got, +want):\n%s", entry.Addr, entry.LocalAddr, diff)
 		}
+		clock.Advance(config.DelayFirstProbeTime + typicalLatency)
 	}
 
-	// Verify the entry's new link address
+	// Verify that the neighbor is now reachable.
 	{
 		e, _, err := neigh.entry(entry.Addr, entry.LocalAddr, linkRes, nil)
-		clock.advance(typicalLatency)
+		clock.Advance(typicalLatency)
 		if err != nil {
 			t.Errorf("unexpected error from neigh.entry(%s, %s, _, nil): %s", entry.Addr, entry.LocalAddr, err)
 		}
-		want = NeighborEntry{
+		want := NeighborEntry{
 			Addr:      entry.Addr,
 			LocalAddr: entry.LocalAddr,
 			LinkAddr:  updatedLinkAddr,
@@ -1572,7 +1581,7 @@ func TestNeighborCacheResolutionFailed(t *testing.T) {
 	config := DefaultNUDConfigurations()
 
 	nudDisp := testNUDDispatcher{}
-	clock := newFakeClock()
+	clock := faketime.NewManualClock()
 	neigh := newTestNeighborCache(&nudDisp, config, clock)
 	store := newTestEntryStore()
 
@@ -1595,7 +1604,7 @@ func TestNeighborCacheResolutionFailed(t *testing.T) {
 	if _, _, err := neigh.entry(entry.Addr, entry.LocalAddr, linkRes, nil); err != tcpip.ErrWouldBlock {
 		t.Fatalf("got neigh.entry(%s, %s, _, nil) = %v, want = %s", entry.Addr, entry.LocalAddr, err, tcpip.ErrWouldBlock)
 	}
-	clock.advance(typicalLatency)
+	clock.Advance(typicalLatency)
 	got, _, err := neigh.entry(entry.Addr, entry.LocalAddr, linkRes, nil)
 	if err != nil {
 		t.Fatalf("unexpected error from neigh.entry(%s, %s, _, nil): %s", entry.Addr, entry.LocalAddr, err)
@@ -1618,7 +1627,7 @@ func TestNeighborCacheResolutionFailed(t *testing.T) {
 		t.Fatalf("got neigh.entry(%s, %s, _, nil) = %v, want = %s", entry.Addr, entry.LocalAddr, err, tcpip.ErrWouldBlock)
 	}
 	waitFor := config.DelayFirstProbeTime + typicalLatency*time.Duration(config.MaxMulticastProbes)
-	clock.advance(waitFor)
+	clock.Advance(waitFor)
 	if _, _, err := neigh.entry(entry.Addr, entry.LocalAddr, linkRes, nil); err != tcpip.ErrNoLinkAddress {
 		t.Fatalf("got neigh.entry(%s, %s, _, nil) = %v, want = %s", entry.Addr, entry.LocalAddr, err, tcpip.ErrNoLinkAddress)
 	}
@@ -1636,7 +1645,7 @@ func TestNeighborCacheResolutionTimeout(t *testing.T) {
 	config := DefaultNUDConfigurations()
 	config.RetransmitTimer = time.Millisecond // small enough to cause timeout
 
-	clock := newFakeClock()
+	clock := faketime.NewManualClock()
 	neigh := newTestNeighborCache(nil, config, clock)
 	store := newTestEntryStore()
 	linkRes := &testNeighborResolver{
@@ -1654,7 +1663,7 @@ func TestNeighborCacheResolutionTimeout(t *testing.T) {
 		t.Fatalf("got neigh.entry(%s, %s, _, nil) = %v, want = %s", entry.Addr, entry.LocalAddr, err, tcpip.ErrWouldBlock)
 	}
 	waitFor := config.RetransmitTimer * time.Duration(config.MaxMulticastProbes)
-	clock.advance(waitFor)
+	clock.Advance(waitFor)
 	if _, _, err := neigh.entry(entry.Addr, entry.LocalAddr, linkRes, nil); err != tcpip.ErrNoLinkAddress {
 		t.Fatalf("got neigh.entry(%s, %s, _, nil) = %v, want = %s", entry.Addr, entry.LocalAddr, err, tcpip.ErrNoLinkAddress)
 	}
@@ -1664,7 +1673,7 @@ func TestNeighborCacheResolutionTimeout(t *testing.T) {
 // resolved immediately and don't send resolution requests.
 func TestNeighborCacheStaticResolution(t *testing.T) {
 	config := DefaultNUDConfigurations()
-	clock := newFakeClock()
+	clock := faketime.NewManualClock()
 	neigh := newTestNeighborCache(nil, config, clock)
 	store := newTestEntryStore()
 	linkRes := &testNeighborResolver{
diff --git a/pkg/tcpip/stack/neighbor_entry.go b/pkg/tcpip/stack/neighbor_entry.go
index 0068cacb8..be61a21af 100644
--- a/pkg/tcpip/stack/neighbor_entry.go
+++ b/pkg/tcpip/stack/neighbor_entry.go
@@ -21,6 +21,7 @@ import (
 
 	"gvisor.dev/gvisor/pkg/sleep"
 	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
 )
 
 // NeighborEntry describes a neighboring device in the local network.
@@ -73,8 +74,7 @@ const (
 type neighborEntry struct {
 	neighborEntryEntry
 
-	nic      *NIC
-	protocol tcpip.NetworkProtocolNumber
+	nic *NIC
 
 	// linkRes provides the functionality to send reachability probes, used in
 	// Neighbor Unreachability Detection.
@@ -236,7 +236,7 @@ func (e *neighborEntry) setStateLocked(next NeighborState) {
 				return
 			}
 
-			if err := e.linkRes.LinkAddressRequest(e.neigh.Addr, e.neigh.LocalAddr, "", e.nic.linkEP); err != nil {
+			if err := e.linkRes.LinkAddressRequest(e.neigh.Addr, e.neigh.LocalAddr, "", e.nic.LinkEndpoint); err != nil {
 				// There is no need to log the error here; the NUD implementation may
 				// assume a working link. A valid link should be the responsibility of
 				// the NIC/stack.LinkEndpoint.
@@ -277,7 +277,7 @@ func (e *neighborEntry) setStateLocked(next NeighborState) {
 				return
 			}
 
-			if err := e.linkRes.LinkAddressRequest(e.neigh.Addr, e.neigh.LocalAddr, e.neigh.LinkAddr, e.nic.linkEP); err != nil {
+			if err := e.linkRes.LinkAddressRequest(e.neigh.Addr, e.neigh.LocalAddr, e.neigh.LinkAddr, e.nic.LinkEndpoint); err != nil {
 				e.dispatchRemoveEventLocked()
 				e.setStateLocked(Failed)
 				return
@@ -406,9 +406,9 @@ func (e *neighborEntry) handleConfirmationLocked(linkAddr tcpip.LinkAddress, fla
 		// INCOMPLETE state." - RFC 4861 section 7.2.5
 
 	case Reachable, Stale, Delay, Probe:
-		sameLinkAddr := e.neigh.LinkAddr == linkAddr
+		isLinkAddrDifferent := len(linkAddr) != 0 && e.neigh.LinkAddr != linkAddr
 
-		if !sameLinkAddr {
+		if isLinkAddrDifferent {
 			if !flags.Override {
 				if e.neigh.State == Reachable {
 					e.dispatchChangeEventLocked(Stale)
@@ -431,7 +431,7 @@ func (e *neighborEntry) handleConfirmationLocked(linkAddr tcpip.LinkAddress, fla
 			}
 		}
 
-		if flags.Solicited && (flags.Override || sameLinkAddr) {
+		if flags.Solicited && (flags.Override || !isLinkAddrDifferent) {
 			if e.neigh.State != Reachable {
 				e.dispatchChangeEventLocked(Reachable)
 			}
@@ -440,7 +440,7 @@ func (e *neighborEntry) handleConfirmationLocked(linkAddr tcpip.LinkAddress, fla
 			e.notifyWakersLocked()
 		}
 
-		if e.isRouter && !flags.IsRouter {
+		if e.isRouter && !flags.IsRouter && header.IsV6UnicastAddress(e.neigh.Addr) {
 			// "In those cases where the IsRouter flag changes from TRUE to FALSE as
 			// a result of this update, the node MUST remove that router from the
 			// Default Router List and update the Destination Cache entries for all
@@ -448,9 +448,17 @@ func (e *neighborEntry) handleConfirmationLocked(linkAddr tcpip.LinkAddress, fla
 			// 7.3.3.  This is needed to detect when a node that is used as a router
 			// stops forwarding packets due to being configured as a host."
 			//  - RFC 4861 section 7.2.5
-			e.nic.mu.Lock()
-			e.nic.mu.ndp.invalidateDefaultRouter(e.neigh.Addr)
-			e.nic.mu.Unlock()
+			//
+			// TODO(gvisor.dev/issue/4085): Remove the special casing we do for IPv6
+			// here.
+			ep, ok := e.nic.networkEndpoints[header.IPv6ProtocolNumber]
+			if !ok {
+				panic(fmt.Sprintf("have a neighbor entry for an IPv6 router but no IPv6 network endpoint"))
+			}
+
+			if ndpEP, ok := ep.(NDPEndpoint); ok {
+				ndpEP.InvalidateDefaultRouter(e.neigh.Addr)
+			}
 		}
 		e.isRouter = flags.IsRouter
 
diff --git a/pkg/tcpip/stack/neighbor_entry_test.go b/pkg/tcpip/stack/neighbor_entry_test.go
index b769fb2fa..3ee2a3b31 100644
--- a/pkg/tcpip/stack/neighbor_entry_test.go
+++ b/pkg/tcpip/stack/neighbor_entry_test.go
@@ -27,6 +27,8 @@ import (
 	"github.com/google/go-cmp/cmp/cmpopts"
 	"gvisor.dev/gvisor/pkg/sleep"
 	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/faketime"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
 )
 
 const (
@@ -81,15 +83,18 @@ func eventDiffOptsWithSort() []cmp.Option {
 // | Reachable  | Stale      | Reachable timer expired                    |                 | Changed |
 // | Reachable  | Stale      | Probe or confirmation w/ different address |                 | Changed |
 // | Stale      | Reachable  | Solicited override confirmation            | Update LinkAddr | Changed |
+// | Stale      | Reachable  | Solicited confirmation w/o address         | Notify wakers   | Changed |
 // | Stale      | Stale      | Override confirmation                      | Update LinkAddr | Changed |
 // | Stale      | Stale      | Probe w/ different address                 | Update LinkAddr | Changed |
 // | Stale      | Delay      | Packet sent                                |                 | Changed |
 // | Delay      | Reachable  | Upper-layer confirmation                   |                 | Changed |
 // | Delay      | Reachable  | Solicited override confirmation            | Update LinkAddr | Changed |
+// | Delay      | Reachable  | Solicited confirmation w/o address         | Notify wakers   | Changed |
 // | Delay      | Stale      | Probe or confirmation w/ different address |                 | Changed |
 // | Delay      | Probe      | Delay timer expired                        | Send probe      | Changed |
 // | Probe      | Reachable  | Solicited override confirmation            | Update LinkAddr | Changed |
 // | Probe      | Reachable  | Solicited confirmation w/ same address     | Notify wakers   | Changed |
+// | Probe      | Reachable  | Solicited confirmation w/o address         | Notify wakers   | Changed |
 // | Probe      | Stale      | Probe or confirmation w/ different address |                 | Changed |
 // | Probe      | Probe      | Retransmit timer expired                   | Send probe      | Changed |
 // | Probe      | Failed     | Max probes sent without reply              | Notify wakers   | Removed |
@@ -221,29 +226,27 @@ func (r *entryTestLinkResolver) LinkAddressProtocol() tcpip.NetworkProtocolNumbe
 	return entryTestNetNumber
 }
 
-func entryTestSetup(c NUDConfigurations) (*neighborEntry, *testNUDDispatcher, *entryTestLinkResolver, *fakeClock) {
-	clock := newFakeClock()
+func entryTestSetup(c NUDConfigurations) (*neighborEntry, *testNUDDispatcher, *entryTestLinkResolver, *faketime.ManualClock) {
+	clock := faketime.NewManualClock()
 	disp := testNUDDispatcher{}
 	nic := NIC{
-		id:     entryTestNICID,
-		linkEP: nil, // entryTestLinkResolver doesn't use a LinkEndpoint
+		LinkEndpoint: nil, // entryTestLinkResolver doesn't use a LinkEndpoint
+
+		id: entryTestNICID,
 		stack: &Stack{
 			clock:   clock,
 			nudDisp: &disp,
 		},
 	}
+	nic.networkEndpoints = map[tcpip.NetworkProtocolNumber]NetworkEndpoint{
+		header.IPv6ProtocolNumber: (&testIPv6Protocol{}).NewEndpoint(&nic, nil, nil, nil),
+	}
 
 	rng := rand.New(rand.NewSource(time.Now().UnixNano()))
 	nudState := NewNUDState(c, rng)
 	linkRes := entryTestLinkResolver{}
 	entry := newNeighborEntry(&nic, entryTestAddr1 /* remoteAddr */, entryTestAddr2 /* localAddr */, nudState, &linkRes)
 
-	// Stub out ndpState to verify modification of default routers.
-	nic.mu.ndp = ndpState{
-		nic:            &nic,
-		defaultRouters: make(map[tcpip.Address]defaultRouterState),
-	}
-
 	// Stub out the neighbor cache to verify deletion from the cache.
 	nic.neigh = &neighborCache{
 		nic:   &nic,
@@ -267,7 +270,7 @@ func TestEntryInitiallyUnknown(t *testing.T) {
 	}
 	e.mu.Unlock()
 
-	clock.advance(c.RetransmitTimer)
+	clock.Advance(c.RetransmitTimer)
 
 	// No probes should have been sent.
 	linkRes.mu.Lock()
@@ -300,7 +303,7 @@ func TestEntryUnknownToUnknownWhenConfirmationWithUnknownAddress(t *testing.T) {
 	}
 	e.mu.Unlock()
 
-	clock.advance(time.Hour)
+	clock.Advance(time.Hour)
 
 	// No probes should have been sent.
 	linkRes.mu.Lock()
@@ -410,7 +413,7 @@ func TestEntryIncompleteToIncompleteDoesNotChangeUpdatedAt(t *testing.T) {
 	updatedAt := e.neigh.UpdatedAt
 	e.mu.Unlock()
 
-	clock.advance(c.RetransmitTimer)
+	clock.Advance(c.RetransmitTimer)
 
 	// UpdatedAt should remain the same during address resolution.
 	wantProbes := []entryTestProbeInfo{
@@ -439,7 +442,7 @@ func TestEntryIncompleteToIncompleteDoesNotChangeUpdatedAt(t *testing.T) {
 	}
 	e.mu.Unlock()
 
-	clock.advance(c.RetransmitTimer)
+	clock.Advance(c.RetransmitTimer)
 
 	// UpdatedAt should change after failing address resolution. Timing out after
 	// sending the last probe transitions the entry to Failed.
@@ -459,7 +462,7 @@ func TestEntryIncompleteToIncompleteDoesNotChangeUpdatedAt(t *testing.T) {
 		}
 	}
 
-	clock.advance(c.RetransmitTimer)
+	clock.Advance(c.RetransmitTimer)
 
 	wantEvents := []testEntryEventInfo{
 		{
@@ -748,7 +751,7 @@ func TestEntryIncompleteToFailed(t *testing.T) {
 	e.mu.Unlock()
 
 	waitFor := c.RetransmitTimer * time.Duration(c.MaxMulticastProbes)
-	clock.advance(waitFor)
+	clock.Advance(waitFor)
 
 	wantProbes := []entryTestProbeInfo{
 		// The Incomplete-to-Incomplete state transition is tested here by
@@ -816,6 +819,8 @@ func TestEntryStaysReachableWhenConfirmationWithRouterFlag(t *testing.T) {
 	c := DefaultNUDConfigurations()
 	e, nudDisp, linkRes, _ := entryTestSetup(c)
 
+	ipv6EP := e.nic.networkEndpoints[header.IPv6ProtocolNumber].(*testIPv6Endpoint)
+
 	e.mu.Lock()
 	e.handlePacketQueuedLocked()
 	e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{
@@ -829,9 +834,7 @@ func TestEntryStaysReachableWhenConfirmationWithRouterFlag(t *testing.T) {
 	if got, want := e.isRouter, true; got != want {
 		t.Errorf("got e.isRouter = %t, want = %t", got, want)
 	}
-	e.nic.mu.ndp.defaultRouters[entryTestAddr1] = defaultRouterState{
-		invalidationJob: e.nic.stack.newJob(&testLocker{}, func() {}),
-	}
+
 	e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{
 		Solicited: false,
 		Override:  false,
@@ -840,8 +843,8 @@ func TestEntryStaysReachableWhenConfirmationWithRouterFlag(t *testing.T) {
 	if got, want := e.isRouter, false; got != want {
 		t.Errorf("got e.isRouter = %t, want = %t", got, want)
 	}
-	if _, ok := e.nic.mu.ndp.defaultRouters[entryTestAddr1]; ok {
-		t.Errorf("unexpected defaultRouter for %s", entryTestAddr1)
+	if ipv6EP.invalidatedRtr != e.neigh.Addr {
+		t.Errorf("got ipv6EP.invalidatedRtr = %s, want = %s", ipv6EP.invalidatedRtr, e.neigh.Addr)
 	}
 	e.mu.Unlock()
 
@@ -983,7 +986,7 @@ func TestEntryReachableToStaleWhenTimeout(t *testing.T) {
 		t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff)
 	}
 
-	clock.advance(c.BaseReachableTime)
+	clock.Advance(c.BaseReachableTime)
 
 	wantEvents := []testEntryEventInfo{
 		{
@@ -1370,6 +1373,77 @@ func TestEntryStaleToReachableWhenSolicitedOverrideConfirmation(t *testing.T) {
 	nudDisp.mu.Unlock()
 }
 
+func TestEntryStaleToReachableWhenSolicitedConfirmationWithoutAddress(t *testing.T) {
+	c := DefaultNUDConfigurations()
+	e, nudDisp, linkRes, _ := entryTestSetup(c)
+
+	e.mu.Lock()
+	e.handlePacketQueuedLocked()
+	e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{
+		Solicited: false,
+		Override:  false,
+		IsRouter:  false,
+	})
+	if e.neigh.State != Stale {
+		t.Errorf("got e.neigh.State = %q, want = %q", e.neigh.State, Stale)
+	}
+	e.handleConfirmationLocked("" /* linkAddr */, ReachabilityConfirmationFlags{
+		Solicited: true,
+		Override:  false,
+		IsRouter:  false,
+	})
+	if e.neigh.State != Reachable {
+		t.Errorf("got e.neigh.State = %q, want = %q", e.neigh.State, Reachable)
+	}
+	if e.neigh.LinkAddr != entryTestLinkAddr1 {
+		t.Errorf("got e.neigh.LinkAddr = %q, want = %q", e.neigh.LinkAddr, entryTestLinkAddr1)
+	}
+	e.mu.Unlock()
+
+	wantProbes := []entryTestProbeInfo{
+		{
+			RemoteAddress:     entryTestAddr1,
+			RemoteLinkAddress: tcpip.LinkAddress(""),
+			LocalAddress:      entryTestAddr2,
+		},
+	}
+	linkRes.mu.Lock()
+	diff := cmp.Diff(linkRes.probes, wantProbes)
+	linkRes.mu.Unlock()
+	if diff != "" {
+		t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff)
+	}
+
+	wantEvents := []testEntryEventInfo{
+		{
+			EventType: entryTestAdded,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  tcpip.LinkAddress(""),
+			State:     Incomplete,
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  entryTestLinkAddr1,
+			State:     Stale,
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  entryTestLinkAddr1,
+			State:     Reachable,
+		},
+	}
+	nudDisp.mu.Lock()
+	if diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...); diff != "" {
+		t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff)
+	}
+	nudDisp.mu.Unlock()
+}
+
 func TestEntryStaleToStaleWhenOverrideConfirmation(t *testing.T) {
 	c := DefaultNUDConfigurations()
 	e, nudDisp, linkRes, _ := entryTestSetup(c)
@@ -1612,7 +1686,7 @@ func TestEntryDelayToReachableWhenUpperLevelConfirmation(t *testing.T) {
 		t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff)
 	}
 
-	clock.advance(c.BaseReachableTime)
+	clock.Advance(c.BaseReachableTime)
 
 	wantEvents := []testEntryEventInfo{
 		{
@@ -1706,7 +1780,7 @@ func TestEntryDelayToReachableWhenSolicitedOverrideConfirmation(t *testing.T) {
 		t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff)
 	}
 
-	clock.advance(c.BaseReachableTime)
+	clock.Advance(c.BaseReachableTime)
 
 	wantEvents := []testEntryEventInfo{
 		{
@@ -1752,6 +1826,100 @@ func TestEntryDelayToReachableWhenSolicitedOverrideConfirmation(t *testing.T) {
 	nudDisp.mu.Unlock()
 }
 
+func TestEntryDelayToReachableWhenSolicitedConfirmationWithoutAddress(t *testing.T) {
+	c := DefaultNUDConfigurations()
+	c.MaxMulticastProbes = 1
+	// Eliminate random factors from ReachableTime computation so the transition
+	// from Stale to Reachable will only take BaseReachableTime duration.
+	c.MinRandomFactor = 1
+	c.MaxRandomFactor = 1
+
+	e, nudDisp, linkRes, clock := entryTestSetup(c)
+
+	e.mu.Lock()
+	e.handlePacketQueuedLocked()
+	e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{
+		Solicited: false,
+		Override:  false,
+		IsRouter:  false,
+	})
+	e.handlePacketQueuedLocked()
+	if e.neigh.State != Delay {
+		t.Errorf("got e.neigh.State = %q, want = %q", e.neigh.State, Delay)
+	}
+	e.handleConfirmationLocked("" /* linkAddr */, ReachabilityConfirmationFlags{
+		Solicited: true,
+		Override:  false,
+		IsRouter:  false,
+	})
+	if e.neigh.State != Reachable {
+		t.Errorf("got e.neigh.State = %q, want = %q", e.neigh.State, Reachable)
+	}
+	if e.neigh.LinkAddr != entryTestLinkAddr1 {
+		t.Errorf("got e.neigh.LinkAddr = %q, want = %q", e.neigh.LinkAddr, entryTestLinkAddr1)
+	}
+	e.mu.Unlock()
+
+	wantProbes := []entryTestProbeInfo{
+		{
+			RemoteAddress:     entryTestAddr1,
+			RemoteLinkAddress: tcpip.LinkAddress(""),
+			LocalAddress:      entryTestAddr2,
+		},
+	}
+	linkRes.mu.Lock()
+	diff := cmp.Diff(linkRes.probes, wantProbes)
+	linkRes.mu.Unlock()
+	if diff != "" {
+		t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff)
+	}
+
+	clock.Advance(c.BaseReachableTime)
+
+	wantEvents := []testEntryEventInfo{
+		{
+			EventType: entryTestAdded,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  tcpip.LinkAddress(""),
+			State:     Incomplete,
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  entryTestLinkAddr1,
+			State:     Stale,
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  entryTestLinkAddr1,
+			State:     Delay,
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  entryTestLinkAddr1,
+			State:     Reachable,
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  entryTestLinkAddr1,
+			State:     Stale,
+		},
+	}
+	nudDisp.mu.Lock()
+	if diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...); diff != "" {
+		t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff)
+	}
+	nudDisp.mu.Unlock()
+}
+
 func TestEntryStaysDelayWhenOverrideConfirmationWithSameAddress(t *testing.T) {
 	c := DefaultNUDConfigurations()
 	e, nudDisp, linkRes, _ := entryTestSetup(c)
@@ -1989,7 +2157,7 @@ func TestEntryDelayToProbe(t *testing.T) {
 	}
 	e.mu.Unlock()
 
-	clock.advance(c.DelayFirstProbeTime)
+	clock.Advance(c.DelayFirstProbeTime)
 
 	wantProbes := []entryTestProbeInfo{
 		// The first probe is caused by the Unknown-to-Incomplete transition.
@@ -2069,7 +2237,7 @@ func TestEntryProbeToStaleWhenProbeWithDifferentAddress(t *testing.T) {
 	e.handlePacketQueuedLocked()
 	e.mu.Unlock()
 
-	clock.advance(c.DelayFirstProbeTime)
+	clock.Advance(c.DelayFirstProbeTime)
 
 	wantProbes := []entryTestProbeInfo{
 		// The first probe is caused by the Unknown-to-Incomplete transition.
@@ -2166,7 +2334,7 @@ func TestEntryProbeToStaleWhenConfirmationWithDifferentAddress(t *testing.T) {
 	e.handlePacketQueuedLocked()
 	e.mu.Unlock()
 
-	clock.advance(c.DelayFirstProbeTime)
+	clock.Advance(c.DelayFirstProbeTime)
 
 	wantProbes := []entryTestProbeInfo{
 		// The first probe is caused by the Unknown-to-Incomplete transition.
@@ -2267,7 +2435,7 @@ func TestEntryStaysProbeWhenOverrideConfirmationWithSameAddress(t *testing.T) {
 	e.handlePacketQueuedLocked()
 	e.mu.Unlock()
 
-	clock.advance(c.DelayFirstProbeTime)
+	clock.Advance(c.DelayFirstProbeTime)
 
 	wantProbes := []entryTestProbeInfo{
 		// The first probe is caused by the Unknown-to-Incomplete transition.
@@ -2364,7 +2532,7 @@ func TestEntryUnknownToStaleToProbeToReachable(t *testing.T) {
 	e.handlePacketQueuedLocked()
 	e.mu.Unlock()
 
-	clock.advance(c.DelayFirstProbeTime)
+	clock.Advance(c.DelayFirstProbeTime)
 
 	wantProbes := []entryTestProbeInfo{
 		// Probe caused by the Delay-to-Probe transition
@@ -2398,7 +2566,7 @@ func TestEntryUnknownToStaleToProbeToReachable(t *testing.T) {
 	}
 	e.mu.Unlock()
 
-	clock.advance(c.BaseReachableTime)
+	clock.Advance(c.BaseReachableTime)
 
 	wantEvents := []testEntryEventInfo{
 		{
@@ -2463,7 +2631,7 @@ func TestEntryProbeToReachableWhenSolicitedOverrideConfirmation(t *testing.T) {
 	e.handlePacketQueuedLocked()
 	e.mu.Unlock()
 
-	clock.advance(c.DelayFirstProbeTime)
+	clock.Advance(c.DelayFirstProbeTime)
 
 	wantProbes := []entryTestProbeInfo{
 		// The first probe is caused by the Unknown-to-Incomplete transition.
@@ -2503,7 +2671,7 @@ func TestEntryProbeToReachableWhenSolicitedOverrideConfirmation(t *testing.T) {
 	}
 	e.mu.Unlock()
 
-	clock.advance(c.BaseReachableTime)
+	clock.Advance(c.BaseReachableTime)
 
 	wantEvents := []testEntryEventInfo{
 		{
@@ -2575,7 +2743,7 @@ func TestEntryProbeToReachableWhenSolicitedConfirmationWithSameAddress(t *testin
 	e.handlePacketQueuedLocked()
 	e.mu.Unlock()
 
-	clock.advance(c.DelayFirstProbeTime)
+	clock.Advance(c.DelayFirstProbeTime)
 
 	wantProbes := []entryTestProbeInfo{
 		// The first probe is caused by the Unknown-to-Incomplete transition.
@@ -2612,7 +2780,116 @@ func TestEntryProbeToReachableWhenSolicitedConfirmationWithSameAddress(t *testin
 	}
 	e.mu.Unlock()
 
-	clock.advance(c.BaseReachableTime)
+	clock.Advance(c.BaseReachableTime)
+
+	wantEvents := []testEntryEventInfo{
+		{
+			EventType: entryTestAdded,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  tcpip.LinkAddress(""),
+			State:     Incomplete,
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  entryTestLinkAddr1,
+			State:     Stale,
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  entryTestLinkAddr1,
+			State:     Delay,
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  entryTestLinkAddr1,
+			State:     Probe,
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  entryTestLinkAddr1,
+			State:     Reachable,
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  entryTestLinkAddr1,
+			State:     Stale,
+		},
+	}
+	nudDisp.mu.Lock()
+	if diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...); diff != "" {
+		t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff)
+	}
+	nudDisp.mu.Unlock()
+}
+
+func TestEntryProbeToReachableWhenSolicitedConfirmationWithoutAddress(t *testing.T) {
+	c := DefaultNUDConfigurations()
+	// Eliminate random factors from ReachableTime computation so the transition
+	// from Stale to Reachable will only take BaseReachableTime duration.
+	c.MinRandomFactor = 1
+	c.MaxRandomFactor = 1
+
+	e, nudDisp, linkRes, clock := entryTestSetup(c)
+
+	e.mu.Lock()
+	e.handlePacketQueuedLocked()
+	e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{
+		Solicited: false,
+		Override:  false,
+		IsRouter:  false,
+	})
+	e.handlePacketQueuedLocked()
+	e.mu.Unlock()
+
+	clock.Advance(c.DelayFirstProbeTime)
+
+	wantProbes := []entryTestProbeInfo{
+		// The first probe is caused by the Unknown-to-Incomplete transition.
+		{
+			RemoteAddress:     entryTestAddr1,
+			RemoteLinkAddress: tcpip.LinkAddress(""),
+			LocalAddress:      entryTestAddr2,
+		},
+		// The second probe is caused by the Delay-to-Probe transition.
+		{
+			RemoteAddress:     entryTestAddr1,
+			RemoteLinkAddress: entryTestLinkAddr1,
+			LocalAddress:      entryTestAddr2,
+		},
+	}
+	linkRes.mu.Lock()
+	diff := cmp.Diff(linkRes.probes, wantProbes)
+	linkRes.mu.Unlock()
+	if diff != "" {
+		t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff)
+	}
+
+	e.mu.Lock()
+	if e.neigh.State != Probe {
+		t.Errorf("got e.neigh.State = %q, want = %q", e.neigh.State, Probe)
+	}
+	e.handleConfirmationLocked("" /* linkAddr */, ReachabilityConfirmationFlags{
+		Solicited: true,
+		Override:  false,
+		IsRouter:  false,
+	})
+	if e.neigh.State != Reachable {
+		t.Errorf("got e.neigh.State = %q, want = %q", e.neigh.State, Reachable)
+	}
+	e.mu.Unlock()
+
+	clock.Advance(c.BaseReachableTime)
 
 	wantEvents := []testEntryEventInfo{
 		{
@@ -2682,7 +2959,7 @@ func TestEntryProbeToFailed(t *testing.T) {
 	e.mu.Unlock()
 
 	waitFor := c.DelayFirstProbeTime + c.RetransmitTimer*time.Duration(c.MaxUnicastProbes)
-	clock.advance(waitFor)
+	clock.Advance(waitFor)
 
 	wantProbes := []entryTestProbeInfo{
 		// The first probe is caused by the Unknown-to-Incomplete transition.
@@ -2787,7 +3064,7 @@ func TestEntryFailedGetsDeleted(t *testing.T) {
 	e.mu.Unlock()
 
 	waitFor := c.DelayFirstProbeTime + c.RetransmitTimer*time.Duration(c.MaxUnicastProbes) + c.UnreachableTime
-	clock.advance(waitFor)
+	clock.Advance(waitFor)
 
 	wantProbes := []entryTestProbeInfo{
 		// The first probe is caused by the Unknown-to-Incomplete transition.
diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index 728292782..dcd4319bf 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -18,48 +18,45 @@ import (
 	"fmt"
 	"math/rand"
 	"reflect"
-	"sort"
 	"sync/atomic"
 
+	"gvisor.dev/gvisor/pkg/sleep"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/tcpip"
-	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 )
 
-var ipv4BroadcastAddr = tcpip.ProtocolAddress{
-	Protocol: header.IPv4ProtocolNumber,
-	AddressWithPrefix: tcpip.AddressWithPrefix{
-		Address:   header.IPv4Broadcast,
-		PrefixLen: 8 * header.IPv4AddressSize,
-	},
-}
+var _ NetworkInterface = (*NIC)(nil)
 
 // NIC represents a "network interface card" to which the networking stack is
 // attached.
 type NIC struct {
+	LinkEndpoint
+
 	stack   *Stack
 	id      tcpip.NICID
 	name    string
-	linkEP  LinkEndpoint
 	context NICContext
 
-	stats            NICStats
-	neigh            *neighborCache
+	stats NICStats
+	neigh *neighborCache
+
+	// The network endpoints themselves may be modified by calling the interface's
+	// methods, but the map reference and entries must be constant.
 	networkEndpoints map[tcpip.NetworkProtocolNumber]NetworkEndpoint
 
+	// enabled is set to 1 when the NIC is enabled and 0 when it is disabled.
+	//
+	// Must be accessed using atomic operations.
+	enabled uint32
+
 	mu struct {
 		sync.RWMutex
-		enabled     bool
 		spoofing    bool
 		promiscuous bool
-		primary     map[tcpip.NetworkProtocolNumber][]*referencedNetworkEndpoint
-		endpoints   map[NetworkEndpointID]*referencedNetworkEndpoint
-		mcastJoins  map[NetworkEndpointID]uint32
 		// packetEPs is protected by mu, but the contained PacketEndpoint
 		// values are not.
 		packetEPs map[tcpip.NetworkProtocolNumber][]PacketEndpoint
-		ndp       ndpState
 	}
 }
 
@@ -83,25 +80,6 @@ type DirectionStats struct {
 	Bytes   *tcpip.StatCounter
 }
 
-// PrimaryEndpointBehavior is an enumeration of an endpoint's primacy behavior.
-type PrimaryEndpointBehavior int
-
-const (
-	// CanBePrimaryEndpoint indicates the endpoint can be used as a primary
-	// endpoint for new connections with no local address. This is the
-	// default when calling NIC.AddAddress.
-	CanBePrimaryEndpoint PrimaryEndpointBehavior = iota
-
-	// FirstPrimaryEndpoint indicates the endpoint should be the first
-	// primary endpoint considered. If there are multiple endpoints with
-	// this behavior, the most recently-added one will be first.
-	FirstPrimaryEndpoint
-
-	// NeverPrimaryEndpoint indicates the endpoint should never be a
-	// primary endpoint.
-	NeverPrimaryEndpoint
-)
-
 // newNIC returns a new NIC using the default NDP configurations from stack.
 func newNIC(stack *Stack, id tcpip.NICID, name string, ep LinkEndpoint, ctx NICContext) *NIC {
 	// TODO(b/141011931): Validate a LinkEndpoint (ep) is valid. For
@@ -113,76 +91,77 @@ func newNIC(stack *Stack, id tcpip.NICID, name string, ep LinkEndpoint, ctx NICC
 	// of IPv6 is supported on this endpoint's LinkEndpoint.
 
 	nic := &NIC{
+		LinkEndpoint: ep,
+
 		stack:            stack,
 		id:               id,
 		name:             name,
-		linkEP:           ep,
 		context:          ctx,
 		stats:            makeNICStats(),
 		networkEndpoints: make(map[tcpip.NetworkProtocolNumber]NetworkEndpoint),
 	}
-	nic.mu.primary = make(map[tcpip.NetworkProtocolNumber][]*referencedNetworkEndpoint)
-	nic.mu.endpoints = make(map[NetworkEndpointID]*referencedNetworkEndpoint)
-	nic.mu.mcastJoins = make(map[NetworkEndpointID]uint32)
 	nic.mu.packetEPs = make(map[tcpip.NetworkProtocolNumber][]PacketEndpoint)
-	nic.mu.ndp = ndpState{
-		nic:            nic,
-		configs:        stack.ndpConfigs,
-		dad:            make(map[tcpip.Address]dadState),
-		defaultRouters: make(map[tcpip.Address]defaultRouterState),
-		onLinkPrefixes: make(map[tcpip.Subnet]onLinkPrefixState),
-		slaacPrefixes:  make(map[tcpip.Subnet]slaacPrefixState),
-	}
-	nic.mu.ndp.initializeTempAddrState()
-
-	// Register supported packet endpoint protocols.
-	for _, netProto := range header.Ethertypes {
-		nic.mu.packetEPs[netProto] = []PacketEndpoint{}
-	}
-	for _, netProto := range stack.networkProtocols {
-		netNum := netProto.Number()
-		nic.mu.packetEPs[netNum] = nil
-		nic.networkEndpoints[netNum] = netProto.NewEndpoint(id, stack, nic, ep, stack)
-	}
 
 	// Check for Neighbor Unreachability Detection support.
-	if ep.Capabilities()&CapabilityResolutionRequired != 0 && len(stack.linkAddrResolvers) != 0 {
+	var nud NUDHandler
+	if ep.Capabilities()&CapabilityResolutionRequired != 0 && len(stack.linkAddrResolvers) != 0 && stack.useNeighborCache {
 		rng := rand.New(rand.NewSource(stack.clock.NowNanoseconds()))
 		nic.neigh = &neighborCache{
 			nic:   nic,
 			state: NewNUDState(stack.nudConfigs, rng),
 			cache: make(map[tcpip.Address]*neighborEntry, neighborCacheSize),
 		}
+
+		// An interface value that holds a nil pointer but non-nil type is not the
+		// same as the nil interface. Because of this, nud must only be assignd if
+		// nic.neigh is non-nil since a nil reference to a neighborCache is not
+		// valid.
+		//
+		// See https://golang.org/doc/faq#nil_error for more information.
+		nud = nic.neigh
 	}
 
-	nic.linkEP.Attach(nic)
+	// Register supported packet and network endpoint protocols.
+	for _, netProto := range header.Ethertypes {
+		nic.mu.packetEPs[netProto] = []PacketEndpoint{}
+	}
+	for _, netProto := range stack.networkProtocols {
+		netNum := netProto.Number()
+		nic.mu.packetEPs[netNum] = nil
+		nic.networkEndpoints[netNum] = netProto.NewEndpoint(nic, stack, nud, nic)
+	}
+
+	nic.LinkEndpoint.Attach(nic)
 
 	return nic
 }
 
-// enabled returns true if n is enabled.
-func (n *NIC) enabled() bool {
-	n.mu.RLock()
-	enabled := n.mu.enabled
-	n.mu.RUnlock()
-	return enabled
+func (n *NIC) getNetworkEndpoint(proto tcpip.NetworkProtocolNumber) NetworkEndpoint {
+	return n.networkEndpoints[proto]
 }
 
-// disable disables n.
+// Enabled implements NetworkInterface.
+func (n *NIC) Enabled() bool {
+	return atomic.LoadUint32(&n.enabled) == 1
+}
+
+// setEnabled sets the enabled status for the NIC.
 //
-// It undoes the work done by enable.
-func (n *NIC) disable() *tcpip.Error {
-	n.mu.RLock()
-	enabled := n.mu.enabled
-	n.mu.RUnlock()
-	if !enabled {
-		return nil
+// Returns true if the enabled status was updated.
+func (n *NIC) setEnabled(v bool) bool {
+	if v {
+		return atomic.SwapUint32(&n.enabled, 1) == 0
 	}
+	return atomic.SwapUint32(&n.enabled, 0) == 1
+}
 
+// disable disables n.
+//
+// It undoes the work done by enable.
+func (n *NIC) disable() {
 	n.mu.Lock()
-	err := n.disableLocked()
+	n.disableLocked()
 	n.mu.Unlock()
-	return err
 }
 
 // disableLocked disables n.
@@ -190,9 +169,9 @@ func (n *NIC) disable() *tcpip.Error {
 // It undoes the work done by enable.
 //
 // n MUST be locked.
-func (n *NIC) disableLocked() *tcpip.Error {
-	if !n.mu.enabled {
-		return nil
+func (n *NIC) disableLocked() {
+	if !n.setEnabled(false) {
+		return
 	}
 
 	// TODO(gvisor.dev/issue/1491): Should Routes that are currently bound to n be
@@ -200,38 +179,9 @@ func (n *NIC) disableLocked() *tcpip.Error {
 	// again, and applications may not know that the underlying NIC was ever
 	// disabled.
 
-	if _, ok := n.stack.networkProtocols[header.IPv6ProtocolNumber]; ok {
-		n.mu.ndp.stopSolicitingRouters()
-		n.mu.ndp.cleanupState(false /* hostOnly */)
-
-		// Stop DAD for all the unicast IPv6 endpoints that are in the
-		// permanentTentative state.
-		for _, r := range n.mu.endpoints {
-			if addr := r.address(); r.getKind() == permanentTentative && header.IsV6UnicastAddress(addr) {
-				n.mu.ndp.stopDuplicateAddressDetection(addr)
-			}
-		}
-
-		// The NIC may have already left the multicast group.
-		if err := n.leaveGroupLocked(header.IPv6AllNodesMulticastAddress, false /* force */); err != nil && err != tcpip.ErrBadLocalAddress {
-			return err
-		}
-	}
-
-	if _, ok := n.stack.networkProtocols[header.IPv4ProtocolNumber]; ok {
-		// The NIC may have already left the multicast group.
-		if err := n.leaveGroupLocked(header.IPv4AllSystems, false /* force */); err != nil && err != tcpip.ErrBadLocalAddress {
-			return err
-		}
-
-		// The address may have already been removed.
-		if err := n.removePermanentAddressLocked(ipv4BroadcastAddr.AddressWithPrefix.Address); err != nil && err != tcpip.ErrBadLocalAddress {
-			return err
-		}
+	for _, ep := range n.networkEndpoints {
+		ep.Disable()
 	}
-
-	n.mu.enabled = false
-	return nil
 }
 
 // enable enables n.
@@ -241,162 +191,38 @@ func (n *NIC) disableLocked() *tcpip.Error {
 // routers if the stack is not operating as a router. If the stack is also
 // configured to auto-generate a link-local address, one will be generated.
 func (n *NIC) enable() *tcpip.Error {
-	n.mu.RLock()
-	enabled := n.mu.enabled
-	n.mu.RUnlock()
-	if enabled {
-		return nil
-	}
-
 	n.mu.Lock()
 	defer n.mu.Unlock()
 
-	if n.mu.enabled {
+	if !n.setEnabled(true) {
 		return nil
 	}
 
-	n.mu.enabled = true
-
-	// Create an endpoint to receive broadcast packets on this interface.
-	if _, ok := n.stack.networkProtocols[header.IPv4ProtocolNumber]; ok {
-		if _, err := n.addAddressLocked(ipv4BroadcastAddr, NeverPrimaryEndpoint, permanent, static, false /* deprecated */); err != nil {
-			return err
-		}
-
-		// As per RFC 1122 section 3.3.7, all hosts should join the all-hosts
-		// multicast group. Note, the IANA calls the all-hosts multicast group the
-		// all-systems multicast group.
-		if err := n.joinGroupLocked(header.IPv4ProtocolNumber, header.IPv4AllSystems); err != nil {
-			return err
-		}
-	}
-
-	// Join the IPv6 All-Nodes Multicast group if the stack is configured to
-	// use IPv6. This is required to ensure that this node properly receives
-	// and responds to the various NDP messages that are destined to the
-	// all-nodes multicast address. An example is the Neighbor Advertisement
-	// when we perform Duplicate Address Detection, or Router Advertisement
-	// when we do Router Discovery. See RFC 4862, section 5.4.2 and RFC 4861
-	// section 4.2 for more information.
-	//
-	// Also auto-generate an IPv6 link-local address based on the NIC's
-	// link address if it is configured to do so. Note, each interface is
-	// required to have IPv6 link-local unicast address, as per RFC 4291
-	// section 2.1.
-	_, ok := n.stack.networkProtocols[header.IPv6ProtocolNumber]
-	if !ok {
-		return nil
-	}
-
-	// Join the All-Nodes multicast group before starting DAD as responses to DAD
-	// (NDP NS) messages may be sent to the All-Nodes multicast group if the
-	// source address of the NDP NS is the unspecified address, as per RFC 4861
-	// section 7.2.4.
-	if err := n.joinGroupLocked(header.IPv6ProtocolNumber, header.IPv6AllNodesMulticastAddress); err != nil {
-		return err
-	}
-
-	// Perform DAD on the all the unicast IPv6 endpoints that are in the permanent
-	// state.
-	//
-	// Addresses may have aleady completed DAD but in the time since the NIC was
-	// last enabled, other devices may have acquired the same addresses.
-	for _, r := range n.mu.endpoints {
-		addr := r.address()
-		if k := r.getKind(); (k != permanent && k != permanentTentative) || !header.IsV6UnicastAddress(addr) {
-			continue
-		}
-
-		r.setKind(permanentTentative)
-		if err := n.mu.ndp.startDuplicateAddressDetection(addr, r); err != nil {
+	for _, ep := range n.networkEndpoints {
+		if err := ep.Enable(); err != nil {
 			return err
 		}
 	}
 
-	// Do not auto-generate an IPv6 link-local address for loopback devices.
-	if n.stack.autoGenIPv6LinkLocal && !n.isLoopback() {
-		// The valid and preferred lifetime is infinite for the auto-generated
-		// link-local address.
-		n.mu.ndp.doSLAAC(header.IPv6LinkLocalPrefix.Subnet(), header.NDPInfiniteLifetime, header.NDPInfiniteLifetime)
-	}
-
-	// If we are operating as a router, then do not solicit routers since we
-	// won't process the RAs anyways.
-	//
-	// Routers do not process Router Advertisements (RA) the same way a host
-	// does. That is, routers do not learn from RAs (e.g. on-link prefixes
-	// and default routers). Therefore, soliciting RAs from other routers on
-	// a link is unnecessary for routers.
-	if !n.stack.forwarding {
-		n.mu.ndp.startSolicitingRouters()
-	}
-
 	return nil
 }
 
-// remove detaches NIC from the link endpoint, and marks existing referenced
-// network endpoints expired. This guarantees no packets between this NIC and
-// the network stack.
+// remove detaches NIC from the link endpoint and releases network endpoint
+// resources. This guarantees no packets between this NIC and the network
+// stack.
 func (n *NIC) remove() *tcpip.Error {
 	n.mu.Lock()
 	defer n.mu.Unlock()
 
 	n.disableLocked()
 
-	// TODO(b/151378115): come up with a better way to pick an error than the
-	// first one.
-	var err *tcpip.Error
-
-	// Forcefully leave multicast groups.
-	for nid := range n.mu.mcastJoins {
-		if tempErr := n.leaveGroupLocked(nid.LocalAddress, true /* force */); tempErr != nil && err == nil {
-			err = tempErr
-		}
-	}
-
-	// Remove permanent and permanentTentative addresses, so no packet goes out.
-	for nid, ref := range n.mu.endpoints {
-		switch ref.getKind() {
-		case permanentTentative, permanent:
-			if tempErr := n.removePermanentAddressLocked(nid.LocalAddress); tempErr != nil && err == nil {
-				err = tempErr
-			}
-		}
-	}
-
-	// Release any resources the network endpoint may hold.
 	for _, ep := range n.networkEndpoints {
 		ep.Close()
 	}
 
 	// Detach from link endpoint, so no packet comes in.
-	n.linkEP.Attach(nil)
-
-	return err
-}
-
-// becomeIPv6Router transitions n into an IPv6 router.
-//
-// When transitioning into an IPv6 router, host-only state (NDP discovered
-// routers, discovered on-link prefixes, and auto-generated addresses) will
-// be cleaned up/invalidated and NDP router solicitations will be stopped.
-func (n *NIC) becomeIPv6Router() {
-	n.mu.Lock()
-	defer n.mu.Unlock()
-
-	n.mu.ndp.cleanupState(true /* hostOnly */)
-	n.mu.ndp.stopSolicitingRouters()
-}
-
-// becomeIPv6Host transitions n into an IPv6 host.
-//
-// When transitioning into an IPv6 host, NDP router solicitations will be
-// started.
-func (n *NIC) becomeIPv6Host() {
-	n.mu.Lock()
-	defer n.mu.Unlock()
-
-	n.mu.ndp.startSolicitingRouters()
+	n.LinkEndpoint.Attach(nil)
+	return nil
 }
 
 // setPromiscuousMode enables or disables promiscuous mode.
@@ -413,217 +239,113 @@ func (n *NIC) isPromiscuousMode() bool {
 	return rv
 }
 
-func (n *NIC) isLoopback() bool {
-	return n.linkEP.Capabilities()&CapabilityLoopback != 0
-}
-
-// setSpoofing enables or disables address spoofing.
-func (n *NIC) setSpoofing(enable bool) {
-	n.mu.Lock()
-	n.mu.spoofing = enable
-	n.mu.Unlock()
+// IsLoopback implements NetworkInterface.
+func (n *NIC) IsLoopback() bool {
+	return n.LinkEndpoint.Capabilities()&CapabilityLoopback != 0
 }
 
-// primaryEndpoint will return the first non-deprecated endpoint if such an
-// endpoint exists for the given protocol and remoteAddr. If no non-deprecated
-// endpoint exists, the first deprecated endpoint will be returned.
-//
-// If an IPv6 primary endpoint is requested, Source Address Selection (as
-// defined by RFC 6724 section 5) will be performed.
-func (n *NIC) primaryEndpoint(protocol tcpip.NetworkProtocolNumber, remoteAddr tcpip.Address) *referencedNetworkEndpoint {
-	if protocol == header.IPv6ProtocolNumber && remoteAddr != "" {
-		return n.primaryIPv6Endpoint(remoteAddr)
-	}
-
-	n.mu.RLock()
-	defer n.mu.RUnlock()
-
-	var deprecatedEndpoint *referencedNetworkEndpoint
-	for _, r := range n.mu.primary[protocol] {
-		if !r.isValidForOutgoingRLocked() {
-			continue
-		}
-
-		if !r.deprecated {
-			if r.tryIncRef() {
-				// r is not deprecated, so return it immediately.
-				//
-				// If we kept track of a deprecated endpoint, decrement its reference
-				// count since it was incremented when we decided to keep track of it.
-				if deprecatedEndpoint != nil {
-					deprecatedEndpoint.decRefLocked()
-					deprecatedEndpoint = nil
-				}
-
-				return r
-			}
-		} else if deprecatedEndpoint == nil && r.tryIncRef() {
-			// We prefer an endpoint that is not deprecated, but we keep track of r in
-			// case n doesn't have any non-deprecated endpoints.
-			//
-			// If we end up finding a more preferred endpoint, r's reference count
-			// will be decremented when such an endpoint is found.
-			deprecatedEndpoint = r
+// WritePacket implements NetworkLinkEndpoint.
+func (n *NIC) WritePacket(r *Route, gso *GSO, protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer) *tcpip.Error {
+	// As per relevant RFCs, we should queue packets while we wait for link
+	// resolution to complete.
+	//
+	// RFC 1122 section 2.3.2.2 (for IPv4):
+	//   The link layer SHOULD save (rather than discard) at least
+	//   one (the latest) packet of each set of packets destined to
+	//   the same unresolved IP address, and transmit the saved
+	//   packet when the address has been resolved.
+	//
+	// RFC 4861 section 5.2 (for IPv6):
+	//   Once the IP address of the next-hop node is known, the sender
+	//   examines the Neighbor Cache for link-layer information about that
+	//   neighbor.  If no entry exists, the sender creates one, sets its state
+	//   to INCOMPLETE, initiates Address Resolution, and then queues the data
+	//   packet pending completion of address resolution.
+	if ch, err := r.Resolve(nil); err != nil {
+		if err == tcpip.ErrWouldBlock {
+			r := r.Clone()
+			n.stack.linkResQueue.enqueue(ch, &r, protocol, pkt)
+			return nil
 		}
+		return err
 	}
 
-	// n doesn't have any valid non-deprecated endpoints, so return
-	// deprecatedEndpoint (which may be nil if n doesn't have any valid deprecated
-	// endpoints either).
-	return deprecatedEndpoint
+	return n.writePacket(r, gso, protocol, pkt)
 }
 
-// ipv6AddrCandidate is an IPv6 candidate for Source Address Selection (RFC
-// 6724 section 5).
-type ipv6AddrCandidate struct {
-	ref   *referencedNetworkEndpoint
-	scope header.IPv6AddressScope
-}
+func (n *NIC) writePacket(r *Route, gso *GSO, protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer) *tcpip.Error {
+	// WritePacket takes ownership of pkt, calculate numBytes first.
+	numBytes := pkt.Size()
 
-// primaryIPv6Endpoint returns an IPv6 endpoint following Source Address
-// Selection (RFC 6724 section 5).
-//
-// Note, only rules 1-3 and 7 are followed.
-//
-// remoteAddr must be a valid IPv6 address.
-func (n *NIC) primaryIPv6Endpoint(remoteAddr tcpip.Address) *referencedNetworkEndpoint {
-	n.mu.RLock()
-	ref := n.primaryIPv6EndpointRLocked(remoteAddr)
-	n.mu.RUnlock()
-	return ref
-}
-
-// primaryIPv6EndpointLocked returns an IPv6 endpoint following Source Address
-// Selection (RFC 6724 section 5).
-//
-// Note, only rules 1-3 and 7 are followed.
-//
-// remoteAddr must be a valid IPv6 address.
-//
-// n.mu MUST be read locked.
-func (n *NIC) primaryIPv6EndpointRLocked(remoteAddr tcpip.Address) *referencedNetworkEndpoint {
-	primaryAddrs := n.mu.primary[header.IPv6ProtocolNumber]
-
-	if len(primaryAddrs) == 0 {
-		return nil
+	if err := n.LinkEndpoint.WritePacket(r, gso, protocol, pkt); err != nil {
+		return err
 	}
 
-	// Create a candidate set of available addresses we can potentially use as a
-	// source address.
-	cs := make([]ipv6AddrCandidate, 0, len(primaryAddrs))
-	for _, r := range primaryAddrs {
-		// If r is not valid for outgoing connections, it is not a valid endpoint.
-		if !r.isValidForOutgoingRLocked() {
-			continue
-		}
-
-		addr := r.address()
-		scope, err := header.ScopeForIPv6Address(addr)
-		if err != nil {
-			// Should never happen as we got r from the primary IPv6 endpoint list and
-			// ScopeForIPv6Address only returns an error if addr is not an IPv6
-			// address.
-			panic(fmt.Sprintf("header.ScopeForIPv6Address(%s): %s", addr, err))
-		}
-
-		cs = append(cs, ipv6AddrCandidate{
-			ref:   r,
-			scope: scope,
-		})
-	}
+	n.stats.Tx.Packets.Increment()
+	n.stats.Tx.Bytes.IncrementBy(uint64(numBytes))
+	return nil
+}
 
-	remoteScope, err := header.ScopeForIPv6Address(remoteAddr)
-	if err != nil {
-		// primaryIPv6Endpoint should never be called with an invalid IPv6 address.
-		panic(fmt.Sprintf("header.ScopeForIPv6Address(%s): %s", remoteAddr, err))
+// WritePackets implements NetworkLinkEndpoint.
+func (n *NIC) WritePackets(r *Route, gso *GSO, pkts PacketBufferList, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+	// TODO(gvisor.dev/issue/4458): Queue packets whie link address resolution
+	// is being peformed like WritePacket.
+	writtenPackets, err := n.LinkEndpoint.WritePackets(r, gso, pkts, protocol)
+	n.stats.Tx.Packets.IncrementBy(uint64(writtenPackets))
+	writtenBytes := 0
+	for i, pb := 0, pkts.Front(); i < writtenPackets && pb != nil; i, pb = i+1, pb.Next() {
+		writtenBytes += pb.Size()
 	}
 
-	// Sort the addresses as per RFC 6724 section 5 rules 1-3.
-	//
-	// TODO(b/146021396): Implement rules 4-8 of RFC 6724 section 5.
-	sort.Slice(cs, func(i, j int) bool {
-		sa := cs[i]
-		sb := cs[j]
-
-		// Prefer same address as per RFC 6724 section 5 rule 1.
-		if sa.ref.address() == remoteAddr {
-			return true
-		}
-		if sb.ref.address() == remoteAddr {
-			return false
-		}
-
-		// Prefer appropriate scope as per RFC 6724 section 5 rule 2.
-		if sa.scope < sb.scope {
-			return sa.scope >= remoteScope
-		} else if sb.scope < sa.scope {
-			return sb.scope < remoteScope
-		}
-
-		// Avoid deprecated addresses as per RFC 6724 section 5 rule 3.
-		if saDep, sbDep := sa.ref.deprecated, sb.ref.deprecated; saDep != sbDep {
-			// If sa is not deprecated, it is preferred over sb.
-			return sbDep
-		}
-
-		// Prefer temporary addresses as per RFC 6724 section 5 rule 7.
-		if saTemp, sbTemp := sa.ref.configType == slaacTemp, sb.ref.configType == slaacTemp; saTemp != sbTemp {
-			return saTemp
-		}
-
-		// sa and sb are equal, return the endpoint that is closest to the front of
-		// the primary endpoint list.
-		return i < j
-	})
-
-	// Return the most preferred address that can have its reference count
-	// incremented.
-	for _, c := range cs {
-		if r := c.ref; r.tryIncRef() {
-			return r
-		}
-	}
+	n.stats.Tx.Bytes.IncrementBy(uint64(writtenBytes))
+	return writtenPackets, err
+}
 
-	return nil
+// setSpoofing enables or disables address spoofing.
+func (n *NIC) setSpoofing(enable bool) {
+	n.mu.Lock()
+	n.mu.spoofing = enable
+	n.mu.Unlock()
 }
 
-// hasPermanentAddrLocked returns true if n has a permanent (including currently
-// tentative) address, addr.
-func (n *NIC) hasPermanentAddrLocked(addr tcpip.Address) bool {
-	ref, ok := n.mu.endpoints[NetworkEndpointID{addr}]
+// primaryAddress returns an address that can be used to communicate with
+// remoteAddr.
+func (n *NIC) primaryEndpoint(protocol tcpip.NetworkProtocolNumber, remoteAddr tcpip.Address) AssignableAddressEndpoint {
+	n.mu.RLock()
+	spoofing := n.mu.spoofing
+	n.mu.RUnlock()
 
+	ep, ok := n.networkEndpoints[protocol]
 	if !ok {
-		return false
+		return nil
 	}
 
-	kind := ref.getKind()
-
-	return kind == permanent || kind == permanentTentative
+	return ep.AcquireOutgoingPrimaryAddress(remoteAddr, spoofing)
 }
 
-type getRefBehaviour int
+type getAddressBehaviour int
 
 const (
 	// spoofing indicates that the NIC's spoofing flag should be observed when
-	// getting a NIC's referenced network endpoint.
-	spoofing getRefBehaviour = iota
+	// getting a NIC's address endpoint.
+	spoofing getAddressBehaviour = iota
 
 	// promiscuous indicates that the NIC's promiscuous flag should be observed
-	// when getting a NIC's referenced network endpoint.
+	// when getting a NIC's address endpoint.
 	promiscuous
 )
 
-func (n *NIC) getRef(protocol tcpip.NetworkProtocolNumber, dst tcpip.Address) *referencedNetworkEndpoint {
-	return n.getRefOrCreateTemp(protocol, dst, CanBePrimaryEndpoint, promiscuous)
+func (n *NIC) getAddress(protocol tcpip.NetworkProtocolNumber, dst tcpip.Address) AssignableAddressEndpoint {
+	return n.getAddressOrCreateTemp(protocol, dst, CanBePrimaryEndpoint, promiscuous)
 }
 
 // findEndpoint finds the endpoint, if any, with the given address.
-func (n *NIC) findEndpoint(protocol tcpip.NetworkProtocolNumber, address tcpip.Address, peb PrimaryEndpointBehavior) *referencedNetworkEndpoint {
-	return n.getRefOrCreateTemp(protocol, address, peb, spoofing)
+func (n *NIC) findEndpoint(protocol tcpip.NetworkProtocolNumber, address tcpip.Address, peb PrimaryEndpointBehavior) AssignableAddressEndpoint {
+	return n.getAddressOrCreateTemp(protocol, address, peb, spoofing)
 }
 
-// getRefEpOrCreateTemp returns the referenced network endpoint for the given
-// protocol and address.
+// getAddressEpOrCreateTemp returns the address endpoint for the given protocol
+// and address.
 //
 // If none exists a temporary one may be created if we are in promiscuous mode
 // or spoofing. Promiscuous mode will only be checked if promiscuous is true.
@@ -631,9 +353,8 @@ func (n *NIC) findEndpoint(protocol tcpip.NetworkProtocolNumber, address tcpip.A
 //
 // If the address is the IPv4 broadcast address for an endpoint's network, that
 // endpoint will be returned.
-func (n *NIC) getRefOrCreateTemp(protocol tcpip.NetworkProtocolNumber, address tcpip.Address, peb PrimaryEndpointBehavior, tempRef getRefBehaviour) *referencedNetworkEndpoint {
+func (n *NIC) getAddressOrCreateTemp(protocol tcpip.NetworkProtocolNumber, address tcpip.Address, peb PrimaryEndpointBehavior, tempRef getAddressBehaviour) AssignableAddressEndpoint {
 	n.mu.RLock()
-
 	var spoofingOrPromiscuous bool
 	switch tempRef {
 	case spoofing:
@@ -641,282 +362,54 @@ func (n *NIC) getRefOrCreateTemp(protocol tcpip.NetworkProtocolNumber, address t
 	case promiscuous:
 		spoofingOrPromiscuous = n.mu.promiscuous
 	}
-
-	if ref, ok := n.mu.endpoints[NetworkEndpointID{address}]; ok {
-		// An endpoint with this id exists, check if it can be used and return it.
-		if !ref.isAssignedRLocked(spoofingOrPromiscuous) {
-			n.mu.RUnlock()
-			return nil
-		}
-
-		if ref.tryIncRef() {
-			n.mu.RUnlock()
-			return ref
-		}
-	}
-
-	// Check if address is a broadcast address for the endpoint's network.
-	//
-	// Only IPv4 has a notion of broadcast addresses.
-	if protocol == header.IPv4ProtocolNumber {
-		if ref := n.getRefForBroadcastRLocked(address); ref != nil {
-			n.mu.RUnlock()
-			return ref
-		}
-	}
-
-	// A usable reference was not found, create a temporary one if requested by
-	// the caller or if the address is found in the NIC's subnets.
-	createTempEP := spoofingOrPromiscuous
 	n.mu.RUnlock()
-
-	if !createTempEP {
-		return nil
-	}
-
-	// Try again with the lock in exclusive mode. If we still can't get the
-	// endpoint, create a new "temporary" endpoint. It will only exist while
-	// there's a route through it.
-	n.mu.Lock()
-	ref := n.getRefOrCreateTempLocked(protocol, address, peb)
-	n.mu.Unlock()
-	return ref
+	return n.getAddressOrCreateTempInner(protocol, address, spoofingOrPromiscuous, peb)
 }
 
-// getRefForBroadcastLocked returns an endpoint where address is the IPv4
-// broadcast address for the endpoint's network.
-//
-// n.mu MUST be read locked.
-func (n *NIC) getRefForBroadcastRLocked(address tcpip.Address) *referencedNetworkEndpoint {
-	for _, ref := range n.mu.endpoints {
-		// Only IPv4 has a notion of broadcast addresses.
-		if ref.protocol != header.IPv4ProtocolNumber {
-			continue
-		}
-
-		addr := ref.addrWithPrefix()
-		subnet := addr.Subnet()
-		if subnet.IsBroadcast(address) && ref.tryIncRef() {
-			return ref
-		}
+// getAddressOrCreateTempInner is like getAddressEpOrCreateTemp except a boolean
+// is passed to indicate whether or not we should generate temporary endpoints.
+func (n *NIC) getAddressOrCreateTempInner(protocol tcpip.NetworkProtocolNumber, address tcpip.Address, createTemp bool, peb PrimaryEndpointBehavior) AssignableAddressEndpoint {
+	if ep, ok := n.networkEndpoints[protocol]; ok {
+		return ep.AcquireAssignedAddress(address, createTemp, peb)
 	}
 
 	return nil
 }
 
-/// getRefOrCreateTempLocked returns an existing endpoint for address or creates
-/// and returns a temporary endpoint.
-//
-// If the address is the IPv4 broadcast address for an endpoint's network, that
-// endpoint will be returned.
-//
-// n.mu must be write locked.
-func (n *NIC) getRefOrCreateTempLocked(protocol tcpip.NetworkProtocolNumber, address tcpip.Address, peb PrimaryEndpointBehavior) *referencedNetworkEndpoint {
-	if ref, ok := n.mu.endpoints[NetworkEndpointID{address}]; ok {
-		// No need to check the type as we are ok with expired endpoints at this
-		// point.
-		if ref.tryIncRef() {
-			return ref
-		}
-		// tryIncRef failing means the endpoint is scheduled to be removed once the
-		// lock is released. Remove it here so we can create a new (temporary) one.
-		// The removal logic waiting for the lock handles this case.
-		n.removeEndpointLocked(ref)
-	}
-
-	// Check if address is a broadcast address for an endpoint's network.
-	//
-	// Only IPv4 has a notion of broadcast addresses.
-	if protocol == header.IPv4ProtocolNumber {
-		if ref := n.getRefForBroadcastRLocked(address); ref != nil {
-			return ref
-		}
-	}
-
-	// Add a new temporary endpoint.
-	netProto, ok := n.stack.networkProtocols[protocol]
-	if !ok {
-		return nil
-	}
-	ref, _ := n.addAddressLocked(tcpip.ProtocolAddress{
-		Protocol: protocol,
-		AddressWithPrefix: tcpip.AddressWithPrefix{
-			Address:   address,
-			PrefixLen: netProto.DefaultPrefixLen(),
-		},
-	}, peb, temporary, static, false)
-	return ref
-}
-
-// addAddressLocked adds a new protocolAddress to n.
-//
-// If n already has the address in a non-permanent state, and the kind given is
-// permanent, that address will be promoted in place and its properties set to
-// the properties provided. Otherwise, it returns tcpip.ErrDuplicateAddress.
-func (n *NIC) addAddressLocked(protocolAddress tcpip.ProtocolAddress, peb PrimaryEndpointBehavior, kind networkEndpointKind, configType networkEndpointConfigType, deprecated bool) (*referencedNetworkEndpoint, *tcpip.Error) {
-	// TODO(b/141022673): Validate IP addresses before adding them.
-
-	// Sanity check.
-	id := NetworkEndpointID{LocalAddress: protocolAddress.AddressWithPrefix.Address}
-	if ref, ok := n.mu.endpoints[id]; ok {
-		// Endpoint already exists.
-		if kind != permanent {
-			return nil, tcpip.ErrDuplicateAddress
-		}
-		switch ref.getKind() {
-		case permanentTentative, permanent:
-			// The NIC already have a permanent endpoint with that address.
-			return nil, tcpip.ErrDuplicateAddress
-		case permanentExpired, temporary:
-			// Promote the endpoint to become permanent and respect the new peb,
-			// configType and deprecated status.
-			if ref.tryIncRef() {
-				// TODO(b/147748385): Perform Duplicate Address Detection when promoting
-				// an IPv6 endpoint to permanent.
-				ref.setKind(permanent)
-				ref.deprecated = deprecated
-				ref.configType = configType
-
-				refs := n.mu.primary[ref.protocol]
-				for i, r := range refs {
-					if r == ref {
-						switch peb {
-						case CanBePrimaryEndpoint:
-							return ref, nil
-						case FirstPrimaryEndpoint:
-							if i == 0 {
-								return ref, nil
-							}
-							n.mu.primary[r.protocol] = append(refs[:i], refs[i+1:]...)
-						case NeverPrimaryEndpoint:
-							n.mu.primary[r.protocol] = append(refs[:i], refs[i+1:]...)
-							return ref, nil
-						}
-					}
-				}
-
-				n.insertPrimaryEndpointLocked(ref, peb)
-
-				return ref, nil
-			}
-			// tryIncRef failing means the endpoint is scheduled to be removed once
-			// the lock is released. Remove it here so we can create a new
-			// (permanent) one. The removal logic waiting for the lock handles this
-			// case.
-			n.removeEndpointLocked(ref)
-		}
-	}
-
+// addAddress adds a new address to n, so that it starts accepting packets
+// targeted at the given address (and network protocol).
+func (n *NIC) addAddress(protocolAddress tcpip.ProtocolAddress, peb PrimaryEndpointBehavior) *tcpip.Error {
 	ep, ok := n.networkEndpoints[protocolAddress.Protocol]
 	if !ok {
-		return nil, tcpip.ErrUnknownProtocol
+		return tcpip.ErrUnknownProtocol
 	}
 
-	isIPv6Unicast := protocolAddress.Protocol == header.IPv6ProtocolNumber && header.IsV6UnicastAddress(protocolAddress.AddressWithPrefix.Address)
-
-	// If the address is an IPv6 address and it is a permanent address,
-	// mark it as tentative so it goes through the DAD process if the NIC is
-	// enabled. If the NIC is not enabled, DAD will be started when the NIC is
-	// enabled.
-	if isIPv6Unicast && kind == permanent {
-		kind = permanentTentative
-	}
-
-	ref := &referencedNetworkEndpoint{
-		refs:       1,
-		addr:       protocolAddress.AddressWithPrefix,
-		ep:         ep,
-		nic:        n,
-		protocol:   protocolAddress.Protocol,
-		kind:       kind,
-		configType: configType,
-		deprecated: deprecated,
+	addressEndpoint, err := ep.AddAndAcquirePermanentAddress(protocolAddress.AddressWithPrefix, peb, AddressConfigStatic, false /* deprecated */)
+	if err == nil {
+		// We have no need for the address endpoint.
+		addressEndpoint.DecRef()
 	}
-
-	// Set up cache if link address resolution exists for this protocol.
-	if n.linkEP.Capabilities()&CapabilityResolutionRequired != 0 {
-		if _, ok := n.stack.linkAddrResolvers[protocolAddress.Protocol]; ok {
-			ref.linkCache = n.stack
-		}
-	}
-
-	// If we are adding an IPv6 unicast address, join the solicited-node
-	// multicast address.
-	if isIPv6Unicast {
-		snmc := header.SolicitedNodeAddr(protocolAddress.AddressWithPrefix.Address)
-		if err := n.joinGroupLocked(protocolAddress.Protocol, snmc); err != nil {
-			return nil, err
-		}
-	}
-
-	n.mu.endpoints[id] = ref
-
-	n.insertPrimaryEndpointLocked(ref, peb)
-
-	// If we are adding a tentative IPv6 address, start DAD if the NIC is enabled.
-	if isIPv6Unicast && kind == permanentTentative && n.mu.enabled {
-		if err := n.mu.ndp.startDuplicateAddressDetection(protocolAddress.AddressWithPrefix.Address, ref); err != nil {
-			return nil, err
-		}
-	}
-
-	return ref, nil
-}
-
-// AddAddress adds a new address to n, so that it starts accepting packets
-// targeted at the given address (and network protocol).
-func (n *NIC) AddAddress(protocolAddress tcpip.ProtocolAddress, peb PrimaryEndpointBehavior) *tcpip.Error {
-	// Add the endpoint.
-	n.mu.Lock()
-	_, err := n.addAddressLocked(protocolAddress, peb, permanent, static, false /* deprecated */)
-	n.mu.Unlock()
-
 	return err
 }
 
-// AllAddresses returns all addresses (primary and non-primary) associated with
+// allPermanentAddresses returns all permanent addresses associated with
 // this NIC.
-func (n *NIC) AllAddresses() []tcpip.ProtocolAddress {
-	n.mu.RLock()
-	defer n.mu.RUnlock()
-
-	addrs := make([]tcpip.ProtocolAddress, 0, len(n.mu.endpoints))
-	for _, ref := range n.mu.endpoints {
-		// Don't include tentative, expired or temporary endpoints to
-		// avoid confusion and prevent the caller from using those.
-		switch ref.getKind() {
-		case permanentExpired, temporary:
-			continue
+func (n *NIC) allPermanentAddresses() []tcpip.ProtocolAddress {
+	var addrs []tcpip.ProtocolAddress
+	for p, ep := range n.networkEndpoints {
+		for _, a := range ep.PermanentAddresses() {
+			addrs = append(addrs, tcpip.ProtocolAddress{Protocol: p, AddressWithPrefix: a})
 		}
-
-		addrs = append(addrs, tcpip.ProtocolAddress{
-			Protocol:          ref.protocol,
-			AddressWithPrefix: ref.addrWithPrefix(),
-		})
 	}
 	return addrs
 }
 
-// PrimaryAddresses returns the primary addresses associated with this NIC.
-func (n *NIC) PrimaryAddresses() []tcpip.ProtocolAddress {
-	n.mu.RLock()
-	defer n.mu.RUnlock()
-
+// primaryAddresses returns the primary addresses associated with this NIC.
+func (n *NIC) primaryAddresses() []tcpip.ProtocolAddress {
 	var addrs []tcpip.ProtocolAddress
-	for proto, list := range n.mu.primary {
-		for _, ref := range list {
-			// Don't include tentative, expired or tempory endpoints
-			// to avoid confusion and prevent the caller from using
-			// those.
-			switch ref.getKind() {
-			case permanentTentative, permanentExpired, temporary:
-				continue
-			}
-
-			addrs = append(addrs, tcpip.ProtocolAddress{
-				Protocol:          proto,
-				AddressWithPrefix: ref.addrWithPrefix(),
-			})
+	for p, ep := range n.networkEndpoints {
+		for _, a := range ep.PrimaryAddresses() {
+			addrs = append(addrs, tcpip.ProtocolAddress{Protocol: p, AddressWithPrefix: a})
 		}
 	}
 	return addrs
@@ -928,237 +421,135 @@ func (n *NIC) PrimaryAddresses() []tcpip.ProtocolAddress {
 // address exists. If no non-deprecated address exists, the first deprecated
 // address will be returned.
 func (n *NIC) primaryAddress(proto tcpip.NetworkProtocolNumber) tcpip.AddressWithPrefix {
-	n.mu.RLock()
-	defer n.mu.RUnlock()
-
-	list, ok := n.mu.primary[proto]
+	ep, ok := n.networkEndpoints[proto]
 	if !ok {
 		return tcpip.AddressWithPrefix{}
 	}
 
-	var deprecatedEndpoint *referencedNetworkEndpoint
-	for _, ref := range list {
-		// Don't include tentative, expired or tempory endpoints to avoid confusion
-		// and prevent the caller from using those.
-		switch ref.getKind() {
-		case permanentTentative, permanentExpired, temporary:
-			continue
-		}
-
-		if !ref.deprecated {
-			return ref.addrWithPrefix()
-		}
+	return ep.MainAddress()
+}
 
-		if deprecatedEndpoint == nil {
-			deprecatedEndpoint = ref
+// removeAddress removes an address from n.
+func (n *NIC) removeAddress(addr tcpip.Address) *tcpip.Error {
+	for _, ep := range n.networkEndpoints {
+		if err := ep.RemovePermanentAddress(addr); err == tcpip.ErrBadLocalAddress {
+			continue
+		} else {
+			return err
 		}
 	}
 
-	if deprecatedEndpoint != nil {
-		return deprecatedEndpoint.addrWithPrefix()
-	}
-
-	return tcpip.AddressWithPrefix{}
+	return tcpip.ErrBadLocalAddress
 }
 
-// insertPrimaryEndpointLocked adds r to n's primary endpoint list as required
-// by peb.
-//
-// n MUST be locked.
-func (n *NIC) insertPrimaryEndpointLocked(r *referencedNetworkEndpoint, peb PrimaryEndpointBehavior) {
-	switch peb {
-	case CanBePrimaryEndpoint:
-		n.mu.primary[r.protocol] = append(n.mu.primary[r.protocol], r)
-	case FirstPrimaryEndpoint:
-		n.mu.primary[r.protocol] = append([]*referencedNetworkEndpoint{r}, n.mu.primary[r.protocol]...)
+func (n *NIC) neighbors() ([]NeighborEntry, *tcpip.Error) {
+	if n.neigh == nil {
+		return nil, tcpip.ErrNotSupported
 	}
-}
 
-func (n *NIC) removeEndpointLocked(r *referencedNetworkEndpoint) {
-	id := NetworkEndpointID{LocalAddress: r.address()}
+	return n.neigh.entries(), nil
+}
 
-	// Nothing to do if the reference has already been replaced with a different
-	// one. This happens in the case where 1) this endpoint's ref count hit zero
-	// and was waiting (on the lock) to be removed and 2) the same address was
-	// re-added in the meantime by removing this endpoint from the list and
-	// adding a new one.
-	if n.mu.endpoints[id] != r {
+func (n *NIC) removeWaker(addr tcpip.Address, w *sleep.Waker) {
+	if n.neigh == nil {
 		return
 	}
 
-	if r.getKind() == permanent {
-		panic("Reference count dropped to zero before being removed")
-	}
+	n.neigh.removeWaker(addr, w)
+}
 
-	delete(n.mu.endpoints, id)
-	refs := n.mu.primary[r.protocol]
-	for i, ref := range refs {
-		if ref == r {
-			n.mu.primary[r.protocol] = append(refs[:i], refs[i+1:]...)
-			refs[len(refs)-1] = nil
-			break
-		}
+func (n *NIC) addStaticNeighbor(addr tcpip.Address, linkAddress tcpip.LinkAddress) *tcpip.Error {
+	if n.neigh == nil {
+		return tcpip.ErrNotSupported
 	}
-}
 
-func (n *NIC) removeEndpoint(r *referencedNetworkEndpoint) {
-	n.mu.Lock()
-	n.removeEndpointLocked(r)
-	n.mu.Unlock()
+	n.neigh.addStaticEntry(addr, linkAddress)
+	return nil
 }
 
-func (n *NIC) removePermanentAddressLocked(addr tcpip.Address) *tcpip.Error {
-	r, ok := n.mu.endpoints[NetworkEndpointID{addr}]
-	if !ok {
-		return tcpip.ErrBadLocalAddress
-	}
-
-	kind := r.getKind()
-	if kind != permanent && kind != permanentTentative {
-		return tcpip.ErrBadLocalAddress
+func (n *NIC) removeNeighbor(addr tcpip.Address) *tcpip.Error {
+	if n.neigh == nil {
+		return tcpip.ErrNotSupported
 	}
 
-	switch r.protocol {
-	case header.IPv6ProtocolNumber:
-		return n.removePermanentIPv6EndpointLocked(r, true /* allowSLAACInvalidation */)
-	default:
-		r.expireLocked()
-		return nil
+	if !n.neigh.removeEntry(addr) {
+		return tcpip.ErrBadAddress
 	}
+	return nil
 }
 
-func (n *NIC) removePermanentIPv6EndpointLocked(r *referencedNetworkEndpoint, allowSLAACInvalidation bool) *tcpip.Error {
-	addr := r.addrWithPrefix()
-
-	isIPv6Unicast := header.IsV6UnicastAddress(addr.Address)
-
-	if isIPv6Unicast {
-		n.mu.ndp.stopDuplicateAddressDetection(addr.Address)
-
-		// If we are removing an address generated via SLAAC, cleanup
-		// its SLAAC resources and notify the integrator.
-		switch r.configType {
-		case slaac:
-			n.mu.ndp.cleanupSLAACAddrResourcesAndNotify(addr, allowSLAACInvalidation)
-		case slaacTemp:
-			n.mu.ndp.cleanupTempSLAACAddrResourcesAndNotify(addr, allowSLAACInvalidation)
-		}
-	}
-
-	r.expireLocked()
-
-	// At this point the endpoint is deleted.
-
-	// If we are removing an IPv6 unicast address, leave the solicited-node
-	// multicast address.
-	//
-	// We ignore the tcpip.ErrBadLocalAddress error because the solicited-node
-	// multicast group may be left by user action.
-	if isIPv6Unicast {
-		snmc := header.SolicitedNodeAddr(addr.Address)
-		if err := n.leaveGroupLocked(snmc, false /* force */); err != nil && err != tcpip.ErrBadLocalAddress {
-			return err
-		}
+func (n *NIC) clearNeighbors() *tcpip.Error {
+	if n.neigh == nil {
+		return tcpip.ErrNotSupported
 	}
 
+	n.neigh.clear()
 	return nil
 }
 
-// RemoveAddress removes an address from n.
-func (n *NIC) RemoveAddress(addr tcpip.Address) *tcpip.Error {
-	n.mu.Lock()
-	defer n.mu.Unlock()
-	return n.removePermanentAddressLocked(addr)
-}
-
 // joinGroup adds a new endpoint for the given multicast address, if none
 // exists yet. Otherwise it just increments its count.
 func (n *NIC) joinGroup(protocol tcpip.NetworkProtocolNumber, addr tcpip.Address) *tcpip.Error {
-	n.mu.Lock()
-	defer n.mu.Unlock()
-
-	return n.joinGroupLocked(protocol, addr)
-}
-
-// joinGroupLocked adds a new endpoint for the given multicast address, if none
-// exists yet. Otherwise it just increments its count. n MUST be locked before
-// joinGroupLocked is called.
-func (n *NIC) joinGroupLocked(protocol tcpip.NetworkProtocolNumber, addr tcpip.Address) *tcpip.Error {
 	// TODO(b/143102137): When implementing MLD, make sure MLD packets are
 	// not sent unless a valid link-local address is available for use on n
 	// as an MLD packet's source address must be a link-local address as
 	// outlined in RFC 3810 section 5.
 
-	id := NetworkEndpointID{addr}
-	joins := n.mu.mcastJoins[id]
-	if joins == 0 {
-		netProto, ok := n.stack.networkProtocols[protocol]
-		if !ok {
-			return tcpip.ErrUnknownProtocol
-		}
-		if _, err := n.addAddressLocked(tcpip.ProtocolAddress{
-			Protocol: protocol,
-			AddressWithPrefix: tcpip.AddressWithPrefix{
-				Address:   addr,
-				PrefixLen: netProto.DefaultPrefixLen(),
-			},
-		}, NeverPrimaryEndpoint, permanent, static, false /* deprecated */); err != nil {
-			return err
-		}
+	ep, ok := n.networkEndpoints[protocol]
+	if !ok {
+		return tcpip.ErrNotSupported
 	}
-	n.mu.mcastJoins[id] = joins + 1
-	return nil
+
+	gep, ok := ep.(GroupAddressableEndpoint)
+	if !ok {
+		return tcpip.ErrNotSupported
+	}
+
+	_, err := gep.JoinGroup(addr)
+	return err
 }
 
 // leaveGroup decrements the count for the given multicast address, and when it
 // reaches zero removes the endpoint for this address.
-func (n *NIC) leaveGroup(addr tcpip.Address) *tcpip.Error {
-	n.mu.Lock()
-	defer n.mu.Unlock()
-
-	return n.leaveGroupLocked(addr, false /* force */)
-}
+func (n *NIC) leaveGroup(protocol tcpip.NetworkProtocolNumber, addr tcpip.Address) *tcpip.Error {
+	ep, ok := n.networkEndpoints[protocol]
+	if !ok {
+		return tcpip.ErrNotSupported
+	}
 
-// leaveGroupLocked decrements the count for the given multicast address, and
-// when it reaches zero removes the endpoint for this address. n MUST be locked
-// before leaveGroupLocked is called.
-//
-// If force is true, then the count for the multicast addres is ignored and the
-// endpoint will be removed immediately.
-func (n *NIC) leaveGroupLocked(addr tcpip.Address, force bool) *tcpip.Error {
-	id := NetworkEndpointID{addr}
-	joins, ok := n.mu.mcastJoins[id]
+	gep, ok := ep.(GroupAddressableEndpoint)
 	if !ok {
-		// There are no joins with this address on this NIC.
-		return tcpip.ErrBadLocalAddress
+		return tcpip.ErrNotSupported
 	}
 
-	joins--
-	if force || joins == 0 {
-		// There are no outstanding joins or we are forced to leave, clean up.
-		delete(n.mu.mcastJoins, id)
-		return n.removePermanentAddressLocked(addr)
+	if _, err := gep.LeaveGroup(addr); err != nil {
+		return err
 	}
 
-	n.mu.mcastJoins[id] = joins
 	return nil
 }
 
 // isInGroup returns true if n has joined the multicast group addr.
 func (n *NIC) isInGroup(addr tcpip.Address) bool {
-	n.mu.RLock()
-	joins := n.mu.mcastJoins[NetworkEndpointID{addr}]
-	n.mu.RUnlock()
+	for _, ep := range n.networkEndpoints {
+		gep, ok := ep.(GroupAddressableEndpoint)
+		if !ok {
+			continue
+		}
+
+		if gep.IsInGroup(addr) {
+			return true
+		}
+	}
 
-	return joins != 0
+	return false
 }
 
-func handlePacket(protocol tcpip.NetworkProtocolNumber, dst, src tcpip.Address, localLinkAddr, remotelinkAddr tcpip.LinkAddress, ref *referencedNetworkEndpoint, pkt *PacketBuffer) {
-	r := makeRoute(protocol, dst, src, localLinkAddr, ref, false /* handleLocal */, false /* multicastLoop */)
+func (n *NIC) handlePacket(protocol tcpip.NetworkProtocolNumber, dst, src tcpip.Address, remotelinkAddr tcpip.LinkAddress, addressEndpoint AssignableAddressEndpoint, pkt *PacketBuffer) {
+	r := makeRoute(protocol, dst, src, n, addressEndpoint, false /* handleLocal */, false /* multicastLoop */)
+	defer r.Release()
 	r.RemoteLinkAddress = remotelinkAddr
-
-	ref.ep.HandlePacket(&r, pkt)
-	ref.decRef()
+	n.getNetworkEndpoint(protocol).HandlePacket(&r, pkt)
 }
 
 // DeliverNetworkPacket finds the appropriate network protocol endpoint and
@@ -1169,7 +560,7 @@ func handlePacket(protocol tcpip.NetworkProtocolNumber, dst, src tcpip.Address,
 // the ownership of the items is not retained by the caller.
 func (n *NIC) DeliverNetworkPacket(remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer) {
 	n.mu.RLock()
-	enabled := n.mu.enabled
+	enabled := n.Enabled()
 	// If the NIC is not yet enabled, don't receive any packets.
 	if !enabled {
 		n.mu.RUnlock()
@@ -1192,12 +583,12 @@ func (n *NIC) DeliverNetworkPacket(remote, local tcpip.LinkAddress, protocol tcp
 	// If no local link layer address is provided, assume it was sent
 	// directly to this NIC.
 	if local == "" {
-		local = n.linkEP.LinkAddress()
+		local = n.LinkEndpoint.LinkAddress()
 	}
 
-	// Are any packet sockets listening for this network protocol?
+	// Are any packet type sockets listening for this network protocol?
 	packetEPs := n.mu.packetEPs[protocol]
-	// Add any other packet sockets that maybe listening for all protocols.
+	// Add any other packet type sockets that may be listening for all protocols.
 	packetEPs = append(packetEPs, n.mu.packetEPs[header.EthernetProtocolAll]...)
 	n.mu.RUnlock()
 	for _, ep := range packetEPs {
@@ -1218,6 +609,7 @@ func (n *NIC) DeliverNetworkPacket(remote, local tcpip.LinkAddress, protocol tcp
 		return
 	}
 	if hasTransportHdr {
+		pkt.TransportProtocolNumber = transProtoNum
 		// Parse the transport header if present.
 		if state, ok := n.stack.transportProtocols[transProtoNum]; ok {
 			state.proto.Parse(pkt)
@@ -1226,29 +618,33 @@ func (n *NIC) DeliverNetworkPacket(remote, local tcpip.LinkAddress, protocol tcp
 
 	src, dst := netProto.ParseAddresses(pkt.NetworkHeader().View())
 
-	if n.stack.handleLocal && !n.isLoopback() && n.getRef(protocol, src) != nil {
-		// The source address is one of our own, so we never should have gotten a
-		// packet like this unless handleLocal is false. Loopback also calls this
-		// function even though the packets didn't come from the physical interface
-		// so don't drop those.
-		n.stack.stats.IP.InvalidSourceAddressesReceived.Increment()
-		return
+	if n.stack.handleLocal && !n.IsLoopback() {
+		if r := n.getAddress(protocol, src); r != nil {
+			r.DecRef()
+
+			// The source address is one of our own, so we never should have gotten a
+			// packet like this unless handleLocal is false. Loopback also calls this
+			// function even though the packets didn't come from the physical interface
+			// so don't drop those.
+			n.stack.stats.IP.InvalidSourceAddressesReceived.Increment()
+			return
+		}
 	}
 
-	// TODO(gvisor.dev/issue/170): Not supporting iptables for IPv6 yet.
 	// Loopback traffic skips the prerouting chain.
-	if protocol == header.IPv4ProtocolNumber && !n.isLoopback() {
+	if !n.IsLoopback() {
 		// iptables filtering.
 		ipt := n.stack.IPTables()
 		address := n.primaryAddress(protocol)
 		if ok := ipt.Check(Prerouting, pkt, nil, nil, address.Address, ""); !ok {
 			// iptables is telling us to drop the packet.
+			n.stack.stats.IP.IPTablesPreroutingDropped.Increment()
 			return
 		}
 	}
 
-	if ref := n.getRef(protocol, dst); ref != nil {
-		handlePacket(protocol, dst, src, n.linkEP.LinkAddress(), remote, ref, pkt)
+	if addressEndpoint := n.getAddress(protocol, dst); addressEndpoint != nil {
+		n.handlePacket(protocol, dst, src, remote, addressEndpoint, pkt)
 		return
 	}
 
@@ -1256,7 +652,7 @@ func (n *NIC) DeliverNetworkPacket(remote, local tcpip.LinkAddress, protocol tcp
 	// packet and forward it to the NIC.
 	//
 	// TODO: Should we be forwarding the packet even if promiscuous?
-	if n.stack.Forwarding() {
+	if n.stack.Forwarding(protocol) {
 		r, err := n.stack.FindRoute(0, "", dst, protocol, false /* multicastLoop */)
 		if err != nil {
 			n.stack.stats.IP.InvalidDestinationAddressesReceived.Increment()
@@ -1264,38 +660,41 @@ func (n *NIC) DeliverNetworkPacket(remote, local tcpip.LinkAddress, protocol tcp
 		}
 
 		// Found a NIC.
-		n := r.ref.nic
-		n.mu.RLock()
-		ref, ok := n.mu.endpoints[NetworkEndpointID{dst}]
-		ok = ok && ref.isValidForOutgoingRLocked() && ref.tryIncRef()
-		n.mu.RUnlock()
-		if ok {
-			r.LocalLinkAddress = n.linkEP.LinkAddress()
-			r.RemoteLinkAddress = remote
-			r.RemoteAddress = src
-			// TODO(b/123449044): Update the source NIC as well.
-			ref.ep.HandlePacket(&r, pkt)
-			ref.decRef()
-			r.Release()
-			return
+		n := r.nic
+		if addressEndpoint := n.getAddressOrCreateTempInner(protocol, dst, false, NeverPrimaryEndpoint); addressEndpoint != nil {
+			if n.isValidForOutgoing(addressEndpoint) {
+				r.LocalLinkAddress = n.LinkEndpoint.LinkAddress()
+				r.RemoteLinkAddress = remote
+				r.RemoteAddress = src
+				// TODO(b/123449044): Update the source NIC as well.
+				n.getNetworkEndpoint(protocol).HandlePacket(&r, pkt)
+				addressEndpoint.DecRef()
+				r.Release()
+				return
+			}
+
+			addressEndpoint.DecRef()
 		}
 
 		// n doesn't have a destination endpoint.
 		// Send the packet out of n.
-		// TODO(b/128629022): move this logic to route.WritePacket.
-		if ch, err := r.Resolve(nil); err != nil {
-			if err == tcpip.ErrWouldBlock {
-				n.stack.forwarder.enqueue(ch, n, &r, protocol, pkt)
-				// forwarder will release route.
-				return
-			}
+		// TODO(gvisor.dev/issue/1085): According to the RFC, we must decrease the TTL field for ipv4/ipv6.
+
+		// pkt may have set its header and may not have enough headroom for
+		// link-layer header for the other link to prepend. Here we create a new
+		// packet to forward.
+		fwdPkt := NewPacketBuffer(PacketBufferOptions{
+			ReserveHeaderBytes: int(n.LinkEndpoint.MaxHeaderLength()),
+			// We need to do a deep copy of the IP packet because WritePacket (and
+			// friends) take ownership of the packet buffer, but we do not own it.
+			Data: PayloadSince(pkt.NetworkHeader()).ToVectorisedView(),
+		})
+
+		// TODO(b/143425874) Decrease the TTL field in forwarded packets.
+		if err := n.WritePacket(&r, nil, protocol, fwdPkt); err != nil {
 			n.stack.stats.IP.InvalidDestinationAddressesReceived.Increment()
-			r.Release()
-			return
 		}
 
-		// The link-address resolution finished immediately.
-		n.forwardPacket(&r, protocol, pkt)
 		r.Release()
 		return
 	}
@@ -1319,41 +718,18 @@ func (n *NIC) DeliverOutboundPacket(remote, local tcpip.LinkAddress, protocol tc
 		p.PktType = tcpip.PacketOutgoing
 		// Add the link layer header as outgoing packets are intercepted
 		// before the link layer header is created.
-		n.linkEP.AddHeader(local, remote, protocol, p)
+		n.LinkEndpoint.AddHeader(local, remote, protocol, p)
 		ep.HandlePacket(n.id, local, protocol, p)
 	}
 }
 
-func (n *NIC) forwardPacket(r *Route, protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer) {
-	// TODO(b/143425874) Decrease the TTL field in forwarded packets.
-
-	// pkt may have set its header and may not have enough headroom for link-layer
-	// header for the other link to prepend. Here we create a new packet to
-	// forward.
-	fwdPkt := NewPacketBuffer(PacketBufferOptions{
-		ReserveHeaderBytes: int(n.linkEP.MaxHeaderLength()),
-		Data:               buffer.NewVectorisedView(pkt.Size(), pkt.Views()),
-	})
-
-	// WritePacket takes ownership of fwdPkt, calculate numBytes first.
-	numBytes := fwdPkt.Size()
-
-	if err := n.linkEP.WritePacket(r, nil /* gso */, protocol, fwdPkt); err != nil {
-		r.Stats().IP.OutgoingPacketErrors.Increment()
-		return
-	}
-
-	n.stats.Tx.Packets.Increment()
-	n.stats.Tx.Bytes.IncrementBy(uint64(numBytes))
-}
-
 // DeliverTransportPacket delivers the packets to the appropriate transport
 // protocol endpoint.
-func (n *NIC) DeliverTransportPacket(r *Route, protocol tcpip.TransportProtocolNumber, pkt *PacketBuffer) {
+func (n *NIC) DeliverTransportPacket(r *Route, protocol tcpip.TransportProtocolNumber, pkt *PacketBuffer) TransportPacketDisposition {
 	state, ok := n.stack.transportProtocols[protocol]
 	if !ok {
 		n.stack.stats.UnknownProtocolRcvdPackets.Increment()
-		return
+		return TransportPacketProtocolUnreachable
 	}
 
 	transProto := state.proto
@@ -1374,41 +750,47 @@ func (n *NIC) DeliverTransportPacket(r *Route, protocol tcpip.TransportProtocolN
 			// we parse it using the minimum size.
 			if _, ok := pkt.TransportHeader().Consume(transProto.MinimumPacketSize()); !ok {
 				n.stack.stats.MalformedRcvdPackets.Increment()
-				return
+				// We consider a malformed transport packet handled because there is
+				// nothing the caller can do.
+				return TransportPacketHandled
 			}
-		} else {
-			// This is either a bad packet or was re-assembled from fragments.
-			transProto.Parse(pkt)
+		} else if !transProto.Parse(pkt) {
+			n.stack.stats.MalformedRcvdPackets.Increment()
+			return TransportPacketHandled
 		}
 	}
 
-	if pkt.TransportHeader().View().Size() < transProto.MinimumPacketSize() {
-		n.stack.stats.MalformedRcvdPackets.Increment()
-		return
-	}
-
 	srcPort, dstPort, err := transProto.ParsePorts(pkt.TransportHeader().View())
 	if err != nil {
 		n.stack.stats.MalformedRcvdPackets.Increment()
-		return
+		return TransportPacketHandled
 	}
 
 	id := TransportEndpointID{dstPort, r.LocalAddress, srcPort, r.RemoteAddress}
 	if n.stack.demux.deliverPacket(r, protocol, pkt, id) {
-		return
+		return TransportPacketHandled
 	}
 
 	// Try to deliver to per-stack default handler.
 	if state.defaultHandler != nil {
 		if state.defaultHandler(r, id, pkt) {
-			return
+			return TransportPacketHandled
 		}
 	}
 
-	// We could not find an appropriate destination for this packet, so
-	// deliver it to the global handler.
-	if !transProto.HandleUnknownDestinationPacket(r, id, pkt) {
+	// We could not find an appropriate destination for this packet so
+	// give the protocol specific error handler a chance to handle it.
+	// If it doesn't handle it then we should do so.
+	switch res := transProto.HandleUnknownDestinationPacket(r, id, pkt); res {
+	case UnknownDestinationPacketMalformed:
 		n.stack.stats.MalformedRcvdPackets.Increment()
+		return TransportPacketHandled
+	case UnknownDestinationPacketUnhandled:
+		return TransportPacketDestinationPortUnreachable
+	case UnknownDestinationPacketHandled:
+		return TransportPacketHandled
+	default:
+		panic(fmt.Sprintf("unrecognized result from HandleUnknownDestinationPacket = %d", res))
 	}
 }
 
@@ -1441,96 +823,18 @@ func (n *NIC) DeliverTransportControlPacket(local, remote tcpip.Address, net tcp
 	}
 }
 
-// ID returns the identifier of n.
+// ID implements NetworkInterface.
 func (n *NIC) ID() tcpip.NICID {
 	return n.id
 }
 
-// Name returns the name of n.
+// Name implements NetworkInterface.
 func (n *NIC) Name() string {
 	return n.name
 }
 
-// Stack returns the instance of the Stack that owns this NIC.
-func (n *NIC) Stack() *Stack {
-	return n.stack
-}
-
-// LinkEndpoint returns the link endpoint of n.
-func (n *NIC) LinkEndpoint() LinkEndpoint {
-	return n.linkEP
-}
-
-// isAddrTentative returns true if addr is tentative on n.
-//
-// Note that if addr is not associated with n, then this function will return
-// false. It will only return true if the address is associated with the NIC
-// AND it is tentative.
-func (n *NIC) isAddrTentative(addr tcpip.Address) bool {
-	n.mu.RLock()
-	defer n.mu.RUnlock()
-
-	ref, ok := n.mu.endpoints[NetworkEndpointID{addr}]
-	if !ok {
-		return false
-	}
-
-	return ref.getKind() == permanentTentative
-}
-
-// dupTentativeAddrDetected attempts to inform n that a tentative addr is a
-// duplicate on a link.
-//
-// dupTentativeAddrDetected will remove the tentative address if it exists. If
-// the address was generated via SLAAC, an attempt will be made to generate a
-// new address.
-func (n *NIC) dupTentativeAddrDetected(addr tcpip.Address) *tcpip.Error {
-	n.mu.Lock()
-	defer n.mu.Unlock()
-
-	ref, ok := n.mu.endpoints[NetworkEndpointID{addr}]
-	if !ok {
-		return tcpip.ErrBadAddress
-	}
-
-	if ref.getKind() != permanentTentative {
-		return tcpip.ErrInvalidEndpointState
-	}
-
-	// If the address is a SLAAC address, do not invalidate its SLAAC prefix as a
-	// new address will be generated for it.
-	if err := n.removePermanentIPv6EndpointLocked(ref, false /* allowSLAACInvalidation */); err != nil {
-		return err
-	}
-
-	prefix := ref.addrWithPrefix().Subnet()
-
-	switch ref.configType {
-	case slaac:
-		n.mu.ndp.regenerateSLAACAddr(prefix)
-	case slaacTemp:
-		// Do not reset the generation attempts counter for the prefix as the
-		// temporary address is being regenerated in response to a DAD conflict.
-		n.mu.ndp.regenerateTempSLAACAddr(prefix, false /* resetGenAttempts */)
-	}
-
-	return nil
-}
-
-// setNDPConfigs sets the NDP configurations for n.
-//
-// Note, if c contains invalid NDP configuration values, it will be fixed to
-// use default values for the erroneous values.
-func (n *NIC) setNDPConfigs(c NDPConfigurations) {
-	c.validate()
-
-	n.mu.Lock()
-	n.mu.ndp.configs = c
-	n.mu.Unlock()
-}
-
-// NUDConfigs gets the NUD configurations for n.
-func (n *NIC) NUDConfigs() (NUDConfigurations, *tcpip.Error) {
+// nudConfigs gets the NUD configurations for n.
+func (n *NIC) nudConfigs() (NUDConfigurations, *tcpip.Error) {
 	if n.neigh == nil {
 		return NUDConfigurations{}, tcpip.ErrNotSupported
 	}
@@ -1550,49 +854,6 @@ func (n *NIC) setNUDConfigs(c NUDConfigurations) *tcpip.Error {
 	return nil
 }
 
-// handleNDPRA handles an NDP Router Advertisement message that arrived on n.
-func (n *NIC) handleNDPRA(ip tcpip.Address, ra header.NDPRouterAdvert) {
-	n.mu.Lock()
-	defer n.mu.Unlock()
-
-	n.mu.ndp.handleRA(ip, ra)
-}
-
-type networkEndpointKind int32
-
-const (
-	// A permanentTentative endpoint is a permanent address that is not yet
-	// considered to be fully bound to an interface in the traditional
-	// sense. That is, the address is associated with a NIC, but packets
-	// destined to the address MUST NOT be accepted and MUST be silently
-	// dropped, and the address MUST NOT be used as a source address for
-	// outgoing packets. For IPv6, addresses will be of this kind until
-	// NDP's Duplicate Address Detection has resolved, or be deleted if
-	// the process results in detecting a duplicate address.
-	permanentTentative networkEndpointKind = iota
-
-	// A permanent endpoint is created by adding a permanent address (vs. a
-	// temporary one) to the NIC. Its reference count is biased by 1 to avoid
-	// removal when no route holds a reference to it. It is removed by explicitly
-	// removing the permanent address from the NIC.
-	permanent
-
-	// An expired permanent endpoint is a permanent endpoint that had its address
-	// removed from the NIC, and it is waiting to be removed once no more routes
-	// hold a reference to it. This is achieved by decreasing its reference count
-	// by 1. If its address is re-added before the endpoint is removed, its type
-	// changes back to permanent and its reference count increases by 1 again.
-	permanentExpired
-
-	// A temporary endpoint is created for spoofing outgoing packets, or when in
-	// promiscuous mode and accepting incoming packets that don't match any
-	// permanent endpoint. Its reference count is not biased by 1 and the
-	// endpoint is removed immediately when no more route holds a reference to
-	// it. A temporary endpoint can be promoted to permanent if its address
-	// is added permanently.
-	temporary
-)
-
 func (n *NIC) registerPacketEndpoint(netProto tcpip.NetworkProtocolNumber, ep PacketEndpoint) *tcpip.Error {
 	n.mu.Lock()
 	defer n.mu.Unlock()
@@ -1623,149 +884,12 @@ func (n *NIC) unregisterPacketEndpoint(netProto tcpip.NetworkProtocolNumber, ep
 	}
 }
 
-type networkEndpointConfigType int32
-
-const (
-	// A statically configured endpoint is an address that was added by
-	// some user-specified action (adding an explicit address, joining a
-	// multicast group).
-	static networkEndpointConfigType = iota
-
-	// A SLAAC configured endpoint is an IPv6 endpoint that was added by
-	// SLAAC as per RFC 4862 section 5.5.3.
-	slaac
-
-	// A temporary SLAAC configured endpoint is an IPv6 endpoint that was added by
-	// SLAAC as per RFC 4941. Temporary SLAAC addresses are short-lived and are
-	// not expected to be valid (or preferred) forever; hence the term temporary.
-	slaacTemp
-)
-
-type referencedNetworkEndpoint struct {
-	ep       NetworkEndpoint
-	addr     tcpip.AddressWithPrefix
-	nic      *NIC
-	protocol tcpip.NetworkProtocolNumber
-
-	// linkCache is set if link address resolution is enabled for this
-	// protocol. Set to nil otherwise.
-	linkCache LinkAddressCache
-
-	// refs is counting references held for this endpoint. When refs hits zero it
-	// triggers the automatic removal of the endpoint from the NIC.
-	refs int32
-
-	// networkEndpointKind must only be accessed using {get,set}Kind().
-	kind networkEndpointKind
-
-	// configType is the method that was used to configure this endpoint.
-	// This must never change except during endpoint creation and promotion to
-	// permanent.
-	configType networkEndpointConfigType
-
-	// deprecated indicates whether or not the endpoint should be considered
-	// deprecated. That is, when deprecated is true, other endpoints that are not
-	// deprecated should be preferred.
-	deprecated bool
-}
-
-func (r *referencedNetworkEndpoint) address() tcpip.Address {
-	return r.addr.Address
-}
-
-func (r *referencedNetworkEndpoint) addrWithPrefix() tcpip.AddressWithPrefix {
-	return r.addr
-}
-
-func (r *referencedNetworkEndpoint) getKind() networkEndpointKind {
-	return networkEndpointKind(atomic.LoadInt32((*int32)(&r.kind)))
-}
-
-func (r *referencedNetworkEndpoint) setKind(kind networkEndpointKind) {
-	atomic.StoreInt32((*int32)(&r.kind), int32(kind))
-}
-
 // isValidForOutgoing returns true if the endpoint can be used to send out a
 // packet. It requires the endpoint to not be marked expired (i.e., its address)
 // has been removed) unless the NIC is in spoofing mode, or temporary.
-func (r *referencedNetworkEndpoint) isValidForOutgoing() bool {
-	r.nic.mu.RLock()
-	defer r.nic.mu.RUnlock()
-
-	return r.isValidForOutgoingRLocked()
-}
-
-// isValidForOutgoingRLocked is the same as isValidForOutgoing but requires
-// r.nic.mu to be read locked.
-func (r *referencedNetworkEndpoint) isValidForOutgoingRLocked() bool {
-	if !r.nic.mu.enabled {
-		return false
-	}
-
-	return r.isAssignedRLocked(r.nic.mu.spoofing)
-}
-
-// isAssignedRLocked returns true if r is considered to be assigned to the NIC.
-//
-// r.nic.mu must be read locked.
-func (r *referencedNetworkEndpoint) isAssignedRLocked(spoofingOrPromiscuous bool) bool {
-	switch r.getKind() {
-	case permanentTentative:
-		return false
-	case permanentExpired:
-		return spoofingOrPromiscuous
-	default:
-		return true
-	}
-}
-
-// expireLocked decrements the reference count and marks the permanent endpoint
-// as expired.
-func (r *referencedNetworkEndpoint) expireLocked() {
-	r.setKind(permanentExpired)
-	r.decRefLocked()
-}
-
-// decRef decrements the ref count and cleans up the endpoint once it reaches
-// zero.
-func (r *referencedNetworkEndpoint) decRef() {
-	if atomic.AddInt32(&r.refs, -1) == 0 {
-		r.nic.removeEndpoint(r)
-	}
-}
-
-// decRefLocked is the same as decRef but assumes that the NIC.mu mutex is
-// locked.
-func (r *referencedNetworkEndpoint) decRefLocked() {
-	if atomic.AddInt32(&r.refs, -1) == 0 {
-		r.nic.removeEndpointLocked(r)
-	}
-}
-
-// incRef increments the ref count. It must only be called when the caller is
-// known to be holding a reference to the endpoint, otherwise tryIncRef should
-// be used.
-func (r *referencedNetworkEndpoint) incRef() {
-	atomic.AddInt32(&r.refs, 1)
-}
-
-// tryIncRef attempts to increment the ref count from n to n+1, but only if n is
-// not zero. That is, it will increment the count if the endpoint is still
-// alive, and do nothing if it has already been clean up.
-func (r *referencedNetworkEndpoint) tryIncRef() bool {
-	for {
-		v := atomic.LoadInt32(&r.refs)
-		if v == 0 {
-			return false
-		}
-
-		if atomic.CompareAndSwapInt32(&r.refs, v, v+1) {
-			return true
-		}
-	}
-}
-
-// stack returns the Stack instance that owns the underlying endpoint.
-func (r *referencedNetworkEndpoint) stack() *Stack {
-	return r.nic.stack
+func (n *NIC) isValidForOutgoing(ep AssignableAddressEndpoint) bool {
+	n.mu.RLock()
+	spoofing := n.mu.spoofing
+	n.mu.RUnlock()
+	return n.Enabled() && ep.IsAssigned(spoofing)
 }
diff --git a/pkg/tcpip/stack/nic_test.go b/pkg/tcpip/stack/nic_test.go
index d312a79eb..97a96af62 100644
--- a/pkg/tcpip/stack/nic_test.go
+++ b/pkg/tcpip/stack/nic_test.go
@@ -15,96 +15,39 @@
 package stack
 
 import (
-	"math"
 	"testing"
-	"time"
 
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 )
 
-var _ LinkEndpoint = (*testLinkEndpoint)(nil)
+var _ AddressableEndpoint = (*testIPv6Endpoint)(nil)
+var _ NetworkEndpoint = (*testIPv6Endpoint)(nil)
+var _ NDPEndpoint = (*testIPv6Endpoint)(nil)
 
-// A LinkEndpoint that throws away outgoing packets.
+// An IPv6 NetworkEndpoint that throws away outgoing packets.
 //
-// We use this instead of the channel endpoint as the channel package depends on
+// We use this instead of ipv6.endpoint because the ipv6 package depends on
 // the stack package which this test lives in, causing a cyclic dependency.
-type testLinkEndpoint struct {
-	dispatcher NetworkDispatcher
-}
-
-// Attach implements LinkEndpoint.Attach.
-func (e *testLinkEndpoint) Attach(dispatcher NetworkDispatcher) {
-	e.dispatcher = dispatcher
-}
-
-// IsAttached implements LinkEndpoint.IsAttached.
-func (e *testLinkEndpoint) IsAttached() bool {
-	return e.dispatcher != nil
-}
-
-// MTU implements LinkEndpoint.MTU.
-func (*testLinkEndpoint) MTU() uint32 {
-	return math.MaxUint16
-}
-
-// Capabilities implements LinkEndpoint.Capabilities.
-func (*testLinkEndpoint) Capabilities() LinkEndpointCapabilities {
-	return CapabilityResolutionRequired
-}
+type testIPv6Endpoint struct {
+	AddressableEndpointState
 
-// MaxHeaderLength implements LinkEndpoint.MaxHeaderLength.
-func (*testLinkEndpoint) MaxHeaderLength() uint16 {
-	return 0
-}
+	nic      NetworkInterface
+	protocol *testIPv6Protocol
 
-// LinkAddress returns the link address of this endpoint.
-func (*testLinkEndpoint) LinkAddress() tcpip.LinkAddress {
-	return ""
+	invalidatedRtr tcpip.Address
 }
 
-// Wait implements LinkEndpoint.Wait.
-func (*testLinkEndpoint) Wait() {}
-
-// WritePacket implements LinkEndpoint.WritePacket.
-func (e *testLinkEndpoint) WritePacket(*Route, *GSO, tcpip.NetworkProtocolNumber, *PacketBuffer) *tcpip.Error {
+func (*testIPv6Endpoint) Enable() *tcpip.Error {
 	return nil
 }
 
-// WritePackets implements LinkEndpoint.WritePackets.
-func (e *testLinkEndpoint) WritePackets(*Route, *GSO, PacketBufferList, tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
-	// Our tests don't use this so we don't support it.
-	return 0, tcpip.ErrNotSupported
-}
-
-// WriteRawPacket implements LinkEndpoint.WriteRawPacket.
-func (e *testLinkEndpoint) WriteRawPacket(buffer.VectorisedView) *tcpip.Error {
-	// Our tests don't use this so we don't support it.
-	return tcpip.ErrNotSupported
-}
-
-// ARPHardwareType implements stack.LinkEndpoint.ARPHardwareType.
-func (*testLinkEndpoint) ARPHardwareType() header.ARPHardwareType {
-	panic("not implemented")
-}
-
-// AddHeader implements stack.LinkEndpoint.AddHeader.
-func (e *testLinkEndpoint) AddHeader(local, remote tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer) {
-	panic("not implemented")
+func (*testIPv6Endpoint) Enabled() bool {
+	return true
 }
 
-var _ NetworkEndpoint = (*testIPv6Endpoint)(nil)
-
-// An IPv6 NetworkEndpoint that throws away outgoing packets.
-//
-// We use this instead of ipv6.endpoint because the ipv6 package depends on
-// the stack package which this test lives in, causing a cyclic dependency.
-type testIPv6Endpoint struct {
-	nicID    tcpip.NICID
-	linkEP   LinkEndpoint
-	protocol *testIPv6Protocol
-}
+func (*testIPv6Endpoint) Disable() {}
 
 // DefaultTTL implements NetworkEndpoint.DefaultTTL.
 func (*testIPv6Endpoint) DefaultTTL() uint8 {
@@ -113,17 +56,12 @@ func (*testIPv6Endpoint) DefaultTTL() uint8 {
 
 // MTU implements NetworkEndpoint.MTU.
 func (e *testIPv6Endpoint) MTU() uint32 {
-	return e.linkEP.MTU() - header.IPv6MinimumSize
-}
-
-// Capabilities implements NetworkEndpoint.Capabilities.
-func (e *testIPv6Endpoint) Capabilities() LinkEndpointCapabilities {
-	return e.linkEP.Capabilities()
+	return e.nic.MTU() - header.IPv6MinimumSize
 }
 
 // MaxHeaderLength implements NetworkEndpoint.MaxHeaderLength.
 func (e *testIPv6Endpoint) MaxHeaderLength() uint16 {
-	return e.linkEP.MaxHeaderLength() + header.IPv6MinimumSize
+	return e.nic.MaxHeaderLength() + header.IPv6MinimumSize
 }
 
 // WritePacket implements NetworkEndpoint.WritePacket.
@@ -144,23 +82,24 @@ func (*testIPv6Endpoint) WriteHeaderIncludedPacket(*Route, *PacketBuffer) *tcpip
 	return tcpip.ErrNotSupported
 }
 
-// NICID implements NetworkEndpoint.NICID.
-func (e *testIPv6Endpoint) NICID() tcpip.NICID {
-	return e.nicID
-}
-
 // HandlePacket implements NetworkEndpoint.HandlePacket.
 func (*testIPv6Endpoint) HandlePacket(*Route, *PacketBuffer) {
 }
 
 // Close implements NetworkEndpoint.Close.
-func (*testIPv6Endpoint) Close() {}
+func (e *testIPv6Endpoint) Close() {
+	e.AddressableEndpointState.Cleanup()
+}
 
 // NetworkProtocolNumber implements NetworkEndpoint.NetworkProtocolNumber.
 func (*testIPv6Endpoint) NetworkProtocolNumber() tcpip.NetworkProtocolNumber {
 	return header.IPv6ProtocolNumber
 }
 
+func (e *testIPv6Endpoint) InvalidateDefaultRouter(rtr tcpip.Address) {
+	e.invalidatedRtr = rtr
+}
+
 var _ NetworkProtocol = (*testIPv6Protocol)(nil)
 
 // An IPv6 NetworkProtocol that supports the bare minimum to make a stack
@@ -192,21 +131,22 @@ func (*testIPv6Protocol) ParseAddresses(v buffer.View) (src, dst tcpip.Address)
 }
 
 // NewEndpoint implements NetworkProtocol.NewEndpoint.
-func (p *testIPv6Protocol) NewEndpoint(nicID tcpip.NICID, _ LinkAddressCache, _ TransportDispatcher, linkEP LinkEndpoint, _ *Stack) NetworkEndpoint {
-	return &testIPv6Endpoint{
-		nicID:    nicID,
-		linkEP:   linkEP,
+func (p *testIPv6Protocol) NewEndpoint(nic NetworkInterface, _ LinkAddressCache, _ NUDHandler, _ TransportDispatcher) NetworkEndpoint {
+	e := &testIPv6Endpoint{
+		nic:      nic,
 		protocol: p,
 	}
+	e.AddressableEndpointState.Init(e)
+	return e
 }
 
 // SetOption implements NetworkProtocol.SetOption.
-func (*testIPv6Protocol) SetOption(interface{}) *tcpip.Error {
+func (*testIPv6Protocol) SetOption(tcpip.SettableNetworkProtocolOption) *tcpip.Error {
 	return nil
 }
 
 // Option implements NetworkProtocol.Option.
-func (*testIPv6Protocol) Option(interface{}) *tcpip.Error {
+func (*testIPv6Protocol) Option(tcpip.GettableNetworkProtocolOption) *tcpip.Error {
 	return nil
 }
 
@@ -241,38 +181,6 @@ func (*testIPv6Protocol) ResolveStaticAddress(addr tcpip.Address) (tcpip.LinkAdd
 	return "", false
 }
 
-// Test the race condition where a NIC is removed and an RS timer fires at the
-// same time.
-func TestRemoveNICWhileHandlingRSTimer(t *testing.T) {
-	const (
-		nicID = 1
-
-		maxRtrSolicitations = 5
-	)
-
-	e := testLinkEndpoint{}
-	s := New(Options{
-		NetworkProtocols: []NetworkProtocol{&testIPv6Protocol{}},
-		NDPConfigs: NDPConfigurations{
-			MaxRtrSolicitations:     maxRtrSolicitations,
-			RtrSolicitationInterval: minimumRtrSolicitationInterval,
-		},
-	})
-
-	if err := s.CreateNIC(nicID, &e); err != nil {
-		t.Fatalf("s.CreateNIC(%d, _) = %s", nicID, err)
-	}
-
-	s.mu.Lock()
-	// Wait for the router solicitation timer to fire and block trying to obtain
-	// the stack lock when doing link address resolution.
-	time.Sleep(minimumRtrSolicitationInterval * 2)
-	if err := s.removeNICLocked(nicID); err != nil {
-		t.Fatalf("s.removeNICLocked(%d) = %s", nicID, err)
-	}
-	s.mu.Unlock()
-}
-
 func TestDisabledRxStatsWhenNICDisabled(t *testing.T) {
 	// When the NIC is disabled, the only field that matters is the stats field.
 	// This test is limited to stats counter checks.
diff --git a/pkg/tcpip/stack/nud_test.go b/pkg/tcpip/stack/nud_test.go
index 2494ee610..8cffb9fc6 100644
--- a/pkg/tcpip/stack/nud_test.go
+++ b/pkg/tcpip/stack/nud_test.go
@@ -60,7 +60,8 @@ func TestSetNUDConfigurationFailsForBadNICID(t *testing.T) {
 		// A neighbor cache is required to store NUDConfigurations. The networking
 		// stack will only allocate neighbor caches if a protocol providing link
 		// address resolution is specified (e.g. ARP or IPv6).
-		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+		NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocol},
+		UseNeighborCache: true,
 	})
 
 	// No NIC with ID 1 yet.
@@ -84,7 +85,8 @@ func TestNUDConfigurationFailsForNotSupported(t *testing.T) {
 	e.LinkEPCapabilities |= stack.CapabilityResolutionRequired
 
 	s := stack.New(stack.Options{
-		NUDConfigs: stack.DefaultNUDConfigurations(),
+		NUDConfigs:       stack.DefaultNUDConfigurations(),
+		UseNeighborCache: true,
 	})
 	if err := s.CreateNIC(nicID, e); err != nil {
 		t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
@@ -108,7 +110,8 @@ func TestSetNUDConfigurationFailsForNotSupported(t *testing.T) {
 	e.LinkEPCapabilities |= stack.CapabilityResolutionRequired
 
 	s := stack.New(stack.Options{
-		NUDConfigs: stack.DefaultNUDConfigurations(),
+		NUDConfigs:       stack.DefaultNUDConfigurations(),
+		UseNeighborCache: true,
 	})
 	if err := s.CreateNIC(nicID, e); err != nil {
 		t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
@@ -134,8 +137,9 @@ func TestDefaultNUDConfigurations(t *testing.T) {
 		// A neighbor cache is required to store NUDConfigurations. The networking
 		// stack will only allocate neighbor caches if a protocol providing link
 		// address resolution is specified (e.g. ARP or IPv6).
-		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+		NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocol},
 		NUDConfigs:       stack.DefaultNUDConfigurations(),
+		UseNeighborCache: true,
 	})
 	if err := s.CreateNIC(nicID, e); err != nil {
 		t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
@@ -188,8 +192,9 @@ func TestNUDConfigurationsBaseReachableTime(t *testing.T) {
 				// A neighbor cache is required to store NUDConfigurations. The
 				// networking stack will only allocate neighbor caches if a protocol
 				// providing link address resolution is specified (e.g. ARP or IPv6).
-				NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+				NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocol},
 				NUDConfigs:       c,
+				UseNeighborCache: true,
 			})
 			if err := s.CreateNIC(nicID, e); err != nil {
 				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
@@ -244,8 +249,9 @@ func TestNUDConfigurationsMinRandomFactor(t *testing.T) {
 				// A neighbor cache is required to store NUDConfigurations. The
 				// networking stack will only allocate neighbor caches if a protocol
 				// providing link address resolution is specified (e.g. ARP or IPv6).
-				NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+				NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocol},
 				NUDConfigs:       c,
+				UseNeighborCache: true,
 			})
 			if err := s.CreateNIC(nicID, e); err != nil {
 				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
@@ -323,8 +329,9 @@ func TestNUDConfigurationsMaxRandomFactor(t *testing.T) {
 				// A neighbor cache is required to store NUDConfigurations. The
 				// networking stack will only allocate neighbor caches if a protocol
 				// providing link address resolution is specified (e.g. ARP or IPv6).
-				NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+				NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocol},
 				NUDConfigs:       c,
+				UseNeighborCache: true,
 			})
 			if err := s.CreateNIC(nicID, e); err != nil {
 				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
@@ -384,8 +391,9 @@ func TestNUDConfigurationsRetransmitTimer(t *testing.T) {
 				// A neighbor cache is required to store NUDConfigurations. The
 				// networking stack will only allocate neighbor caches if a protocol
 				// providing link address resolution is specified (e.g. ARP or IPv6).
-				NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+				NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocol},
 				NUDConfigs:       c,
+				UseNeighborCache: true,
 			})
 			if err := s.CreateNIC(nicID, e); err != nil {
 				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
@@ -435,8 +443,9 @@ func TestNUDConfigurationsDelayFirstProbeTime(t *testing.T) {
 				// A neighbor cache is required to store NUDConfigurations. The
 				// networking stack will only allocate neighbor caches if a protocol
 				// providing link address resolution is specified (e.g. ARP or IPv6).
-				NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+				NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocol},
 				NUDConfigs:       c,
+				UseNeighborCache: true,
 			})
 			if err := s.CreateNIC(nicID, e); err != nil {
 				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
@@ -486,8 +495,9 @@ func TestNUDConfigurationsMaxMulticastProbes(t *testing.T) {
 				// A neighbor cache is required to store NUDConfigurations. The
 				// networking stack will only allocate neighbor caches if a protocol
 				// providing link address resolution is specified (e.g. ARP or IPv6).
-				NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+				NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocol},
 				NUDConfigs:       c,
+				UseNeighborCache: true,
 			})
 			if err := s.CreateNIC(nicID, e); err != nil {
 				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
@@ -537,8 +547,9 @@ func TestNUDConfigurationsMaxUnicastProbes(t *testing.T) {
 				// A neighbor cache is required to store NUDConfigurations. The
 				// networking stack will only allocate neighbor caches if a protocol
 				// providing link address resolution is specified (e.g. ARP or IPv6).
-				NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+				NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocol},
 				NUDConfigs:       c,
+				UseNeighborCache: true,
 			})
 			if err := s.CreateNIC(nicID, e); err != nil {
 				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
@@ -588,8 +599,9 @@ func TestNUDConfigurationsUnreachableTime(t *testing.T) {
 				// A neighbor cache is required to store NUDConfigurations. The
 				// networking stack will only allocate neighbor caches if a protocol
 				// providing link address resolution is specified (e.g. ARP or IPv6).
-				NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+				NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocol},
 				NUDConfigs:       c,
+				UseNeighborCache: true,
 			})
 			if err := s.CreateNIC(nicID, e); err != nil {
 				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
diff --git a/pkg/tcpip/stack/packet_buffer.go b/pkg/tcpip/stack/packet_buffer.go
index 17b8beebb..7f54a6de8 100644
--- a/pkg/tcpip/stack/packet_buffer.go
+++ b/pkg/tcpip/stack/packet_buffer.go
@@ -19,6 +19,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
 )
 
 type headerType int
@@ -80,11 +81,17 @@ type PacketBuffer struct {
 	// data are held in the same underlying buffer storage.
 	header buffer.Prependable
 
-	// NetworkProtocol is only valid when NetworkHeader is set.
+	// NetworkProtocolNumber is only valid when NetworkHeader().View().IsEmpty()
+	// returns false.
 	// TODO(gvisor.dev/issue/3574): Remove the separately passed protocol
 	// numbers in registration APIs that take a PacketBuffer.
 	NetworkProtocolNumber tcpip.NetworkProtocolNumber
 
+	// TransportProtocol is only valid if it is non zero.
+	// TODO(gvisor.dev/issue/3810): This and the network protocol number should
+	// be moved into the headerinfo. This should resolve the validity issue.
+	TransportProtocolNumber tcpip.TransportProtocolNumber
+
 	// Hash is the transport layer hash of this packet. A value of zero
 	// indicates no valid hash has been set.
 	Hash uint32
@@ -234,20 +241,35 @@ func (pk *PacketBuffer) consume(typ headerType, size int) (v buffer.View, consum
 // underlying packet payload.
 func (pk *PacketBuffer) Clone() *PacketBuffer {
 	newPk := &PacketBuffer{
-		PacketBufferEntry:     pk.PacketBufferEntry,
-		Data:                  pk.Data.Clone(nil),
-		headers:               pk.headers,
-		header:                pk.header,
-		Hash:                  pk.Hash,
-		Owner:                 pk.Owner,
-		EgressRoute:           pk.EgressRoute,
-		GSOOptions:            pk.GSOOptions,
-		NetworkProtocolNumber: pk.NetworkProtocolNumber,
-		NatDone:               pk.NatDone,
+		PacketBufferEntry:       pk.PacketBufferEntry,
+		Data:                    pk.Data.Clone(nil),
+		headers:                 pk.headers,
+		header:                  pk.header,
+		Hash:                    pk.Hash,
+		Owner:                   pk.Owner,
+		EgressRoute:             pk.EgressRoute,
+		GSOOptions:              pk.GSOOptions,
+		NetworkProtocolNumber:   pk.NetworkProtocolNumber,
+		NatDone:                 pk.NatDone,
+		TransportProtocolNumber: pk.TransportProtocolNumber,
 	}
 	return newPk
 }
 
+// Network returns the network header as a header.Network.
+//
+// Network should only be called when NetworkHeader has been set.
+func (pk *PacketBuffer) Network() header.Network {
+	switch netProto := pk.NetworkProtocolNumber; netProto {
+	case header.IPv4ProtocolNumber:
+		return header.IPv4(pk.NetworkHeader().View())
+	case header.IPv6ProtocolNumber:
+		return header.IPv6(pk.NetworkHeader().View())
+	default:
+		panic(fmt.Sprintf("unknown network protocol number %d", netProto))
+	}
+}
+
 // headerInfo stores metadata about a header in a packet.
 type headerInfo struct {
 	// buf is the memorized slice for both prepended and consumed header.
@@ -289,11 +311,25 @@ func (h PacketHeader) Consume(size int) (v buffer.View, consumed bool) {
 }
 
 // PayloadSince returns packet payload starting from and including a particular
-// header. This method isn't optimized and should be used in test only.
+// header.
+//
+// The returned View is owned by the caller - its backing buffer is separate
+// from the packet header's underlying packet buffer.
 func PayloadSince(h PacketHeader) buffer.View {
-	var v buffer.View
+	size := h.pk.Data.Size()
+	for _, hinfo := range h.pk.headers[h.typ:] {
+		size += len(hinfo.buf)
+	}
+
+	v := make(buffer.View, 0, size)
+
 	for _, hinfo := range h.pk.headers[h.typ:] {
 		v = append(v, hinfo.buf...)
 	}
-	return append(v, h.pk.Data.ToView()...)
+
+	for _, view := range h.pk.Data.Views() {
+		v = append(v, view...)
+	}
+
+	return v
 }
diff --git a/pkg/tcpip/stack/forwarder.go b/pkg/tcpip/stack/pending_packets.go
index 3eff141e6..f838eda8d 100644
--- a/pkg/tcpip/stack/forwarder.go
+++ b/pkg/tcpip/stack/pending_packets.go
@@ -29,60 +29,60 @@ const (
 )
 
 type pendingPacket struct {
-	nic   *NIC
 	route *Route
 	proto tcpip.NetworkProtocolNumber
 	pkt   *PacketBuffer
 }
 
-type forwardQueue struct {
+// packetsPendingLinkResolution is a queue of packets pending link resolution.
+//
+// Once link resolution completes successfully, the packets will be written.
+type packetsPendingLinkResolution struct {
 	sync.Mutex
 
 	// The packets to send once the resolver completes.
-	packets map[<-chan struct{}][]*pendingPacket
+	packets map[<-chan struct{}][]pendingPacket
 
 	// FIFO of channels used to cancel the oldest goroutine waiting for
 	// link-address resolution.
 	cancelChans []chan struct{}
 }
 
-func newForwardQueue() *forwardQueue {
-	return &forwardQueue{packets: make(map[<-chan struct{}][]*pendingPacket)}
+func (f *packetsPendingLinkResolution) init() {
+	f.Lock()
+	defer f.Unlock()
+	f.packets = make(map[<-chan struct{}][]pendingPacket)
 }
 
-func (f *forwardQueue) enqueue(ch <-chan struct{}, n *NIC, r *Route, protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer) {
-	shouldWait := false
-
+func (f *packetsPendingLinkResolution) enqueue(ch <-chan struct{}, r *Route, proto tcpip.NetworkProtocolNumber, pkt *PacketBuffer) {
 	f.Lock()
+	defer f.Unlock()
+
 	packets, ok := f.packets[ch]
-	if !ok {
-		shouldWait = true
-	}
-	for len(packets) == maxPendingPacketsPerResolution {
+	if len(packets) == maxPendingPacketsPerResolution {
 		p := packets[0]
+		packets[0] = pendingPacket{}
 		packets = packets[1:]
-		p.nic.stack.stats.IP.OutgoingPacketErrors.Increment()
+		p.route.Stats().IP.OutgoingPacketErrors.Increment()
 		p.route.Release()
 	}
+
 	if l := len(packets); l >= maxPendingPacketsPerResolution {
 		panic(fmt.Sprintf("max pending packets for resolution reached; got %d packets, max = %d", l, maxPendingPacketsPerResolution))
 	}
-	f.packets[ch] = append(packets, &pendingPacket{
-		nic:   n,
+
+	f.packets[ch] = append(packets, pendingPacket{
 		route: r,
-		proto: protocol,
+		proto: proto,
 		pkt:   pkt,
 	})
-	f.Unlock()
 
-	if !shouldWait {
+	if ok {
 		return
 	}
 
 	// Wait for the link-address resolution to complete.
-	// Start a goroutine with a forwarding-cancel channel so that we can
-	// limit the maximum number of goroutines running concurrently.
-	cancel := f.newCancelChannel()
+	cancel := f.newCancelChannelLocked()
 	go func() {
 		cancelled := false
 		select {
@@ -92,17 +92,21 @@ func (f *forwardQueue) enqueue(ch <-chan struct{}, n *NIC, r *Route, protocol tc
 		}
 
 		f.Lock()
-		packets := f.packets[ch]
+		packets, ok := f.packets[ch]
 		delete(f.packets, ch)
 		f.Unlock()
 
+		if !ok {
+			panic(fmt.Sprintf("link-resolution goroutine woke up but no entry exists in the queue of packets"))
+		}
+
 		for _, p := range packets {
 			if cancelled {
-				p.nic.stack.stats.IP.OutgoingPacketErrors.Increment()
+				p.route.Stats().IP.OutgoingPacketErrors.Increment()
 			} else if _, err := p.route.Resolve(nil); err != nil {
-				p.nic.stack.stats.IP.OutgoingPacketErrors.Increment()
+				p.route.Stats().IP.OutgoingPacketErrors.Increment()
 			} else {
-				p.nic.forwardPacket(p.route, p.proto, p.pkt)
+				p.route.nic.writePacket(p.route, nil /* gso */, p.proto, p.pkt)
 			}
 			p.route.Release()
 		}
@@ -112,12 +116,10 @@ func (f *forwardQueue) enqueue(ch <-chan struct{}, n *NIC, r *Route, protocol tc
 // newCancelChannel creates a channel that can cancel a pending forwarding
 // activity. The oldest channel is closed if the number of open channels would
 // exceed maxPendingResolutions.
-func (f *forwardQueue) newCancelChannel() chan struct{} {
-	f.Lock()
-	defer f.Unlock()
-
+func (f *packetsPendingLinkResolution) newCancelChannelLocked() chan struct{} {
 	if len(f.cancelChans) == maxPendingResolutions {
 		ch := f.cancelChans[0]
+		f.cancelChans[0] = nil
 		f.cancelChans = f.cancelChans[1:]
 		close(ch)
 	}
diff --git a/pkg/tcpip/stack/registration.go b/pkg/tcpip/stack/registration.go
index aca2f77f8..defb9129b 100644
--- a/pkg/tcpip/stack/registration.go
+++ b/pkg/tcpip/stack/registration.go
@@ -15,6 +15,8 @@
 package stack
 
 import (
+	"fmt"
+
 	"gvisor.dev/gvisor/pkg/sleep"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
@@ -125,6 +127,26 @@ type PacketEndpoint interface {
 	HandlePacket(nicID tcpip.NICID, addr tcpip.LinkAddress, netProto tcpip.NetworkProtocolNumber, pkt *PacketBuffer)
 }
 
+// UnknownDestinationPacketDisposition enumerates the possible return vaues from
+// HandleUnknownDestinationPacket().
+type UnknownDestinationPacketDisposition int
+
+const (
+	// UnknownDestinationPacketMalformed denotes that the packet was malformed
+	// and no further processing should be attempted other than updating
+	// statistics.
+	UnknownDestinationPacketMalformed UnknownDestinationPacketDisposition = iota
+
+	// UnknownDestinationPacketUnhandled tells the caller that the packet was
+	// well formed but that the issue was not handled and the stack should take
+	// the default action.
+	UnknownDestinationPacketUnhandled
+
+	// UnknownDestinationPacketHandled tells the caller that it should do
+	// no further processing.
+	UnknownDestinationPacketHandled
+)
+
 // TransportProtocol is the interface that needs to be implemented by transport
 // protocols (e.g., tcp, udp) that want to be part of the networking stack.
 type TransportProtocol interface {
@@ -132,10 +154,10 @@ type TransportProtocol interface {
 	Number() tcpip.TransportProtocolNumber
 
 	// NewEndpoint creates a new endpoint of the transport protocol.
-	NewEndpoint(stack *Stack, netProto tcpip.NetworkProtocolNumber, waitQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error)
+	NewEndpoint(netProto tcpip.NetworkProtocolNumber, waitQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error)
 
 	// NewRawEndpoint creates a new raw endpoint of the transport protocol.
-	NewRawEndpoint(stack *Stack, netProto tcpip.NetworkProtocolNumber, waitQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error)
+	NewRawEndpoint(netProto tcpip.NetworkProtocolNumber, waitQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error)
 
 	// MinimumPacketSize returns the minimum valid packet size of this
 	// transport protocol. The stack automatically drops any packets smaller
@@ -147,24 +169,22 @@ type TransportProtocol interface {
 	ParsePorts(v buffer.View) (src, dst uint16, err *tcpip.Error)
 
 	// HandleUnknownDestinationPacket handles packets targeted at this
-	// protocol but that don't match any existing endpoint. For example,
-	// it is targeted at a port that have no listeners.
-	//
-	// The return value indicates whether the packet was well-formed (for
-	// stats purposes only).
+	// protocol that don't match any existing endpoint. For example,
+	// it is targeted at a port that has no listeners.
 	//
-	// HandleUnknownDestinationPacket takes ownership of pkt.
-	HandleUnknownDestinationPacket(r *Route, id TransportEndpointID, pkt *PacketBuffer) bool
+	// HandleUnknownDestinationPacket takes ownership of pkt if it handles
+	// the issue.
+	HandleUnknownDestinationPacket(r *Route, id TransportEndpointID, pkt *PacketBuffer) UnknownDestinationPacketDisposition
 
 	// SetOption allows enabling/disabling protocol specific features.
 	// SetOption returns an error if the option is not supported or the
 	// provided option value is invalid.
-	SetOption(option interface{}) *tcpip.Error
+	SetOption(option tcpip.SettableTransportProtocolOption) *tcpip.Error
 
 	// Option allows retrieving protocol specific option values.
 	// Option returns an error if the option is not supported or the
 	// provided option value is invalid.
-	Option(option interface{}) *tcpip.Error
+	Option(option tcpip.GettableTransportProtocolOption) *tcpip.Error
 
 	// Close requests that any worker goroutines owned by the protocol
 	// stop.
@@ -179,6 +199,25 @@ type TransportProtocol interface {
 	Parse(pkt *PacketBuffer) (ok bool)
 }
 
+// TransportPacketDisposition is the result from attempting to deliver a packet
+// to the transport layer.
+type TransportPacketDisposition int
+
+const (
+	// TransportPacketHandled indicates that a transport packet was handled by the
+	// transport layer and callers need not take any further action.
+	TransportPacketHandled TransportPacketDisposition = iota
+
+	// TransportPacketProtocolUnreachable indicates that the transport
+	// protocol requested in the packet is not supported.
+	TransportPacketProtocolUnreachable
+
+	// TransportPacketDestinationPortUnreachable indicates that there weren't any
+	// listeners interested in the packet and the transport protocol has no means
+	// to notify the sender.
+	TransportPacketDestinationPortUnreachable
+)
+
 // TransportDispatcher contains the methods used by the network stack to deliver
 // packets to the appropriate transport endpoint after it has been handled by
 // the network layer.
@@ -189,7 +228,7 @@ type TransportDispatcher interface {
 	// pkt.NetworkHeader must be set before calling DeliverTransportPacket.
 	//
 	// DeliverTransportPacket takes ownership of pkt.
-	DeliverTransportPacket(r *Route, protocol tcpip.TransportProtocolNumber, pkt *PacketBuffer)
+	DeliverTransportPacket(r *Route, protocol tcpip.TransportProtocolNumber, pkt *PacketBuffer) TransportPacketDisposition
 
 	// DeliverTransportControlPacket delivers control packets to the
 	// appropriate transport protocol endpoint.
@@ -226,9 +265,252 @@ type NetworkHeaderParams struct {
 	TOS uint8
 }
 
+// GroupAddressableEndpoint is an endpoint that supports group addressing.
+//
+// An endpoint is considered to support group addressing when one or more
+// endpoints may associate themselves with the same identifier (group address).
+type GroupAddressableEndpoint interface {
+	// JoinGroup joins the spcified group.
+	//
+	// Returns true if the group was newly joined.
+	JoinGroup(group tcpip.Address) (bool, *tcpip.Error)
+
+	// LeaveGroup attempts to leave the specified group.
+	//
+	// Returns tcpip.ErrBadLocalAddress if the endpoint has not joined the group.
+	LeaveGroup(group tcpip.Address) (bool, *tcpip.Error)
+
+	// IsInGroup returns true if the endpoint is a member of the specified group.
+	IsInGroup(group tcpip.Address) bool
+}
+
+// PrimaryEndpointBehavior is an enumeration of an AddressEndpoint's primary
+// behavior.
+type PrimaryEndpointBehavior int
+
+const (
+	// CanBePrimaryEndpoint indicates the endpoint can be used as a primary
+	// endpoint for new connections with no local address. This is the
+	// default when calling NIC.AddAddress.
+	CanBePrimaryEndpoint PrimaryEndpointBehavior = iota
+
+	// FirstPrimaryEndpoint indicates the endpoint should be the first
+	// primary endpoint considered. If there are multiple endpoints with
+	// this behavior, they are ordered by recency.
+	FirstPrimaryEndpoint
+
+	// NeverPrimaryEndpoint indicates the endpoint should never be a
+	// primary endpoint.
+	NeverPrimaryEndpoint
+)
+
+// AddressConfigType is the method used to add an address.
+type AddressConfigType int
+
+const (
+	// AddressConfigStatic is a statically configured address endpoint that was
+	// added by some user-specified action (adding an explicit address, joining a
+	// multicast group).
+	AddressConfigStatic AddressConfigType = iota
+
+	// AddressConfigSlaac is an address endpoint added by SLAAC, as per RFC 4862
+	// section 5.5.3.
+	AddressConfigSlaac
+
+	// AddressConfigSlaacTemp is a temporary address endpoint added by SLAAC as
+	// per RFC 4941. Temporary SLAAC addresses are short-lived and are not
+	// to be valid (or preferred) forever; hence the term temporary.
+	AddressConfigSlaacTemp
+)
+
+// AssignableAddressEndpoint is a reference counted address endpoint that may be
+// assigned to a NetworkEndpoint.
+type AssignableAddressEndpoint interface {
+	// AddressWithPrefix returns the endpoint's address.
+	AddressWithPrefix() tcpip.AddressWithPrefix
+
+	// IsAssigned returns whether or not the endpoint is considered bound
+	// to its NetworkEndpoint.
+	IsAssigned(allowExpired bool) bool
+
+	// IncRef increments this endpoint's reference count.
+	//
+	// Returns true if it was successfully incremented. If it returns false, then
+	// the endpoint is considered expired and should no longer be used.
+	IncRef() bool
+
+	// DecRef decrements this endpoint's reference count.
+	DecRef()
+}
+
+// AddressEndpoint is an endpoint representing an address assigned to an
+// AddressableEndpoint.
+type AddressEndpoint interface {
+	AssignableAddressEndpoint
+
+	// GetKind returns the address kind for this endpoint.
+	GetKind() AddressKind
+
+	// SetKind sets the address kind for this endpoint.
+	SetKind(AddressKind)
+
+	// ConfigType returns the method used to add the address.
+	ConfigType() AddressConfigType
+
+	// Deprecated returns whether or not this endpoint is deprecated.
+	Deprecated() bool
+
+	// SetDeprecated sets this endpoint's deprecated status.
+	SetDeprecated(bool)
+}
+
+// AddressKind is the kind of of an address.
+//
+// See the values of AddressKind for more details.
+type AddressKind int
+
+const (
+	// PermanentTentative is a permanent address endpoint that is not yet
+	// considered to be fully bound to an interface in the traditional
+	// sense. That is, the address is associated with a NIC, but packets
+	// destined to the address MUST NOT be accepted and MUST be silently
+	// dropped, and the address MUST NOT be used as a source address for
+	// outgoing packets. For IPv6, addresses are of this kind until NDP's
+	// Duplicate Address Detection (DAD) resolves. If DAD fails, the address
+	// is removed.
+	PermanentTentative AddressKind = iota
+
+	// Permanent is a permanent endpoint (vs. a temporary one) assigned to the
+	// NIC. Its reference count is biased by 1 to avoid removal when no route
+	// holds a reference to it. It is removed by explicitly removing the address
+	// from the NIC.
+	Permanent
+
+	// PermanentExpired is a permanent endpoint that had its address removed from
+	// the NIC, and it is waiting to be removed once no references to it are held.
+	//
+	// If the address is re-added before the endpoint is removed, its type
+	// changes back to Permanent.
+	PermanentExpired
+
+	// Temporary is an endpoint, created on a one-off basis to temporarily
+	// consider the NIC bound an an address that it is not explictiy bound to
+	// (such as a permanent address). Its reference count must not be biased by 1
+	// so that the address is removed immediately when references to it are no
+	// longer held.
+	//
+	// A temporary endpoint may be promoted to permanent if the address is added
+	// permanently.
+	Temporary
+)
+
+// IsPermanent returns true if the AddressKind represents a permanent address.
+func (k AddressKind) IsPermanent() bool {
+	switch k {
+	case Permanent, PermanentTentative:
+		return true
+	case Temporary, PermanentExpired:
+		return false
+	default:
+		panic(fmt.Sprintf("unrecognized address kind = %d", k))
+	}
+}
+
+// AddressableEndpoint is an endpoint that supports addressing.
+//
+// An endpoint is considered to support addressing when the endpoint may
+// associate itself with an identifier (address).
+type AddressableEndpoint interface {
+	// AddAndAcquirePermanentAddress adds the passed permanent address.
+	//
+	// Returns tcpip.ErrDuplicateAddress if the address exists.
+	//
+	// Acquires and returns the AddressEndpoint for the added address.
+	AddAndAcquirePermanentAddress(addr tcpip.AddressWithPrefix, peb PrimaryEndpointBehavior, configType AddressConfigType, deprecated bool) (AddressEndpoint, *tcpip.Error)
+
+	// RemovePermanentAddress removes the passed address if it is a permanent
+	// address.
+	//
+	// Returns tcpip.ErrBadLocalAddress if the endpoint does not have the passed
+	// permanent address.
+	RemovePermanentAddress(addr tcpip.Address) *tcpip.Error
+
+	// MainAddress returns the endpoint's primary permanent address.
+	MainAddress() tcpip.AddressWithPrefix
+
+	// AcquireAssignedAddress returns an address endpoint for the passed address
+	// that is considered bound to the endpoint, optionally creating a temporary
+	// endpoint if requested and no existing address exists.
+	//
+	// The returned endpoint's reference count is incremented.
+	//
+	// Returns nil if the specified address is not local to this endpoint.
+	AcquireAssignedAddress(localAddr tcpip.Address, allowTemp bool, tempPEB PrimaryEndpointBehavior) AddressEndpoint
+
+	// AcquireOutgoingPrimaryAddress returns a primary address that may be used as
+	// a source address when sending packets to the passed remote address.
+	//
+	// If allowExpired is true, expired addresses may be returned.
+	//
+	// The returned endpoint's reference count is incremented.
+	//
+	// Returns nil if a primary address is not available.
+	AcquireOutgoingPrimaryAddress(remoteAddr tcpip.Address, allowExpired bool) AddressEndpoint
+
+	// PrimaryAddresses returns the primary addresses.
+	PrimaryAddresses() []tcpip.AddressWithPrefix
+
+	// PermanentAddresses returns all the permanent addresses.
+	PermanentAddresses() []tcpip.AddressWithPrefix
+}
+
+// NDPEndpoint is a network endpoint that supports NDP.
+type NDPEndpoint interface {
+	NetworkEndpoint
+
+	// InvalidateDefaultRouter invalidates a default router discovered through
+	// NDP.
+	InvalidateDefaultRouter(tcpip.Address)
+}
+
+// NetworkInterface is a network interface.
+type NetworkInterface interface {
+	NetworkLinkEndpoint
+
+	// ID returns the interface's ID.
+	ID() tcpip.NICID
+
+	// IsLoopback returns true if the interface is a loopback interface.
+	IsLoopback() bool
+
+	// Name returns the name of the interface.
+	//
+	// May return an empty string if the interface is not configured with a name.
+	Name() string
+
+	// Enabled returns true if the interface is enabled.
+	Enabled() bool
+}
+
 // NetworkEndpoint is the interface that needs to be implemented by endpoints
 // of network layer protocols (e.g., ipv4, ipv6).
 type NetworkEndpoint interface {
+	AddressableEndpoint
+
+	// Enable enables the endpoint.
+	//
+	// Must only be called when the stack is in a state that allows the endpoint
+	// to send and receive packets.
+	//
+	// Returns tcpip.ErrNotPermitted if the endpoint cannot be enabled.
+	Enable() *tcpip.Error
+
+	// Enabled returns true if the endpoint is enabled.
+	Enabled() bool
+
+	// Disable disables the endpoint.
+	Disable()
+
 	// DefaultTTL is the default time-to-live value (or hop limit, in ipv6)
 	// for this endpoint.
 	DefaultTTL() uint8
@@ -238,10 +520,6 @@ type NetworkEndpoint interface {
 	// minus the network endpoint max header length.
 	MTU() uint32
 
-	// Capabilities returns the set of capabilities supported by the
-	// underlying link-layer endpoint.
-	Capabilities() LinkEndpointCapabilities
-
 	// MaxHeaderLength returns the maximum size the network (and lower
 	// level layers combined) headers can have. Higher levels use this
 	// information to reserve space in the front of the packets they're
@@ -262,9 +540,6 @@ type NetworkEndpoint interface {
 	// header to the given destination address. It takes ownership of pkt.
 	WriteHeaderIncludedPacket(r *Route, pkt *PacketBuffer) *tcpip.Error
 
-	// NICID returns the id of the NIC this endpoint belongs to.
-	NICID() tcpip.NICID
-
 	// HandlePacket is called by the link layer when new packets arrive to
 	// this network endpoint. It sets pkt.NetworkHeader.
 	//
@@ -279,6 +554,17 @@ type NetworkEndpoint interface {
 	NetworkProtocolNumber() tcpip.NetworkProtocolNumber
 }
 
+// ForwardingNetworkProtocol is a NetworkProtocol that may forward packets.
+type ForwardingNetworkProtocol interface {
+	NetworkProtocol
+
+	// Forwarding returns the forwarding configuration.
+	Forwarding() bool
+
+	// SetForwarding sets the forwarding configuration.
+	SetForwarding(bool)
+}
+
 // NetworkProtocol is the interface that needs to be implemented by network
 // protocols (e.g., ipv4, ipv6) that want to be part of the networking stack.
 type NetworkProtocol interface {
@@ -298,17 +584,17 @@ type NetworkProtocol interface {
 	ParseAddresses(v buffer.View) (src, dst tcpip.Address)
 
 	// NewEndpoint creates a new endpoint of this protocol.
-	NewEndpoint(nicID tcpip.NICID, linkAddrCache LinkAddressCache, dispatcher TransportDispatcher, sender LinkEndpoint, st *Stack) NetworkEndpoint
+	NewEndpoint(nic NetworkInterface, linkAddrCache LinkAddressCache, nud NUDHandler, dispatcher TransportDispatcher) NetworkEndpoint
 
 	// SetOption allows enabling/disabling protocol specific features.
 	// SetOption returns an error if the option is not supported or the
 	// provided option value is invalid.
-	SetOption(option interface{}) *tcpip.Error
+	SetOption(option tcpip.SettableNetworkProtocolOption) *tcpip.Error
 
 	// Option allows retrieving protocol specific option values.
 	// Option returns an error if the option is not supported or the
 	// provided option value is invalid.
-	Option(option interface{}) *tcpip.Error
+	Option(option tcpip.GettableNetworkProtocolOption) *tcpip.Error
 
 	// Close requests that any worker goroutines owned by the protocol
 	// stop.
@@ -376,22 +662,15 @@ const (
 	CapabilitySoftwareGSO
 )
 
-// LinkEndpoint is the interface implemented by data link layer protocols (e.g.,
-// ethernet, loopback, raw) and used by network layer protocols to send packets
-// out through the implementer's data link endpoint. When a link header exists,
-// it sets each PacketBuffer's LinkHeader field before passing it up the
-// stack.
-type LinkEndpoint interface {
+// NetworkLinkEndpoint is a data-link layer that supports sending network
+// layer packets.
+type NetworkLinkEndpoint interface {
 	// MTU is the maximum transmission unit for this endpoint. This is
 	// usually dictated by the backing physical network; when such a
 	// physical network doesn't exist, the limit is generally 64k, which
 	// includes the maximum size of an IP packet.
 	MTU() uint32
 
-	// Capabilities returns the set of capabilities supported by the
-	// endpoint.
-	Capabilities() LinkEndpointCapabilities
-
 	// MaxHeaderLength returns the maximum size the data link (and
 	// lower level layers combined) headers can have. Higher levels use this
 	// information to reserve space in the front of the packets they're
@@ -399,7 +678,7 @@ type LinkEndpoint interface {
 	MaxHeaderLength() uint16
 
 	// LinkAddress returns the link address (typically a MAC) of the
-	// link endpoint.
+	// endpoint.
 	LinkAddress() tcpip.LinkAddress
 
 	// WritePacket writes a packet with the given protocol through the
@@ -419,6 +698,19 @@ type LinkEndpoint interface {
 	// offload is enabled. If it will be used for something else, it may
 	// require to change syscall filters.
 	WritePackets(r *Route, gso *GSO, pkts PacketBufferList, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error)
+}
+
+// LinkEndpoint is the interface implemented by data link layer protocols (e.g.,
+// ethernet, loopback, raw) and used by network layer protocols to send packets
+// out through the implementer's data link endpoint. When a link header exists,
+// it sets each PacketBuffer's LinkHeader field before passing it up the
+// stack.
+type LinkEndpoint interface {
+	NetworkLinkEndpoint
+
+	// Capabilities returns the set of capabilities supported by the
+	// endpoint.
+	Capabilities() LinkEndpointCapabilities
 
 	// WriteRawPacket writes a packet directly to the link. The packet
 	// should already have an ethernet header. It takes ownership of vv.
@@ -427,8 +719,8 @@ type LinkEndpoint interface {
 	// Attach attaches the data link layer endpoint to the network-layer
 	// dispatcher of the stack.
 	//
-	// Attach will be called with a nil dispatcher if the receiver's associated
-	// NIC is being removed.
+	// Attach is called with a nil dispatcher when the endpoint's NIC is being
+	// removed.
 	Attach(dispatcher NetworkDispatcher)
 
 	// IsAttached returns whether a NetworkDispatcher is attached to the
@@ -488,7 +780,7 @@ type LinkAddressResolver interface {
 	ResolveStaticAddress(addr tcpip.Address) (tcpip.LinkAddress, bool)
 
 	// LinkAddressProtocol returns the network protocol of the
-	// addresses this this resolver can resolve.
+	// addresses this resolver can resolve.
 	LinkAddressProtocol() tcpip.NetworkProtocolNumber
 }
 
diff --git a/pkg/tcpip/stack/route.go b/pkg/tcpip/stack/route.go
index e267bebb0..b76e2d37b 100644
--- a/pkg/tcpip/stack/route.go
+++ b/pkg/tcpip/stack/route.go
@@ -42,21 +42,27 @@ type Route struct {
 	// NetProto is the network-layer protocol.
 	NetProto tcpip.NetworkProtocolNumber
 
-	// ref a reference to the network endpoint through which the route
-	// starts.
-	ref *referencedNetworkEndpoint
-
 	// Loop controls where WritePacket should send packets.
 	Loop PacketLooping
 
-	// directedBroadcast indicates whether this route is sending a directed
-	// broadcast packet.
-	directedBroadcast bool
+	// nic is the NIC the route goes through.
+	nic *NIC
+
+	// addressEndpoint is the local address this route is associated with.
+	addressEndpoint AssignableAddressEndpoint
+
+	// linkCache is set if link address resolution is enabled for this protocol on
+	// the route's NIC.
+	linkCache LinkAddressCache
+
+	// linkRes is set if link address resolution is enabled for this protocol on
+	// the route's NIC.
+	linkRes LinkAddressResolver
 }
 
 // makeRoute initializes a new route. It takes ownership of the provided
-// reference to a network endpoint.
-func makeRoute(netProto tcpip.NetworkProtocolNumber, localAddr, remoteAddr tcpip.Address, localLinkAddr tcpip.LinkAddress, ref *referencedNetworkEndpoint, handleLocal, multicastLoop bool) Route {
+// AssignableAddressEndpoint.
+func makeRoute(netProto tcpip.NetworkProtocolNumber, localAddr, remoteAddr tcpip.Address, nic *NIC, addressEndpoint AssignableAddressEndpoint, handleLocal, multicastLoop bool) Route {
 	loop := PacketOut
 	if handleLocal && localAddr != "" && remoteAddr == localAddr {
 		loop = PacketLoop
@@ -66,29 +72,39 @@ func makeRoute(netProto tcpip.NetworkProtocolNumber, localAddr, remoteAddr tcpip
 		loop |= PacketLoop
 	}
 
-	return Route{
+	r := Route{
 		NetProto:         netProto,
 		LocalAddress:     localAddr,
-		LocalLinkAddress: localLinkAddr,
+		LocalLinkAddress: nic.LinkEndpoint.LinkAddress(),
 		RemoteAddress:    remoteAddr,
-		ref:              ref,
+		addressEndpoint:  addressEndpoint,
+		nic:              nic,
 		Loop:             loop,
 	}
+
+	if r.nic.LinkEndpoint.Capabilities()&CapabilityResolutionRequired != 0 {
+		if linkRes, ok := r.nic.stack.linkAddrResolvers[r.NetProto]; ok {
+			r.linkRes = linkRes
+			r.linkCache = r.nic.stack
+		}
+	}
+
+	return r
 }
 
 // NICID returns the id of the NIC from which this route originates.
 func (r *Route) NICID() tcpip.NICID {
-	return r.ref.ep.NICID()
+	return r.nic.ID()
 }
 
 // MaxHeaderLength forwards the call to the network endpoint's implementation.
 func (r *Route) MaxHeaderLength() uint16 {
-	return r.ref.ep.MaxHeaderLength()
+	return r.nic.getNetworkEndpoint(r.NetProto).MaxHeaderLength()
 }
 
 // Stats returns a mutable copy of current stats.
 func (r *Route) Stats() tcpip.Stats {
-	return r.ref.nic.stack.Stats()
+	return r.nic.stack.Stats()
 }
 
 // PseudoHeaderChecksum forwards the call to the network endpoint's
@@ -99,12 +115,12 @@ func (r *Route) PseudoHeaderChecksum(protocol tcpip.TransportProtocolNumber, tot
 
 // Capabilities returns the link-layer capabilities of the route.
 func (r *Route) Capabilities() LinkEndpointCapabilities {
-	return r.ref.ep.Capabilities()
+	return r.nic.LinkEndpoint.Capabilities()
 }
 
 // GSOMaxSize returns the maximum GSO packet size.
 func (r *Route) GSOMaxSize() uint32 {
-	if gso, ok := r.ref.ep.(GSOEndpoint); ok {
+	if gso, ok := r.nic.LinkEndpoint.(GSOEndpoint); ok {
 		return gso.GSOMaxSize()
 	}
 	return 0
@@ -141,7 +157,17 @@ func (r *Route) Resolve(waker *sleep.Waker) (<-chan struct{}, *tcpip.Error) {
 		}
 		nextAddr = r.RemoteAddress
 	}
-	linkAddr, ch, err := r.ref.linkCache.GetLinkAddress(r.ref.nic.ID(), nextAddr, r.LocalAddress, r.NetProto, waker)
+
+	if neigh := r.nic.neigh; neigh != nil {
+		entry, ch, err := neigh.entry(nextAddr, r.LocalAddress, r.linkRes, waker)
+		if err != nil {
+			return ch, err
+		}
+		r.RemoteLinkAddress = entry.LinkAddr
+		return nil, nil
+	}
+
+	linkAddr, ch, err := r.linkCache.GetLinkAddress(r.nic.ID(), nextAddr, r.LocalAddress, r.NetProto, waker)
 	if err != nil {
 		return ch, err
 	}
@@ -155,7 +181,13 @@ func (r *Route) RemoveWaker(waker *sleep.Waker) {
 	if nextAddr == "" {
 		nextAddr = r.RemoteAddress
 	}
-	r.ref.linkCache.RemoveWaker(r.ref.nic.ID(), nextAddr, waker)
+
+	if neigh := r.nic.neigh; neigh != nil {
+		neigh.removeWaker(nextAddr, waker)
+		return
+	}
+
+	r.linkCache.RemoveWaker(r.nic.ID(), nextAddr, waker)
 }
 
 // IsResolutionRequired returns true if Resolve() must be called to resolve
@@ -163,101 +195,63 @@ func (r *Route) RemoveWaker(waker *sleep.Waker) {
 //
 // The NIC r uses must not be locked.
 func (r *Route) IsResolutionRequired() bool {
-	return r.ref.isValidForOutgoing() && r.ref.linkCache != nil && r.RemoteLinkAddress == ""
+	if r.nic.neigh != nil {
+		return r.nic.isValidForOutgoing(r.addressEndpoint) && r.linkRes != nil && r.RemoteLinkAddress == ""
+	}
+	return r.nic.isValidForOutgoing(r.addressEndpoint) && r.linkCache != nil && r.RemoteLinkAddress == ""
 }
 
 // WritePacket writes the packet through the given route.
 func (r *Route) WritePacket(gso *GSO, params NetworkHeaderParams, pkt *PacketBuffer) *tcpip.Error {
-	if !r.ref.isValidForOutgoing() {
+	if !r.nic.isValidForOutgoing(r.addressEndpoint) {
 		return tcpip.ErrInvalidEndpointState
 	}
 
-	// WritePacket takes ownership of pkt, calculate numBytes first.
-	numBytes := pkt.Size()
-
-	err := r.ref.ep.WritePacket(r, gso, params, pkt)
-	if err != nil {
-		r.Stats().IP.OutgoingPacketErrors.Increment()
-	} else {
-		r.ref.nic.stats.Tx.Packets.Increment()
-		r.ref.nic.stats.Tx.Bytes.IncrementBy(uint64(numBytes))
-	}
-	return err
+	return r.nic.getNetworkEndpoint(r.NetProto).WritePacket(r, gso, params, pkt)
 }
 
 // WritePackets writes a list of n packets through the given route and returns
 // the number of packets written.
 func (r *Route) WritePackets(gso *GSO, pkts PacketBufferList, params NetworkHeaderParams) (int, *tcpip.Error) {
-	if !r.ref.isValidForOutgoing() {
+	if !r.nic.isValidForOutgoing(r.addressEndpoint) {
 		return 0, tcpip.ErrInvalidEndpointState
 	}
 
-	// WritePackets takes ownership of pkt, calculate length first.
-	numPkts := pkts.Len()
-
-	n, err := r.ref.ep.WritePackets(r, gso, pkts, params)
-	if err != nil {
-		r.Stats().IP.OutgoingPacketErrors.IncrementBy(uint64(numPkts - n))
-	}
-	r.ref.nic.stats.Tx.Packets.IncrementBy(uint64(n))
-
-	writtenBytes := 0
-	for i, pb := 0, pkts.Front(); i < n && pb != nil; i, pb = i+1, pb.Next() {
-		writtenBytes += pb.Size()
-	}
-
-	r.ref.nic.stats.Tx.Bytes.IncrementBy(uint64(writtenBytes))
-	return n, err
+	return r.nic.getNetworkEndpoint(r.NetProto).WritePackets(r, gso, pkts, params)
 }
 
 // WriteHeaderIncludedPacket writes a packet already containing a network
 // header through the given route.
 func (r *Route) WriteHeaderIncludedPacket(pkt *PacketBuffer) *tcpip.Error {
-	if !r.ref.isValidForOutgoing() {
+	if !r.nic.isValidForOutgoing(r.addressEndpoint) {
 		return tcpip.ErrInvalidEndpointState
 	}
 
-	// WriteHeaderIncludedPacket takes ownership of pkt, calculate numBytes first.
-	numBytes := pkt.Data.Size()
-
-	if err := r.ref.ep.WriteHeaderIncludedPacket(r, pkt); err != nil {
-		r.Stats().IP.OutgoingPacketErrors.Increment()
-		return err
-	}
-	r.ref.nic.stats.Tx.Packets.Increment()
-	r.ref.nic.stats.Tx.Bytes.IncrementBy(uint64(numBytes))
-	return nil
+	return r.nic.getNetworkEndpoint(r.NetProto).WriteHeaderIncludedPacket(r, pkt)
 }
 
 // DefaultTTL returns the default TTL of the underlying network endpoint.
 func (r *Route) DefaultTTL() uint8 {
-	return r.ref.ep.DefaultTTL()
+	return r.nic.getNetworkEndpoint(r.NetProto).DefaultTTL()
 }
 
 // MTU returns the MTU of the underlying network endpoint.
 func (r *Route) MTU() uint32 {
-	return r.ref.ep.MTU()
-}
-
-// NetworkProtocolNumber returns the NetworkProtocolNumber of the underlying
-// network endpoint.
-func (r *Route) NetworkProtocolNumber() tcpip.NetworkProtocolNumber {
-	return r.ref.ep.NetworkProtocolNumber()
+	return r.nic.getNetworkEndpoint(r.NetProto).MTU()
 }
 
 // Release frees all resources associated with the route.
 func (r *Route) Release() {
-	if r.ref != nil {
-		r.ref.decRef()
-		r.ref = nil
+	if r.addressEndpoint != nil {
+		r.addressEndpoint.DecRef()
+		r.addressEndpoint = nil
 	}
 }
 
-// Clone Clone a route such that the original one can be released and the new
-// one will remain valid.
+// Clone clones the route.
 func (r *Route) Clone() Route {
-	if r.ref != nil {
-		r.ref.incRef()
+	if r.addressEndpoint != nil {
+		_ = r.addressEndpoint.IncRef()
 	}
 	return *r
 }
@@ -281,27 +275,30 @@ func (r *Route) MakeLoopedRoute() Route {
 
 // Stack returns the instance of the Stack that owns this route.
 func (r *Route) Stack() *Stack {
-	return r.ref.stack()
+	return r.nic.stack
+}
+
+func (r *Route) isV4Broadcast(addr tcpip.Address) bool {
+	if addr == header.IPv4Broadcast {
+		return true
+	}
+
+	subnet := r.addressEndpoint.AddressWithPrefix().Subnet()
+	return subnet.IsBroadcast(addr)
 }
 
 // IsOutboundBroadcast returns true if the route is for an outbound broadcast
 // packet.
 func (r *Route) IsOutboundBroadcast() bool {
 	// Only IPv4 has a notion of broadcast.
-	return r.directedBroadcast || r.RemoteAddress == header.IPv4Broadcast
+	return r.isV4Broadcast(r.RemoteAddress)
 }
 
 // IsInboundBroadcast returns true if the route is for an inbound broadcast
 // packet.
 func (r *Route) IsInboundBroadcast() bool {
 	// Only IPv4 has a notion of broadcast.
-	if r.LocalAddress == header.IPv4Broadcast {
-		return true
-	}
-
-	addr := r.ref.addrWithPrefix()
-	subnet := addr.Subnet()
-	return subnet.IsBroadcast(r.LocalAddress)
+	return r.isV4Broadcast(r.LocalAddress)
 }
 
 // ReverseRoute returns new route with given source and destination address.
@@ -312,7 +309,10 @@ func (r *Route) ReverseRoute(src tcpip.Address, dst tcpip.Address) Route {
 		LocalLinkAddress:  r.RemoteLinkAddress,
 		RemoteAddress:     src,
 		RemoteLinkAddress: r.LocalLinkAddress,
-		ref:               r.ref,
 		Loop:              r.Loop,
+		addressEndpoint:   r.addressEndpoint,
+		nic:               r.nic,
+		linkCache:         r.linkCache,
+		linkRes:           r.linkRes,
 	}
 }
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index a3f87c8af..3a07577c8 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -144,10 +144,7 @@ type TCPReceiverState struct {
 
 	// PendingBufUsed is the number of bytes pending in the receive
 	// queue.
-	PendingBufUsed seqnum.Size
-
-	// PendingBufSize is the size of the socket receive buffer.
-	PendingBufSize seqnum.Size
+	PendingBufUsed int
 }
 
 // TCPSenderState holds a copy of the internal state of the sender for
@@ -248,7 +245,7 @@ type RcvBufAutoTuneParams struct {
 	// was started.
 	MeasureTime time.Time
 
-	// CopiedBytes is the number of bytes copied to userspace since
+	// CopiedBytes is the number of bytes copied to user space since
 	// this measure began.
 	CopiedBytes int
 
@@ -366,38 +363,6 @@ func (u *uniqueIDGenerator) UniqueID() uint64 {
 	return atomic.AddUint64((*uint64)(u), 1)
 }
 
-// NICNameFromID is a function that returns a stable name for the specified NIC,
-// even if different NIC IDs are used to refer to the same NIC in different
-// program runs. It is used when generating opaque interface identifiers (IIDs).
-// If the NIC was created with a name, it will be passed to NICNameFromID.
-//
-// NICNameFromID SHOULD return unique NIC names so unique opaque IIDs are
-// generated for the same prefix on differnt NICs.
-type NICNameFromID func(tcpip.NICID, string) string
-
-// OpaqueInterfaceIdentifierOptions holds the options related to the generation
-// of opaque interface indentifiers (IIDs) as defined by RFC 7217.
-type OpaqueInterfaceIdentifierOptions struct {
-	// NICNameFromID is a function that returns a stable name for a specified NIC,
-	// even if the NIC ID changes over time.
-	//
-	// Must be specified to generate the opaque IID.
-	NICNameFromID NICNameFromID
-
-	// SecretKey is a pseudo-random number used as the secret key when generating
-	// opaque IIDs as defined by RFC 7217. The key SHOULD be at least
-	// header.OpaqueIIDSecretKeyMinBytes bytes and MUST follow minimum randomness
-	// requirements for security as outlined by RFC 4086. SecretKey MUST NOT
-	// change between program runs, unless explicitly changed.
-	//
-	// OpaqueInterfaceIdentifierOptions takes ownership of SecretKey. SecretKey
-	// MUST NOT be modified after Stack is created.
-	//
-	// May be nil, but a nil value is highly discouraged to maintain
-	// some level of randomness between nodes.
-	SecretKey []byte
-}
-
 // Stack is a networking stack, with all supported protocols, NICs, and route
 // table.
 type Stack struct {
@@ -415,10 +380,12 @@ type Stack struct {
 
 	linkAddrCache *linkAddrCache
 
-	mu               sync.RWMutex
-	nics             map[tcpip.NICID]*NIC
-	forwarding       bool
-	cleanupEndpoints map[TransportEndpoint]struct{}
+	mu   sync.RWMutex
+	nics map[tcpip.NICID]*NIC
+
+	// cleanupEndpointsMu protects cleanupEndpoints.
+	cleanupEndpointsMu sync.Mutex
+	cleanupEndpoints   map[TransportEndpoint]struct{}
 
 	// route is the route table passed in by the user via SetRouteTable(),
 	// it is used by FindRoute() to build a route for a specific
@@ -429,7 +396,7 @@ type Stack struct {
 
 	// If not nil, then any new endpoints will have this probe function
 	// invoked everytime they receive a TCP segment.
-	tcpProbeFunc TCPProbeFunc
+	tcpProbeFunc atomic.Value // TCPProbeFunc
 
 	// clock is used to generate user-visible times.
 	clock tcpip.Clock
@@ -455,20 +422,12 @@ type Stack struct {
 	// TODO(gvisor.dev/issue/940): S/R this field.
 	seed uint32
 
-	// ndpConfigs is the default NDP configurations used by interfaces.
-	ndpConfigs NDPConfigurations
-
 	// nudConfigs is the default NUD configurations used by interfaces.
 	nudConfigs NUDConfigurations
 
-	// autoGenIPv6LinkLocal determines whether or not the stack will attempt
-	// to auto-generate an IPv6 link-local address for newly enabled non-loopback
-	// NICs. See the AutoGenIPv6LinkLocal field of Options for more details.
-	autoGenIPv6LinkLocal bool
-
-	// ndpDisp is the NDP event dispatcher that is used to send the netstack
-	// integrator NDP related events.
-	ndpDisp NDPDispatcher
+	// useNeighborCache indicates whether ARP and NDP packets should be handled
+	// by the NIC's neighborCache instead of linkAddrCache.
+	useNeighborCache bool
 
 	// nudDisp is the NUD event dispatcher that is used to send the netstack
 	// integrator NUD related events.
@@ -477,17 +436,9 @@ type Stack struct {
 	// uniqueIDGenerator is a generator of unique identifiers.
 	uniqueIDGenerator UniqueID
 
-	// opaqueIIDOpts hold the options for generating opaque interface identifiers
-	// (IIDs) as outlined by RFC 7217.
-	opaqueIIDOpts OpaqueInterfaceIdentifierOptions
-
-	// tempIIDSeed is used to seed the initial temporary interface identifier
-	// history value used to generate IIDs for temporary SLAAC addresses.
-	tempIIDSeed []byte
-
-	// forwarder holds the packets that wait for their link-address resolutions
-	// to complete, and forwards them when each resolution is done.
-	forwarder *forwardQueue
+	// linkResQueue holds packets that are waiting for link resolution to
+	// complete.
+	linkResQueue packetsPendingLinkResolution
 
 	// randomGenerator is an injectable pseudo random generator that can be
 	// used when a random number is required.
@@ -507,13 +458,25 @@ type UniqueID interface {
 	UniqueID() uint64
 }
 
+// NetworkProtocolFactory instantiates a network protocol.
+//
+// NetworkProtocolFactory must not attempt to modify the stack, it may only
+// query the stack.
+type NetworkProtocolFactory func(*Stack) NetworkProtocol
+
+// TransportProtocolFactory instantiates a transport protocol.
+//
+// TransportProtocolFactory must not attempt to modify the stack, it may only
+// query the stack.
+type TransportProtocolFactory func(*Stack) TransportProtocol
+
 // Options contains optional Stack configuration.
 type Options struct {
 	// NetworkProtocols lists the network protocols to enable.
-	NetworkProtocols []NetworkProtocol
+	NetworkProtocols []NetworkProtocolFactory
 
 	// TransportProtocols lists the transport protocols to enable.
-	TransportProtocols []TransportProtocol
+	TransportProtocols []TransportProtocolFactory
 
 	// Clock is an optional clock source used for timestampping packets.
 	//
@@ -531,33 +494,15 @@ type Options struct {
 	// UniqueID is an optional generator of unique identifiers.
 	UniqueID UniqueID
 
-	// NDPConfigs is the default NDP configurations used by interfaces.
-	//
-	// By default, NDPConfigs will have a zero value for its
-	// DupAddrDetectTransmits field, implying that DAD will not be performed
-	// before assigning an address to a NIC.
-	NDPConfigs NDPConfigurations
-
 	// NUDConfigs is the default NUD configurations used by interfaces.
 	NUDConfigs NUDConfigurations
 
-	// AutoGenIPv6LinkLocal determines whether or not the stack will attempt to
-	// auto-generate an IPv6 link-local address for newly enabled non-loopback
-	// NICs.
-	//
-	// Note, setting this to true does not mean that a link-local address
-	// will be assigned right away, or at all. If Duplicate Address Detection
-	// is enabled, an address will only be assigned if it successfully resolves.
-	// If it fails, no further attempt will be made to auto-generate an IPv6
-	// link-local address.
-	//
-	// The generated link-local address will follow RFC 4291 Appendix A
-	// guidelines.
-	AutoGenIPv6LinkLocal bool
-
-	// NDPDisp is the NDP event dispatcher that an integrator can provide to
-	// receive NDP related events.
-	NDPDisp NDPDispatcher
+	// UseNeighborCache indicates whether ARP and NDP packets should be handled
+	// by the Neighbor Unreachability Detection (NUD) state machine. This flag
+	// also enables the APIs for inspecting and modifying the neighbor table via
+	// NUDDispatcher and the following Stack methods: Neighbors, RemoveNeighbor,
+	// and ClearNeighbors.
+	UseNeighborCache bool
 
 	// NUDDisp is the NUD event dispatcher that an integrator can provide to
 	// receive NUD related events.
@@ -567,31 +512,12 @@ type Options struct {
 	// this is non-nil.
 	RawFactory RawFactory
 
-	// OpaqueIIDOpts hold the options for generating opaque interface
-	// identifiers (IIDs) as outlined by RFC 7217.
-	OpaqueIIDOpts OpaqueInterfaceIdentifierOptions
-
 	// RandSource is an optional source to use to generate random
 	// numbers. If omitted it defaults to a Source seeded by the data
 	// returned by rand.Read().
 	//
 	// RandSource must be thread-safe.
 	RandSource mathrand.Source
-
-	// TempIIDSeed is used to seed the initial temporary interface identifier
-	// history value used to generate IIDs for temporary SLAAC addresses.
-	//
-	// Temporary SLAAC adresses are short-lived addresses which are unpredictable
-	// and random from the perspective of other nodes on the network. It is
-	// recommended that the seed be a random byte buffer of at least
-	// header.IIDSize bytes to make sure that temporary SLAAC addresses are
-	// sufficiently random. It should follow minimum randomness requirements for
-	// security as outlined by RFC 4086.
-	//
-	// Note: using a nil value, the same seed across netstack program runs, or a
-	// seed that is too small would reduce randomness and increase predictability,
-	// defeating the purpose of temporary SLAAC addresses.
-	TempIIDSeed []byte
 }
 
 // TransportEndpointInfo holds useful information about a transport endpoint
@@ -624,8 +550,8 @@ type TransportEndpointInfo struct {
 // incompatible with the receiver.
 //
 // Preconditon: the parent endpoint mu must be held while calling this method.
-func (e *TransportEndpointInfo) AddrNetProtoLocked(addr tcpip.FullAddress, v6only bool) (tcpip.FullAddress, tcpip.NetworkProtocolNumber, *tcpip.Error) {
-	netProto := e.NetProto
+func (t *TransportEndpointInfo) AddrNetProtoLocked(addr tcpip.FullAddress, v6only bool) (tcpip.FullAddress, tcpip.NetworkProtocolNumber, *tcpip.Error) {
+	netProto := t.NetProto
 	switch len(addr.Addr) {
 	case header.IPv4AddressSize:
 		netProto = header.IPv4ProtocolNumber
@@ -639,7 +565,7 @@ func (e *TransportEndpointInfo) AddrNetProtoLocked(addr tcpip.FullAddress, v6onl
 		}
 	}
 
-	switch len(e.ID.LocalAddress) {
+	switch len(t.ID.LocalAddress) {
 	case header.IPv4AddressSize:
 		if len(addr.Addr) == header.IPv6AddressSize {
 			return tcpip.FullAddress{}, 0, tcpip.ErrInvalidEndpointState
@@ -651,8 +577,8 @@ func (e *TransportEndpointInfo) AddrNetProtoLocked(addr tcpip.FullAddress, v6onl
 	}
 
 	switch {
-	case netProto == e.NetProto:
-	case netProto == header.IPv4ProtocolNumber && e.NetProto == header.IPv6ProtocolNumber:
+	case netProto == t.NetProto:
+	case netProto == header.IPv4ProtocolNumber && t.NetProto == header.IPv6ProtocolNumber:
 		if v6only {
 			return tcpip.FullAddress{}, 0, tcpip.ErrNoRoute
 		}
@@ -694,35 +620,27 @@ func New(opts Options) *Stack {
 		randSrc = &lockedRandomSource{src: mathrand.NewSource(generateRandInt64())}
 	}
 
-	// Make sure opts.NDPConfigs contains valid values only.
-	opts.NDPConfigs.validate()
-
 	opts.NUDConfigs.resetInvalidFields()
 
 	s := &Stack{
-		transportProtocols:   make(map[tcpip.TransportProtocolNumber]*transportProtocolState),
-		networkProtocols:     make(map[tcpip.NetworkProtocolNumber]NetworkProtocol),
-		linkAddrResolvers:    make(map[tcpip.NetworkProtocolNumber]LinkAddressResolver),
-		nics:                 make(map[tcpip.NICID]*NIC),
-		cleanupEndpoints:     make(map[TransportEndpoint]struct{}),
-		linkAddrCache:        newLinkAddrCache(ageLimit, resolutionTimeout, resolutionAttempts),
-		PortManager:          ports.NewPortManager(),
-		clock:                clock,
-		stats:                opts.Stats.FillIn(),
-		handleLocal:          opts.HandleLocal,
-		tables:               DefaultTables(),
-		icmpRateLimiter:      NewICMPRateLimiter(),
-		seed:                 generateRandUint32(),
-		ndpConfigs:           opts.NDPConfigs,
-		nudConfigs:           opts.NUDConfigs,
-		autoGenIPv6LinkLocal: opts.AutoGenIPv6LinkLocal,
-		uniqueIDGenerator:    opts.UniqueID,
-		ndpDisp:              opts.NDPDisp,
-		nudDisp:              opts.NUDDisp,
-		opaqueIIDOpts:        opts.OpaqueIIDOpts,
-		tempIIDSeed:          opts.TempIIDSeed,
-		forwarder:            newForwardQueue(),
-		randomGenerator:      mathrand.New(randSrc),
+		transportProtocols: make(map[tcpip.TransportProtocolNumber]*transportProtocolState),
+		networkProtocols:   make(map[tcpip.NetworkProtocolNumber]NetworkProtocol),
+		linkAddrResolvers:  make(map[tcpip.NetworkProtocolNumber]LinkAddressResolver),
+		nics:               make(map[tcpip.NICID]*NIC),
+		cleanupEndpoints:   make(map[TransportEndpoint]struct{}),
+		linkAddrCache:      newLinkAddrCache(ageLimit, resolutionTimeout, resolutionAttempts),
+		PortManager:        ports.NewPortManager(),
+		clock:              clock,
+		stats:              opts.Stats.FillIn(),
+		handleLocal:        opts.HandleLocal,
+		tables:             DefaultTables(),
+		icmpRateLimiter:    NewICMPRateLimiter(),
+		seed:               generateRandUint32(),
+		nudConfigs:         opts.NUDConfigs,
+		useNeighborCache:   opts.UseNeighborCache,
+		uniqueIDGenerator:  opts.UniqueID,
+		nudDisp:            opts.NUDDisp,
+		randomGenerator:    mathrand.New(randSrc),
 		sendBufferSize: SendBufferSizeOption{
 			Min:     MinBufferSize,
 			Default: DefaultBufferSize,
@@ -734,9 +652,11 @@ func New(opts Options) *Stack {
 			Max:     DefaultMaxBufferSize,
 		},
 	}
+	s.linkResQueue.init()
 
 	// Add specified network protocols.
-	for _, netProto := range opts.NetworkProtocols {
+	for _, netProtoFactory := range opts.NetworkProtocols {
+		netProto := netProtoFactory(s)
 		s.networkProtocols[netProto.Number()] = netProto
 		if r, ok := netProto.(LinkAddressResolver); ok {
 			s.linkAddrResolvers[r.LinkAddressProtocol()] = r
@@ -744,7 +664,8 @@ func New(opts Options) *Stack {
 	}
 
 	// Add specified transport protocols.
-	for _, transProto := range opts.TransportProtocols {
+	for _, transProtoFactory := range opts.TransportProtocols {
+		transProto := transProtoFactory(s)
 		s.transportProtocols[transProto.Number()] = &transportProtocolState{
 			proto: transProto,
 		}
@@ -773,7 +694,7 @@ func (s *Stack) UniqueID() uint64 {
 // options. This method returns an error if the protocol is not supported or
 // option is not supported by the protocol implementation or the provided value
 // is incorrect.
-func (s *Stack) SetNetworkProtocolOption(network tcpip.NetworkProtocolNumber, option interface{}) *tcpip.Error {
+func (s *Stack) SetNetworkProtocolOption(network tcpip.NetworkProtocolNumber, option tcpip.SettableNetworkProtocolOption) *tcpip.Error {
 	netProto, ok := s.networkProtocols[network]
 	if !ok {
 		return tcpip.ErrUnknownProtocol
@@ -790,7 +711,7 @@ func (s *Stack) SetNetworkProtocolOption(network tcpip.NetworkProtocolNumber, op
 // if err != nil {
 //   ...
 // }
-func (s *Stack) NetworkProtocolOption(network tcpip.NetworkProtocolNumber, option interface{}) *tcpip.Error {
+func (s *Stack) NetworkProtocolOption(network tcpip.NetworkProtocolNumber, option tcpip.GettableNetworkProtocolOption) *tcpip.Error {
 	netProto, ok := s.networkProtocols[network]
 	if !ok {
 		return tcpip.ErrUnknownProtocol
@@ -802,7 +723,7 @@ func (s *Stack) NetworkProtocolOption(network tcpip.NetworkProtocolNumber, optio
 // options. This method returns an error if the protocol is not supported or
 // option is not supported by the protocol implementation or the provided value
 // is incorrect.
-func (s *Stack) SetTransportProtocolOption(transport tcpip.TransportProtocolNumber, option interface{}) *tcpip.Error {
+func (s *Stack) SetTransportProtocolOption(transport tcpip.TransportProtocolNumber, option tcpip.SettableTransportProtocolOption) *tcpip.Error {
 	transProtoState, ok := s.transportProtocols[transport]
 	if !ok {
 		return tcpip.ErrUnknownProtocol
@@ -817,7 +738,7 @@ func (s *Stack) SetTransportProtocolOption(transport tcpip.TransportProtocolNumb
 // if err := s.TransportProtocolOption(tcpip.TCPProtocolNumber, &v); err != nil {
 //   ...
 // }
-func (s *Stack) TransportProtocolOption(transport tcpip.TransportProtocolNumber, option interface{}) *tcpip.Error {
+func (s *Stack) TransportProtocolOption(transport tcpip.TransportProtocolNumber, option tcpip.GettableTransportProtocolOption) *tcpip.Error {
 	transProtoState, ok := s.transportProtocols[transport]
 	if !ok {
 		return tcpip.ErrUnknownProtocol
@@ -851,46 +772,37 @@ func (s *Stack) Stats() tcpip.Stats {
 	return s.stats
 }
 
-// SetForwarding enables or disables the packet forwarding between NICs.
-//
-// When forwarding becomes enabled, any host-only state on all NICs will be
-// cleaned up and if IPv6 is enabled, NDP Router Solicitations will be started.
-// When forwarding becomes disabled and if IPv6 is enabled, NDP Router
-// Solicitations will be stopped.
-func (s *Stack) SetForwarding(enable bool) {
-	// TODO(igudger, bgeffon): Expose via /proc/sys/net/ipv4/ip_forward.
-	s.mu.Lock()
-	defer s.mu.Unlock()
+// SetForwarding enables or disables packet forwarding between NICs for the
+// passed protocol.
+func (s *Stack) SetForwarding(protocolNum tcpip.NetworkProtocolNumber, enable bool) *tcpip.Error {
+	protocol, ok := s.networkProtocols[protocolNum]
+	if !ok {
+		return tcpip.ErrUnknownProtocol
+	}
 
-	// If forwarding status didn't change, do nothing further.
-	if s.forwarding == enable {
-		return
+	forwardingProtocol, ok := protocol.(ForwardingNetworkProtocol)
+	if !ok {
+		return tcpip.ErrNotSupported
 	}
 
-	s.forwarding = enable
+	forwardingProtocol.SetForwarding(enable)
+	return nil
+}
 
-	// If this stack does not support IPv6, do nothing further.
-	if _, ok := s.networkProtocols[header.IPv6ProtocolNumber]; !ok {
-		return
+// Forwarding returns true if packet forwarding between NICs is enabled for the
+// passed protocol.
+func (s *Stack) Forwarding(protocolNum tcpip.NetworkProtocolNumber) bool {
+	protocol, ok := s.networkProtocols[protocolNum]
+	if !ok {
+		return false
 	}
 
-	if enable {
-		for _, nic := range s.nics {
-			nic.becomeIPv6Router()
-		}
-	} else {
-		for _, nic := range s.nics {
-			nic.becomeIPv6Host()
-		}
+	forwardingProtocol, ok := protocol.(ForwardingNetworkProtocol)
+	if !ok {
+		return false
 	}
-}
 
-// Forwarding returns if the packet forwarding between NICs is enabled.
-func (s *Stack) Forwarding() bool {
-	// TODO(igudger, bgeffon): Expose via /proc/sys/net/ipv4/ip_forward.
-	s.mu.RLock()
-	defer s.mu.RUnlock()
-	return s.forwarding
+	return forwardingProtocol.Forwarding()
 }
 
 // SetRouteTable assigns the route table to be used by this stack. It
@@ -925,7 +837,7 @@ func (s *Stack) NewEndpoint(transport tcpip.TransportProtocolNumber, network tcp
 		return nil, tcpip.ErrUnknownProtocol
 	}
 
-	return t.proto.NewEndpoint(s, network, waiterQueue)
+	return t.proto.NewEndpoint(network, waiterQueue)
 }
 
 // NewRawEndpoint creates a new raw transport layer endpoint of the given
@@ -945,7 +857,7 @@ func (s *Stack) NewRawEndpoint(transport tcpip.TransportProtocolNumber, network
 		return nil, tcpip.ErrUnknownProtocol
 	}
 
-	return t.proto.NewRawEndpoint(s, network, waiterQueue)
+	return t.proto.NewRawEndpoint(network, waiterQueue)
 }
 
 // NewPacketEndpoint creates a new packet endpoint listening for the given
@@ -1016,16 +928,16 @@ func (s *Stack) CreateNIC(id tcpip.NICID, ep LinkEndpoint) *tcpip.Error {
 	return s.CreateNICWithOptions(id, ep, NICOptions{})
 }
 
-// GetNICByName gets the NIC specified by name.
-func (s *Stack) GetNICByName(name string) (*NIC, bool) {
+// GetLinkEndpointByName gets the link endpoint specified by name.
+func (s *Stack) GetLinkEndpointByName(name string) LinkEndpoint {
 	s.mu.RLock()
 	defer s.mu.RUnlock()
 	for _, nic := range s.nics {
 		if nic.Name() == name {
-			return nic, true
+			return nic.LinkEndpoint
 		}
 	}
-	return nil, false
+	return nil
 }
 
 // EnableNIC enables the given NIC so that the link-layer endpoint can start
@@ -1052,7 +964,8 @@ func (s *Stack) DisableNIC(id tcpip.NICID) *tcpip.Error {
 		return tcpip.ErrUnknownNICID
 	}
 
-	return nic.disable()
+	nic.disable()
+	return nil
 }
 
 // CheckNIC checks if a NIC is usable.
@@ -1065,7 +978,7 @@ func (s *Stack) CheckNIC(id tcpip.NICID) bool {
 		return false
 	}
 
-	return nic.enabled()
+	return nic.Enabled()
 }
 
 // RemoveNIC removes NIC and all related routes from the network stack.
@@ -1143,19 +1056,19 @@ func (s *Stack) NICInfo() map[tcpip.NICID]NICInfo {
 	for id, nic := range s.nics {
 		flags := NICStateFlags{
 			Up:          true, // Netstack interfaces are always up.
-			Running:     nic.enabled(),
+			Running:     nic.Enabled(),
 			Promiscuous: nic.isPromiscuousMode(),
-			Loopback:    nic.isLoopback(),
+			Loopback:    nic.IsLoopback(),
 		}
 		nics[id] = NICInfo{
 			Name:              nic.name,
-			LinkAddress:       nic.linkEP.LinkAddress(),
-			ProtocolAddresses: nic.PrimaryAddresses(),
+			LinkAddress:       nic.LinkEndpoint.LinkAddress(),
+			ProtocolAddresses: nic.primaryAddresses(),
 			Flags:             flags,
-			MTU:               nic.linkEP.MTU(),
+			MTU:               nic.LinkEndpoint.MTU(),
 			Stats:             nic.stats,
 			Context:           nic.context,
-			ARPHardwareType:   nic.linkEP.ARPHardwareType(),
+			ARPHardwareType:   nic.LinkEndpoint.ARPHardwareType(),
 		}
 	}
 	return nics
@@ -1209,12 +1122,12 @@ func (s *Stack) AddProtocolAddressWithOptions(id tcpip.NICID, protocolAddress tc
 	s.mu.RLock()
 	defer s.mu.RUnlock()
 
-	nic := s.nics[id]
-	if nic == nil {
+	nic, ok := s.nics[id]
+	if !ok {
 		return tcpip.ErrUnknownNICID
 	}
 
-	return nic.AddAddress(protocolAddress, peb)
+	return nic.addAddress(protocolAddress, peb)
 }
 
 // RemoveAddress removes an existing network-layer address from the specified
@@ -1224,7 +1137,7 @@ func (s *Stack) RemoveAddress(id tcpip.NICID, addr tcpip.Address) *tcpip.Error {
 	defer s.mu.RUnlock()
 
 	if nic, ok := s.nics[id]; ok {
-		return nic.RemoveAddress(addr)
+		return nic.removeAddress(addr)
 	}
 
 	return tcpip.ErrUnknownNICID
@@ -1238,7 +1151,7 @@ func (s *Stack) AllAddresses() map[tcpip.NICID][]tcpip.ProtocolAddress {
 
 	nics := make(map[tcpip.NICID][]tcpip.ProtocolAddress)
 	for id, nic := range s.nics {
-		nics[id] = nic.AllAddresses()
+		nics[id] = nic.allPermanentAddresses()
 	}
 	return nics
 }
@@ -1260,7 +1173,7 @@ func (s *Stack) GetMainNICAddress(id tcpip.NICID, protocol tcpip.NetworkProtocol
 	return nic.primaryAddress(protocol), nil
 }
 
-func (s *Stack) getRefEP(nic *NIC, localAddr, remoteAddr tcpip.Address, netProto tcpip.NetworkProtocolNumber) (ref *referencedNetworkEndpoint) {
+func (s *Stack) getAddressEP(nic *NIC, localAddr, remoteAddr tcpip.Address, netProto tcpip.NetworkProtocolNumber) AssignableAddressEndpoint {
 	if len(localAddr) == 0 {
 		return nic.primaryEndpoint(netProto, remoteAddr)
 	}
@@ -1277,9 +1190,9 @@ func (s *Stack) FindRoute(id tcpip.NICID, localAddr, remoteAddr tcpip.Address, n
 	isMulticast := header.IsV4MulticastAddress(remoteAddr) || header.IsV6MulticastAddress(remoteAddr)
 	needRoute := !(isLocalBroadcast || isMulticast || header.IsV6LinkLocalAddress(remoteAddr))
 	if id != 0 && !needRoute {
-		if nic, ok := s.nics[id]; ok && nic.enabled() {
-			if ref := s.getRefEP(nic, localAddr, remoteAddr, netProto); ref != nil {
-				return makeRoute(netProto, ref.address(), remoteAddr, nic.linkEP.LinkAddress(), ref, s.handleLocal && !nic.isLoopback(), multicastLoop && !nic.isLoopback()), nil
+		if nic, ok := s.nics[id]; ok && nic.Enabled() {
+			if addressEndpoint := s.getAddressEP(nic, localAddr, remoteAddr, netProto); addressEndpoint != nil {
+				return makeRoute(netProto, addressEndpoint.AddressWithPrefix().Address, remoteAddr, nic, addressEndpoint, s.handleLocal && !nic.IsLoopback(), multicastLoop && !nic.IsLoopback()), nil
 			}
 		}
 	} else {
@@ -1287,22 +1200,20 @@ func (s *Stack) FindRoute(id tcpip.NICID, localAddr, remoteAddr tcpip.Address, n
 			if (id != 0 && id != route.NIC) || (len(remoteAddr) != 0 && !route.Destination.Contains(remoteAddr)) {
 				continue
 			}
-			if nic, ok := s.nics[route.NIC]; ok && nic.enabled() {
-				if ref := s.getRefEP(nic, localAddr, remoteAddr, netProto); ref != nil {
+			if nic, ok := s.nics[route.NIC]; ok && nic.Enabled() {
+				if addressEndpoint := s.getAddressEP(nic, localAddr, remoteAddr, netProto); addressEndpoint != nil {
 					if len(remoteAddr) == 0 {
 						// If no remote address was provided, then the route
 						// provided will refer to the link local address.
-						remoteAddr = ref.address()
+						remoteAddr = addressEndpoint.AddressWithPrefix().Address
 					}
 
-					r := makeRoute(netProto, ref.address(), remoteAddr, nic.linkEP.LinkAddress(), ref, s.handleLocal && !nic.isLoopback(), multicastLoop && !nic.isLoopback())
-					r.directedBroadcast = route.Destination.IsBroadcast(remoteAddr)
-
+					r := makeRoute(netProto, addressEndpoint.AddressWithPrefix().Address, remoteAddr, nic, addressEndpoint, s.handleLocal && !nic.IsLoopback(), multicastLoop && !nic.IsLoopback())
 					if len(route.Gateway) > 0 {
 						if needRoute {
 							r.NextHop = route.Gateway
 						}
-					} else if r.directedBroadcast {
+					} else if subnet := addressEndpoint.AddressWithPrefix().Subnet(); subnet.IsBroadcast(remoteAddr) {
 						r.RemoteLinkAddress = header.EthernetBroadcastAddress
 					}
 
@@ -1335,26 +1246,25 @@ func (s *Stack) CheckLocalAddress(nicID tcpip.NICID, protocol tcpip.NetworkProto
 
 	// If a NIC is specified, we try to find the address there only.
 	if nicID != 0 {
-		nic := s.nics[nicID]
-		if nic == nil {
+		nic, ok := s.nics[nicID]
+		if !ok {
 			return 0
 		}
 
-		ref := nic.findEndpoint(protocol, addr, CanBePrimaryEndpoint)
-		if ref == nil {
+		addressEndpoint := nic.findEndpoint(protocol, addr, CanBePrimaryEndpoint)
+		if addressEndpoint == nil {
 			return 0
 		}
 
-		ref.decRef()
+		addressEndpoint.DecRef()
 
 		return nic.id
 	}
 
 	// Go through all the NICs.
 	for _, nic := range s.nics {
-		ref := nic.findEndpoint(protocol, addr, CanBePrimaryEndpoint)
-		if ref != nil {
-			ref.decRef()
+		if addressEndpoint := nic.findEndpoint(protocol, addr, CanBePrimaryEndpoint); addressEndpoint != nil {
+			addressEndpoint.DecRef()
 			return nic.id
 		}
 	}
@@ -1367,8 +1277,8 @@ func (s *Stack) SetPromiscuousMode(nicID tcpip.NICID, enable bool) *tcpip.Error
 	s.mu.RLock()
 	defer s.mu.RUnlock()
 
-	nic := s.nics[nicID]
-	if nic == nil {
+	nic, ok := s.nics[nicID]
+	if !ok {
 		return tcpip.ErrUnknownNICID
 	}
 
@@ -1383,8 +1293,8 @@ func (s *Stack) SetSpoofing(nicID tcpip.NICID, enable bool) *tcpip.Error {
 	s.mu.RLock()
 	defer s.mu.RUnlock()
 
-	nic := s.nics[nicID]
-	if nic == nil {
+	nic, ok := s.nics[nicID]
+	if !ok {
 		return tcpip.ErrUnknownNICID
 	}
 
@@ -1413,11 +1323,36 @@ func (s *Stack) GetLinkAddress(nicID tcpip.NICID, addr, localAddr tcpip.Address,
 
 	fullAddr := tcpip.FullAddress{NIC: nicID, Addr: addr}
 	linkRes := s.linkAddrResolvers[protocol]
-	return s.linkAddrCache.get(fullAddr, linkRes, localAddr, nic.linkEP, waker)
+	return s.linkAddrCache.get(fullAddr, linkRes, localAddr, nic.LinkEndpoint, waker)
 }
 
-// RemoveWaker implements LinkAddressCache.RemoveWaker.
+// Neighbors returns all IP to MAC address associations.
+func (s *Stack) Neighbors(nicID tcpip.NICID) ([]NeighborEntry, *tcpip.Error) {
+	s.mu.RLock()
+	nic, ok := s.nics[nicID]
+	s.mu.RUnlock()
+
+	if !ok {
+		return nil, tcpip.ErrUnknownNICID
+	}
+
+	return nic.neighbors()
+}
+
+// RemoveWaker removes a waker that has been added when link resolution for
+// addr was requested.
 func (s *Stack) RemoveWaker(nicID tcpip.NICID, addr tcpip.Address, waker *sleep.Waker) {
+	if s.useNeighborCache {
+		s.mu.RLock()
+		nic, ok := s.nics[nicID]
+		s.mu.RUnlock()
+
+		if ok {
+			nic.removeWaker(addr, waker)
+		}
+		return
+	}
+
 	s.mu.RLock()
 	defer s.mu.RUnlock()
 
@@ -1427,6 +1362,47 @@ func (s *Stack) RemoveWaker(nicID tcpip.NICID, addr tcpip.Address, waker *sleep.
 	}
 }
 
+// AddStaticNeighbor statically associates an IP address to a MAC address.
+func (s *Stack) AddStaticNeighbor(nicID tcpip.NICID, addr tcpip.Address, linkAddr tcpip.LinkAddress) *tcpip.Error {
+	s.mu.RLock()
+	nic, ok := s.nics[nicID]
+	s.mu.RUnlock()
+
+	if !ok {
+		return tcpip.ErrUnknownNICID
+	}
+
+	return nic.addStaticNeighbor(addr, linkAddr)
+}
+
+// RemoveNeighbor removes an IP to MAC address association previously created
+// either automically or by AddStaticNeighbor. Returns ErrBadAddress if there
+// is no association with the provided address.
+func (s *Stack) RemoveNeighbor(nicID tcpip.NICID, addr tcpip.Address) *tcpip.Error {
+	s.mu.RLock()
+	nic, ok := s.nics[nicID]
+	s.mu.RUnlock()
+
+	if !ok {
+		return tcpip.ErrUnknownNICID
+	}
+
+	return nic.removeNeighbor(addr)
+}
+
+// ClearNeighbors removes all IP to MAC address associations.
+func (s *Stack) ClearNeighbors(nicID tcpip.NICID) *tcpip.Error {
+	s.mu.RLock()
+	nic, ok := s.nics[nicID]
+	s.mu.RUnlock()
+
+	if !ok {
+		return tcpip.ErrUnknownNICID
+	}
+
+	return nic.clearNeighbors()
+}
+
 // RegisterTransportEndpoint registers the given endpoint with the stack
 // transport dispatcher. Received packets that match the provided id will be
 // delivered to the given endpoint; specifying a nic is optional, but
@@ -1450,10 +1426,9 @@ func (s *Stack) UnregisterTransportEndpoint(nicID tcpip.NICID, netProtos []tcpip
 // StartTransportEndpointCleanup removes the endpoint with the given id from
 // the stack transport dispatcher. It also transitions it to the cleanup stage.
 func (s *Stack) StartTransportEndpointCleanup(nicID tcpip.NICID, netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint, flags ports.Flags, bindToDevice tcpip.NICID) {
-	s.mu.Lock()
-	defer s.mu.Unlock()
-
+	s.cleanupEndpointsMu.Lock()
 	s.cleanupEndpoints[ep] = struct{}{}
+	s.cleanupEndpointsMu.Unlock()
 
 	s.demux.unregisterEndpoint(netProtos, protocol, id, ep, flags, bindToDevice)
 }
@@ -1461,9 +1436,9 @@ func (s *Stack) StartTransportEndpointCleanup(nicID tcpip.NICID, netProtos []tcp
 // CompleteTransportEndpointCleanup removes the endpoint from the cleanup
 // stage.
 func (s *Stack) CompleteTransportEndpointCleanup(ep TransportEndpoint) {
-	s.mu.Lock()
+	s.cleanupEndpointsMu.Lock()
 	delete(s.cleanupEndpoints, ep)
-	s.mu.Unlock()
+	s.cleanupEndpointsMu.Unlock()
 }
 
 // FindTransportEndpoint finds an endpoint that most closely matches the provided
@@ -1506,23 +1481,23 @@ func (s *Stack) RegisteredEndpoints() []TransportEndpoint {
 
 // CleanupEndpoints returns endpoints currently in the cleanup state.
 func (s *Stack) CleanupEndpoints() []TransportEndpoint {
-	s.mu.Lock()
+	s.cleanupEndpointsMu.Lock()
 	es := make([]TransportEndpoint, 0, len(s.cleanupEndpoints))
 	for e := range s.cleanupEndpoints {
 		es = append(es, e)
 	}
-	s.mu.Unlock()
+	s.cleanupEndpointsMu.Unlock()
 	return es
 }
 
 // RestoreCleanupEndpoints adds endpoints to cleanup tracking. This is useful
 // for restoring a stack after a save.
 func (s *Stack) RestoreCleanupEndpoints(es []TransportEndpoint) {
-	s.mu.Lock()
+	s.cleanupEndpointsMu.Lock()
 	for _, e := range es {
 		s.cleanupEndpoints[e] = struct{}{}
 	}
-	s.mu.Unlock()
+	s.cleanupEndpointsMu.Unlock()
 }
 
 // Close closes all currently registered transport endpoints.
@@ -1564,7 +1539,7 @@ func (s *Stack) Wait() {
 	s.mu.RLock()
 	defer s.mu.RUnlock()
 	for _, n := range s.nics {
-		n.linkEP.Wait()
+		n.LinkEndpoint.Wait()
 	}
 }
 
@@ -1652,7 +1627,7 @@ func (s *Stack) WritePacket(nicID tcpip.NICID, dst tcpip.LinkAddress, netProto t
 
 	// Add our own fake ethernet header.
 	ethFields := header.EthernetFields{
-		SrcAddr: nic.linkEP.LinkAddress(),
+		SrcAddr: nic.LinkEndpoint.LinkAddress(),
 		DstAddr: dst,
 		Type:    netProto,
 	}
@@ -1661,7 +1636,7 @@ func (s *Stack) WritePacket(nicID tcpip.NICID, dst tcpip.LinkAddress, netProto t
 	vv := buffer.View(fakeHeader).ToVectorisedView()
 	vv.Append(payload)
 
-	if err := nic.linkEP.WriteRawPacket(vv); err != nil {
+	if err := nic.LinkEndpoint.WriteRawPacket(vv); err != nil {
 		return err
 	}
 
@@ -1678,7 +1653,7 @@ func (s *Stack) WriteRawPacket(nicID tcpip.NICID, payload buffer.VectorisedView)
 		return tcpip.ErrUnknownDevice
 	}
 
-	if err := nic.linkEP.WriteRawPacket(payload); err != nil {
+	if err := nic.LinkEndpoint.WriteRawPacket(payload); err != nil {
 		return err
 	}
 
@@ -1717,18 +1692,17 @@ func (s *Stack) TransportProtocolInstance(num tcpip.TransportProtocolNumber) Tra
 // guarantee provided on which probe will be invoked. Ideally this should only
 // be called once per stack.
 func (s *Stack) AddTCPProbe(probe TCPProbeFunc) {
-	s.mu.Lock()
-	s.tcpProbeFunc = probe
-	s.mu.Unlock()
+	s.tcpProbeFunc.Store(probe)
 }
 
 // GetTCPProbe returns the TCPProbeFunc if installed with AddTCPProbe, nil
 // otherwise.
 func (s *Stack) GetTCPProbe() TCPProbeFunc {
-	s.mu.Lock()
-	p := s.tcpProbeFunc
-	s.mu.Unlock()
-	return p
+	p := s.tcpProbeFunc.Load()
+	if p == nil {
+		return nil
+	}
+	return p.(TCPProbeFunc)
 }
 
 // RemoveTCPProbe removes an installed TCP probe.
@@ -1737,9 +1711,8 @@ func (s *Stack) GetTCPProbe() TCPProbeFunc {
 // have a probe attached. Endpoints already created will continue to invoke
 // TCP probe.
 func (s *Stack) RemoveTCPProbe() {
-	s.mu.Lock()
-	s.tcpProbeFunc = nil
-	s.mu.Unlock()
+	// This must be TCPProbeFunc(nil) because atomic.Value.Store(nil) panics.
+	s.tcpProbeFunc.Store(TCPProbeFunc(nil))
 }
 
 // JoinGroup joins the given multicast group on the given NIC.
@@ -1760,7 +1733,7 @@ func (s *Stack) LeaveGroup(protocol tcpip.NetworkProtocolNumber, nicID tcpip.NIC
 	defer s.mu.RUnlock()
 
 	if nic, ok := s.nics[nicID]; ok {
-		return nic.leaveGroup(multicastAddr)
+		return nic.leaveGroup(protocol, multicastAddr)
 	}
 	return tcpip.ErrUnknownNICID
 }
@@ -1812,53 +1785,18 @@ func (s *Stack) AllowICMPMessage() bool {
 	return s.icmpRateLimiter.Allow()
 }
 
-// IsAddrTentative returns true if addr is tentative on the NIC with ID id.
-//
-// Note that if addr is not associated with a NIC with id ID, then this
-// function will return false. It will only return true if the address is
-// associated with the NIC AND it is tentative.
-func (s *Stack) IsAddrTentative(id tcpip.NICID, addr tcpip.Address) (bool, *tcpip.Error) {
-	s.mu.RLock()
-	defer s.mu.RUnlock()
-
-	nic, ok := s.nics[id]
-	if !ok {
-		return false, tcpip.ErrUnknownNICID
-	}
-
-	return nic.isAddrTentative(addr), nil
-}
-
-// DupTentativeAddrDetected attempts to inform the NIC with ID id that a
-// tentative addr on it is a duplicate on a link.
-func (s *Stack) DupTentativeAddrDetected(id tcpip.NICID, addr tcpip.Address) *tcpip.Error {
-	s.mu.Lock()
-	defer s.mu.Unlock()
-
-	nic, ok := s.nics[id]
-	if !ok {
-		return tcpip.ErrUnknownNICID
-	}
-
-	return nic.dupTentativeAddrDetected(addr)
-}
-
-// SetNDPConfigurations sets the per-interface NDP configurations on the NIC
-// with ID id to c.
-//
-// Note, if c contains invalid NDP configuration values, it will be fixed to
-// use default values for the erroneous values.
-func (s *Stack) SetNDPConfigurations(id tcpip.NICID, c NDPConfigurations) *tcpip.Error {
+// GetNetworkEndpoint returns the NetworkEndpoint with the specified protocol
+// number installed on the specified NIC.
+func (s *Stack) GetNetworkEndpoint(nicID tcpip.NICID, proto tcpip.NetworkProtocolNumber) (NetworkEndpoint, *tcpip.Error) {
 	s.mu.Lock()
 	defer s.mu.Unlock()
 
-	nic, ok := s.nics[id]
+	nic, ok := s.nics[nicID]
 	if !ok {
-		return tcpip.ErrUnknownNICID
+		return nil, tcpip.ErrUnknownNICID
 	}
 
-	nic.setNDPConfigs(c)
-	return nil
+	return nic.getNetworkEndpoint(proto), nil
 }
 
 // NUDConfigurations gets the per-interface NUD configurations.
@@ -1871,7 +1809,7 @@ func (s *Stack) NUDConfigurations(id tcpip.NICID) (NUDConfigurations, *tcpip.Err
 		return NUDConfigurations{}, tcpip.ErrUnknownNICID
 	}
 
-	return nic.NUDConfigs()
+	return nic.nudConfigs()
 }
 
 // SetNUDConfigurations sets the per-interface NUD configurations.
@@ -1890,22 +1828,6 @@ func (s *Stack) SetNUDConfigurations(id tcpip.NICID, c NUDConfigurations) *tcpip
 	return nic.setNUDConfigs(c)
 }
 
-// HandleNDPRA provides a NIC with ID id a validated NDP Router Advertisement
-// message that it needs to handle.
-func (s *Stack) HandleNDPRA(id tcpip.NICID, ip tcpip.Address, ra header.NDPRouterAdvert) *tcpip.Error {
-	s.mu.Lock()
-	defer s.mu.Unlock()
-
-	nic, ok := s.nics[id]
-	if !ok {
-		return tcpip.ErrUnknownNICID
-	}
-
-	nic.handleNDPRA(ip, ra)
-
-	return nil
-}
-
 // Seed returns a 32 bit value that can be used as a seed value for port
 // picking, ISN generation etc.
 //
@@ -1947,21 +1869,17 @@ func (s *Stack) FindNetworkEndpoint(netProto tcpip.NetworkProtocolNumber, addres
 	defer s.mu.RUnlock()
 
 	for _, nic := range s.nics {
-		id := NetworkEndpointID{address}
-
-		if ref, ok := nic.mu.endpoints[id]; ok {
-			nic.mu.RLock()
-			defer nic.mu.RUnlock()
-
-			// An endpoint with this id exists, check if it can be
-			// used and return it.
-			return ref.ep, nil
+		addressEndpoint := nic.getAddressOrCreateTempInner(netProto, address, false /* createTemp */, NeverPrimaryEndpoint)
+		if addressEndpoint == nil {
+			continue
 		}
+		addressEndpoint.DecRef()
+		return nic.getNetworkEndpoint(netProto), nil
 	}
 	return nil, tcpip.ErrBadAddress
 }
 
-// FindNICNameFromID returns the name of the nic for the given NICID.
+// FindNICNameFromID returns the name of the NIC for the given NICID.
 func (s *Stack) FindNICNameFromID(id tcpip.NICID) string {
 	s.mu.RLock()
 	defer s.mu.RUnlock()
@@ -1973,3 +1891,8 @@ func (s *Stack) FindNICNameFromID(id tcpip.NICID) string {
 
 	return nic.Name()
 }
+
+// NewJob returns a new tcpip.Job using the stack's clock.
+func (s *Stack) NewJob(l sync.Locker, f func()) *tcpip.Job {
+	return tcpip.NewJob(s.clock, l, f)
+}
diff --git a/pkg/tcpip/stack/stack_test.go b/pkg/tcpip/stack/stack_test.go
index 106645c50..e75f58c64 100644
--- a/pkg/tcpip/stack/stack_test.go
+++ b/pkg/tcpip/stack/stack_test.go
@@ -21,7 +21,6 @@ import (
 	"bytes"
 	"fmt"
 	"math"
-	"net"
 	"sort"
 	"testing"
 	"time"
@@ -29,6 +28,7 @@ import (
 	"github.com/google/go-cmp/cmp"
 	"github.com/google/go-cmp/cmp/cmpopts"
 	"gvisor.dev/gvisor/pkg/rand"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
@@ -68,18 +68,40 @@ const (
 // use the first three: destination address, source address, and transport
 // protocol. They're all one byte fields to simplify parsing.
 type fakeNetworkEndpoint struct {
-	nicID      tcpip.NICID
+	stack.AddressableEndpointState
+
+	mu struct {
+		sync.RWMutex
+
+		enabled bool
+	}
+
+	nic        stack.NetworkInterface
 	proto      *fakeNetworkProtocol
 	dispatcher stack.TransportDispatcher
-	ep         stack.LinkEndpoint
 }
 
-func (f *fakeNetworkEndpoint) MTU() uint32 {
-	return f.ep.MTU() - uint32(f.MaxHeaderLength())
+func (f *fakeNetworkEndpoint) Enable() *tcpip.Error {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	f.mu.enabled = true
+	return nil
+}
+
+func (f *fakeNetworkEndpoint) Enabled() bool {
+	f.mu.RLock()
+	defer f.mu.RUnlock()
+	return f.mu.enabled
 }
 
-func (f *fakeNetworkEndpoint) NICID() tcpip.NICID {
-	return f.nicID
+func (f *fakeNetworkEndpoint) Disable() {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	f.mu.enabled = false
+}
+
+func (f *fakeNetworkEndpoint) MTU() uint32 {
+	return f.nic.MTU() - uint32(f.MaxHeaderLength())
 }
 
 func (*fakeNetworkEndpoint) DefaultTTL() uint8 {
@@ -111,17 +133,13 @@ func (f *fakeNetworkEndpoint) HandlePacket(r *stack.Route, pkt *stack.PacketBuff
 }
 
 func (f *fakeNetworkEndpoint) MaxHeaderLength() uint16 {
-	return f.ep.MaxHeaderLength() + fakeNetHeaderLen
+	return f.nic.MaxHeaderLength() + fakeNetHeaderLen
 }
 
 func (f *fakeNetworkEndpoint) PseudoHeaderChecksum(protocol tcpip.TransportProtocolNumber, dstAddr tcpip.Address) uint16 {
 	return 0
 }
 
-func (f *fakeNetworkEndpoint) Capabilities() stack.LinkEndpointCapabilities {
-	return f.ep.Capabilities()
-}
-
 func (f *fakeNetworkEndpoint) NetworkProtocolNumber() tcpip.NetworkProtocolNumber {
 	return f.proto.Number()
 }
@@ -144,7 +162,7 @@ func (f *fakeNetworkEndpoint) WritePacket(r *stack.Route, gso *stack.GSO, params
 		return nil
 	}
 
-	return f.ep.WritePacket(r, gso, fakeNetNumber, pkt)
+	return f.nic.WritePacket(r, gso, fakeNetNumber, pkt)
 }
 
 // WritePackets implements stack.LinkEndpoint.WritePackets.
@@ -156,16 +174,8 @@ func (*fakeNetworkEndpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt *stack
 	return tcpip.ErrNotSupported
 }
 
-func (*fakeNetworkEndpoint) Close() {}
-
-type fakeNetGoodOption bool
-
-type fakeNetBadOption bool
-
-type fakeNetInvalidValueOption int
-
-type fakeNetOptions struct {
-	good bool
+func (f *fakeNetworkEndpoint) Close() {
+	f.AddressableEndpointState.Cleanup()
 }
 
 // fakeNetworkProtocol is a network-layer protocol descriptor. It aggregates the
@@ -174,7 +184,12 @@ type fakeNetOptions struct {
 type fakeNetworkProtocol struct {
 	packetCount     [10]int
 	sendPacketCount [10]int
-	opts            fakeNetOptions
+	defaultTTL      uint8
+
+	mu struct {
+		sync.RWMutex
+		forwarding bool
+	}
 }
 
 func (f *fakeNetworkProtocol) Number() tcpip.NetworkProtocolNumber {
@@ -197,44 +212,43 @@ func (*fakeNetworkProtocol) ParseAddresses(v buffer.View) (src, dst tcpip.Addres
 	return tcpip.Address(v[srcAddrOffset : srcAddrOffset+1]), tcpip.Address(v[dstAddrOffset : dstAddrOffset+1])
 }
 
-func (f *fakeNetworkProtocol) NewEndpoint(nicID tcpip.NICID, linkAddrCache stack.LinkAddressCache, dispatcher stack.TransportDispatcher, ep stack.LinkEndpoint, _ *stack.Stack) stack.NetworkEndpoint {
-	return &fakeNetworkEndpoint{
-		nicID:      nicID,
+func (f *fakeNetworkProtocol) NewEndpoint(nic stack.NetworkInterface, _ stack.LinkAddressCache, _ stack.NUDHandler, dispatcher stack.TransportDispatcher) stack.NetworkEndpoint {
+	e := &fakeNetworkEndpoint{
+		nic:        nic,
 		proto:      f,
 		dispatcher: dispatcher,
-		ep:         ep,
 	}
+	e.AddressableEndpointState.Init(e)
+	return e
 }
 
-func (f *fakeNetworkProtocol) SetOption(option interface{}) *tcpip.Error {
+func (f *fakeNetworkProtocol) SetOption(option tcpip.SettableNetworkProtocolOption) *tcpip.Error {
 	switch v := option.(type) {
-	case fakeNetGoodOption:
-		f.opts.good = bool(v)
+	case *tcpip.DefaultTTLOption:
+		f.defaultTTL = uint8(*v)
 		return nil
-	case fakeNetInvalidValueOption:
-		return tcpip.ErrInvalidOptionValue
 	default:
 		return tcpip.ErrUnknownProtocolOption
 	}
 }
 
-func (f *fakeNetworkProtocol) Option(option interface{}) *tcpip.Error {
+func (f *fakeNetworkProtocol) Option(option tcpip.GettableNetworkProtocolOption) *tcpip.Error {
 	switch v := option.(type) {
-	case *fakeNetGoodOption:
-		*v = fakeNetGoodOption(f.opts.good)
+	case *tcpip.DefaultTTLOption:
+		*v = tcpip.DefaultTTLOption(f.defaultTTL)
 		return nil
 	default:
 		return tcpip.ErrUnknownProtocolOption
 	}
 }
 
-// Close implements TransportProtocol.Close.
+// Close implements NetworkProtocol.Close.
 func (*fakeNetworkProtocol) Close() {}
 
-// Wait implements TransportProtocol.Wait.
+// Wait implements NetworkProtocol.Wait.
 func (*fakeNetworkProtocol) Wait() {}
 
-// Parse implements TransportProtocol.Parse.
+// Parse implements NetworkProtocol.Parse.
 func (*fakeNetworkProtocol) Parse(pkt *stack.PacketBuffer) (tcpip.TransportProtocolNumber, bool, bool) {
 	hdr, ok := pkt.NetworkHeader().Consume(fakeNetHeaderLen)
 	if !ok {
@@ -243,7 +257,21 @@ func (*fakeNetworkProtocol) Parse(pkt *stack.PacketBuffer) (tcpip.TransportProto
 	return tcpip.TransportProtocolNumber(hdr[protocolNumberOffset]), true, true
 }
 
-func fakeNetFactory() stack.NetworkProtocol {
+// Forwarding implements stack.ForwardingNetworkProtocol.
+func (f *fakeNetworkProtocol) Forwarding() bool {
+	f.mu.RLock()
+	defer f.mu.RUnlock()
+	return f.mu.forwarding
+}
+
+// SetForwarding implements stack.ForwardingNetworkProtocol.
+func (f *fakeNetworkProtocol) SetForwarding(v bool) {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	f.mu.forwarding = v
+}
+
+func fakeNetFactory(*stack.Stack) stack.NetworkProtocol {
 	return &fakeNetworkProtocol{}
 }
 
@@ -280,7 +308,7 @@ func TestNetworkReceive(t *testing.T) {
 	// addresses attached to it: 1 & 2.
 	ep := channel.New(10, defaultMTU, "")
 	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+		NetworkProtocols: []stack.NetworkProtocolFactory{fakeNetFactory},
 	})
 	if err := s.CreateNIC(1, ep); err != nil {
 		t.Fatal("CreateNIC failed:", err)
@@ -440,7 +468,7 @@ func TestNetworkSend(t *testing.T) {
 	// existing nic.
 	ep := channel.New(10, defaultMTU, "")
 	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+		NetworkProtocols: []stack.NetworkProtocolFactory{fakeNetFactory},
 	})
 	if err := s.CreateNIC(1, ep); err != nil {
 		t.Fatal("NewNIC failed:", err)
@@ -467,7 +495,7 @@ func TestNetworkSendMultiRoute(t *testing.T) {
 	// addresses per nic, the first nic has odd address, the second one has
 	// even addresses.
 	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+		NetworkProtocols: []stack.NetworkProtocolFactory{fakeNetFactory},
 	})
 
 	ep1 := channel.New(10, defaultMTU, "")
@@ -567,7 +595,7 @@ func TestAttachToLinkEndpointImmediately(t *testing.T) {
 	for _, test := range tests {
 		t.Run(test.name, func(t *testing.T) {
 			s := stack.New(stack.Options{
-				NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+				NetworkProtocols: []stack.NetworkProtocolFactory{fakeNetFactory},
 			})
 
 			e := linkEPWithMockedAttach{
@@ -586,7 +614,7 @@ func TestAttachToLinkEndpointImmediately(t *testing.T) {
 
 func TestDisableUnknownNIC(t *testing.T) {
 	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+		NetworkProtocols: []stack.NetworkProtocolFactory{fakeNetFactory},
 	})
 
 	if err := s.DisableNIC(1); err != tcpip.ErrUnknownNICID {
@@ -598,7 +626,7 @@ func TestDisabledNICsNICInfoAndCheckNIC(t *testing.T) {
 	const nicID = 1
 
 	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+		NetworkProtocols: []stack.NetworkProtocolFactory{fakeNetFactory},
 	})
 
 	e := loopback.New()
@@ -645,7 +673,7 @@ func TestDisabledNICsNICInfoAndCheckNIC(t *testing.T) {
 
 func TestRemoveUnknownNIC(t *testing.T) {
 	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+		NetworkProtocols: []stack.NetworkProtocolFactory{fakeNetFactory},
 	})
 
 	if err := s.RemoveNIC(1); err != tcpip.ErrUnknownNICID {
@@ -657,7 +685,7 @@ func TestRemoveNIC(t *testing.T) {
 	const nicID = 1
 
 	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+		NetworkProtocols: []stack.NetworkProtocolFactory{fakeNetFactory},
 	})
 
 	e := linkEPWithMockedAttach{
@@ -718,7 +746,7 @@ func TestRouteWithDownNIC(t *testing.T) {
 
 	setup := func(t *testing.T) (*stack.Stack, *channel.Endpoint, *channel.Endpoint) {
 		s := stack.New(stack.Options{
-			NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+			NetworkProtocols: []stack.NetworkProtocolFactory{fakeNetFactory},
 		})
 
 		ep1 := channel.New(1, defaultMTU, "")
@@ -884,7 +912,7 @@ func TestRoutes(t *testing.T) {
 	// addresses per nic, the first nic has odd address, the second one has
 	// even addresses.
 	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+		NetworkProtocols: []stack.NetworkProtocolFactory{fakeNetFactory},
 	})
 
 	ep1 := channel.New(10, defaultMTU, "")
@@ -964,7 +992,7 @@ func TestAddressRemoval(t *testing.T) {
 	remoteAddr := tcpip.Address("\x02")
 
 	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+		NetworkProtocols: []stack.NetworkProtocolFactory{fakeNetFactory},
 	})
 
 	ep := channel.New(10, defaultMTU, "")
@@ -1011,7 +1039,7 @@ func TestAddressRemovalWithRouteHeld(t *testing.T) {
 	remoteAddr := tcpip.Address("\x02")
 
 	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+		NetworkProtocols: []stack.NetworkProtocolFactory{fakeNetFactory},
 	})
 
 	ep := channel.New(10, defaultMTU, "")
@@ -1102,7 +1130,7 @@ func TestEndpointExpiration(t *testing.T) {
 		for _, spoofing := range []bool{true, false} {
 			t.Run(fmt.Sprintf("promiscuous=%t spoofing=%t", promiscuous, spoofing), func(t *testing.T) {
 				s := stack.New(stack.Options{
-					NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+					NetworkProtocols: []stack.NetworkProtocolFactory{fakeNetFactory},
 				})
 
 				ep := channel.New(10, defaultMTU, "")
@@ -1260,7 +1288,7 @@ func TestEndpointExpiration(t *testing.T) {
 
 func TestPromiscuousMode(t *testing.T) {
 	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+		NetworkProtocols: []stack.NetworkProtocolFactory{fakeNetFactory},
 	})
 
 	ep := channel.New(10, defaultMTU, "")
@@ -1312,7 +1340,7 @@ func TestSpoofingWithAddress(t *testing.T) {
 	dstAddr := tcpip.Address("\x03")
 
 	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+		NetworkProtocols: []stack.NetworkProtocolFactory{fakeNetFactory},
 	})
 
 	ep := channel.New(10, defaultMTU, "")
@@ -1378,7 +1406,7 @@ func TestSpoofingNoAddress(t *testing.T) {
 	dstAddr := tcpip.Address("\x02")
 
 	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+		NetworkProtocols: []stack.NetworkProtocolFactory{fakeNetFactory},
 	})
 
 	ep := channel.New(10, defaultMTU, "")
@@ -1441,7 +1469,7 @@ func verifyRoute(gotRoute, wantRoute stack.Route) error {
 
 func TestOutgoingBroadcastWithEmptyRouteTable(t *testing.T) {
 	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+		NetworkProtocols: []stack.NetworkProtocolFactory{fakeNetFactory},
 	})
 
 	ep := channel.New(10, defaultMTU, "")
@@ -1484,7 +1512,7 @@ func TestOutgoingBroadcastWithRouteTable(t *testing.T) {
 
 	// Create a new stack with two NICs.
 	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+		NetworkProtocols: []stack.NetworkProtocolFactory{fakeNetFactory},
 	})
 	ep := channel.New(10, defaultMTU, "")
 	if err := s.CreateNIC(1, ep); err != nil {
@@ -1585,7 +1613,7 @@ func TestMulticastOrIPv6LinkLocalNeedsNoRoute(t *testing.T) {
 	} {
 		t.Run(tc.name, func(t *testing.T) {
 			s := stack.New(stack.Options{
-				NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+				NetworkProtocols: []stack.NetworkProtocolFactory{fakeNetFactory},
 			})
 
 			ep := channel.New(10, defaultMTU, "")
@@ -1640,46 +1668,24 @@ func TestMulticastOrIPv6LinkLocalNeedsNoRoute(t *testing.T) {
 	}
 }
 
-func TestNetworkOptions(t *testing.T) {
+func TestNetworkOption(t *testing.T) {
 	s := stack.New(stack.Options{
-		NetworkProtocols:   []stack.NetworkProtocol{fakeNetFactory()},
-		TransportProtocols: []stack.TransportProtocol{},
+		NetworkProtocols:   []stack.NetworkProtocolFactory{fakeNetFactory},
+		TransportProtocols: []stack.TransportProtocolFactory{},
 	})
 
-	// Try an unsupported network protocol.
-	if err := s.SetNetworkProtocolOption(tcpip.NetworkProtocolNumber(99999), fakeNetGoodOption(false)); err != tcpip.ErrUnknownProtocol {
-		t.Fatalf("SetNetworkProtocolOption(fakeNet2, blah, false) = %v, want = tcpip.ErrUnknownProtocol", err)
+	opt := tcpip.DefaultTTLOption(5)
+	if err := s.SetNetworkProtocolOption(fakeNetNumber, &opt); err != nil {
+		t.Fatalf("s.SetNetworkProtocolOption(%d, &%T(%d)): %s", fakeNetNumber, opt, opt, err)
 	}
 
-	testCases := []struct {
-		option   interface{}
-		wantErr  *tcpip.Error
-		verifier func(t *testing.T, p stack.NetworkProtocol)
-	}{
-		{fakeNetGoodOption(true), nil, func(t *testing.T, p stack.NetworkProtocol) {
-			t.Helper()
-			fakeNet := p.(*fakeNetworkProtocol)
-			if fakeNet.opts.good != true {
-				t.Fatalf("fakeNet.opts.good = false, want = true")
-			}
-			var v fakeNetGoodOption
-			if err := s.NetworkProtocolOption(fakeNetNumber, &v); err != nil {
-				t.Fatalf("s.NetworkProtocolOption(fakeNetNumber, &v) = %v, want = nil, where v is option %T", v, err)
-			}
-			if v != true {
-				t.Fatalf("s.NetworkProtocolOption(fakeNetNumber, &v) returned v = %v, want = true", v)
-			}
-		}},
-		{fakeNetBadOption(true), tcpip.ErrUnknownProtocolOption, nil},
-		{fakeNetInvalidValueOption(1), tcpip.ErrInvalidOptionValue, nil},
+	var optGot tcpip.DefaultTTLOption
+	if err := s.NetworkProtocolOption(fakeNetNumber, &optGot); err != nil {
+		t.Fatalf("s.NetworkProtocolOption(%d, &%T): %s", fakeNetNumber, optGot, err)
 	}
-	for _, tc := range testCases {
-		if got := s.SetNetworkProtocolOption(fakeNetNumber, tc.option); got != tc.wantErr {
-			t.Errorf("s.SetNetworkProtocolOption(fakeNet, %v) = %v, want = %v", tc.option, got, tc.wantErr)
-		}
-		if tc.verifier != nil {
-			tc.verifier(t, s.NetworkProtocolInstance(fakeNetNumber))
-		}
+
+	if opt != optGot {
+		t.Errorf("got optGot = %d, want = %d", optGot, opt)
 	}
 }
 
@@ -1691,7 +1697,7 @@ func TestGetMainNICAddressAddPrimaryNonPrimary(t *testing.T) {
 					for never := 0; never < 3; never++ {
 						t.Run(fmt.Sprintf("never=%d", never), func(t *testing.T) {
 							s := stack.New(stack.Options{
-								NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+								NetworkProtocols: []stack.NetworkProtocolFactory{fakeNetFactory},
 							})
 							ep := channel.New(10, defaultMTU, "")
 							if err := s.CreateNIC(1, ep); err != nil {
@@ -1758,7 +1764,7 @@ func TestGetMainNICAddressAddPrimaryNonPrimary(t *testing.T) {
 
 func TestGetMainNICAddressAddRemove(t *testing.T) {
 	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+		NetworkProtocols: []stack.NetworkProtocolFactory{fakeNetFactory},
 	})
 	ep := channel.New(10, defaultMTU, "")
 	if err := s.CreateNIC(1, ep); err != nil {
@@ -1843,7 +1849,7 @@ func verifyAddresses(t *testing.T, expectedAddresses, gotAddresses []tcpip.Proto
 func TestAddAddress(t *testing.T) {
 	const nicID = 1
 	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+		NetworkProtocols: []stack.NetworkProtocolFactory{fakeNetFactory},
 	})
 	ep := channel.New(10, defaultMTU, "")
 	if err := s.CreateNIC(nicID, ep); err != nil {
@@ -1870,7 +1876,7 @@ func TestAddAddress(t *testing.T) {
 func TestAddProtocolAddress(t *testing.T) {
 	const nicID = 1
 	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+		NetworkProtocols: []stack.NetworkProtocolFactory{fakeNetFactory},
 	})
 	ep := channel.New(10, defaultMTU, "")
 	if err := s.CreateNIC(nicID, ep); err != nil {
@@ -1904,7 +1910,7 @@ func TestAddProtocolAddress(t *testing.T) {
 func TestAddAddressWithOptions(t *testing.T) {
 	const nicID = 1
 	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+		NetworkProtocols: []stack.NetworkProtocolFactory{fakeNetFactory},
 	})
 	ep := channel.New(10, defaultMTU, "")
 	if err := s.CreateNIC(nicID, ep); err != nil {
@@ -1935,7 +1941,7 @@ func TestAddAddressWithOptions(t *testing.T) {
 func TestAddProtocolAddressWithOptions(t *testing.T) {
 	const nicID = 1
 	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+		NetworkProtocols: []stack.NetworkProtocolFactory{fakeNetFactory},
 	})
 	ep := channel.New(10, defaultMTU, "")
 	if err := s.CreateNIC(nicID, ep); err != nil {
@@ -2056,7 +2062,7 @@ func TestCreateNICWithOptions(t *testing.T) {
 
 func TestNICStats(t *testing.T) {
 	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+		NetworkProtocols: []stack.NetworkProtocolFactory{fakeNetFactory},
 	})
 	ep1 := channel.New(10, defaultMTU, "")
 	if err := s.CreateNIC(1, ep1); err != nil {
@@ -2097,7 +2103,7 @@ func TestNICStats(t *testing.T) {
 		t.Errorf("got Tx.Packets.Value() = %d, ep1.Drain() = %d", got, want)
 	}
 
-	if got, want := s.NICInfo()[1].Stats.Tx.Bytes.Value(), uint64(len(payload)); got != want {
+	if got, want := s.NICInfo()[1].Stats.Tx.Bytes.Value(), uint64(len(payload)+fakeNetHeaderLen); got != want {
 		t.Errorf("got Tx.Bytes.Value() = %d, want = %d", got, want)
 	}
 }
@@ -2123,9 +2129,9 @@ func TestNICForwarding(t *testing.T) {
 	for _, test := range tests {
 		t.Run(test.name, func(t *testing.T) {
 			s := stack.New(stack.Options{
-				NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+				NetworkProtocols: []stack.NetworkProtocolFactory{fakeNetFactory},
 			})
-			s.SetForwarding(true)
+			s.SetForwarding(fakeNetNumber, true)
 
 			ep1 := channel.New(10, defaultMTU, "")
 			if err := s.CreateNIC(nicID1, ep1); err != nil {
@@ -2247,7 +2253,7 @@ func TestNICAutoGenLinkLocalAddr(t *testing.T) {
 		nicName      string
 		autoGen      bool
 		linkAddr     tcpip.LinkAddress
-		iidOpts      stack.OpaqueInterfaceIdentifierOptions
+		iidOpts      ipv6.OpaqueInterfaceIdentifierOptions
 		shouldGen    bool
 		expectedAddr tcpip.Address
 	}{
@@ -2263,7 +2269,7 @@ func TestNICAutoGenLinkLocalAddr(t *testing.T) {
 			nicName:  "nic1",
 			autoGen:  false,
 			linkAddr: linkAddr1,
-			iidOpts: stack.OpaqueInterfaceIdentifierOptions{
+			iidOpts: ipv6.OpaqueInterfaceIdentifierOptions{
 				NICNameFromID: nicNameFunc,
 				SecretKey:     secretKey[:],
 			},
@@ -2308,7 +2314,7 @@ func TestNICAutoGenLinkLocalAddr(t *testing.T) {
 			nicName:  "nic1",
 			autoGen:  true,
 			linkAddr: linkAddr1,
-			iidOpts: stack.OpaqueInterfaceIdentifierOptions{
+			iidOpts: ipv6.OpaqueInterfaceIdentifierOptions{
 				NICNameFromID: nicNameFunc,
 				SecretKey:     secretKey[:],
 			},
@@ -2320,7 +2326,7 @@ func TestNICAutoGenLinkLocalAddr(t *testing.T) {
 		{
 			name:    "OIID Empty MAC and empty nicName",
 			autoGen: true,
-			iidOpts: stack.OpaqueInterfaceIdentifierOptions{
+			iidOpts: ipv6.OpaqueInterfaceIdentifierOptions{
 				NICNameFromID: nicNameFunc,
 				SecretKey:     secretKey[:1],
 			},
@@ -2332,7 +2338,7 @@ func TestNICAutoGenLinkLocalAddr(t *testing.T) {
 			nicName:  "test",
 			autoGen:  true,
 			linkAddr: "\x01\x02\x03",
-			iidOpts: stack.OpaqueInterfaceIdentifierOptions{
+			iidOpts: ipv6.OpaqueInterfaceIdentifierOptions{
 				NICNameFromID: nicNameFunc,
 				SecretKey:     secretKey[:2],
 			},
@@ -2344,7 +2350,7 @@ func TestNICAutoGenLinkLocalAddr(t *testing.T) {
 			nicName:  "test2",
 			autoGen:  true,
 			linkAddr: "\x01\x02\x03\x04\x05\x06",
-			iidOpts: stack.OpaqueInterfaceIdentifierOptions{
+			iidOpts: ipv6.OpaqueInterfaceIdentifierOptions{
 				NICNameFromID: nicNameFunc,
 				SecretKey:     secretKey[:3],
 			},
@@ -2356,7 +2362,7 @@ func TestNICAutoGenLinkLocalAddr(t *testing.T) {
 			nicName:  "test3",
 			autoGen:  true,
 			linkAddr: "\x00\x00\x00\x00\x00\x00",
-			iidOpts: stack.OpaqueInterfaceIdentifierOptions{
+			iidOpts: ipv6.OpaqueInterfaceIdentifierOptions{
 				NICNameFromID: nicNameFunc,
 			},
 			shouldGen:    true,
@@ -2370,10 +2376,11 @@ func TestNICAutoGenLinkLocalAddr(t *testing.T) {
 				autoGenAddrC: make(chan ndpAutoGenAddrEvent, 1),
 			}
 			opts := stack.Options{
-				NetworkProtocols:     []stack.NetworkProtocol{ipv6.NewProtocol()},
-				AutoGenIPv6LinkLocal: test.autoGen,
-				NDPDisp:              &ndpDisp,
-				OpaqueIIDOpts:        test.iidOpts,
+				NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
+					AutoGenIPv6LinkLocal: test.autoGen,
+					NDPDisp:              &ndpDisp,
+					OpaqueIIDOpts:        test.iidOpts,
+				})},
 			}
 
 			e := channel.New(0, 1280, test.linkAddr)
@@ -2445,15 +2452,15 @@ func TestNoLinkLocalAutoGenForLoopbackNIC(t *testing.T) {
 
 	tests := []struct {
 		name          string
-		opaqueIIDOpts stack.OpaqueInterfaceIdentifierOptions
+		opaqueIIDOpts ipv6.OpaqueInterfaceIdentifierOptions
 	}{
 		{
 			name:          "IID From MAC",
-			opaqueIIDOpts: stack.OpaqueInterfaceIdentifierOptions{},
+			opaqueIIDOpts: ipv6.OpaqueInterfaceIdentifierOptions{},
 		},
 		{
 			name: "Opaque IID",
-			opaqueIIDOpts: stack.OpaqueInterfaceIdentifierOptions{
+			opaqueIIDOpts: ipv6.OpaqueInterfaceIdentifierOptions{
 				NICNameFromID: func(_ tcpip.NICID, nicName string) string {
 					return nicName
 				},
@@ -2464,9 +2471,10 @@ func TestNoLinkLocalAutoGenForLoopbackNIC(t *testing.T) {
 	for _, test := range tests {
 		t.Run(test.name, func(t *testing.T) {
 			opts := stack.Options{
-				NetworkProtocols:     []stack.NetworkProtocol{ipv6.NewProtocol()},
-				AutoGenIPv6LinkLocal: true,
-				OpaqueIIDOpts:        test.opaqueIIDOpts,
+				NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
+					AutoGenIPv6LinkLocal: true,
+					OpaqueIIDOpts:        test.opaqueIIDOpts,
+				})},
 			}
 
 			e := loopback.New()
@@ -2495,12 +2503,13 @@ func TestNICAutoGenAddrDoesDAD(t *testing.T) {
 	ndpDisp := ndpDispatcher{
 		dadC: make(chan ndpDADEvent),
 	}
-	ndpConfigs := stack.DefaultNDPConfigurations()
+	ndpConfigs := ipv6.DefaultNDPConfigurations()
 	opts := stack.Options{
-		NetworkProtocols:     []stack.NetworkProtocol{ipv6.NewProtocol()},
-		NDPConfigs:           ndpConfigs,
-		AutoGenIPv6LinkLocal: true,
-		NDPDisp:              &ndpDisp,
+		NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
+			NDPConfigs:           ndpConfigs,
+			AutoGenIPv6LinkLocal: true,
+			NDPDisp:              &ndpDisp,
+		})},
 	}
 
 	e := channel.New(int(ndpConfigs.DupAddrDetectTransmits), 1280, linkAddr1)
@@ -2556,7 +2565,7 @@ func TestNewPEBOnPromotionToPermanent(t *testing.T) {
 		for _, ps := range pebs {
 			t.Run(fmt.Sprintf("%d-to-%d", pi, ps), func(t *testing.T) {
 				s := stack.New(stack.Options{
-					NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+					NetworkProtocols: []stack.NetworkProtocolFactory{fakeNetFactory},
 				})
 				ep1 := channel.New(10, defaultMTU, "")
 				if err := s.CreateNIC(1, ep1); err != nil {
@@ -2847,14 +2856,15 @@ func TestIPv6SourceAddressSelectionScopeAndSameAddress(t *testing.T) {
 		t.Run(test.name, func(t *testing.T) {
 			e := channel.New(0, 1280, linkAddr1)
 			s := stack.New(stack.Options{
-				NetworkProtocols:   []stack.NetworkProtocol{ipv6.NewProtocol()},
-				TransportProtocols: []stack.TransportProtocol{udp.NewProtocol()},
-				NDPConfigs: stack.NDPConfigurations{
-					HandleRAs:                  true,
-					AutoGenGlobalAddresses:     true,
-					AutoGenTempGlobalAddresses: true,
-				},
-				NDPDisp: &ndpDispatcher{},
+				NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
+					NDPConfigs: ipv6.NDPConfigurations{
+						HandleRAs:                  true,
+						AutoGenGlobalAddresses:     true,
+						AutoGenTempGlobalAddresses: true,
+					},
+					NDPDisp: &ndpDispatcher{},
+				})},
+				TransportProtocols: []stack.TransportProtocolFactory{udp.NewProtocol},
 			})
 			if err := s.CreateNIC(nicID, e); err != nil {
 				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
@@ -2903,7 +2913,7 @@ func TestAddRemoveIPv4BroadcastAddressOnNICEnableDisable(t *testing.T) {
 
 	e := loopback.New()
 	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{ipv4.NewProtocol()},
+		NetworkProtocols: []stack.NetworkProtocolFactory{ipv4.NewProtocol},
 	})
 	nicOpts := stack.NICOptions{Disabled: true}
 	if err := s.CreateNICWithOptions(nicID, e, nicOpts); err != nil {
@@ -2955,7 +2965,7 @@ func TestLeaveIPv6SolicitedNodeAddrBeforeAddrRemoval(t *testing.T) {
 	const nicID = 1
 
 	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+		NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocol},
 	})
 	e := channel.New(10, 1280, linkAddr1)
 	if err := s.CreateNIC(1, e); err != nil {
@@ -3016,7 +3026,7 @@ func TestJoinLeaveMulticastOnNICEnableDisable(t *testing.T) {
 		t.Run(test.name, func(t *testing.T) {
 			e := loopback.New()
 			s := stack.New(stack.Options{
-				NetworkProtocols: []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol()},
+				NetworkProtocols: []stack.NetworkProtocolFactory{ipv4.NewProtocol, ipv6.NewProtocol},
 			})
 			nicOpts := stack.NICOptions{Disabled: true}
 			if err := s.CreateNICWithOptions(nicID, e, nicOpts); err != nil {
@@ -3093,12 +3103,13 @@ func TestDoDADWhenNICEnabled(t *testing.T) {
 		dadC: make(chan ndpDADEvent),
 	}
 	opts := stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
-		NDPConfigs: stack.NDPConfigurations{
-			DupAddrDetectTransmits: dadTransmits,
-			RetransmitTimer:        retransmitTimer,
-		},
-		NDPDisp: &ndpDisp,
+		NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
+			NDPConfigs: ipv6.NDPConfigurations{
+				DupAddrDetectTransmits: dadTransmits,
+				RetransmitTimer:        retransmitTimer,
+			},
+			NDPDisp: &ndpDisp,
+		})},
 	}
 
 	e := channel.New(dadTransmits, 1280, linkAddr1)
@@ -3457,7 +3468,7 @@ func TestOutgoingSubnetBroadcast(t *testing.T) {
 	for _, test := range tests {
 		t.Run(test.name, func(t *testing.T) {
 			s := stack.New(stack.Options{
-				NetworkProtocols: []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol()},
+				NetworkProtocols: []stack.NetworkProtocolFactory{ipv4.NewProtocol, ipv6.NewProtocol},
 			})
 			ep := channel.New(0, defaultMTU, "")
 			if err := s.CreateNIC(nicID1, ep); err != nil {
@@ -3495,7 +3506,7 @@ func TestResolveWith(t *testing.T) {
 	)
 
 	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{ipv4.NewProtocol(), arp.NewProtocol()},
+		NetworkProtocols: []stack.NetworkProtocolFactory{ipv4.NewProtocol, arp.NewProtocol},
 	})
 	ep := channel.New(0, defaultMTU, "")
 	ep.LinkEPCapabilities |= stack.CapabilityResolutionRequired
@@ -3505,17 +3516,17 @@ func TestResolveWith(t *testing.T) {
 	addr := tcpip.ProtocolAddress{
 		Protocol: header.IPv4ProtocolNumber,
 		AddressWithPrefix: tcpip.AddressWithPrefix{
-			Address:   tcpip.Address(net.ParseIP("192.168.1.58").To4()),
+			Address:   tcpip.Address([]byte{192, 168, 1, 58}),
 			PrefixLen: 24,
 		},
 	}
 	if err := s.AddProtocolAddress(nicID, addr); err != nil {
-		t.Fatalf("AddProtocolAddress(%d, %+v): %s", nicID, addr, err)
+		t.Fatalf("AddProtocolAddress(%d, %#v): %s", nicID, addr, err)
 	}
 
 	s.SetRouteTable([]tcpip.Route{{Destination: header.IPv4EmptySubnet, NIC: nicID}})
 
-	remoteAddr := tcpip.Address(net.ParseIP("192.168.1.59").To4())
+	remoteAddr := tcpip.Address([]byte{192, 168, 1, 59})
 	r, err := s.FindRoute(unspecifiedNICID, "" /* localAddr */, remoteAddr, header.IPv4ProtocolNumber, false /* multicastLoop */)
 	if err != nil {
 		t.Fatalf("FindRoute(%d, '', %s, %d): %s", unspecifiedNICID, remoteAddr, header.IPv4ProtocolNumber, err)
@@ -3533,3 +3544,131 @@ func TestResolveWith(t *testing.T) {
 		t.Fatal("got r.IsResolutionRequired() = true, want = false")
 	}
 }
+
+// TestRouteReleaseAfterAddrRemoval tests that releasing a Route after its
+// associated address is removed should not cause a panic.
+func TestRouteReleaseAfterAddrRemoval(t *testing.T) {
+	const (
+		nicID      = 1
+		localAddr  = tcpip.Address("\x01")
+		remoteAddr = tcpip.Address("\x02")
+	)
+
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocolFactory{fakeNetFactory},
+	})
+
+	ep := channel.New(0, defaultMTU, "")
+	if err := s.CreateNIC(nicID, ep); err != nil {
+		t.Fatalf("CreateNIC(%d, _): %s", nicID, err)
+	}
+	if err := s.AddAddress(nicID, fakeNetNumber, localAddr); err != nil {
+		t.Fatalf("s.AddAddress(%d, %d, %s): %s", nicID, fakeNetNumber, localAddr, err)
+	}
+	{
+		subnet, err := tcpip.NewSubnet("\x00", "\x00")
+		if err != nil {
+			t.Fatal(err)
+		}
+		s.SetRouteTable([]tcpip.Route{{Destination: subnet, Gateway: "\x00", NIC: 1}})
+	}
+
+	r, err := s.FindRoute(nicID, localAddr, remoteAddr, fakeNetNumber, false /* multicastLoop */)
+	if err != nil {
+		t.Fatalf("s.FindRoute(%d, %s, %s, %d, false): %s", nicID, localAddr, remoteAddr, fakeNetNumber, err)
+	}
+	// Should not panic.
+	defer r.Release()
+
+	// Check that removing the same address fails.
+	if err := s.RemoveAddress(nicID, localAddr); err != nil {
+		t.Fatalf("s.RemoveAddress(%d, %s): %s", nicID, localAddr, err)
+	}
+}
+
+func TestGetNetworkEndpoint(t *testing.T) {
+	const nicID = 1
+
+	tests := []struct {
+		name         string
+		protoFactory stack.NetworkProtocolFactory
+		protoNum     tcpip.NetworkProtocolNumber
+	}{
+		{
+			name:         "IPv4",
+			protoFactory: ipv4.NewProtocol,
+			protoNum:     ipv4.ProtocolNumber,
+		},
+		{
+			name:         "IPv6",
+			protoFactory: ipv6.NewProtocol,
+			protoNum:     ipv6.ProtocolNumber,
+		},
+	}
+
+	factories := make([]stack.NetworkProtocolFactory, 0, len(tests))
+	for _, test := range tests {
+		factories = append(factories, test.protoFactory)
+	}
+
+	s := stack.New(stack.Options{
+		NetworkProtocols: factories,
+	})
+
+	if err := s.CreateNIC(nicID, channel.New(0, defaultMTU, "")); err != nil {
+		t.Fatalf("CreateNIC(%d, _): %s", nicID, err)
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			ep, err := s.GetNetworkEndpoint(nicID, test.protoNum)
+			if err != nil {
+				t.Fatalf("s.GetNetworkEndpoint(%d, %d): %s", nicID, test.protoNum, err)
+			}
+
+			if got := ep.NetworkProtocolNumber(); got != test.protoNum {
+				t.Fatalf("got ep.NetworkProtocolNumber() = %d, want = %d", got, test.protoNum)
+			}
+		})
+	}
+}
+
+func TestGetMainNICAddressWhenNICDisabled(t *testing.T) {
+	const nicID = 1
+
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocolFactory{fakeNetFactory},
+	})
+
+	if err := s.CreateNIC(nicID, channel.New(0, defaultMTU, "")); err != nil {
+		t.Fatalf("CreateNIC(%d, _): %s", nicID, err)
+	}
+
+	protocolAddress := tcpip.ProtocolAddress{
+		Protocol: fakeNetNumber,
+		AddressWithPrefix: tcpip.AddressWithPrefix{
+			Address:   "\x01",
+			PrefixLen: 8,
+		},
+	}
+	if err := s.AddProtocolAddress(nicID, protocolAddress); err != nil {
+		t.Fatalf("AddProtocolAddress(%d, %#v): %s", nicID, protocolAddress, err)
+	}
+
+	// Check that we get the right initial address and prefix length.
+	if gotAddr, err := s.GetMainNICAddress(nicID, fakeNetNumber); err != nil {
+		t.Fatalf("GetMainNICAddress(%d, %d): %s", nicID, fakeNetNumber, err)
+	} else if gotAddr != protocolAddress.AddressWithPrefix {
+		t.Fatalf("got GetMainNICAddress(%d, %d) = %s, want = %s", nicID, fakeNetNumber, gotAddr, protocolAddress.AddressWithPrefix)
+	}
+
+	// Should still get the address when the NIC is diabled.
+	if err := s.DisableNIC(nicID); err != nil {
+		t.Fatalf("DisableNIC(%d): %s", nicID, err)
+	}
+	if gotAddr, err := s.GetMainNICAddress(nicID, fakeNetNumber); err != nil {
+		t.Fatalf("GetMainNICAddress(%d, %d): %s", nicID, fakeNetNumber, err)
+	} else if gotAddr != protocolAddress.AddressWithPrefix {
+		t.Fatalf("got GetMainNICAddress(%d, %d) = %s, want = %s", nicID, fakeNetNumber, gotAddr, protocolAddress.AddressWithPrefix)
+	}
+}
diff --git a/pkg/tcpip/stack/transport_demuxer.go b/pkg/tcpip/stack/transport_demuxer.go
index b902c6ca9..35e5b1a2e 100644
--- a/pkg/tcpip/stack/transport_demuxer.go
+++ b/pkg/tcpip/stack/transport_demuxer.go
@@ -155,7 +155,7 @@ func (epsByNIC *endpointsByNIC) transportEndpoints() []TransportEndpoint {
 func (epsByNIC *endpointsByNIC) handlePacket(r *Route, id TransportEndpointID, pkt *PacketBuffer) {
 	epsByNIC.mu.RLock()
 
-	mpep, ok := epsByNIC.endpoints[r.ref.nic.ID()]
+	mpep, ok := epsByNIC.endpoints[r.nic.ID()]
 	if !ok {
 		if mpep, ok = epsByNIC.endpoints[0]; !ok {
 			epsByNIC.mu.RUnlock() // Don't use defer for performance reasons.
@@ -165,7 +165,7 @@ func (epsByNIC *endpointsByNIC) handlePacket(r *Route, id TransportEndpointID, p
 
 	// If this is a broadcast or multicast datagram, deliver the datagram to all
 	// endpoints bound to the right device.
-	if isMulticastOrBroadcast(id.LocalAddress) {
+	if isInboundMulticastOrBroadcast(r) {
 		mpep.handlePacketAll(r, id, pkt)
 		epsByNIC.mu.RUnlock() // Don't use defer for performance reasons.
 		return
@@ -526,7 +526,7 @@ func (d *transportDemuxer) deliverPacket(r *Route, protocol tcpip.TransportProto
 
 	// If the packet is a UDP broadcast or multicast, then find all matching
 	// transport endpoints.
-	if protocol == header.UDPProtocolNumber && isMulticastOrBroadcast(id.LocalAddress) {
+	if protocol == header.UDPProtocolNumber && isInboundMulticastOrBroadcast(r) {
 		eps.mu.RLock()
 		destEPs := eps.findAllEndpointsLocked(id)
 		eps.mu.RUnlock()
@@ -544,9 +544,11 @@ func (d *transportDemuxer) deliverPacket(r *Route, protocol tcpip.TransportProto
 		return true
 	}
 
-	// If the packet is a TCP packet with a non-unicast source or destination
-	// address, then do nothing further and instruct the caller to do the same.
-	if protocol == header.TCPProtocolNumber && (!isUnicast(r.LocalAddress) || !isUnicast(r.RemoteAddress)) {
+	// If the packet is a TCP packet with a unspecified source or non-unicast
+	// destination address, then do nothing further and instruct the caller to do
+	// the same. The network layer handles address validation for specified source
+	// addresses.
+	if protocol == header.TCPProtocolNumber && (!isSpecified(r.LocalAddress) || !isSpecified(r.RemoteAddress) || isInboundMulticastOrBroadcast(r)) {
 		// TCP can only be used to communicate between a single source and a
 		// single destination; the addresses must be unicast.
 		r.Stats().TCP.InvalidSegmentsReceived.Increment()
@@ -626,7 +628,7 @@ func (d *transportDemuxer) findTransportEndpoint(netProto tcpip.NetworkProtocolN
 	epsByNIC.mu.RLock()
 	eps.mu.RUnlock()
 
-	mpep, ok := epsByNIC.endpoints[r.ref.nic.ID()]
+	mpep, ok := epsByNIC.endpoints[r.nic.ID()]
 	if !ok {
 		if mpep, ok = epsByNIC.endpoints[0]; !ok {
 			epsByNIC.mu.RUnlock() // Don't use defer for performance reasons.
@@ -677,10 +679,10 @@ func (d *transportDemuxer) unregisterRawEndpoint(netProto tcpip.NetworkProtocolN
 	eps.mu.Unlock()
 }
 
-func isMulticastOrBroadcast(addr tcpip.Address) bool {
-	return addr == header.IPv4Broadcast || header.IsV4MulticastAddress(addr) || header.IsV6MulticastAddress(addr)
+func isInboundMulticastOrBroadcast(r *Route) bool {
+	return r.IsInboundBroadcast() || header.IsV4MulticastAddress(r.LocalAddress) || header.IsV6MulticastAddress(r.LocalAddress)
 }
 
-func isUnicast(addr tcpip.Address) bool {
-	return addr != header.IPv4Any && addr != header.IPv6Any && !isMulticastOrBroadcast(addr)
+func isSpecified(addr tcpip.Address) bool {
+	return addr != header.IPv4Any && addr != header.IPv6Any
 }
diff --git a/pkg/tcpip/stack/transport_demuxer_test.go b/pkg/tcpip/stack/transport_demuxer_test.go
index 1339edc2d..698c8609e 100644
--- a/pkg/tcpip/stack/transport_demuxer_test.go
+++ b/pkg/tcpip/stack/transport_demuxer_test.go
@@ -51,8 +51,8 @@ type testContext struct {
 // newDualTestContextMultiNIC creates the testing context and also linkEpIDs NICs.
 func newDualTestContextMultiNIC(t *testing.T, mtu uint32, linkEpIDs []tcpip.NICID) *testContext {
 	s := stack.New(stack.Options{
-		NetworkProtocols:   []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol()},
-		TransportProtocols: []stack.TransportProtocol{udp.NewProtocol()},
+		NetworkProtocols:   []stack.NetworkProtocolFactory{ipv4.NewProtocol, ipv6.NewProtocol},
+		TransportProtocols: []stack.TransportProtocolFactory{udp.NewProtocol},
 	})
 	linkEps := make(map[tcpip.NICID]*channel.Endpoint)
 	for _, linkEpID := range linkEpIDs {
@@ -182,8 +182,8 @@ func TestTransportDemuxerRegister(t *testing.T) {
 	} {
 		t.Run(test.name, func(t *testing.T) {
 			s := stack.New(stack.Options{
-				NetworkProtocols:   []stack.NetworkProtocol{ipv4.NewProtocol()},
-				TransportProtocols: []stack.TransportProtocol{udp.NewProtocol()},
+				NetworkProtocols:   []stack.NetworkProtocolFactory{ipv4.NewProtocol},
+				TransportProtocols: []stack.TransportProtocolFactory{udp.NewProtocol},
 			})
 			var wq waiter.Queue
 			ep, err := s.NewEndpoint(udp.ProtocolNumber, ipv4.ProtocolNumber, &wq)
@@ -312,8 +312,8 @@ func TestBindToDeviceDistribution(t *testing.T) {
 							t.Fatalf("SetSockOptBool(ReusePortOption, %t) on endpoint %d failed: %s", endpoint.reuse, i, err)
 						}
 						bindToDeviceOption := tcpip.BindToDeviceOption(endpoint.bindToDevice)
-						if err := ep.SetSockOpt(bindToDeviceOption); err != nil {
-							t.Fatalf("SetSockOpt(%#v) on endpoint %d failed: %s", bindToDeviceOption, i, err)
+						if err := ep.SetSockOpt(&bindToDeviceOption); err != nil {
+							t.Fatalf("SetSockOpt(&%T(%d)) on endpoint %d failed: %s", bindToDeviceOption, bindToDeviceOption, i, err)
 						}
 
 						var dstAddr tcpip.Address
diff --git a/pkg/tcpip/stack/transport_test.go b/pkg/tcpip/stack/transport_test.go
index 6c6e44468..6b8071467 100644
--- a/pkg/tcpip/stack/transport_test.go
+++ b/pkg/tcpip/stack/transport_test.go
@@ -28,7 +28,7 @@ import (
 
 const (
 	fakeTransNumber    tcpip.TransportProtocolNumber = 1
-	fakeTransHeaderLen                               = 3
+	fakeTransHeaderLen int                           = 3
 )
 
 // fakeTransportEndpoint is a transport-layer protocol endpoint. It counts
@@ -39,7 +39,7 @@ const (
 // use it.
 type fakeTransportEndpoint struct {
 	stack.TransportEndpointInfo
-	stack    *stack.Stack
+
 	proto    *fakeTransportProtocol
 	peerAddr tcpip.Address
 	route    stack.Route
@@ -53,14 +53,14 @@ func (f *fakeTransportEndpoint) Info() tcpip.EndpointInfo {
 	return &f.TransportEndpointInfo
 }
 
-func (f *fakeTransportEndpoint) Stats() tcpip.EndpointStats {
+func (*fakeTransportEndpoint) Stats() tcpip.EndpointStats {
 	return nil
 }
 
-func (f *fakeTransportEndpoint) SetOwner(owner tcpip.PacketOwner) {}
+func (*fakeTransportEndpoint) SetOwner(owner tcpip.PacketOwner) {}
 
-func newFakeTransportEndpoint(s *stack.Stack, proto *fakeTransportProtocol, netProto tcpip.NetworkProtocolNumber, uniqueID uint64) tcpip.Endpoint {
-	return &fakeTransportEndpoint{stack: s, TransportEndpointInfo: stack.TransportEndpointInfo{NetProto: netProto}, proto: proto, uniqueID: uniqueID}
+func newFakeTransportEndpoint(proto *fakeTransportProtocol, netProto tcpip.NetworkProtocolNumber, uniqueID uint64) tcpip.Endpoint {
+	return &fakeTransportEndpoint{TransportEndpointInfo: stack.TransportEndpointInfo{NetProto: netProto}, proto: proto, uniqueID: uniqueID}
 }
 
 func (f *fakeTransportEndpoint) Abort() {
@@ -100,12 +100,12 @@ func (f *fakeTransportEndpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions
 	return int64(len(v)), nil, nil
 }
 
-func (f *fakeTransportEndpoint) Peek([][]byte) (int64, tcpip.ControlMessages, *tcpip.Error) {
+func (*fakeTransportEndpoint) Peek([][]byte) (int64, tcpip.ControlMessages, *tcpip.Error) {
 	return 0, tcpip.ControlMessages{}, nil
 }
 
 // SetSockOpt sets a socket option. Currently not supported.
-func (*fakeTransportEndpoint) SetSockOpt(interface{}) *tcpip.Error {
+func (*fakeTransportEndpoint) SetSockOpt(tcpip.SettableSocketOption) *tcpip.Error {
 	return tcpip.ErrInvalidEndpointState
 }
 
@@ -130,11 +130,7 @@ func (*fakeTransportEndpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.E
 }
 
 // GetSockOpt implements tcpip.Endpoint.GetSockOpt.
-func (*fakeTransportEndpoint) GetSockOpt(opt interface{}) *tcpip.Error {
-	switch opt.(type) {
-	case tcpip.ErrorOption:
-		return nil
-	}
+func (*fakeTransportEndpoint) GetSockOpt(tcpip.GettableSocketOption) *tcpip.Error {
 	return tcpip.ErrInvalidEndpointState
 }
 
@@ -147,7 +143,7 @@ func (f *fakeTransportEndpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
 	f.peerAddr = addr.Addr
 
 	// Find the route.
-	r, err := f.stack.FindRoute(addr.NIC, "", addr.Addr, fakeNetNumber, false /* multicastLoop */)
+	r, err := f.proto.stack.FindRoute(addr.NIC, "", addr.Addr, fakeNetNumber, false /* multicastLoop */)
 	if err != nil {
 		return tcpip.ErrNoRoute
 	}
@@ -155,7 +151,7 @@ func (f *fakeTransportEndpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
 
 	// Try to register so that we can start receiving packets.
 	f.ID.RemoteAddress = addr.Addr
-	err = f.stack.RegisterTransportEndpoint(0, []tcpip.NetworkProtocolNumber{fakeNetNumber}, fakeTransNumber, f.ID, f, ports.Flags{}, 0 /* bindToDevice */)
+	err = f.proto.stack.RegisterTransportEndpoint(0, []tcpip.NetworkProtocolNumber{fakeNetNumber}, fakeTransNumber, f.ID, f, ports.Flags{}, 0 /* bindToDevice */)
 	if err != nil {
 		return err
 	}
@@ -169,7 +165,7 @@ func (f *fakeTransportEndpoint) UniqueID() uint64 {
 	return f.uniqueID
 }
 
-func (f *fakeTransportEndpoint) ConnectEndpoint(e tcpip.Endpoint) *tcpip.Error {
+func (*fakeTransportEndpoint) ConnectEndpoint(e tcpip.Endpoint) *tcpip.Error {
 	return nil
 }
 
@@ -184,7 +180,7 @@ func (*fakeTransportEndpoint) Listen(int) *tcpip.Error {
 	return nil
 }
 
-func (f *fakeTransportEndpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
+func (f *fakeTransportEndpoint) Accept(*tcpip.FullAddress) (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
 	if len(f.acceptQueue) == 0 {
 		return nil, nil, nil
 	}
@@ -194,7 +190,7 @@ func (f *fakeTransportEndpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.
 }
 
 func (f *fakeTransportEndpoint) Bind(a tcpip.FullAddress) *tcpip.Error {
-	if err := f.stack.RegisterTransportEndpoint(
+	if err := f.proto.stack.RegisterTransportEndpoint(
 		a.NIC,
 		[]tcpip.NetworkProtocolNumber{fakeNetNumber},
 		fakeTransNumber,
@@ -222,7 +218,6 @@ func (f *fakeTransportEndpoint) HandlePacket(r *stack.Route, id stack.TransportE
 	f.proto.packetCount++
 	if f.acceptQueue != nil {
 		f.acceptQueue = append(f.acceptQueue, fakeTransportEndpoint{
-			stack: f.stack,
 			TransportEndpointInfo: stack.TransportEndpointInfo{
 				ID:       f.ID,
 				NetProto: f.NetProto,
@@ -239,19 +234,19 @@ func (f *fakeTransportEndpoint) HandleControlPacket(stack.TransportEndpointID, s
 	f.proto.controlCount++
 }
 
-func (f *fakeTransportEndpoint) State() uint32 {
+func (*fakeTransportEndpoint) State() uint32 {
 	return 0
 }
 
-func (f *fakeTransportEndpoint) ModerateRecvBuf(copied int) {}
+func (*fakeTransportEndpoint) ModerateRecvBuf(copied int) {}
 
-func (f *fakeTransportEndpoint) IPTables() (stack.IPTables, error) {
-	return stack.IPTables{}, nil
-}
+func (*fakeTransportEndpoint) Resume(*stack.Stack) {}
 
-func (f *fakeTransportEndpoint) Resume(*stack.Stack) {}
+func (*fakeTransportEndpoint) Wait() {}
 
-func (f *fakeTransportEndpoint) Wait() {}
+func (*fakeTransportEndpoint) LastError() *tcpip.Error {
+	return nil
+}
 
 type fakeTransportGoodOption bool
 
@@ -266,6 +261,8 @@ type fakeTransportProtocolOptions struct {
 // fakeTransportProtocol is a transport-layer protocol descriptor. It
 // aggregates the number of packets received via endpoints of this protocol.
 type fakeTransportProtocol struct {
+	stack *stack.Stack
+
 	packetCount  int
 	controlCount int
 	opts         fakeTransportProtocolOptions
@@ -275,11 +272,11 @@ func (*fakeTransportProtocol) Number() tcpip.TransportProtocolNumber {
 	return fakeTransNumber
 }
 
-func (f *fakeTransportProtocol) NewEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, _ *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
-	return newFakeTransportEndpoint(stack, f, netProto, stack.UniqueID()), nil
+func (f *fakeTransportProtocol) NewEndpoint(netProto tcpip.NetworkProtocolNumber, _ *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
+	return newFakeTransportEndpoint(f, netProto, f.stack.UniqueID()), nil
 }
 
-func (*fakeTransportProtocol) NewRawEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, _ *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
+func (*fakeTransportProtocol) NewRawEndpoint(tcpip.NetworkProtocolNumber, *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
 	return nil, tcpip.ErrUnknownProtocol
 }
 
@@ -291,26 +288,24 @@ func (*fakeTransportProtocol) ParsePorts(buffer.View) (src, dst uint16, err *tcp
 	return 0, 0, nil
 }
 
-func (*fakeTransportProtocol) HandleUnknownDestinationPacket(*stack.Route, stack.TransportEndpointID, *stack.PacketBuffer) bool {
-	return true
+func (*fakeTransportProtocol) HandleUnknownDestinationPacket(*stack.Route, stack.TransportEndpointID, *stack.PacketBuffer) stack.UnknownDestinationPacketDisposition {
+	return stack.UnknownDestinationPacketHandled
 }
 
-func (f *fakeTransportProtocol) SetOption(option interface{}) *tcpip.Error {
+func (f *fakeTransportProtocol) SetOption(option tcpip.SettableTransportProtocolOption) *tcpip.Error {
 	switch v := option.(type) {
-	case fakeTransportGoodOption:
-		f.opts.good = bool(v)
+	case *tcpip.TCPModerateReceiveBufferOption:
+		f.opts.good = bool(*v)
 		return nil
-	case fakeTransportInvalidValueOption:
-		return tcpip.ErrInvalidOptionValue
 	default:
 		return tcpip.ErrUnknownProtocolOption
 	}
 }
 
-func (f *fakeTransportProtocol) Option(option interface{}) *tcpip.Error {
+func (f *fakeTransportProtocol) Option(option tcpip.GettableTransportProtocolOption) *tcpip.Error {
 	switch v := option.(type) {
-	case *fakeTransportGoodOption:
-		*v = fakeTransportGoodOption(f.opts.good)
+	case *tcpip.TCPModerateReceiveBufferOption:
+		*v = tcpip.TCPModerateReceiveBufferOption(f.opts.good)
 		return nil
 	default:
 		return tcpip.ErrUnknownProtocolOption
@@ -332,15 +327,15 @@ func (*fakeTransportProtocol) Parse(pkt *stack.PacketBuffer) bool {
 	return ok
 }
 
-func fakeTransFactory() stack.TransportProtocol {
-	return &fakeTransportProtocol{}
+func fakeTransFactory(s *stack.Stack) stack.TransportProtocol {
+	return &fakeTransportProtocol{stack: s}
 }
 
 func TestTransportReceive(t *testing.T) {
 	linkEP := channel.New(10, defaultMTU, "")
 	s := stack.New(stack.Options{
-		NetworkProtocols:   []stack.NetworkProtocol{fakeNetFactory()},
-		TransportProtocols: []stack.TransportProtocol{fakeTransFactory()},
+		NetworkProtocols:   []stack.NetworkProtocolFactory{fakeNetFactory},
+		TransportProtocols: []stack.TransportProtocolFactory{fakeTransFactory},
 	})
 	if err := s.CreateNIC(1, linkEP); err != nil {
 		t.Fatalf("CreateNIC failed: %v", err)
@@ -410,8 +405,8 @@ func TestTransportReceive(t *testing.T) {
 func TestTransportControlReceive(t *testing.T) {
 	linkEP := channel.New(10, defaultMTU, "")
 	s := stack.New(stack.Options{
-		NetworkProtocols:   []stack.NetworkProtocol{fakeNetFactory()},
-		TransportProtocols: []stack.TransportProtocol{fakeTransFactory()},
+		NetworkProtocols:   []stack.NetworkProtocolFactory{fakeNetFactory},
+		TransportProtocols: []stack.TransportProtocolFactory{fakeTransFactory},
 	})
 	if err := s.CreateNIC(1, linkEP); err != nil {
 		t.Fatalf("CreateNIC failed: %v", err)
@@ -487,8 +482,8 @@ func TestTransportControlReceive(t *testing.T) {
 func TestTransportSend(t *testing.T) {
 	linkEP := channel.New(10, defaultMTU, "")
 	s := stack.New(stack.Options{
-		NetworkProtocols:   []stack.NetworkProtocol{fakeNetFactory()},
-		TransportProtocols: []stack.TransportProtocol{fakeTransFactory()},
+		NetworkProtocols:   []stack.NetworkProtocolFactory{fakeNetFactory},
+		TransportProtocols: []stack.TransportProtocolFactory{fakeTransFactory},
 	})
 	if err := s.CreateNIC(1, linkEP); err != nil {
 		t.Fatalf("CreateNIC failed: %v", err)
@@ -533,54 +528,29 @@ func TestTransportSend(t *testing.T) {
 
 func TestTransportOptions(t *testing.T) {
 	s := stack.New(stack.Options{
-		NetworkProtocols:   []stack.NetworkProtocol{fakeNetFactory()},
-		TransportProtocols: []stack.TransportProtocol{fakeTransFactory()},
+		NetworkProtocols:   []stack.NetworkProtocolFactory{fakeNetFactory},
+		TransportProtocols: []stack.TransportProtocolFactory{fakeTransFactory},
 	})
 
-	// Try an unsupported transport protocol.
-	if err := s.SetTransportProtocolOption(tcpip.TransportProtocolNumber(99999), fakeTransportGoodOption(false)); err != tcpip.ErrUnknownProtocol {
-		t.Fatalf("SetTransportProtocolOption(fakeTrans2, blah, false) = %v, want = tcpip.ErrUnknownProtocol", err)
-	}
-
-	testCases := []struct {
-		option   interface{}
-		wantErr  *tcpip.Error
-		verifier func(t *testing.T, p stack.TransportProtocol)
-	}{
-		{fakeTransportGoodOption(true), nil, func(t *testing.T, p stack.TransportProtocol) {
-			t.Helper()
-			fakeTrans := p.(*fakeTransportProtocol)
-			if fakeTrans.opts.good != true {
-				t.Fatalf("fakeTrans.opts.good = false, want = true")
-			}
-			var v fakeTransportGoodOption
-			if err := s.TransportProtocolOption(fakeTransNumber, &v); err != nil {
-				t.Fatalf("s.TransportProtocolOption(fakeTransNumber, &v) = %v, want = nil, where v is option %T", v, err)
-			}
-			if v != true {
-				t.Fatalf("s.TransportProtocolOption(fakeTransNumber, &v) returned v = %v, want = true", v)
-			}
-
-		}},
-		{fakeTransportBadOption(true), tcpip.ErrUnknownProtocolOption, nil},
-		{fakeTransportInvalidValueOption(1), tcpip.ErrInvalidOptionValue, nil},
-	}
-	for _, tc := range testCases {
-		if got := s.SetTransportProtocolOption(fakeTransNumber, tc.option); got != tc.wantErr {
-			t.Errorf("s.SetTransportProtocolOption(fakeTrans, %v) = %v, want = %v", tc.option, got, tc.wantErr)
-		}
-		if tc.verifier != nil {
-			tc.verifier(t, s.TransportProtocolInstance(fakeTransNumber))
-		}
+	v := tcpip.TCPModerateReceiveBufferOption(true)
+	if err := s.SetTransportProtocolOption(fakeTransNumber, &v); err != nil {
+		t.Errorf("s.SetTransportProtocolOption(fakeTrans, &%T(%t)): %s", v, v, err)
+	}
+	v = false
+	if err := s.TransportProtocolOption(fakeTransNumber, &v); err != nil {
+		t.Fatalf("s.TransportProtocolOption(fakeTransNumber, &%T): %s", v, err)
+	}
+	if !v {
+		t.Fatalf("got tcpip.TCPModerateReceiveBufferOption = false, want = true")
 	}
 }
 
 func TestTransportForwarding(t *testing.T) {
 	s := stack.New(stack.Options{
-		NetworkProtocols:   []stack.NetworkProtocol{fakeNetFactory()},
-		TransportProtocols: []stack.TransportProtocol{fakeTransFactory()},
+		NetworkProtocols:   []stack.NetworkProtocolFactory{fakeNetFactory},
+		TransportProtocols: []stack.TransportProtocolFactory{fakeTransFactory},
 	})
-	s.SetForwarding(true)
+	s.SetForwarding(fakeNetNumber, true)
 
 	// TODO(b/123449044): Change this to a channel NIC.
 	ep1 := loopback.New()
@@ -635,7 +605,7 @@ func TestTransportForwarding(t *testing.T) {
 		Data: req.ToVectorisedView(),
 	}))
 
-	aep, _, err := ep.Accept()
+	aep, _, err := ep.Accept(nil)
 	if err != nil || aep == nil {
 		t.Fatalf("Accept failed: %v, %v", aep, err)
 	}
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index 07c85ce59..d77848d61 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -111,6 +111,7 @@ var (
 	ErrBroadcastDisabled         = &Error{msg: "broadcast socket option disabled"}
 	ErrNotPermitted              = &Error{msg: "operation not permitted"}
 	ErrAddressFamilyNotSupported = &Error{msg: "address family not supported by protocol"}
+	ErrMalformedHeader           = &Error{msg: "header is malformed"}
 )
 
 var messageToError map[string]*Error
@@ -159,6 +160,7 @@ func StringToError(s string) *Error {
 			ErrBroadcastDisabled,
 			ErrNotPermitted,
 			ErrAddressFamilyNotSupported,
+			ErrMalformedHeader,
 		}
 
 		messageToError = make(map[string]*Error)
@@ -237,6 +239,14 @@ type Timer interface {
 // network node. Or, in the case of unix endpoints, it may represent a path.
 type Address string
 
+// WithPrefix returns the address with a prefix that represents a point subnet.
+func (a Address) WithPrefix() AddressWithPrefix {
+	return AddressWithPrefix{
+		Address:   a,
+		PrefixLen: len(a) * 8,
+	}
+}
+
 // AddressMask is a bitmask for an address.
 type AddressMask string
 
@@ -561,7 +571,10 @@ type Endpoint interface {
 	// block if no new connections are available.
 	//
 	// The returned Queue is the wait queue for the newly created endpoint.
-	Accept() (Endpoint, *waiter.Queue, *Error)
+	//
+	// If peerAddr is not nil then it is populated with the peer address of the
+	// returned endpoint.
+	Accept(peerAddr *FullAddress) (Endpoint, *waiter.Queue, *Error)
 
 	// Bind binds the endpoint to a specific local address and port.
 	// Specifying a NIC is optional.
@@ -578,8 +591,8 @@ type Endpoint interface {
 	// if waiter.EventIn is set, the endpoint is immediately readable.
 	Readiness(mask waiter.EventMask) waiter.EventMask
 
-	// SetSockOpt sets a socket option. opt should be one of the *Option types.
-	SetSockOpt(opt interface{}) *Error
+	// SetSockOpt sets a socket option.
+	SetSockOpt(opt SettableSocketOption) *Error
 
 	// SetSockOptBool sets a socket option, for simple cases where a value
 	// has the bool type.
@@ -589,9 +602,8 @@ type Endpoint interface {
 	// has the int type.
 	SetSockOptInt(opt SockOptInt, v int) *Error
 
-	// GetSockOpt gets a socket option. opt should be a pointer to one of the
-	// *Option types.
-	GetSockOpt(opt interface{}) *Error
+	// GetSockOpt gets a socket option.
+	GetSockOpt(opt GettableSocketOption) *Error
 
 	// GetSockOptBool gets a socket option for simple cases where a return
 	// value has the bool type.
@@ -620,6 +632,9 @@ type Endpoint interface {
 
 	// SetOwner sets the task owner to the endpoint owner.
 	SetOwner(owner PacketOwner)
+
+	// LastError clears and returns the last error reported by the endpoint.
+	LastError() *Error
 }
 
 // LinkPacketInfo holds Link layer information for a received packet.
@@ -839,14 +854,134 @@ const (
 	PMTUDiscoveryProbe
 )
 
-// ErrorOption is used in GetSockOpt to specify that the last error reported by
-// the endpoint should be cleared and returned.
-type ErrorOption struct{}
+// GettableNetworkProtocolOption is a marker interface for network protocol
+// options that may be queried.
+type GettableNetworkProtocolOption interface {
+	isGettableNetworkProtocolOption()
+}
+
+// SettableNetworkProtocolOption is a marker interface for network protocol
+// options that may be set.
+type SettableNetworkProtocolOption interface {
+	isSettableNetworkProtocolOption()
+}
+
+// DefaultTTLOption is used by stack.(*Stack).NetworkProtocolOption to specify
+// a default TTL.
+type DefaultTTLOption uint8
+
+func (*DefaultTTLOption) isGettableNetworkProtocolOption() {}
+
+func (*DefaultTTLOption) isSettableNetworkProtocolOption() {}
+
+// GettableTransportProtocolOption is a marker interface for transport protocol
+// options that may be queried.
+type GettableTransportProtocolOption interface {
+	isGettableTransportProtocolOption()
+}
+
+// SettableTransportProtocolOption is a marker interface for transport protocol
+// options that may be set.
+type SettableTransportProtocolOption interface {
+	isSettableTransportProtocolOption()
+}
+
+// TCPSACKEnabled the SACK option for TCP.
+//
+// See: https://tools.ietf.org/html/rfc2018.
+type TCPSACKEnabled bool
+
+func (*TCPSACKEnabled) isGettableTransportProtocolOption() {}
+
+func (*TCPSACKEnabled) isSettableTransportProtocolOption() {}
+
+// TCPRecovery is the loss deteoction algorithm used by TCP.
+type TCPRecovery int32
+
+func (*TCPRecovery) isGettableTransportProtocolOption() {}
+
+func (*TCPRecovery) isSettableTransportProtocolOption() {}
+
+const (
+	// TCPRACKLossDetection indicates RACK is used for loss detection and
+	// recovery.
+	TCPRACKLossDetection TCPRecovery = 1 << iota
+
+	// TCPRACKStaticReoWnd indicates the reordering window should not be
+	// adjusted when DSACK is received.
+	TCPRACKStaticReoWnd
+
+	// TCPRACKNoDupTh indicates RACK should not consider the classic three
+	// duplicate acknowledgements rule to mark the segments as lost. This
+	// is used when reordering is not detected.
+	TCPRACKNoDupTh
+)
+
+// TCPDelayEnabled enables/disables Nagle's algorithm in TCP.
+type TCPDelayEnabled bool
+
+func (*TCPDelayEnabled) isGettableTransportProtocolOption() {}
+
+func (*TCPDelayEnabled) isSettableTransportProtocolOption() {}
+
+// TCPSendBufferSizeRangeOption is the send buffer size range for TCP.
+type TCPSendBufferSizeRangeOption struct {
+	Min     int
+	Default int
+	Max     int
+}
+
+func (*TCPSendBufferSizeRangeOption) isGettableTransportProtocolOption() {}
+
+func (*TCPSendBufferSizeRangeOption) isSettableTransportProtocolOption() {}
+
+// TCPReceiveBufferSizeRangeOption is the receive buffer size range for TCP.
+type TCPReceiveBufferSizeRangeOption struct {
+	Min     int
+	Default int
+	Max     int
+}
+
+func (*TCPReceiveBufferSizeRangeOption) isGettableTransportProtocolOption() {}
+
+func (*TCPReceiveBufferSizeRangeOption) isSettableTransportProtocolOption() {}
+
+// TCPAvailableCongestionControlOption is the supported congestion control
+// algorithms for TCP
+type TCPAvailableCongestionControlOption string
+
+func (*TCPAvailableCongestionControlOption) isGettableTransportProtocolOption() {}
+
+func (*TCPAvailableCongestionControlOption) isSettableTransportProtocolOption() {}
+
+// TCPModerateReceiveBufferOption enables/disables receive buffer moderation
+// for TCP.
+type TCPModerateReceiveBufferOption bool
+
+func (*TCPModerateReceiveBufferOption) isGettableTransportProtocolOption() {}
+
+func (*TCPModerateReceiveBufferOption) isSettableTransportProtocolOption() {}
+
+// GettableSocketOption is a marker interface for socket options that may be
+// queried.
+type GettableSocketOption interface {
+	isGettableSocketOption()
+}
+
+// SettableSocketOption is a marker interface for socket options that may be
+// configured.
+type SettableSocketOption interface {
+	isSettableSocketOption()
+}
 
 // BindToDeviceOption is used by SetSockOpt/GetSockOpt to specify that sockets
 // should bind only on a specific NIC.
 type BindToDeviceOption NICID
 
+func (*BindToDeviceOption) isGettableSocketOption() {}
+
+func (*BindToDeviceOption) isSettableSocketOption() {}
+
 // TCPInfoOption is used by GetSockOpt to expose TCP statistics.
 //
 // TODO(b/64800844): Add and populate stat fields.
@@ -855,68 +990,143 @@ type TCPInfoOption struct {
 	RTTVar time.Duration
 }
 
+func (*TCPInfoOption) isGettableSocketOption() {}
+
 // KeepaliveIdleOption is used by SetSockOpt/GetSockOpt to specify the time a
 // connection must remain idle before the first TCP keepalive packet is sent.
 // Once this time is reached, KeepaliveIntervalOption is used instead.
 type KeepaliveIdleOption time.Duration
 
+func (*KeepaliveIdleOption) isGettableSocketOption() {}
+
+func (*KeepaliveIdleOption) isSettableSocketOption() {}
+
 // KeepaliveIntervalOption is used by SetSockOpt/GetSockOpt to specify the
 // interval between sending TCP keepalive packets.
 type KeepaliveIntervalOption time.Duration
 
+func (*KeepaliveIntervalOption) isGettableSocketOption() {}
+
+func (*KeepaliveIntervalOption) isSettableSocketOption() {}
+
 // TCPUserTimeoutOption is used by SetSockOpt/GetSockOpt to specify a user
 // specified timeout for a given TCP connection.
 // See: RFC5482 for details.
 type TCPUserTimeoutOption time.Duration
 
+func (*TCPUserTimeoutOption) isGettableSocketOption() {}
+
+func (*TCPUserTimeoutOption) isSettableSocketOption() {}
+
 // CongestionControlOption is used by SetSockOpt/GetSockOpt to set/get
 // the current congestion control algorithm.
 type CongestionControlOption string
 
-// AvailableCongestionControlOption is used to query the supported congestion
-// control algorithms.
-type AvailableCongestionControlOption string
+func (*CongestionControlOption) isGettableSocketOption() {}
+
+func (*CongestionControlOption) isSettableSocketOption() {}
 
-// ModerateReceiveBufferOption is used by buffer moderation.
-type ModerateReceiveBufferOption bool
+func (*CongestionControlOption) isGettableTransportProtocolOption() {}
+
+func (*CongestionControlOption) isSettableTransportProtocolOption() {}
 
 // TCPLingerTimeoutOption is used by SetSockOpt/GetSockOpt to set/get the
 // maximum duration for which a socket lingers in the TCP_FIN_WAIT_2 state
 // before being marked closed.
 type TCPLingerTimeoutOption time.Duration
 
+func (*TCPLingerTimeoutOption) isGettableSocketOption() {}
+
+func (*TCPLingerTimeoutOption) isSettableSocketOption() {}
+
+func (*TCPLingerTimeoutOption) isGettableTransportProtocolOption() {}
+
+func (*TCPLingerTimeoutOption) isSettableTransportProtocolOption() {}
+
 // TCPTimeWaitTimeoutOption is used by SetSockOpt/GetSockOpt to set/get the
 // maximum duration for which a socket lingers in the TIME_WAIT state
 // before being marked closed.
 type TCPTimeWaitTimeoutOption time.Duration
 
+func (*TCPTimeWaitTimeoutOption) isGettableSocketOption() {}
+
+func (*TCPTimeWaitTimeoutOption) isSettableSocketOption() {}
+
+func (*TCPTimeWaitTimeoutOption) isGettableTransportProtocolOption() {}
+
+func (*TCPTimeWaitTimeoutOption) isSettableTransportProtocolOption() {}
+
 // TCPDeferAcceptOption is used by SetSockOpt/GetSockOpt to allow a
 // accept to return a completed connection only when there is data to be
 // read. This usually means the listening socket will drop the final ACK
 // for a handshake till the specified timeout until a segment with data arrives.
 type TCPDeferAcceptOption time.Duration
 
+func (*TCPDeferAcceptOption) isGettableSocketOption() {}
+
+func (*TCPDeferAcceptOption) isSettableSocketOption() {}
+
 // TCPMinRTOOption is use by SetSockOpt/GetSockOpt to allow overriding
 // default MinRTO used by the Stack.
 type TCPMinRTOOption time.Duration
 
+func (*TCPMinRTOOption) isGettableSocketOption() {}
+
+func (*TCPMinRTOOption) isSettableSocketOption() {}
+
+func (*TCPMinRTOOption) isGettableTransportProtocolOption() {}
+
+func (*TCPMinRTOOption) isSettableTransportProtocolOption() {}
+
 // TCPMaxRTOOption is use by SetSockOpt/GetSockOpt to allow overriding
 // default MaxRTO used by the Stack.
 type TCPMaxRTOOption time.Duration
 
+func (*TCPMaxRTOOption) isGettableSocketOption() {}
+
+func (*TCPMaxRTOOption) isSettableSocketOption() {}
+
+func (*TCPMaxRTOOption) isGettableTransportProtocolOption() {}
+
+func (*TCPMaxRTOOption) isSettableTransportProtocolOption() {}
+
 // TCPMaxRetriesOption is used by SetSockOpt/GetSockOpt to set/get the
 // maximum number of retransmits after which we time out the connection.
 type TCPMaxRetriesOption uint64
 
+func (*TCPMaxRetriesOption) isGettableSocketOption() {}
+
+func (*TCPMaxRetriesOption) isSettableSocketOption() {}
+
+func (*TCPMaxRetriesOption) isGettableTransportProtocolOption() {}
+
+func (*TCPMaxRetriesOption) isSettableTransportProtocolOption() {}
+
 // TCPSynRcvdCountThresholdOption is used by SetSockOpt/GetSockOpt to specify
 // the number of endpoints that can be in SYN-RCVD state before the stack
 // switches to using SYN cookies.
 type TCPSynRcvdCountThresholdOption uint64
 
+func (*TCPSynRcvdCountThresholdOption) isGettableSocketOption() {}
+
+func (*TCPSynRcvdCountThresholdOption) isSettableSocketOption() {}
+
+func (*TCPSynRcvdCountThresholdOption) isGettableTransportProtocolOption() {}
+
+func (*TCPSynRcvdCountThresholdOption) isSettableTransportProtocolOption() {}
+
 // TCPSynRetriesOption is used by SetSockOpt/GetSockOpt to specify stack-wide
 // default for number of times SYN is retransmitted before aborting a connect.
 type TCPSynRetriesOption uint8
 
+func (*TCPSynRetriesOption) isGettableSocketOption() {}
+
+func (*TCPSynRetriesOption) isSettableSocketOption() {}
+
+func (*TCPSynRetriesOption) isGettableTransportProtocolOption() {}
+
+func (*TCPSynRetriesOption) isSettableTransportProtocolOption() {}
+
 // MulticastInterfaceOption is used by SetSockOpt/GetSockOpt to specify a
 // default interface for multicast.
 type MulticastInterfaceOption struct {
@@ -924,45 +1134,61 @@ type MulticastInterfaceOption struct {
 	InterfaceAddr Address
 }
 
-// MembershipOption is used by SetSockOpt/GetSockOpt as an argument to
-// AddMembershipOption and RemoveMembershipOption.
+func (*MulticastInterfaceOption) isGettableSocketOption() {}
+
+func (*MulticastInterfaceOption) isSettableSocketOption() {}
+
+// MembershipOption is used to identify a multicast membership on an interface.
 type MembershipOption struct {
 	NIC           NICID
 	InterfaceAddr Address
 	MulticastAddr Address
 }
 
-// AddMembershipOption is used by SetSockOpt/GetSockOpt to join a multicast
-// group identified by the given multicast address, on the interface matching
-// the given interface address.
+// AddMembershipOption identifies a multicast group to join on some interface.
 type AddMembershipOption MembershipOption
 
-// RemoveMembershipOption is used by SetSockOpt/GetSockOpt to leave a multicast
-// group identified by the given multicast address, on the interface matching
-// the given interface address.
+func (*AddMembershipOption) isSettableSocketOption() {}
+
+// RemoveMembershipOption identifies a multicast group to leave on some
+// interface.
 type RemoveMembershipOption MembershipOption
 
+func (*RemoveMembershipOption) isSettableSocketOption() {}
+
 // OutOfBandInlineOption is used by SetSockOpt/GetSockOpt to specify whether
 // TCP out-of-band data is delivered along with the normal in-band data.
 type OutOfBandInlineOption int
 
-// DefaultTTLOption is used by stack.(*Stack).NetworkProtocolOption to specify
-// a default TTL.
-type DefaultTTLOption uint8
+func (*OutOfBandInlineOption) isGettableSocketOption() {}
+
+func (*OutOfBandInlineOption) isSettableSocketOption() {}
 
 // SocketDetachFilterOption is used by SetSockOpt to detach a previously attached
 // classic BPF filter on a given endpoint.
 type SocketDetachFilterOption int
 
+func (*SocketDetachFilterOption) isSettableSocketOption() {}
+
 // OriginalDestinationOption is used to get the original destination address
 // and port of a redirected packet.
 type OriginalDestinationOption FullAddress
 
+func (*OriginalDestinationOption) isGettableSocketOption() {}
+
 // TCPTimeWaitReuseOption is used stack.(*Stack).TransportProtocolOption to
 // specify if the stack can reuse the port bound by an endpoint in TIME-WAIT for
 // new connections when it is safe from protocol viewpoint.
 type TCPTimeWaitReuseOption uint8
 
+func (*TCPTimeWaitReuseOption) isGettableSocketOption() {}
+
+func (*TCPTimeWaitReuseOption) isSettableSocketOption() {}
+
+func (*TCPTimeWaitReuseOption) isGettableTransportProtocolOption() {}
+
+func (*TCPTimeWaitReuseOption) isSettableTransportProtocolOption() {}
+
 const (
 	// TCPTimeWaitReuseDisabled indicates reuse of port bound by endponts in TIME-WAIT cannot
 	// be reused for new connections.
@@ -978,6 +1204,19 @@ const (
 	TCPTimeWaitReuseLoopbackOnly
 )
 
+// LingerOption is used by SetSockOpt/GetSockOpt to set/get the
+// duration for which a socket lingers before returning from Close.
+//
+// +stateify savable
+type LingerOption struct {
+	Enabled bool
+	Timeout time.Duration
+}
+
+func (*LingerOption) isGettableSocketOption() {}
+
+func (*LingerOption) isSettableSocketOption() {}
+
 // IPPacketInfo is the message structure for IP_PKTINFO.
 //
 // +stateify savable
@@ -1020,7 +1259,10 @@ func (r Route) String() string {
 // TransportProtocolNumber is the number of a transport protocol.
 type TransportProtocolNumber uint32
 
-// NetworkProtocolNumber is the number of a network protocol.
+// NetworkProtocolNumber is the EtherType of a network protocol in an Ethernet
+// frame.
+//
+// See: https://www.iana.org/assignments/ieee-802-numbers/ieee-802-numbers.xhtml
 type NetworkProtocolNumber uint32
 
 // A StatCounter keeps track of a statistic.
@@ -1183,6 +1425,10 @@ type ICMPv6ReceivedPacketStats struct {
 	// Invalid is the total number of ICMPv6 packets received that the
 	// transport layer could not parse.
 	Invalid *StatCounter
+
+	// RouterOnlyPacketsDroppedByHost is the total number of ICMPv6 packets
+	// dropped due to being router-specific packets.
+	RouterOnlyPacketsDroppedByHost *StatCounter
 }
 
 // ICMPStats collects ICMP-specific stats (both v4 and v6).
@@ -1238,6 +1484,18 @@ type IPStats struct {
 	// MalformedFragmentsReceived is the total number of IP Fragments that were
 	// dropped due to the fragment failing validation checks.
 	MalformedFragmentsReceived *StatCounter
+
+	// IPTablesPreroutingDropped is the total number of IP packets dropped
+	// in the Prerouting chain.
+	IPTablesPreroutingDropped *StatCounter
+
+	// IPTablesInputDropped is the total number of IP packets dropped in
+	// the Input chain.
+	IPTablesInputDropped *StatCounter
+
+	// IPTablesOutputDropped is the total number of IP packets dropped in
+	// the Output chain.
+	IPTablesOutputDropped *StatCounter
 }
 
 // TCPStats collects TCP-specific stats.
@@ -1366,9 +1624,6 @@ type UDPStats struct {
 
 	// ChecksumErrors is the number of datagrams dropped due to bad checksums.
 	ChecksumErrors *StatCounter
-
-	// InvalidSourceAddress is the number of invalid sourced datagrams dropped.
-	InvalidSourceAddress *StatCounter
 }
 
 // Stats holds statistics about the networking stack.
diff --git a/pkg/tcpip/tests/integration/BUILD b/pkg/tcpip/tests/integration/BUILD
index 6d52af98a..34aab32d0 100644
--- a/pkg/tcpip/tests/integration/BUILD
+++ b/pkg/tcpip/tests/integration/BUILD
@@ -5,12 +5,21 @@ package(licenses = ["notice"])
 go_test(
     name = "integration_test",
     size = "small",
-    srcs = ["multicast_broadcast_test.go"],
+    srcs = [
+        "forward_test.go",
+        "link_resolution_test.go",
+        "loopback_test.go",
+        "multicast_broadcast_test.go",
+    ],
     deps = [
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/header",
         "//pkg/tcpip/link/channel",
+        "//pkg/tcpip/link/ethernet",
+        "//pkg/tcpip/link/loopback",
+        "//pkg/tcpip/link/pipe",
+        "//pkg/tcpip/network/arp",
         "//pkg/tcpip/network/ipv4",
         "//pkg/tcpip/network/ipv6",
         "//pkg/tcpip/stack",
diff --git a/pkg/tcpip/tests/integration/forward_test.go b/pkg/tcpip/tests/integration/forward_test.go
new file mode 100644
index 000000000..0dcef7b04
--- /dev/null
+++ b/pkg/tcpip/tests/integration/forward_test.go
@@ -0,0 +1,379 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package integration_test
+
+import (
+	"net"
+	"testing"
+
+	"github.com/google/go-cmp/cmp"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/link/ethernet"
+	"gvisor.dev/gvisor/pkg/tcpip/link/pipe"
+	"gvisor.dev/gvisor/pkg/tcpip/network/arp"
+	"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
+	"gvisor.dev/gvisor/pkg/tcpip/network/ipv6"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+	"gvisor.dev/gvisor/pkg/tcpip/transport/udp"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+func TestForwarding(t *testing.T) {
+	const (
+		host1NICLinkAddr   = tcpip.LinkAddress("\x02\x03\x03\x04\x05\x06")
+		routerNIC1LinkAddr = tcpip.LinkAddress("\x02\x03\x03\x04\x05\x07")
+		routerNIC2LinkAddr = tcpip.LinkAddress("\x02\x03\x03\x04\x05\x08")
+		host2NICLinkAddr   = tcpip.LinkAddress("\x02\x03\x03\x04\x05\x09")
+
+		host1NICID   = 1
+		routerNICID1 = 2
+		routerNICID2 = 3
+		host2NICID   = 4
+
+		listenPort = 8080
+	)
+
+	host1IPv4Addr := tcpip.ProtocolAddress{
+		Protocol: ipv4.ProtocolNumber,
+		AddressWithPrefix: tcpip.AddressWithPrefix{
+			Address:   tcpip.Address(net.ParseIP("192.168.0.2").To4()),
+			PrefixLen: 24,
+		},
+	}
+	routerNIC1IPv4Addr := tcpip.ProtocolAddress{
+		Protocol: ipv4.ProtocolNumber,
+		AddressWithPrefix: tcpip.AddressWithPrefix{
+			Address:   tcpip.Address(net.ParseIP("192.168.0.1").To4()),
+			PrefixLen: 24,
+		},
+	}
+	routerNIC2IPv4Addr := tcpip.ProtocolAddress{
+		Protocol: ipv4.ProtocolNumber,
+		AddressWithPrefix: tcpip.AddressWithPrefix{
+			Address:   tcpip.Address(net.ParseIP("10.0.0.1").To4()),
+			PrefixLen: 8,
+		},
+	}
+	host2IPv4Addr := tcpip.ProtocolAddress{
+		Protocol: ipv4.ProtocolNumber,
+		AddressWithPrefix: tcpip.AddressWithPrefix{
+			Address:   tcpip.Address(net.ParseIP("10.0.0.2").To4()),
+			PrefixLen: 8,
+		},
+	}
+	host1IPv6Addr := tcpip.ProtocolAddress{
+		Protocol: ipv6.ProtocolNumber,
+		AddressWithPrefix: tcpip.AddressWithPrefix{
+			Address:   tcpip.Address(net.ParseIP("a::2").To16()),
+			PrefixLen: 64,
+		},
+	}
+	routerNIC1IPv6Addr := tcpip.ProtocolAddress{
+		Protocol: ipv6.ProtocolNumber,
+		AddressWithPrefix: tcpip.AddressWithPrefix{
+			Address:   tcpip.Address(net.ParseIP("a::1").To16()),
+			PrefixLen: 64,
+		},
+	}
+	routerNIC2IPv6Addr := tcpip.ProtocolAddress{
+		Protocol: ipv6.ProtocolNumber,
+		AddressWithPrefix: tcpip.AddressWithPrefix{
+			Address:   tcpip.Address(net.ParseIP("b::1").To16()),
+			PrefixLen: 64,
+		},
+	}
+	host2IPv6Addr := tcpip.ProtocolAddress{
+		Protocol: ipv6.ProtocolNumber,
+		AddressWithPrefix: tcpip.AddressWithPrefix{
+			Address:   tcpip.Address(net.ParseIP("b::2").To16()),
+			PrefixLen: 64,
+		},
+	}
+
+	type endpointAndAddresses struct {
+		serverEP         tcpip.Endpoint
+		serverAddr       tcpip.Address
+		serverReadableCH chan struct{}
+
+		clientEP         tcpip.Endpoint
+		clientAddr       tcpip.Address
+		clientReadableCH chan struct{}
+	}
+
+	newEP := func(t *testing.T, s *stack.Stack, transProto tcpip.TransportProtocolNumber, netProto tcpip.NetworkProtocolNumber) (tcpip.Endpoint, chan struct{}) {
+		t.Helper()
+		var wq waiter.Queue
+		we, ch := waiter.NewChannelEntry(nil)
+		wq.EventRegister(&we, waiter.EventIn)
+		ep, err := s.NewEndpoint(transProto, netProto, &wq)
+		if err != nil {
+			t.Fatalf("s.NewEndpoint(%d, %d, _): %s", transProto, netProto, err)
+		}
+
+		t.Cleanup(func() {
+			wq.EventUnregister(&we)
+		})
+
+		return ep, ch
+	}
+
+	tests := []struct {
+		name       string
+		epAndAddrs func(t *testing.T, host1Stack, routerStack, host2Stack *stack.Stack) endpointAndAddresses
+	}{
+		{
+			name: "IPv4 host1 server with host2 client",
+			epAndAddrs: func(t *testing.T, host1Stack, routerStack, host2Stack *stack.Stack) endpointAndAddresses {
+				ep1, ep1WECH := newEP(t, host1Stack, udp.ProtocolNumber, ipv4.ProtocolNumber)
+				ep2, ep2WECH := newEP(t, host2Stack, udp.ProtocolNumber, ipv4.ProtocolNumber)
+				return endpointAndAddresses{
+					serverEP:         ep1,
+					serverAddr:       host1IPv4Addr.AddressWithPrefix.Address,
+					serverReadableCH: ep1WECH,
+
+					clientEP:         ep2,
+					clientAddr:       host2IPv4Addr.AddressWithPrefix.Address,
+					clientReadableCH: ep2WECH,
+				}
+			},
+		},
+		{
+			name: "IPv6 host2 server with host1 client",
+			epAndAddrs: func(t *testing.T, host1Stack, routerStack, host2Stack *stack.Stack) endpointAndAddresses {
+				ep1, ep1WECH := newEP(t, host2Stack, udp.ProtocolNumber, ipv6.ProtocolNumber)
+				ep2, ep2WECH := newEP(t, host1Stack, udp.ProtocolNumber, ipv6.ProtocolNumber)
+				return endpointAndAddresses{
+					serverEP:         ep1,
+					serverAddr:       host2IPv6Addr.AddressWithPrefix.Address,
+					serverReadableCH: ep1WECH,
+
+					clientEP:         ep2,
+					clientAddr:       host1IPv6Addr.AddressWithPrefix.Address,
+					clientReadableCH: ep2WECH,
+				}
+			},
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			stackOpts := stack.Options{
+				NetworkProtocols:   []stack.NetworkProtocolFactory{arp.NewProtocol, ipv4.NewProtocol, ipv6.NewProtocol},
+				TransportProtocols: []stack.TransportProtocolFactory{udp.NewProtocol},
+			}
+
+			host1Stack := stack.New(stackOpts)
+			routerStack := stack.New(stackOpts)
+			host2Stack := stack.New(stackOpts)
+
+			host1NIC, routerNIC1 := pipe.New(host1NICLinkAddr, routerNIC1LinkAddr)
+			routerNIC2, host2NIC := pipe.New(routerNIC2LinkAddr, host2NICLinkAddr)
+
+			if err := host1Stack.CreateNIC(host1NICID, ethernet.New(host1NIC)); err != nil {
+				t.Fatalf("host1Stack.CreateNIC(%d, _): %s", host1NICID, err)
+			}
+			if err := routerStack.CreateNIC(routerNICID1, ethernet.New(routerNIC1)); err != nil {
+				t.Fatalf("routerStack.CreateNIC(%d, _): %s", routerNICID1, err)
+			}
+			if err := routerStack.CreateNIC(routerNICID2, ethernet.New(routerNIC2)); err != nil {
+				t.Fatalf("routerStack.CreateNIC(%d, _): %s", routerNICID2, err)
+			}
+			if err := host2Stack.CreateNIC(host2NICID, ethernet.New(host2NIC)); err != nil {
+				t.Fatalf("host2Stack.CreateNIC(%d, _): %s", host2NICID, err)
+			}
+
+			if err := routerStack.SetForwarding(ipv4.ProtocolNumber, true); err != nil {
+				t.Fatalf("routerStack.SetForwarding(%d): %s", ipv4.ProtocolNumber, err)
+			}
+			if err := routerStack.SetForwarding(ipv6.ProtocolNumber, true); err != nil {
+				t.Fatalf("routerStack.SetForwarding(%d): %s", ipv6.ProtocolNumber, err)
+			}
+
+			if err := host1Stack.AddAddress(host1NICID, arp.ProtocolNumber, arp.ProtocolAddress); err != nil {
+				t.Fatalf("host1Stack.AddAddress(%d, %d, %s): %s", host1NICID, arp.ProtocolNumber, arp.ProtocolAddress, err)
+			}
+			if err := routerStack.AddAddress(routerNICID1, arp.ProtocolNumber, arp.ProtocolAddress); err != nil {
+				t.Fatalf("routerStack.AddAddress(%d, %d, %s): %s", routerNICID1, arp.ProtocolNumber, arp.ProtocolAddress, err)
+			}
+			if err := routerStack.AddAddress(routerNICID2, arp.ProtocolNumber, arp.ProtocolAddress); err != nil {
+				t.Fatalf("routerStack.AddAddress(%d, %d, %s): %s", routerNICID2, arp.ProtocolNumber, arp.ProtocolAddress, err)
+			}
+			if err := host2Stack.AddAddress(host2NICID, arp.ProtocolNumber, arp.ProtocolAddress); err != nil {
+				t.Fatalf("host2Stack.AddAddress(%d, %d, %s): %s", host2NICID, arp.ProtocolNumber, arp.ProtocolAddress, err)
+			}
+
+			if err := host1Stack.AddProtocolAddress(host1NICID, host1IPv4Addr); err != nil {
+				t.Fatalf("host1Stack.AddProtocolAddress(%d, %#v): %s", host1NICID, host1IPv4Addr, err)
+			}
+			if err := routerStack.AddProtocolAddress(routerNICID1, routerNIC1IPv4Addr); err != nil {
+				t.Fatalf("routerStack.AddProtocolAddress(%d, %#v): %s", routerNICID1, routerNIC1IPv4Addr, err)
+			}
+			if err := routerStack.AddProtocolAddress(routerNICID2, routerNIC2IPv4Addr); err != nil {
+				t.Fatalf("routerStack.AddProtocolAddress(%d, %#v): %s", routerNICID2, routerNIC2IPv4Addr, err)
+			}
+			if err := host2Stack.AddProtocolAddress(host2NICID, host2IPv4Addr); err != nil {
+				t.Fatalf("host2Stack.AddProtocolAddress(%d, %#v): %s", host2NICID, host2IPv4Addr, err)
+			}
+			if err := host1Stack.AddProtocolAddress(host1NICID, host1IPv6Addr); err != nil {
+				t.Fatalf("host1Stack.AddProtocolAddress(%d, %#v): %s", host1NICID, host1IPv6Addr, err)
+			}
+			if err := routerStack.AddProtocolAddress(routerNICID1, routerNIC1IPv6Addr); err != nil {
+				t.Fatalf("routerStack.AddProtocolAddress(%d, %#v): %s", routerNICID1, routerNIC1IPv6Addr, err)
+			}
+			if err := routerStack.AddProtocolAddress(routerNICID2, routerNIC2IPv6Addr); err != nil {
+				t.Fatalf("routerStack.AddProtocolAddress(%d, %#v): %s", routerNICID2, routerNIC2IPv6Addr, err)
+			}
+			if err := host2Stack.AddProtocolAddress(host2NICID, host2IPv6Addr); err != nil {
+				t.Fatalf("host2Stack.AddProtocolAddress(%d, %#v): %s", host2NICID, host2IPv6Addr, err)
+			}
+
+			host1Stack.SetRouteTable([]tcpip.Route{
+				tcpip.Route{
+					Destination: host1IPv4Addr.AddressWithPrefix.Subnet(),
+					NIC:         host1NICID,
+				},
+				tcpip.Route{
+					Destination: host1IPv6Addr.AddressWithPrefix.Subnet(),
+					NIC:         host1NICID,
+				},
+				tcpip.Route{
+					Destination: host2IPv4Addr.AddressWithPrefix.Subnet(),
+					Gateway:     routerNIC1IPv4Addr.AddressWithPrefix.Address,
+					NIC:         host1NICID,
+				},
+				tcpip.Route{
+					Destination: host2IPv6Addr.AddressWithPrefix.Subnet(),
+					Gateway:     routerNIC1IPv6Addr.AddressWithPrefix.Address,
+					NIC:         host1NICID,
+				},
+			})
+			routerStack.SetRouteTable([]tcpip.Route{
+				tcpip.Route{
+					Destination: routerNIC1IPv4Addr.AddressWithPrefix.Subnet(),
+					NIC:         routerNICID1,
+				},
+				tcpip.Route{
+					Destination: routerNIC1IPv6Addr.AddressWithPrefix.Subnet(),
+					NIC:         routerNICID1,
+				},
+				tcpip.Route{
+					Destination: routerNIC2IPv4Addr.AddressWithPrefix.Subnet(),
+					NIC:         routerNICID2,
+				},
+				tcpip.Route{
+					Destination: routerNIC2IPv6Addr.AddressWithPrefix.Subnet(),
+					NIC:         routerNICID2,
+				},
+			})
+			host2Stack.SetRouteTable([]tcpip.Route{
+				tcpip.Route{
+					Destination: host2IPv4Addr.AddressWithPrefix.Subnet(),
+					NIC:         host2NICID,
+				},
+				tcpip.Route{
+					Destination: host2IPv6Addr.AddressWithPrefix.Subnet(),
+					NIC:         host2NICID,
+				},
+				tcpip.Route{
+					Destination: host1IPv4Addr.AddressWithPrefix.Subnet(),
+					Gateway:     routerNIC2IPv4Addr.AddressWithPrefix.Address,
+					NIC:         host2NICID,
+				},
+				tcpip.Route{
+					Destination: host1IPv6Addr.AddressWithPrefix.Subnet(),
+					Gateway:     routerNIC2IPv6Addr.AddressWithPrefix.Address,
+					NIC:         host2NICID,
+				},
+			})
+
+			epsAndAddrs := test.epAndAddrs(t, host1Stack, routerStack, host2Stack)
+			defer epsAndAddrs.serverEP.Close()
+			defer epsAndAddrs.clientEP.Close()
+
+			serverAddr := tcpip.FullAddress{Addr: epsAndAddrs.serverAddr, Port: listenPort}
+			if err := epsAndAddrs.serverEP.Bind(serverAddr); err != nil {
+				t.Fatalf("epsAndAddrs.serverEP.Bind(%#v): %s", serverAddr, err)
+			}
+			clientAddr := tcpip.FullAddress{Addr: epsAndAddrs.clientAddr}
+			if err := epsAndAddrs.clientEP.Bind(clientAddr); err != nil {
+				t.Fatalf("epsAndAddrs.clientEP.Bind(%#v): %s", clientAddr, err)
+			}
+
+			write := func(ep tcpip.Endpoint, data []byte, to *tcpip.FullAddress) {
+				t.Helper()
+
+				dataPayload := tcpip.SlicePayload(data)
+				wOpts := tcpip.WriteOptions{To: to}
+				n, ch, err := ep.Write(dataPayload, wOpts)
+				if err == tcpip.ErrNoLinkAddress {
+					// Wait for link resolution to complete.
+					<-ch
+
+					n, _, err = ep.Write(dataPayload, wOpts)
+				} else if err != nil {
+					t.Fatalf("ep.Write(_, _): %s", err)
+				}
+
+				if err != nil {
+					t.Fatalf("ep.Write(_, _): %s", err)
+				}
+				if want := int64(len(data)); n != want {
+					t.Fatalf("got ep.Write(_, _) = (%d, _, _), want = (%d, _, _)", n, want)
+				}
+			}
+
+			data := []byte{1, 2, 3, 4}
+			write(epsAndAddrs.clientEP, data, &serverAddr)
+
+			read := func(ch chan struct{}, ep tcpip.Endpoint, data []byte, expectedFrom tcpip.Address) tcpip.FullAddress {
+				t.Helper()
+
+				// Wait for the endpoint to be readable.
+				<-ch
+
+				var addr tcpip.FullAddress
+				v, _, err := ep.Read(&addr)
+				if err != nil {
+					t.Fatalf("ep.Read(_): %s", err)
+				}
+
+				if diff := cmp.Diff(v, buffer.View(data)); diff != "" {
+					t.Errorf("received data mismatch (-want +got):\n%s", diff)
+				}
+				if addr.Addr != expectedFrom {
+					t.Errorf("got addr.Addr = %s, want = %s", addr.Addr, expectedFrom)
+				}
+
+				if t.Failed() {
+					t.FailNow()
+				}
+
+				return addr
+			}
+
+			addr := read(epsAndAddrs.serverReadableCH, epsAndAddrs.serverEP, data, epsAndAddrs.clientAddr)
+			// Unspecify the NIC since NIC IDs are meaningless across stacks.
+			addr.NIC = 0
+
+			data = tcpip.SlicePayload([]byte{5, 6, 7, 8, 9, 10, 11, 12})
+			write(epsAndAddrs.serverEP, data, &addr)
+			addr = read(epsAndAddrs.clientReadableCH, epsAndAddrs.clientEP, data, epsAndAddrs.serverAddr)
+			if addr.Port != listenPort {
+				t.Errorf("got addr.Port = %d, want = %d", addr.Port, listenPort)
+			}
+		})
+	}
+}
diff --git a/pkg/tcpip/tests/integration/link_resolution_test.go b/pkg/tcpip/tests/integration/link_resolution_test.go
new file mode 100644
index 000000000..6ddcda70c
--- /dev/null
+++ b/pkg/tcpip/tests/integration/link_resolution_test.go
@@ -0,0 +1,220 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package integration_test
+
+import (
+	"net"
+	"testing"
+
+	"github.com/google/go-cmp/cmp"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/link/ethernet"
+	"gvisor.dev/gvisor/pkg/tcpip/link/pipe"
+	"gvisor.dev/gvisor/pkg/tcpip/network/arp"
+	"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
+	"gvisor.dev/gvisor/pkg/tcpip/network/ipv6"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+	"gvisor.dev/gvisor/pkg/tcpip/transport/icmp"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+var (
+	host1NICLinkAddr = tcpip.LinkAddress("\x02\x03\x03\x04\x05\x06")
+	host2NICLinkAddr = tcpip.LinkAddress("\x02\x03\x03\x04\x05\x09")
+
+	host1IPv4Addr = tcpip.ProtocolAddress{
+		Protocol: ipv4.ProtocolNumber,
+		AddressWithPrefix: tcpip.AddressWithPrefix{
+			Address:   tcpip.Address(net.ParseIP("192.168.0.1").To4()),
+			PrefixLen: 24,
+		},
+	}
+	host2IPv4Addr = tcpip.ProtocolAddress{
+		Protocol: ipv4.ProtocolNumber,
+		AddressWithPrefix: tcpip.AddressWithPrefix{
+			Address:   tcpip.Address(net.ParseIP("192.168.0.2").To4()),
+			PrefixLen: 8,
+		},
+	}
+	host1IPv6Addr = tcpip.ProtocolAddress{
+		Protocol: ipv6.ProtocolNumber,
+		AddressWithPrefix: tcpip.AddressWithPrefix{
+			Address:   tcpip.Address(net.ParseIP("a::1").To16()),
+			PrefixLen: 64,
+		},
+	}
+	host2IPv6Addr = tcpip.ProtocolAddress{
+		Protocol: ipv6.ProtocolNumber,
+		AddressWithPrefix: tcpip.AddressWithPrefix{
+			Address:   tcpip.Address(net.ParseIP("a::2").To16()),
+			PrefixLen: 64,
+		},
+	}
+)
+
+// TestPing tests that two hosts can ping eachother when link resolution is
+// enabled.
+func TestPing(t *testing.T) {
+	const (
+		host1NICID = 1
+		host2NICID = 4
+
+		// icmpDataOffset is the offset to the data in both ICMPv4 and ICMPv6 echo
+		// request/reply packets.
+		icmpDataOffset = 8
+	)
+
+	tests := []struct {
+		name       string
+		transProto tcpip.TransportProtocolNumber
+		netProto   tcpip.NetworkProtocolNumber
+		remoteAddr tcpip.Address
+		icmpBuf    func(*testing.T) buffer.View
+	}{
+		{
+			name:       "IPv4 Ping",
+			transProto: icmp.ProtocolNumber4,
+			netProto:   ipv4.ProtocolNumber,
+			remoteAddr: host2IPv4Addr.AddressWithPrefix.Address,
+			icmpBuf: func(t *testing.T) buffer.View {
+				data := [8]byte{1, 2, 3, 4, 5, 6, 7, 8}
+				hdr := header.ICMPv4(make([]byte, header.ICMPv4MinimumSize+len(data)))
+				hdr.SetType(header.ICMPv4Echo)
+				if n := copy(hdr.Payload(), data[:]); n != len(data) {
+					t.Fatalf("copied %d bytes but expected to copy %d bytes", n, len(data))
+				}
+				return buffer.View(hdr)
+			},
+		},
+		{
+			name:       "IPv6 Ping",
+			transProto: icmp.ProtocolNumber6,
+			netProto:   ipv6.ProtocolNumber,
+			remoteAddr: host2IPv6Addr.AddressWithPrefix.Address,
+			icmpBuf: func(t *testing.T) buffer.View {
+				data := [8]byte{1, 2, 3, 4, 5, 6, 7, 8}
+				hdr := header.ICMPv6(make([]byte, header.ICMPv6MinimumSize+len(data)))
+				hdr.SetType(header.ICMPv6EchoRequest)
+				if n := copy(hdr.Payload(), data[:]); n != len(data) {
+					t.Fatalf("copied %d bytes but expected to copy %d bytes", n, len(data))
+				}
+				return buffer.View(hdr)
+			},
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			stackOpts := stack.Options{
+				NetworkProtocols:   []stack.NetworkProtocolFactory{arp.NewProtocol, ipv4.NewProtocol, ipv6.NewProtocol},
+				TransportProtocols: []stack.TransportProtocolFactory{icmp.NewProtocol4, icmp.NewProtocol6},
+			}
+
+			host1Stack := stack.New(stackOpts)
+			host2Stack := stack.New(stackOpts)
+
+			host1NIC, host2NIC := pipe.New(host1NICLinkAddr, host2NICLinkAddr)
+
+			if err := host1Stack.CreateNIC(host1NICID, ethernet.New(host1NIC)); err != nil {
+				t.Fatalf("host1Stack.CreateNIC(%d, _): %s", host1NICID, err)
+			}
+			if err := host2Stack.CreateNIC(host2NICID, ethernet.New(host2NIC)); err != nil {
+				t.Fatalf("host2Stack.CreateNIC(%d, _): %s", host2NICID, err)
+			}
+
+			if err := host1Stack.AddAddress(host1NICID, arp.ProtocolNumber, arp.ProtocolAddress); err != nil {
+				t.Fatalf("host1Stack.AddAddress(%d, %d, %s): %s", host1NICID, arp.ProtocolNumber, arp.ProtocolAddress, err)
+			}
+			if err := host2Stack.AddAddress(host2NICID, arp.ProtocolNumber, arp.ProtocolAddress); err != nil {
+				t.Fatalf("host2Stack.AddAddress(%d, %d, %s): %s", host2NICID, arp.ProtocolNumber, arp.ProtocolAddress, err)
+			}
+
+			if err := host1Stack.AddProtocolAddress(host1NICID, host1IPv4Addr); err != nil {
+				t.Fatalf("host1Stack.AddProtocolAddress(%d, %#v): %s", host1NICID, host1IPv4Addr, err)
+			}
+			if err := host2Stack.AddProtocolAddress(host2NICID, host2IPv4Addr); err != nil {
+				t.Fatalf("host2Stack.AddProtocolAddress(%d, %#v): %s", host2NICID, host2IPv4Addr, err)
+			}
+			if err := host1Stack.AddProtocolAddress(host1NICID, host1IPv6Addr); err != nil {
+				t.Fatalf("host1Stack.AddProtocolAddress(%d, %#v): %s", host1NICID, host1IPv6Addr, err)
+			}
+			if err := host2Stack.AddProtocolAddress(host2NICID, host2IPv6Addr); err != nil {
+				t.Fatalf("host2Stack.AddProtocolAddress(%d, %#v): %s", host2NICID, host2IPv6Addr, err)
+			}
+
+			host1Stack.SetRouteTable([]tcpip.Route{
+				tcpip.Route{
+					Destination: host1IPv4Addr.AddressWithPrefix.Subnet(),
+					NIC:         host1NICID,
+				},
+				tcpip.Route{
+					Destination: host1IPv6Addr.AddressWithPrefix.Subnet(),
+					NIC:         host1NICID,
+				},
+			})
+			host2Stack.SetRouteTable([]tcpip.Route{
+				tcpip.Route{
+					Destination: host2IPv4Addr.AddressWithPrefix.Subnet(),
+					NIC:         host2NICID,
+				},
+				tcpip.Route{
+					Destination: host2IPv6Addr.AddressWithPrefix.Subnet(),
+					NIC:         host2NICID,
+				},
+			})
+
+			var wq waiter.Queue
+			we, waiterCH := waiter.NewChannelEntry(nil)
+			wq.EventRegister(&we, waiter.EventIn)
+			ep, err := host1Stack.NewEndpoint(test.transProto, test.netProto, &wq)
+			if err != nil {
+				t.Fatalf("host1Stack.NewEndpoint(%d, %d, _): %s", test.transProto, test.netProto, err)
+			}
+			defer ep.Close()
+
+			// The first write should trigger link resolution.
+			icmpBuf := test.icmpBuf(t)
+			wOpts := tcpip.WriteOptions{To: &tcpip.FullAddress{Addr: test.remoteAddr}}
+			if _, ch, err := ep.Write(tcpip.SlicePayload(icmpBuf), wOpts); err != tcpip.ErrNoLinkAddress {
+				t.Fatalf("got ep.Write(_, _) = %s, want = %s", err, tcpip.ErrNoLinkAddress)
+			} else {
+				// Wait for link resolution to complete.
+				<-ch
+			}
+			if n, _, err := ep.Write(tcpip.SlicePayload(icmpBuf), wOpts); err != nil {
+				t.Fatalf("ep.Write(_, _): %s", err)
+			} else if want := int64(len(icmpBuf)); n != want {
+				t.Fatalf("got ep.Write(_, _) = (%d, _, _), want = (%d, _, _)", n, want)
+			}
+
+			// Wait for the endpoint to be readable.
+			<-waiterCH
+
+			var addr tcpip.FullAddress
+			v, _, err := ep.Read(&addr)
+			if err != nil {
+				t.Fatalf("ep.Read(_): %s", err)
+			}
+			if diff := cmp.Diff(v[icmpDataOffset:], icmpBuf[icmpDataOffset:]); diff != "" {
+				t.Errorf("received data mismatch (-want +got):\n%s", diff)
+			}
+			if addr.Addr != test.remoteAddr {
+				t.Errorf("got addr.Addr = %s, want = %s", addr.Addr, test.remoteAddr)
+			}
+		})
+	}
+}
diff --git a/pkg/tcpip/tests/integration/loopback_test.go b/pkg/tcpip/tests/integration/loopback_test.go
new file mode 100644
index 000000000..e8caf09ba
--- /dev/null
+++ b/pkg/tcpip/tests/integration/loopback_test.go
@@ -0,0 +1,314 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package integration_test
+
+import (
+	"testing"
+	"time"
+
+	"github.com/google/go-cmp/cmp"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/link/loopback"
+	"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
+	"gvisor.dev/gvisor/pkg/tcpip/network/ipv6"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+	"gvisor.dev/gvisor/pkg/tcpip/transport/udp"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+var _ ipv6.NDPDispatcher = (*ndpDispatcher)(nil)
+
+type ndpDispatcher struct{}
+
+func (*ndpDispatcher) OnDuplicateAddressDetectionStatus(tcpip.NICID, tcpip.Address, bool, *tcpip.Error) {
+}
+
+func (*ndpDispatcher) OnDefaultRouterDiscovered(tcpip.NICID, tcpip.Address) bool {
+	return false
+}
+
+func (*ndpDispatcher) OnDefaultRouterInvalidated(tcpip.NICID, tcpip.Address) {}
+
+func (*ndpDispatcher) OnOnLinkPrefixDiscovered(tcpip.NICID, tcpip.Subnet) bool {
+	return false
+}
+
+func (*ndpDispatcher) OnOnLinkPrefixInvalidated(tcpip.NICID, tcpip.Subnet) {}
+
+func (*ndpDispatcher) OnAutoGenAddress(tcpip.NICID, tcpip.AddressWithPrefix) bool {
+	return true
+}
+
+func (*ndpDispatcher) OnAutoGenAddressDeprecated(tcpip.NICID, tcpip.AddressWithPrefix) {}
+
+func (*ndpDispatcher) OnAutoGenAddressInvalidated(tcpip.NICID, tcpip.AddressWithPrefix) {}
+
+func (*ndpDispatcher) OnRecursiveDNSServerOption(tcpip.NICID, []tcpip.Address, time.Duration) {}
+
+func (*ndpDispatcher) OnDNSSearchListOption(tcpip.NICID, []string, time.Duration) {}
+
+func (*ndpDispatcher) OnDHCPv6Configuration(tcpip.NICID, ipv6.DHCPv6ConfigurationFromNDPRA) {}
+
+// TestInitialLoopbackAddresses tests that the loopback interface does not
+// auto-generate a link-local address when it is brought up.
+func TestInitialLoopbackAddresses(t *testing.T) {
+	const nicID = 1
+
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocolFactory{ipv4.NewProtocol, ipv6.NewProtocolWithOptions(ipv6.Options{
+			NDPDisp:              &ndpDispatcher{},
+			AutoGenIPv6LinkLocal: true,
+			OpaqueIIDOpts: ipv6.OpaqueInterfaceIdentifierOptions{
+				NICNameFromID: func(nicID tcpip.NICID, nicName string) string {
+					t.Fatalf("should not attempt to get name for NIC with ID = %d; nicName = %s", nicID, nicName)
+					return ""
+				},
+			},
+		})},
+	})
+
+	if err := s.CreateNIC(nicID, loopback.New()); err != nil {
+		t.Fatalf("CreateNIC(%d, _): %s", nicID, err)
+	}
+
+	nicsInfo := s.NICInfo()
+	if nicInfo, ok := nicsInfo[nicID]; !ok {
+		t.Fatalf("did not find NIC with ID = %d in s.NICInfo() = %#v", nicID, nicsInfo)
+	} else if got := len(nicInfo.ProtocolAddresses); got != 0 {
+		t.Fatalf("got len(nicInfo.ProtocolAddresses) = %d, want = 0; nicInfo.ProtocolAddresses = %#v", got, nicInfo.ProtocolAddresses)
+	}
+}
+
+// TestLoopbackAcceptAllInSubnet tests that a loopback interface considers
+// itself bound to all addresses in the subnet of an assigned address.
+func TestLoopbackAcceptAllInSubnet(t *testing.T) {
+	const (
+		nicID     = 1
+		localPort = 80
+	)
+
+	data := []byte{1, 2, 3, 4}
+
+	ipv4ProtocolAddress := tcpip.ProtocolAddress{
+		Protocol:          header.IPv4ProtocolNumber,
+		AddressWithPrefix: ipv4Addr,
+	}
+	ipv4Bytes := []byte(ipv4Addr.Address)
+	ipv4Bytes[len(ipv4Bytes)-1]++
+	otherIPv4Address := tcpip.Address(ipv4Bytes)
+
+	ipv6ProtocolAddress := tcpip.ProtocolAddress{
+		Protocol:          header.IPv6ProtocolNumber,
+		AddressWithPrefix: ipv6Addr,
+	}
+	ipv6Bytes := []byte(ipv6Addr.Address)
+	ipv6Bytes[len(ipv6Bytes)-1]++
+	otherIPv6Address := tcpip.Address(ipv6Bytes)
+
+	tests := []struct {
+		name       string
+		addAddress tcpip.ProtocolAddress
+		bindAddr   tcpip.Address
+		dstAddr    tcpip.Address
+		expectRx   bool
+	}{
+		{
+			name:       "IPv4 bind to wildcard and send to assigned address",
+			addAddress: ipv4ProtocolAddress,
+			dstAddr:    ipv4Addr.Address,
+			expectRx:   true,
+		},
+		{
+			name:       "IPv4 bind to wildcard and send to other subnet-local address",
+			addAddress: ipv4ProtocolAddress,
+			dstAddr:    otherIPv4Address,
+			expectRx:   true,
+		},
+		{
+			name:       "IPv4 bind to wildcard send to other address",
+			addAddress: ipv4ProtocolAddress,
+			dstAddr:    remoteIPv4Addr,
+			expectRx:   false,
+		},
+		{
+			name:       "IPv4 bind to other subnet-local address and send to assigned address",
+			addAddress: ipv4ProtocolAddress,
+			bindAddr:   otherIPv4Address,
+			dstAddr:    ipv4Addr.Address,
+			expectRx:   false,
+		},
+		{
+			name:       "IPv4 bind and send to other subnet-local address",
+			addAddress: ipv4ProtocolAddress,
+			bindAddr:   otherIPv4Address,
+			dstAddr:    otherIPv4Address,
+			expectRx:   true,
+		},
+		{
+			name:       "IPv4 bind to assigned address and send to other subnet-local address",
+			addAddress: ipv4ProtocolAddress,
+			bindAddr:   ipv4Addr.Address,
+			dstAddr:    otherIPv4Address,
+			expectRx:   false,
+		},
+
+		{
+			name:       "IPv6 bind and send to assigned address",
+			addAddress: ipv6ProtocolAddress,
+			bindAddr:   ipv6Addr.Address,
+			dstAddr:    ipv6Addr.Address,
+			expectRx:   true,
+		},
+		{
+			name:       "IPv6 bind to wildcard and send to other subnet-local address",
+			addAddress: ipv6ProtocolAddress,
+			dstAddr:    otherIPv6Address,
+			expectRx:   false,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			s := stack.New(stack.Options{
+				NetworkProtocols:   []stack.NetworkProtocolFactory{ipv4.NewProtocol, ipv6.NewProtocol},
+				TransportProtocols: []stack.TransportProtocolFactory{udp.NewProtocol},
+			})
+			if err := s.CreateNIC(nicID, loopback.New()); err != nil {
+				t.Fatalf("CreateNIC(%d, _): %s", nicID, err)
+			}
+			if err := s.AddProtocolAddress(nicID, test.addAddress); err != nil {
+				t.Fatalf("AddProtocolAddress(%d, %+v): %s", nicID, test.addAddress, err)
+			}
+			s.SetRouteTable([]tcpip.Route{
+				tcpip.Route{
+					Destination: header.IPv4EmptySubnet,
+					NIC:         nicID,
+				},
+				tcpip.Route{
+					Destination: header.IPv6EmptySubnet,
+					NIC:         nicID,
+				},
+			})
+
+			wq := waiter.Queue{}
+			rep, err := s.NewEndpoint(udp.ProtocolNumber, test.addAddress.Protocol, &wq)
+			if err != nil {
+				t.Fatalf("NewEndpoint(%d, %d, _): %s", udp.ProtocolNumber, test.addAddress.Protocol, err)
+			}
+			defer rep.Close()
+
+			bindAddr := tcpip.FullAddress{Addr: test.bindAddr, Port: localPort}
+			if err := rep.Bind(bindAddr); err != nil {
+				t.Fatalf("rep.Bind(%+v): %s", bindAddr, err)
+			}
+
+			sep, err := s.NewEndpoint(udp.ProtocolNumber, test.addAddress.Protocol, &wq)
+			if err != nil {
+				t.Fatalf("NewEndpoint(%d, %d, _): %s", udp.ProtocolNumber, test.addAddress.Protocol, err)
+			}
+			defer sep.Close()
+
+			wopts := tcpip.WriteOptions{
+				To: &tcpip.FullAddress{
+					Addr: test.dstAddr,
+					Port: localPort,
+				},
+			}
+			n, _, err := sep.Write(tcpip.SlicePayload(data), wopts)
+			if err != nil {
+				t.Fatalf("sep.Write(_, _): %s", err)
+			}
+			if want := int64(len(data)); n != want {
+				t.Fatalf("got sep.Write(_, _) = (%d, _, nil), want = (%d, _, nil)", n, want)
+			}
+
+			if gotPayload, _, err := rep.Read(nil); test.expectRx {
+				if err != nil {
+					t.Fatalf("reep.Read(nil): %s", err)
+				}
+				if diff := cmp.Diff(buffer.View(data), gotPayload); diff != "" {
+					t.Errorf("got UDP payload mismatch (-want +got):\n%s", diff)
+				}
+			} else {
+				if err != tcpip.ErrWouldBlock {
+					t.Fatalf("got rep.Read(nil) = (%x, _, %s), want = (_, _, %s)", gotPayload, err, tcpip.ErrWouldBlock)
+				}
+			}
+		})
+	}
+}
+
+// TestLoopbackSubnetLifetimeBoundToAddr tests that the lifetime of an address
+// in a loopback interface's associated subnet is bound to the permanently bound
+// address.
+func TestLoopbackSubnetLifetimeBoundToAddr(t *testing.T) {
+	const nicID = 1
+
+	protoAddr := tcpip.ProtocolAddress{
+		Protocol:          ipv4.ProtocolNumber,
+		AddressWithPrefix: ipv4Addr,
+	}
+	addrBytes := []byte(ipv4Addr.Address)
+	addrBytes[len(addrBytes)-1]++
+	otherAddr := tcpip.Address(addrBytes)
+
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocolFactory{ipv4.NewProtocol},
+	})
+	if err := s.CreateNIC(nicID, loopback.New()); err != nil {
+		t.Fatalf("s.CreateNIC(%d, _): %s", nicID, err)
+	}
+	if err := s.AddProtocolAddress(nicID, protoAddr); err != nil {
+		t.Fatalf("s.AddProtocolAddress(%d, %#v): %s", nicID, protoAddr, err)
+	}
+	s.SetRouteTable([]tcpip.Route{
+		tcpip.Route{
+			Destination: header.IPv4EmptySubnet,
+			NIC:         nicID,
+		},
+	})
+
+	r, err := s.FindRoute(nicID, otherAddr, remoteIPv4Addr, ipv4.ProtocolNumber, false /* multicastLoop */)
+	if err != nil {
+		t.Fatalf("s.FindRoute(%d, %s, %s, %d, false): %s", nicID, otherAddr, remoteIPv4Addr, ipv4.ProtocolNumber, err)
+	}
+	defer r.Release()
+
+	params := stack.NetworkHeaderParams{
+		Protocol: 111,
+		TTL:      64,
+		TOS:      stack.DefaultTOS,
+	}
+	data := buffer.View([]byte{1, 2, 3, 4})
+	if err := r.WritePacket(nil /* gso */, params, stack.NewPacketBuffer(stack.PacketBufferOptions{
+		ReserveHeaderBytes: int(r.MaxHeaderLength()),
+		Data:               data.ToVectorisedView(),
+	})); err != nil {
+		t.Fatalf("r.WritePacket(nil, %#v, _): %s", params, err)
+	}
+
+	// Removing the address should make the endpoint invalid.
+	if err := s.RemoveAddress(nicID, protoAddr.AddressWithPrefix.Address); err != nil {
+		t.Fatalf("s.RemoveAddress(%d, %s): %s", nicID, protoAddr.AddressWithPrefix.Address, err)
+	}
+	if err := r.WritePacket(nil /* gso */, params, stack.NewPacketBuffer(stack.PacketBufferOptions{
+		ReserveHeaderBytes: int(r.MaxHeaderLength()),
+		Data:               data.ToVectorisedView(),
+	})); err != tcpip.ErrInvalidEndpointState {
+		t.Fatalf("got r.WritePacket(nil, %#v, _) = %s, want = %s", params, err, tcpip.ErrInvalidEndpointState)
+	}
+}
diff --git a/pkg/tcpip/tests/integration/multicast_broadcast_test.go b/pkg/tcpip/tests/integration/multicast_broadcast_test.go
index 9f0dd4d6d..f1028823b 100644
--- a/pkg/tcpip/tests/integration/multicast_broadcast_test.go
+++ b/pkg/tcpip/tests/integration/multicast_broadcast_test.go
@@ -23,6 +23,7 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/link/channel"
+	"gvisor.dev/gvisor/pkg/tcpip/link/loopback"
 	"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
 	"gvisor.dev/gvisor/pkg/tcpip/network/ipv6"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
@@ -79,6 +80,7 @@ func TestPingMulticastBroadcast(t *testing.T) {
 			SrcAddr:     remoteIPv4Addr,
 			DstAddr:     dst,
 		})
+		ip.SetChecksum(^ip.CalculateChecksum())
 
 		e.InjectInbound(header.IPv4ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
 			Data: hdr.View().ToVectorisedView(),
@@ -139,11 +141,9 @@ func TestPingMulticastBroadcast(t *testing.T) {
 
 	for _, test := range tests {
 		t.Run(test.name, func(t *testing.T) {
-			ipv4Proto := ipv4.NewProtocol()
-			ipv6Proto := ipv6.NewProtocol()
 			s := stack.New(stack.Options{
-				NetworkProtocols:   []stack.NetworkProtocol{ipv4Proto, ipv6Proto},
-				TransportProtocols: []stack.TransportProtocol{icmp.NewProtocol4(), icmp.NewProtocol6()},
+				NetworkProtocols:   []stack.NetworkProtocolFactory{ipv4.NewProtocol, ipv6.NewProtocol},
+				TransportProtocols: []stack.TransportProtocolFactory{icmp.NewProtocol4, icmp.NewProtocol6},
 			})
 			// We only expect a single packet in response to our ICMP Echo Request.
 			e := channel.New(1, defaultMTU, "")
@@ -175,18 +175,18 @@ func TestPingMulticastBroadcast(t *testing.T) {
 			var rxICMP func(*channel.Endpoint, tcpip.Address)
 			var expectedSrc tcpip.Address
 			var expectedDst tcpip.Address
-			var proto stack.NetworkProtocol
+			var protoNum tcpip.NetworkProtocolNumber
 			switch l := len(test.dstAddr); l {
 			case header.IPv4AddressSize:
 				rxICMP = rxIPv4ICMP
 				expectedSrc = ipv4Addr.Address
 				expectedDst = remoteIPv4Addr
-				proto = ipv4Proto
+				protoNum = header.IPv4ProtocolNumber
 			case header.IPv6AddressSize:
 				rxICMP = rxIPv6ICMP
 				expectedSrc = ipv6Addr.Address
 				expectedDst = remoteIPv6Addr
-				proto = ipv6Proto
+				protoNum = header.IPv6ProtocolNumber
 			default:
 				t.Fatalf("got unexpected address length = %d bytes", l)
 			}
@@ -204,7 +204,7 @@ func TestPingMulticastBroadcast(t *testing.T) {
 				t.Errorf("got pkt.Route.RemoteAddress = %s, want = %s", pkt.Route.RemoteAddress, expectedDst)
 			}
 
-			src, dst := proto.ParseAddresses(pkt.Pkt.NetworkHeader().View())
+			src, dst := s.NetworkProtocolInstance(protoNum).ParseAddresses(stack.PayloadSince(pkt.Pkt.NetworkHeader()))
 			if src != expectedSrc {
 				t.Errorf("got pkt source = %s, want = %s", src, expectedSrc)
 			}
@@ -251,6 +251,7 @@ func TestIncomingMulticastAndBroadcast(t *testing.T) {
 			SrcAddr:     remoteIPv4Addr,
 			DstAddr:     dst,
 		})
+		ip.SetChecksum(^ip.CalculateChecksum())
 
 		e.InjectInbound(header.IPv4ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
 			Data: hdr.View().ToVectorisedView(),
@@ -379,8 +380,8 @@ func TestIncomingMulticastAndBroadcast(t *testing.T) {
 	for _, test := range tests {
 		t.Run(test.name, func(t *testing.T) {
 			s := stack.New(stack.Options{
-				NetworkProtocols:   []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol()},
-				TransportProtocols: []stack.TransportProtocol{udp.NewProtocol()},
+				NetworkProtocols:   []stack.NetworkProtocolFactory{ipv4.NewProtocol, ipv6.NewProtocol},
+				TransportProtocols: []stack.TransportProtocolFactory{udp.NewProtocol},
 			})
 			e := channel.New(0, defaultMTU, "")
 			if err := s.CreateNIC(nicID, e); err != nil {
@@ -430,7 +431,126 @@ func TestIncomingMulticastAndBroadcast(t *testing.T) {
 				}
 			} else {
 				if err != tcpip.ErrWouldBlock {
-					t.Fatalf("got Read(nil) = (%x, _, %v), want = (_, _, %s)", gotPayload, err, tcpip.ErrWouldBlock)
+					t.Fatalf("got Read(nil) = (%x, _, %s), want = (_, _, %s)", gotPayload, err, tcpip.ErrWouldBlock)
+				}
+			}
+		})
+	}
+}
+
+// TestReuseAddrAndBroadcast makes sure broadcast packets are received by all
+// interested endpoints.
+func TestReuseAddrAndBroadcast(t *testing.T) {
+	const (
+		nicID             = 1
+		localPort         = 9000
+		loopbackBroadcast = tcpip.Address("\x7f\xff\xff\xff")
+	)
+
+	data := tcpip.SlicePayload([]byte{1, 2, 3, 4})
+
+	tests := []struct {
+		name          string
+		broadcastAddr tcpip.Address
+	}{
+		{
+			name:          "Subnet directed broadcast",
+			broadcastAddr: loopbackBroadcast,
+		},
+		{
+			name:          "IPv4 broadcast",
+			broadcastAddr: header.IPv4Broadcast,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			s := stack.New(stack.Options{
+				NetworkProtocols:   []stack.NetworkProtocolFactory{ipv4.NewProtocol, ipv6.NewProtocol},
+				TransportProtocols: []stack.TransportProtocolFactory{udp.NewProtocol},
+			})
+			if err := s.CreateNIC(nicID, loopback.New()); err != nil {
+				t.Fatalf("CreateNIC(%d, _): %s", nicID, err)
+			}
+			protoAddr := tcpip.ProtocolAddress{
+				Protocol: header.IPv4ProtocolNumber,
+				AddressWithPrefix: tcpip.AddressWithPrefix{
+					Address:   "\x7f\x00\x00\x01",
+					PrefixLen: 8,
+				},
+			}
+			if err := s.AddProtocolAddress(nicID, protoAddr); err != nil {
+				t.Fatalf("AddProtocolAddress(%d, %+v): %s", nicID, protoAddr, err)
+			}
+
+			s.SetRouteTable([]tcpip.Route{
+				tcpip.Route{
+					// We use the empty subnet instead of just the loopback subnet so we
+					// also have a route to the IPv4 Broadcast address.
+					Destination: header.IPv4EmptySubnet,
+					NIC:         nicID,
+				},
+			})
+
+			// We create endpoints that bind to both the wildcard address and the
+			// broadcast address to make sure both of these types of "broadcast
+			// interested" endpoints receive broadcast packets.
+			wq := waiter.Queue{}
+			var eps []tcpip.Endpoint
+			for _, bindWildcard := range []bool{false, true} {
+				// Create multiple endpoints for each type of "broadcast interested"
+				// endpoint so we can test that all endpoints receive the broadcast
+				// packet.
+				for i := 0; i < 2; i++ {
+					ep, err := s.NewEndpoint(udp.ProtocolNumber, ipv4.ProtocolNumber, &wq)
+					if err != nil {
+						t.Fatalf("(eps[%d]) NewEndpoint(%d, %d, _): %s", len(eps), udp.ProtocolNumber, ipv4.ProtocolNumber, err)
+					}
+					defer ep.Close()
+
+					if err := ep.SetSockOptBool(tcpip.ReuseAddressOption, true); err != nil {
+						t.Fatalf("eps[%d].SetSockOptBool(tcpip.ReuseAddressOption, true): %s", len(eps), err)
+					}
+
+					if err := ep.SetSockOptBool(tcpip.BroadcastOption, true); err != nil {
+						t.Fatalf("eps[%d].SetSockOptBool(tcpip.BroadcastOption, true): %s", len(eps), err)
+					}
+
+					bindAddr := tcpip.FullAddress{Port: localPort}
+					if bindWildcard {
+						if err := ep.Bind(bindAddr); err != nil {
+							t.Fatalf("eps[%d].Bind(%+v): %s", len(eps), bindAddr, err)
+						}
+					} else {
+						bindAddr.Addr = test.broadcastAddr
+						if err := ep.Bind(bindAddr); err != nil {
+							t.Fatalf("eps[%d].Bind(%+v): %s", len(eps), bindAddr, err)
+						}
+					}
+
+					eps = append(eps, ep)
+				}
+			}
+
+			for i, wep := range eps {
+				writeOpts := tcpip.WriteOptions{
+					To: &tcpip.FullAddress{
+						Addr: test.broadcastAddr,
+						Port: localPort,
+					},
+				}
+				if n, _, err := wep.Write(data, writeOpts); err != nil {
+					t.Fatalf("eps[%d].Write(_, _): %s", i, err)
+				} else if want := int64(len(data)); n != want {
+					t.Fatalf("got eps[%d].Write(_, _) = (%d, nil, nil), want = (%d, nil, nil)", i, n, want)
+				}
+
+				for j, rep := range eps {
+					if gotPayload, _, err := rep.Read(nil); err != nil {
+						t.Errorf("(eps[%d] write) eps[%d].Read(nil): %s", i, j, err)
+					} else if diff := cmp.Diff(buffer.View(data), gotPayload); diff != "" {
+						t.Errorf("(eps[%d] write) got UDP payload from eps[%d] mismatch (-want +got):\n%s", i, j, diff)
+					}
 				}
 			}
 		})
diff --git a/pkg/tcpip/time_unsafe.go b/pkg/tcpip/time_unsafe.go
index f32d58091..606363567 100644
--- a/pkg/tcpip/time_unsafe.go
+++ b/pkg/tcpip/time_unsafe.go
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 // +build go1.9
-// +build !go1.16
+// +build !go1.17
 
 // Check go:linkname function signatures when updating Go version.
 
diff --git a/pkg/tcpip/transport/icmp/endpoint.go b/pkg/tcpip/transport/icmp/endpoint.go
index bd6f49eb8..41eb0ca44 100644
--- a/pkg/tcpip/transport/icmp/endpoint.go
+++ b/pkg/tcpip/transport/icmp/endpoint.go
@@ -74,6 +74,8 @@ type endpoint struct {
 	route         stack.Route `state:"manual"`
 	ttl           uint8
 	stats         tcpip.TransportEndpointStats `state:"nosave"`
+	// linger is used for SO_LINGER socket option.
+	linger tcpip.LingerOption
 
 	// owner is used to get uid and gid of the packet.
 	owner tcpip.PacketOwner
@@ -343,10 +345,15 @@ func (e *endpoint) Peek([][]byte) (int64, tcpip.ControlMessages, *tcpip.Error) {
 }
 
 // SetSockOpt sets a socket option.
-func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
-	switch opt.(type) {
-	case tcpip.SocketDetachFilterOption:
+func (e *endpoint) SetSockOpt(opt tcpip.SettableSocketOption) *tcpip.Error {
+	switch v := opt.(type) {
+	case *tcpip.SocketDetachFilterOption:
 		return nil
+
+	case *tcpip.LingerOption:
+		e.mu.Lock()
+		e.linger = *v
+		e.mu.Unlock()
 	}
 	return nil
 }
@@ -415,9 +422,12 @@ func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
 }
 
 // GetSockOpt implements tcpip.Endpoint.GetSockOpt.
-func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
-	switch opt.(type) {
-	case tcpip.ErrorOption:
+func (e *endpoint) GetSockOpt(opt tcpip.GettableSocketOption) *tcpip.Error {
+	switch o := opt.(type) {
+	case *tcpip.LingerOption:
+		e.mu.Lock()
+		*o = e.linger
+		e.mu.Unlock()
 		return nil
 
 	default:
@@ -436,6 +446,7 @@ func send4(r *stack.Route, ident uint16, data buffer.View, ttl uint8, owner tcpi
 	pkt.Owner = owner
 
 	icmpv4 := header.ICMPv4(pkt.TransportHeader().Push(header.ICMPv4MinimumSize))
+	pkt.TransportProtocolNumber = header.ICMPv4ProtocolNumber
 	copy(icmpv4, data)
 	// Set the ident to the user-specified port. Sequence number should
 	// already be set by the user.
@@ -468,6 +479,7 @@ func send6(r *stack.Route, ident uint16, data buffer.View, ttl uint8) *tcpip.Err
 	})
 
 	icmpv6 := header.ICMPv6(pkt.TransportHeader().Push(header.ICMPv6MinimumSize))
+	pkt.TransportProtocolNumber = header.ICMPv6ProtocolNumber
 	copy(icmpv6, data)
 	// Set the ident. Sequence number is provided by the user.
 	icmpv6.SetIdent(ident)
@@ -603,7 +615,7 @@ func (*endpoint) Listen(int) *tcpip.Error {
 }
 
 // Accept is not supported by UDP, it just fails.
-func (*endpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
+func (*endpoint) Accept(*tcpip.FullAddress) (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
 	return nil, nil, tcpip.ErrNotSupported
 }
 
@@ -836,3 +848,8 @@ func (e *endpoint) Stats() tcpip.EndpointStats {
 
 // Wait implements stack.TransportEndpoint.Wait.
 func (*endpoint) Wait() {}
+
+// LastError implements tcpip.Endpoint.LastError.
+func (*endpoint) LastError() *tcpip.Error {
+	return nil
+}
diff --git a/pkg/tcpip/transport/icmp/protocol.go b/pkg/tcpip/transport/icmp/protocol.go
index 74ef6541e..87d510f96 100644
--- a/pkg/tcpip/transport/icmp/protocol.go
+++ b/pkg/tcpip/transport/icmp/protocol.go
@@ -13,12 +13,7 @@
 // limitations under the License.
 
 // Package icmp contains the implementation of the ICMP and IPv6-ICMP transport
-// protocols for use in ping. To use it in the networking stack, this package
-// must be added to the project, and activated on the stack by passing
-// icmp.NewProtocol4() and/or icmp.NewProtocol6() as one of the transport
-// protocols when calling stack.New(). Then endpoints can be created by passing
-// icmp.ProtocolNumber or icmp.ProtocolNumber6 as the transport protocol number
-// when calling Stack.NewEndpoint().
+// protocols for use in ping.
 package icmp
 
 import (
@@ -42,6 +37,8 @@ const (
 
 // protocol implements stack.TransportProtocol.
 type protocol struct {
+	stack *stack.Stack
+
 	number tcpip.TransportProtocolNumber
 }
 
@@ -62,20 +59,20 @@ func (p *protocol) netProto() tcpip.NetworkProtocolNumber {
 
 // NewEndpoint creates a new icmp endpoint. It implements
 // stack.TransportProtocol.NewEndpoint.
-func (p *protocol) NewEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
+func (p *protocol) NewEndpoint(netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
 	if netProto != p.netProto() {
 		return nil, tcpip.ErrUnknownProtocol
 	}
-	return newEndpoint(stack, netProto, p.number, waiterQueue)
+	return newEndpoint(p.stack, netProto, p.number, waiterQueue)
 }
 
 // NewRawEndpoint creates a new raw icmp endpoint. It implements
 // stack.TransportProtocol.NewRawEndpoint.
-func (p *protocol) NewRawEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
+func (p *protocol) NewRawEndpoint(netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
 	if netProto != p.netProto() {
 		return nil, tcpip.ErrUnknownProtocol
 	}
-	return raw.NewEndpoint(stack, netProto, p.number, waiterQueue)
+	return raw.NewEndpoint(p.stack, netProto, p.number, waiterQueue)
 }
 
 // MinimumPacketSize returns the minimum valid icmp packet size.
@@ -104,17 +101,17 @@ func (p *protocol) ParsePorts(v buffer.View) (src, dst uint16, err *tcpip.Error)
 
 // HandleUnknownDestinationPacket handles packets targeted at this protocol but
 // that don't match any existing endpoint.
-func (*protocol) HandleUnknownDestinationPacket(*stack.Route, stack.TransportEndpointID, *stack.PacketBuffer) bool {
-	return true
+func (*protocol) HandleUnknownDestinationPacket(*stack.Route, stack.TransportEndpointID, *stack.PacketBuffer) stack.UnknownDestinationPacketDisposition {
+	return stack.UnknownDestinationPacketHandled
 }
 
 // SetOption implements stack.TransportProtocol.SetOption.
-func (*protocol) SetOption(option interface{}) *tcpip.Error {
+func (*protocol) SetOption(tcpip.SettableTransportProtocolOption) *tcpip.Error {
 	return tcpip.ErrUnknownProtocolOption
 }
 
 // Option implements stack.TransportProtocol.Option.
-func (*protocol) Option(option interface{}) *tcpip.Error {
+func (*protocol) Option(tcpip.GettableTransportProtocolOption) *tcpip.Error {
 	return tcpip.ErrUnknownProtocolOption
 }
 
@@ -135,11 +132,11 @@ func (*protocol) Parse(pkt *stack.PacketBuffer) bool {
 }
 
 // NewProtocol4 returns an ICMPv4 transport protocol.
-func NewProtocol4() stack.TransportProtocol {
-	return &protocol{ProtocolNumber4}
+func NewProtocol4(s *stack.Stack) stack.TransportProtocol {
+	return &protocol{stack: s, number: ProtocolNumber4}
 }
 
 // NewProtocol6 returns an ICMPv6 transport protocol.
-func NewProtocol6() stack.TransportProtocol {
-	return &protocol{ProtocolNumber6}
+func NewProtocol6(s *stack.Stack) stack.TransportProtocol {
+	return &protocol{stack: s, number: ProtocolNumber6}
 }
diff --git a/pkg/tcpip/transport/packet/endpoint.go b/pkg/tcpip/transport/packet/endpoint.go
index 1b03ad6bb..072601d2d 100644
--- a/pkg/tcpip/transport/packet/endpoint.go
+++ b/pkg/tcpip/transport/packet/endpoint.go
@@ -83,6 +83,8 @@ type endpoint struct {
 	stats         tcpip.TransportEndpointStats `state:"nosave"`
 	bound         bool
 	boundNIC      tcpip.NICID
+	// linger is used for SO_LINGER socket option.
+	linger tcpip.LingerOption
 
 	// lastErrorMu protects lastError.
 	lastErrorMu sync.Mutex   `state:"nosave"`
@@ -192,13 +194,13 @@ func (ep *endpoint) Read(addr *tcpip.FullAddress) (buffer.View, tcpip.ControlMes
 	return ep.ReadPacket(addr, nil)
 }
 
-func (ep *endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-chan struct{}, *tcpip.Error) {
+func (*endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-chan struct{}, *tcpip.Error) {
 	// TODO(gvisor.dev/issue/173): Implement.
 	return 0, nil, tcpip.ErrInvalidOptionValue
 }
 
 // Peek implements tcpip.Endpoint.Peek.
-func (ep *endpoint) Peek([][]byte) (int64, tcpip.ControlMessages, *tcpip.Error) {
+func (*endpoint) Peek([][]byte) (int64, tcpip.ControlMessages, *tcpip.Error) {
 	return 0, tcpip.ControlMessages{}, nil
 }
 
@@ -210,25 +212,25 @@ func (*endpoint) Disconnect() *tcpip.Error {
 
 // Connect implements tcpip.Endpoint.Connect. Packet sockets cannot be
 // connected, and this function always returnes tcpip.ErrNotSupported.
-func (ep *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
+func (*endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
 	return tcpip.ErrNotSupported
 }
 
 // Shutdown implements tcpip.Endpoint.Shutdown. Packet sockets cannot be used
 // with Shutdown, and this function always returns tcpip.ErrNotSupported.
-func (ep *endpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error {
+func (*endpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error {
 	return tcpip.ErrNotSupported
 }
 
 // Listen implements tcpip.Endpoint.Listen. Packet sockets cannot be used with
 // Listen, and this function always returns tcpip.ErrNotSupported.
-func (ep *endpoint) Listen(backlog int) *tcpip.Error {
+func (*endpoint) Listen(backlog int) *tcpip.Error {
 	return tcpip.ErrNotSupported
 }
 
 // Accept implements tcpip.Endpoint.Accept. Packet sockets cannot be used with
 // Accept, and this function always returns tcpip.ErrNotSupported.
-func (ep *endpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
+func (*endpoint) Accept(*tcpip.FullAddress) (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
 	return nil, nil, tcpip.ErrNotSupported
 }
 
@@ -267,12 +269,12 @@ func (ep *endpoint) Bind(addr tcpip.FullAddress) *tcpip.Error {
 }
 
 // GetLocalAddress implements tcpip.Endpoint.GetLocalAddress.
-func (ep *endpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) {
+func (*endpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) {
 	return tcpip.FullAddress{}, tcpip.ErrNotSupported
 }
 
 // GetRemoteAddress implements tcpip.Endpoint.GetRemoteAddress.
-func (ep *endpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) {
+func (*endpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) {
 	// Even a connected socket doesn't return a remote address.
 	return tcpip.FullAddress{}, tcpip.ErrNotConnected
 }
@@ -297,9 +299,15 @@ func (ep *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
 // SetSockOpt implements tcpip.Endpoint.SetSockOpt. Packet sockets cannot be
 // used with SetSockOpt, and this function always returns
 // tcpip.ErrNotSupported.
-func (ep *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
-	switch opt.(type) {
-	case tcpip.SocketDetachFilterOption:
+func (ep *endpoint) SetSockOpt(opt tcpip.SettableSocketOption) *tcpip.Error {
+	switch v := opt.(type) {
+	case *tcpip.SocketDetachFilterOption:
+		return nil
+
+	case *tcpip.LingerOption:
+		ep.mu.Lock()
+		ep.linger = *v
+		ep.mu.Unlock()
 		return nil
 
 	default:
@@ -356,7 +364,7 @@ func (ep *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
 	}
 }
 
-func (ep *endpoint) takeLastError() *tcpip.Error {
+func (ep *endpoint) LastError() *tcpip.Error {
 	ep.lastErrorMu.Lock()
 	defer ep.lastErrorMu.Unlock()
 
@@ -366,16 +374,21 @@ func (ep *endpoint) takeLastError() *tcpip.Error {
 }
 
 // GetSockOpt implements tcpip.Endpoint.GetSockOpt.
-func (ep *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
-	switch opt.(type) {
-	case tcpip.ErrorOption:
-		return ep.takeLastError()
+func (ep *endpoint) GetSockOpt(opt tcpip.GettableSocketOption) *tcpip.Error {
+	switch o := opt.(type) {
+	case *tcpip.LingerOption:
+		ep.mu.Lock()
+		*o = ep.linger
+		ep.mu.Unlock()
+		return nil
+
+	default:
+		return tcpip.ErrNotSupported
 	}
-	return tcpip.ErrNotSupported
 }
 
 // GetSockOptBool implements tcpip.Endpoint.GetSockOptBool.
-func (ep *endpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) {
+func (*endpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) {
 	return false, tcpip.ErrNotSupported
 }
 
@@ -512,7 +525,7 @@ func (ep *endpoint) HandlePacket(nicID tcpip.NICID, localAddr tcpip.LinkAddress,
 }
 
 // State implements socket.Socket.State.
-func (ep *endpoint) State() uint32 {
+func (*endpoint) State() uint32 {
 	return 0
 }
 
diff --git a/pkg/tcpip/transport/raw/endpoint.go b/pkg/tcpip/transport/raw/endpoint.go
index edc2b5b61..e37c00523 100644
--- a/pkg/tcpip/transport/raw/endpoint.go
+++ b/pkg/tcpip/transport/raw/endpoint.go
@@ -84,6 +84,8 @@ type endpoint struct {
 	// Connect(), and is valid only when conneted is true.
 	route stack.Route                  `state:"manual"`
 	stats tcpip.TransportEndpointStats `state:"nosave"`
+	// linger is used for SO_LINGER socket option.
+	linger tcpip.LingerOption
 
 	// owner is used to get uid and gid of the packet.
 	owner tcpip.PacketOwner
@@ -446,12 +448,12 @@ func (e *endpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error {
 }
 
 // Listen implements tcpip.Endpoint.Listen.
-func (e *endpoint) Listen(backlog int) *tcpip.Error {
+func (*endpoint) Listen(backlog int) *tcpip.Error {
 	return tcpip.ErrNotSupported
 }
 
 // Accept implements tcpip.Endpoint.Accept.
-func (e *endpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
+func (*endpoint) Accept(*tcpip.FullAddress) (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
 	return nil, nil, tcpip.ErrNotSupported
 }
 
@@ -482,12 +484,12 @@ func (e *endpoint) Bind(addr tcpip.FullAddress) *tcpip.Error {
 }
 
 // GetLocalAddress implements tcpip.Endpoint.GetLocalAddress.
-func (e *endpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) {
+func (*endpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) {
 	return tcpip.FullAddress{}, tcpip.ErrNotSupported
 }
 
 // GetRemoteAddress implements tcpip.Endpoint.GetRemoteAddress.
-func (e *endpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) {
+func (*endpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) {
 	// Even a connected socket doesn't return a remote address.
 	return tcpip.FullAddress{}, tcpip.ErrNotConnected
 }
@@ -510,9 +512,15 @@ func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
 }
 
 // SetSockOpt implements tcpip.Endpoint.SetSockOpt.
-func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
-	switch opt.(type) {
-	case tcpip.SocketDetachFilterOption:
+func (e *endpoint) SetSockOpt(opt tcpip.SettableSocketOption) *tcpip.Error {
+	switch v := opt.(type) {
+	case *tcpip.SocketDetachFilterOption:
+		return nil
+
+	case *tcpip.LingerOption:
+		e.mu.Lock()
+		e.linger = *v
+		e.mu.Unlock()
 		return nil
 
 	default:
@@ -577,9 +585,12 @@ func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
 }
 
 // GetSockOpt implements tcpip.Endpoint.GetSockOpt.
-func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
-	switch opt.(type) {
-	case tcpip.ErrorOption:
+func (e *endpoint) GetSockOpt(opt tcpip.GettableSocketOption) *tcpip.Error {
+	switch o := opt.(type) {
+	case *tcpip.LingerOption:
+		e.mu.Lock()
+		*o = e.linger
+		e.mu.Unlock()
 		return nil
 
 	default:
@@ -739,3 +750,7 @@ func (e *endpoint) Stats() tcpip.EndpointStats {
 
 // Wait implements stack.TransportEndpoint.Wait.
 func (*endpoint) Wait() {}
+
+func (*endpoint) LastError() *tcpip.Error {
+	return nil
+}
diff --git a/pkg/tcpip/transport/raw/endpoint_state.go b/pkg/tcpip/transport/raw/endpoint_state.go
index 33bfb56cd..7d97cbdc7 100644
--- a/pkg/tcpip/transport/raw/endpoint_state.go
+++ b/pkg/tcpip/transport/raw/endpoint_state.go
@@ -37,57 +37,57 @@ func (p *rawPacket) loadData(data buffer.VectorisedView) {
 }
 
 // beforeSave is invoked by stateify.
-func (ep *endpoint) beforeSave() {
+func (e *endpoint) beforeSave() {
 	// Stop incoming packets from being handled (and mutate endpoint state).
 	// The lock will be released after saveRcvBufSizeMax(), which would have
-	// saved ep.rcvBufSizeMax and set it to 0 to continue blocking incoming
+	// saved e.rcvBufSizeMax and set it to 0 to continue blocking incoming
 	// packets.
-	ep.rcvMu.Lock()
+	e.rcvMu.Lock()
 }
 
 // saveRcvBufSizeMax is invoked by stateify.
-func (ep *endpoint) saveRcvBufSizeMax() int {
-	max := ep.rcvBufSizeMax
+func (e *endpoint) saveRcvBufSizeMax() int {
+	max := e.rcvBufSizeMax
 	// Make sure no new packets will be handled regardless of the lock.
-	ep.rcvBufSizeMax = 0
+	e.rcvBufSizeMax = 0
 	// Release the lock acquired in beforeSave() so regular endpoint closing
 	// logic can proceed after save.
-	ep.rcvMu.Unlock()
+	e.rcvMu.Unlock()
 	return max
 }
 
 // loadRcvBufSizeMax is invoked by stateify.
-func (ep *endpoint) loadRcvBufSizeMax(max int) {
-	ep.rcvBufSizeMax = max
+func (e *endpoint) loadRcvBufSizeMax(max int) {
+	e.rcvBufSizeMax = max
 }
 
 // afterLoad is invoked by stateify.
-func (ep *endpoint) afterLoad() {
-	stack.StackFromEnv.RegisterRestoredEndpoint(ep)
+func (e *endpoint) afterLoad() {
+	stack.StackFromEnv.RegisterRestoredEndpoint(e)
 }
 
 // Resume implements tcpip.ResumableEndpoint.Resume.
-func (ep *endpoint) Resume(s *stack.Stack) {
-	ep.stack = s
+func (e *endpoint) Resume(s *stack.Stack) {
+	e.stack = s
 
 	// If the endpoint is connected, re-connect.
-	if ep.connected {
+	if e.connected {
 		var err *tcpip.Error
-		ep.route, err = ep.stack.FindRoute(ep.RegisterNICID, ep.BindAddr, ep.route.RemoteAddress, ep.NetProto, false)
+		e.route, err = e.stack.FindRoute(e.RegisterNICID, e.BindAddr, e.route.RemoteAddress, e.NetProto, false)
 		if err != nil {
 			panic(err)
 		}
 	}
 
 	// If the endpoint is bound, re-bind.
-	if ep.bound {
-		if ep.stack.CheckLocalAddress(ep.RegisterNICID, ep.NetProto, ep.BindAddr) == 0 {
+	if e.bound {
+		if e.stack.CheckLocalAddress(e.RegisterNICID, e.NetProto, e.BindAddr) == 0 {
 			panic(tcpip.ErrBadLocalAddress)
 		}
 	}
 
-	if ep.associated {
-		if err := ep.stack.RegisterRawTransportEndpoint(ep.RegisterNICID, ep.NetProto, ep.TransProto, ep); err != nil {
+	if e.associated {
+		if err := e.stack.RegisterRawTransportEndpoint(e.RegisterNICID, e.NetProto, e.TransProto, e); err != nil {
 			panic(err)
 		}
 	}
diff --git a/pkg/tcpip/transport/tcp/BUILD b/pkg/tcpip/transport/tcp/BUILD
index bde071f2a..518449602 100644
--- a/pkg/tcpip/transport/tcp/BUILD
+++ b/pkg/tcpip/transport/tcp/BUILD
@@ -11,8 +11,7 @@ go_template_instance(
     template = "//pkg/ilist:generic_list",
     types = {
         "Element": "*segment",
-        "ElementMapper": "segmentMapper",
-        "Linker": "*segmentEntry",
+        "Linker": "*segment",
     },
 )
 
@@ -28,19 +27,6 @@ go_template_instance(
     },
 )
 
-go_template_instance(
-    name = "tcp_rack_segment_list",
-    out = "tcp_rack_segment_list.go",
-    package = "tcp",
-    prefix = "rackSegment",
-    template = "//pkg/ilist:generic_list",
-    types = {
-        "Element": "*segment",
-        "ElementMapper": "rackSegmentMapper",
-        "Linker": "*rackSegmentEntry",
-    },
-)
-
 go_library(
     name = "tcp",
     srcs = [
@@ -69,7 +55,6 @@ go_library(
         "snd.go",
         "snd_state.go",
         "tcp_endpoint_list.go",
-        "tcp_rack_segment_list.go",
         "tcp_segment_list.go",
         "timer.go",
     ],
@@ -84,6 +69,7 @@ go_library(
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/hash/jenkins",
         "//pkg/tcpip/header",
+        "//pkg/tcpip/header/parse",
         "//pkg/tcpip/ports",
         "//pkg/tcpip/seqnum",
         "//pkg/tcpip/stack",
@@ -108,6 +94,7 @@ go_test(
     shard_count = 10,
     deps = [
         ":tcp",
+        "//pkg/rand",
         "//pkg/sync",
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index 87980c0a1..0aaef495d 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -491,7 +491,7 @@ func (h *handshake) resolveRoute() *tcpip.Error {
 				h.ep.mu.Lock()
 			}
 			if n&notifyError != 0 {
-				return h.ep.takeLastError()
+				return h.ep.LastError()
 			}
 		}
 
@@ -522,7 +522,7 @@ func (h *handshake) execute() *tcpip.Error {
 	s.AddWaker(&h.ep.newSegmentWaker, wakerForNewSegment)
 	defer s.Done()
 
-	var sackEnabled SACKEnabled
+	var sackEnabled tcpip.TCPSACKEnabled
 	if err := h.ep.stack.TransportProtocolOption(ProtocolNumber, &sackEnabled); err != nil {
 		// If stack returned an error when checking for SACKEnabled
 		// status then just default to switching off SACK negotiation.
@@ -620,7 +620,7 @@ func (h *handshake) execute() *tcpip.Error {
 				h.ep.mu.Lock()
 			}
 			if n&notifyError != 0 {
-				return h.ep.takeLastError()
+				return h.ep.LastError()
 			}
 
 		case wakerForNewSegment:
@@ -747,6 +747,7 @@ func (e *endpoint) sendTCP(r *stack.Route, tf tcpFields, data buffer.VectorisedV
 func buildTCPHdr(r *stack.Route, tf tcpFields, pkt *stack.PacketBuffer, gso *stack.GSO) {
 	optLen := len(tf.opts)
 	tcp := header.TCP(pkt.TransportHeader().Push(header.TCPMinimumSize + optLen))
+	pkt.TransportProtocolNumber = header.TCPProtocolNumber
 	tcp.Encode(&header.TCPFields{
 		SrcPort:    tf.id.LocalPort,
 		DstPort:    tf.id.RemotePort,
@@ -803,7 +804,7 @@ func sendTCPBatch(r *stack.Route, tf tcpFields, data buffer.VectorisedView, gso
 		pkt.Owner = owner
 		pkt.EgressRoute = r
 		pkt.GSOOptions = gso
-		pkt.NetworkProtocolNumber = r.NetworkProtocolNumber()
+		pkt.NetworkProtocolNumber = r.NetProto
 		data.ReadToVV(&pkt.Data, packetSize)
 		buildTCPHdr(r, tf, pkt, gso)
 		tf.seq = tf.seq.Add(seqnum.Size(packetSize))
@@ -897,7 +898,7 @@ func (e *endpoint) makeOptions(sackBlocks []header.SACKBlock) []byte {
 // sendRaw sends a TCP segment to the endpoint's peer.
 func (e *endpoint) sendRaw(data buffer.VectorisedView, flags byte, seq, ack seqnum.Value, rcvWnd seqnum.Size) *tcpip.Error {
 	var sackBlocks []header.SACKBlock
-	if e.EndpointState() == StateEstablished && e.rcv.pendingBufSize > 0 && (flags&header.TCPFlagAck != 0) {
+	if e.EndpointState() == StateEstablished && e.rcv.pendingRcvdSegments.Len() > 0 && (flags&header.TCPFlagAck != 0) {
 		sackBlocks = e.sack.Blocks[:e.sack.NumBlocks]
 	}
 	options := e.makeOptions(sackBlocks)
@@ -924,18 +925,7 @@ func (e *endpoint) handleWrite() *tcpip.Error {
 
 	first := e.sndQueue.Front()
 	if first != nil {
-		lastSeg := e.snd.writeList.Back()
 		e.snd.writeList.PushBackList(&e.sndQueue)
-		if lastSeg == nil {
-			lastSeg = e.snd.writeList.Front()
-		} else {
-			lastSeg = lastSeg.segEntry.Next()
-		}
-		// Add new segments to rcList, as rcList and writeList should
-		// be consistent.
-		for seg := lastSeg; seg != nil; seg = seg.segEntry.Next() {
-			e.snd.rcList.PushBack(seg)
-		}
 		e.sndBufInQueue = 0
 	}
 
@@ -1013,9 +1003,8 @@ func (e *endpoint) transitionToStateEstablishedLocked(h *handshake) {
 	// (indicated by a negative send window scale).
 	e.snd = newSender(e, h.iss, h.ackNum-1, h.sndWnd, h.mss, h.sndWndScale)
 
-	rcvBufSize := seqnum.Size(e.receiveBufferSize())
 	e.rcvListMu.Lock()
-	e.rcv = newReceiver(e, h.ackNum-1, h.rcvWnd, h.effectiveRcvWndScale(), rcvBufSize)
+	e.rcv = newReceiver(e, h.ackNum-1, h.rcvWnd, h.effectiveRcvWndScale())
 	// Bootstrap the auto tuning algorithm. Starting at zero will
 	// result in a really large receive window after the first auto
 	// tuning adjustment.
@@ -1146,12 +1135,11 @@ func (e *endpoint) handleSegments(fastPath bool) *tcpip.Error {
 		}
 
 		cont, err := e.handleSegment(s)
+		s.decRef()
 		if err != nil {
-			s.decRef()
 			return err
 		}
 		if !cont {
-			s.decRef()
 			return nil
 		}
 	}
@@ -1243,7 +1231,6 @@ func (e *endpoint) handleSegment(s *segment) (cont bool, err *tcpip.Error) {
 			// or a notification from the protocolMainLoop (caller goroutine).
 			// This means that with this return, the segment dequeue below can
 			// never occur on a closed endpoint.
-			s.decRef()
 			return false, nil
 		}
 
@@ -1435,10 +1422,6 @@ func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{
 					e.rcv.nonZeroWindow()
 				}
 
-				if n&notifyReceiveWindowChanged != 0 {
-					e.rcv.pendingBufSize = seqnum.Size(e.receiveBufferSize())
-				}
-
 				if n&notifyMTUChanged != 0 {
 					e.sndBufMu.Lock()
 					count := e.packetTooBigCount
diff --git a/pkg/tcpip/transport/tcp/dual_stack_test.go b/pkg/tcpip/transport/tcp/dual_stack_test.go
index 804e95aea..560b4904c 100644
--- a/pkg/tcpip/transport/tcp/dual_stack_test.go
+++ b/pkg/tcpip/transport/tcp/dual_stack_test.go
@@ -78,16 +78,15 @@ func testV4Connect(t *testing.T, c *context.Context, checkers ...checker.Network
 	ackCheckers := append(checkers, checker.TCP(
 		checker.DstPort(context.TestPort),
 		checker.TCPFlags(header.TCPFlagAck),
-		checker.SeqNum(uint32(c.IRS)+1),
-		checker.AckNum(uint32(iss)+1),
+		checker.TCPSeqNum(uint32(c.IRS)+1),
+		checker.TCPAckNum(uint32(iss)+1),
 	))
 	checker.IPv4(t, c.GetPacket(), ackCheckers...)
 
 	// Wait for connection to be established.
 	select {
 	case <-ch:
-		err = c.EP.GetSockOpt(tcpip.ErrorOption{})
-		if err != nil {
+		if err := c.EP.LastError(); err != nil {
 			t.Fatalf("Unexpected error when connecting: %v", err)
 		}
 	case <-time.After(1 * time.Second):
@@ -186,16 +185,15 @@ func testV6Connect(t *testing.T, c *context.Context, checkers ...checker.Network
 	ackCheckers := append(checkers, checker.TCP(
 		checker.DstPort(context.TestPort),
 		checker.TCPFlags(header.TCPFlagAck),
-		checker.SeqNum(uint32(c.IRS)+1),
-		checker.AckNum(uint32(iss)+1),
+		checker.TCPSeqNum(uint32(c.IRS)+1),
+		checker.TCPAckNum(uint32(iss)+1),
 	))
 	checker.IPv6(t, c.GetV6Packet(), ackCheckers...)
 
 	// Wait for connection to be established.
 	select {
 	case <-ch:
-		err = c.EP.GetSockOpt(tcpip.ErrorOption{})
-		if err != nil {
+		if err := c.EP.LastError(); err != nil {
 			t.Fatalf("Unexpected error when connecting: %v", err)
 		}
 	case <-time.After(1 * time.Second):
@@ -285,7 +283,7 @@ func TestV4RefuseOnV6Only(t *testing.T) {
 			checker.SrcPort(context.StackPort),
 			checker.DstPort(context.TestPort),
 			checker.TCPFlags(header.TCPFlagRst|header.TCPFlagAck),
-			checker.AckNum(uint32(irs)+1),
+			checker.TCPAckNum(uint32(irs)+1),
 		),
 	)
 }
@@ -321,7 +319,7 @@ func TestV6RefuseOnBoundToV4Mapped(t *testing.T) {
 			checker.SrcPort(context.StackPort),
 			checker.DstPort(context.TestPort),
 			checker.TCPFlags(header.TCPFlagRst|header.TCPFlagAck),
-			checker.AckNum(uint32(irs)+1),
+			checker.TCPAckNum(uint32(irs)+1),
 		),
 	)
 }
@@ -354,7 +352,7 @@ func testV4Accept(t *testing.T, c *context.Context) {
 			checker.SrcPort(context.StackPort),
 			checker.DstPort(context.TestPort),
 			checker.TCPFlags(header.TCPFlagAck|header.TCPFlagSyn),
-			checker.AckNum(uint32(irs)+1),
+			checker.TCPAckNum(uint32(irs)+1),
 		),
 	)
 
@@ -373,12 +371,12 @@ func testV4Accept(t *testing.T, c *context.Context) {
 	c.WQ.EventRegister(&we, waiter.EventIn)
 	defer c.WQ.EventUnregister(&we)
 
-	nep, _, err := c.EP.Accept()
+	nep, _, err := c.EP.Accept(nil)
 	if err == tcpip.ErrWouldBlock {
 		// Wait for connection to be established.
 		select {
 		case <-ch:
-			nep, _, err = c.EP.Accept()
+			nep, _, err = c.EP.Accept(nil)
 			if err != nil {
 				t.Fatalf("Accept failed: %v", err)
 			}
@@ -494,7 +492,7 @@ func TestV6AcceptOnV6(t *testing.T) {
 			checker.SrcPort(context.StackPort),
 			checker.DstPort(context.TestPort),
 			checker.TCPFlags(header.TCPFlagAck|header.TCPFlagSyn),
-			checker.AckNum(uint32(irs)+1),
+			checker.TCPAckNum(uint32(irs)+1),
 		),
 	)
 
@@ -512,13 +510,13 @@ func TestV6AcceptOnV6(t *testing.T) {
 	we, ch := waiter.NewChannelEntry(nil)
 	c.WQ.EventRegister(&we, waiter.EventIn)
 	defer c.WQ.EventUnregister(&we)
-
-	nep, _, err := c.EP.Accept()
+	var addr tcpip.FullAddress
+	nep, _, err := c.EP.Accept(&addr)
 	if err == tcpip.ErrWouldBlock {
 		// Wait for connection to be established.
 		select {
 		case <-ch:
-			nep, _, err = c.EP.Accept()
+			nep, _, err = c.EP.Accept(&addr)
 			if err != nil {
 				t.Fatalf("Accept failed: %v", err)
 			}
@@ -528,20 +526,14 @@ func TestV6AcceptOnV6(t *testing.T) {
 		}
 	}
 
+	if addr.Addr != context.TestV6Addr {
+		t.Errorf("Unexpected remote address: got %s, want %s", addr.Addr, context.TestV6Addr)
+	}
+
 	// Make sure we can still query the v6 only status of the new endpoint,
 	// that is, that it is in fact a v6 socket.
 	if _, err := nep.GetSockOptBool(tcpip.V6OnlyOption); err != nil {
-		t.Fatalf("GetSockOpt failed failed: %v", err)
-	}
-
-	// Check the peer address.
-	addr, err := nep.GetRemoteAddress()
-	if err != nil {
-		t.Fatalf("GetRemoteAddress failed failed: %v", err)
-	}
-
-	if addr.Addr != context.TestV6Addr {
-		t.Fatalf("Unexpected remote address: got %v, want %v", addr.Addr, context.TestV6Addr)
+		t.Errorf("GetSockOptBool(tcpip.V6OnlyOption) failed: %s", err)
 	}
 }
 
@@ -568,8 +560,9 @@ func TestV4AcceptOnV4(t *testing.T) {
 func testV4ListenClose(t *testing.T, c *context.Context) {
 	// Set the SynRcvd threshold to zero to force a syn cookie based accept
 	// to happen.
-	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPSynRcvdCountThresholdOption(0)); err != nil {
-		t.Fatalf("setting TCPSynRcvdCountThresholdOption failed: %s", err)
+	var opt tcpip.TCPSynRcvdCountThresholdOption
+	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+		t.Fatalf("setting TCPSynRcvdCountThresholdOption(%d, &%T(%d)): %s", tcp.ProtocolNumber, opt, opt, err)
 	}
 
 	const n = uint16(32)
@@ -612,12 +605,12 @@ func testV4ListenClose(t *testing.T, c *context.Context) {
 	we, ch := waiter.NewChannelEntry(nil)
 	c.WQ.EventRegister(&we, waiter.EventIn)
 	defer c.WQ.EventUnregister(&we)
-	nep, _, err := c.EP.Accept()
+	nep, _, err := c.EP.Accept(nil)
 	if err == tcpip.ErrWouldBlock {
 		// Wait for connection to be established.
 		select {
 		case <-ch:
-			nep, _, err = c.EP.Accept()
+			nep, _, err = c.EP.Accept(nil)
 			if err != nil {
 				t.Fatalf("Accept failed: %v", err)
 			}
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index 21a4b6e2f..3bcd3923a 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -63,6 +63,17 @@ const (
 	StateClosing
 )
 
+const (
+	// rcvAdvWndScale is used to split the available socket buffer into
+	// application buffer and the window to be advertised to the peer. This is
+	// currently hard coded to split the available space equally.
+	rcvAdvWndScale = 1
+
+	// SegOverheadFactor is used to multiply the value provided by the
+	// user on a SetSockOpt for setting the socket send/receive buffer sizes.
+	SegOverheadFactor = 2
+)
+
 // connected returns true when s is one of the states representing an
 // endpoint connected to a peer.
 func (s EndpointState) connected() bool {
@@ -149,7 +160,6 @@ func (s EndpointState) String() string {
 // Reasons for notifying the protocol goroutine.
 const (
 	notifyNonZeroReceiveWindow = 1 << iota
-	notifyReceiveWindowChanged
 	notifyClose
 	notifyMTUChanged
 	notifyDrain
@@ -238,6 +248,11 @@ type ReceiveErrors struct {
 	// ZeroRcvWindowState is the number of times we advertised
 	// a zero receive window when rcvList is full.
 	ZeroRcvWindowState tcpip.StatCounter
+
+	// WantZeroWindow is the number of times we wanted to advertise a
+	// zero receive window but couldn't because it would have caused
+	// the receive window's right edge to shrink.
+	WantZeroRcvWindow tcpip.StatCounter
 }
 
 // SendErrors collect segment send errors within the transport layer.
@@ -384,13 +399,26 @@ type endpoint struct {
 	// to indicate to users that no more data is coming.
 	//
 	// rcvListMu can be taken after the endpoint mu below.
-	rcvListMu     sync.Mutex  `state:"nosave"`
-	rcvList       segmentList `state:"wait"`
-	rcvClosed     bool
-	rcvBufSize    int
+	rcvListMu sync.Mutex  `state:"nosave"`
+	rcvList   segmentList `state:"wait"`
+	rcvClosed bool
+	// rcvBufSize is the total size of the receive buffer.
+	rcvBufSize int
+	// rcvBufUsed is the actual number of payload bytes held in the receive buffer
+	// not counting any overheads of the segments itself. NOTE: This will always
+	// be strictly <= rcvMemUsed below.
 	rcvBufUsed    int
 	rcvAutoParams rcvBufAutoTuneParams
 
+	// rcvMemUsed tracks the total amount of memory in use by received segments
+	// held in rcvList, pendingRcvdSegments and the segment queue. This is used to
+	// compute the window and the actual available buffer space. This is distinct
+	// from rcvBufUsed above which is the actual number of payload bytes held in
+	// the buffer not including any segment overheads.
+	//
+	// rcvMemUsed must be accessed atomically.
+	rcvMemUsed int32
+
 	// mu protects all endpoint fields unless documented otherwise. mu must
 	// be acquired before interacting with the endpoint fields.
 	mu          sync.Mutex `state:"nosave"`
@@ -654,6 +682,9 @@ type endpoint struct {
 
 	// owner is used to get uid and gid of the packet.
 	owner tcpip.PacketOwner
+
+	// linger is used for SO_LINGER socket option.
+	linger tcpip.LingerOption
 }
 
 // UniqueID implements stack.TransportEndpoint.UniqueID.
@@ -849,12 +880,12 @@ func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQue
 		maxSynRetries: DefaultSynRetries,
 	}
 
-	var ss SendBufferSizeOption
+	var ss tcpip.TCPSendBufferSizeRangeOption
 	if err := s.TransportProtocolOption(ProtocolNumber, &ss); err == nil {
 		e.sndBufSize = ss.Default
 	}
 
-	var rs ReceiveBufferSizeOption
+	var rs tcpip.TCPReceiveBufferSizeRangeOption
 	if err := s.TransportProtocolOption(ProtocolNumber, &rs); err == nil {
 		e.rcvBufSize = rs.Default
 	}
@@ -864,12 +895,12 @@ func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQue
 		e.cc = cs
 	}
 
-	var mrb tcpip.ModerateReceiveBufferOption
+	var mrb tcpip.TCPModerateReceiveBufferOption
 	if err := s.TransportProtocolOption(ProtocolNumber, &mrb); err == nil {
 		e.rcvAutoParams.disabled = !bool(mrb)
 	}
 
-	var de DelayEnabled
+	var de tcpip.TCPDelayEnabled
 	if err := s.TransportProtocolOption(ProtocolNumber, &de); err == nil && de {
 		e.SetSockOptBool(tcpip.DelayOption, true)
 	}
@@ -888,7 +919,7 @@ func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQue
 		e.probe = p
 	}
 
-	e.segmentQueue.setLimit(MaxUnprocessedSegments)
+	e.segmentQueue.ep = e
 	e.tsOffset = timeStampOffset()
 	e.acceptCond = sync.NewCond(&e.acceptMu)
 
@@ -901,7 +932,12 @@ func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
 	result := waiter.EventMask(0)
 
 	switch e.EndpointState() {
-	case StateInitial, StateBound, StateConnecting, StateSynSent, StateSynRecv:
+	case StateInitial, StateBound:
+		// This prevents blocking of new sockets which are not
+		// connected when SO_LINGER is set.
+		result |= waiter.EventHUp
+
+	case StateConnecting, StateSynSent, StateSynRecv:
 		// Ready for nothing.
 
 	case StateClose, StateError, StateTimeWait:
@@ -1007,6 +1043,26 @@ func (e *endpoint) Close() {
 		return
 	}
 
+	if e.linger.Enabled && e.linger.Timeout == 0 {
+		s := e.EndpointState()
+		isResetState := s == StateEstablished || s == StateCloseWait || s == StateFinWait1 || s == StateFinWait2 || s == StateSynRecv
+		if isResetState {
+			// Close the endpoint without doing full shutdown and
+			// send a RST.
+			e.resetConnectionLocked(tcpip.ErrConnectionAborted)
+			e.closeNoShutdownLocked()
+
+			// Wake up worker to close the endpoint.
+			switch s {
+			case StateSynRecv:
+				e.notifyProtocolGoroutine(notifyClose)
+			default:
+				e.notifyProtocolGoroutine(notifyTickleWorker)
+			}
+			return
+		}
+	}
+
 	// Issue a shutdown so that the peer knows we won't send any more data
 	// if we're connected, or stop accepting if we're listening.
 	e.shutdownLocked(tcpip.ShutdownWrite | tcpip.ShutdownRead)
@@ -1052,6 +1108,8 @@ func (e *endpoint) closeNoShutdownLocked() {
 		e.notifyProtocolGoroutine(notifyClose)
 	} else {
 		e.transitionToStateCloseLocked()
+		// Notify that the endpoint is closed.
+		e.waiterQueue.Notify(waiter.EventHUp)
 	}
 }
 
@@ -1106,10 +1164,16 @@ func (e *endpoint) cleanupLocked() {
 	tcpip.DeleteDanglingEndpoint(e)
 }
 
+// wndFromSpace returns the window that we can advertise based on the available
+// receive buffer space.
+func wndFromSpace(space int) int {
+	return space >> rcvAdvWndScale
+}
+
 // initialReceiveWindow returns the initial receive window to advertise in the
 // SYN/SYN-ACK.
 func (e *endpoint) initialReceiveWindow() int {
-	rcvWnd := e.receiveBufferAvailable()
+	rcvWnd := wndFromSpace(e.receiveBufferAvailable())
 	if rcvWnd > math.MaxUint16 {
 		rcvWnd = math.MaxUint16
 	}
@@ -1186,14 +1250,12 @@ func (e *endpoint) ModerateRecvBuf(copied int) {
 		// reject valid data that might already be in flight as the
 		// acceptable window will shrink.
 		if rcvWnd > e.rcvBufSize {
-			availBefore := e.receiveBufferAvailableLocked()
+			availBefore := wndFromSpace(e.receiveBufferAvailableLocked())
 			e.rcvBufSize = rcvWnd
-			availAfter := e.receiveBufferAvailableLocked()
-			mask := uint32(notifyReceiveWindowChanged)
+			availAfter := wndFromSpace(e.receiveBufferAvailableLocked())
 			if crossed, above := e.windowCrossedACKThresholdLocked(availAfter - availBefore); crossed && above {
-				mask |= notifyNonZeroReceiveWindow
+				e.notifyProtocolGoroutine(notifyNonZeroReceiveWindow)
 			}
-			e.notifyProtocolGoroutine(mask)
 		}
 
 		// We only update prevCopied when we grow the buffer because in cases
@@ -1211,7 +1273,7 @@ func (e *endpoint) SetOwner(owner tcpip.PacketOwner) {
 	e.owner = owner
 }
 
-func (e *endpoint) takeLastError() *tcpip.Error {
+func (e *endpoint) LastError() *tcpip.Error {
 	e.lastErrorMu.Lock()
 	defer e.lastErrorMu.Unlock()
 	err := e.lastError
@@ -1270,18 +1332,22 @@ func (e *endpoint) readLocked() (buffer.View, *tcpip.Error) {
 	v := views[s.viewToDeliver]
 	s.viewToDeliver++
 
+	var delta int
 	if s.viewToDeliver >= len(views) {
 		e.rcvList.Remove(s)
+		// We only free up receive buffer space when the segment is released as the
+		// segment is still holding on to the views even though some views have been
+		// read out to the user.
+		delta = s.segMemSize()
 		s.decRef()
 	}
 
 	e.rcvBufUsed -= len(v)
-
 	// If the window was small before this read and if the read freed up
 	// enough buffer space, to either fit an aMSS or half a receive buffer
 	// (whichever smaller), then notify the protocol goroutine to send a
 	// window update.
-	if crossed, above := e.windowCrossedACKThresholdLocked(len(v)); crossed && above {
+	if crossed, above := e.windowCrossedACKThresholdLocked(delta); crossed && above {
 		e.notifyProtocolGoroutine(notifyNonZeroReceiveWindow)
 	}
 
@@ -1294,14 +1360,17 @@ func (e *endpoint) readLocked() (buffer.View, *tcpip.Error) {
 // indicating the reason why it's not writable.
 // Caller must hold e.mu and e.sndBufMu
 func (e *endpoint) isEndpointWritableLocked() (int, *tcpip.Error) {
-	// The endpoint cannot be written to if it's not connected.
-	if !e.EndpointState().connected() {
-		switch e.EndpointState() {
-		case StateError:
-			return 0, e.HardError
-		default:
-			return 0, tcpip.ErrClosedForSend
-		}
+	switch s := e.EndpointState(); {
+	case s == StateError:
+		return 0, e.HardError
+	case !s.connecting() && !s.connected():
+		return 0, tcpip.ErrClosedForSend
+	case s.connecting():
+		// As per RFC793, page 56, a send request arriving when in connecting
+		// state, can be queued to be completed after the state becomes
+		// connected. Return an error code for the caller of endpoint Write to
+		// try again, until the connection handshake is complete.
+		return 0, tcpip.ErrWouldBlock
 	}
 
 	// Check if the connection has already been closed for sends.
@@ -1428,7 +1497,7 @@ func (e *endpoint) Peek(vec [][]byte) (int64, tcpip.ControlMessages, *tcpip.Erro
 	vec = append([][]byte(nil), vec...)
 
 	var num int64
-	for s := e.rcvList.Front(); s != nil; s = s.segEntry.Next() {
+	for s := e.rcvList.Front(); s != nil; s = s.Next() {
 		views := s.data.Views()
 
 		for i := s.viewToDeliver; i < len(views); i++ {
@@ -1454,12 +1523,44 @@ func (e *endpoint) Peek(vec [][]byte) (int64, tcpip.ControlMessages, *tcpip.Erro
 	return num, tcpip.ControlMessages{}, nil
 }
 
+// selectWindowLocked returns the new window without checking for shrinking or scaling
+// applied.
+// Precondition: e.mu and e.rcvListMu must be held.
+func (e *endpoint) selectWindowLocked() (wnd seqnum.Size) {
+	wndFromAvailable := wndFromSpace(e.receiveBufferAvailableLocked())
+	maxWindow := wndFromSpace(e.rcvBufSize)
+	wndFromUsedBytes := maxWindow - e.rcvBufUsed
+
+	// We take the lesser of the wndFromAvailable and wndFromUsedBytes because in
+	// cases where we receive a lot of small segments the segment overhead is a
+	// lot higher and we can run out socket buffer space before we can fill the
+	// previous window we advertised. In cases where we receive MSS sized or close
+	// MSS sized segments we will probably run out of window space before we
+	// exhaust receive buffer.
+	newWnd := wndFromAvailable
+	if newWnd > wndFromUsedBytes {
+		newWnd = wndFromUsedBytes
+	}
+	if newWnd < 0 {
+		newWnd = 0
+	}
+	return seqnum.Size(newWnd)
+}
+
+// selectWindow invokes selectWindowLocked after acquiring e.rcvListMu.
+func (e *endpoint) selectWindow() (wnd seqnum.Size) {
+	e.rcvListMu.Lock()
+	wnd = e.selectWindowLocked()
+	e.rcvListMu.Unlock()
+	return wnd
+}
+
 // windowCrossedACKThresholdLocked checks if the receive window to be announced
-// now would be under aMSS or under half receive buffer, whichever smaller. This
-// is useful as a receive side silly window syndrome prevention mechanism. If
-// window grows to reasonable value, we should send ACK to the sender to inform
-// the rx space is now large. We also want ensure a series of small read()'s
-// won't trigger a flood of spurious tiny ACK's.
+// would be under aMSS or under the window derived from half receive buffer,
+// whichever smaller. This is useful as a receive side silly window syndrome
+// prevention mechanism. If window grows to reasonable value, we should send ACK
+// to the sender to inform the rx space is now large. We also want ensure a
+// series of small read()'s won't trigger a flood of spurious tiny ACK's.
 //
 // For large receive buffers, the threshold is aMSS - once reader reads more
 // than aMSS we'll send ACK. For tiny receive buffers, the threshold is half of
@@ -1470,17 +1571,18 @@ func (e *endpoint) Peek(vec [][]byte) (int64, tcpip.ControlMessages, *tcpip.Erro
 //
 // Precondition: e.mu and e.rcvListMu must be held.
 func (e *endpoint) windowCrossedACKThresholdLocked(deltaBefore int) (crossed bool, above bool) {
-	newAvail := e.receiveBufferAvailableLocked()
+	newAvail := int(e.selectWindowLocked())
 	oldAvail := newAvail - deltaBefore
 	if oldAvail < 0 {
 		oldAvail = 0
 	}
-
 	threshold := int(e.amss)
-	if threshold > e.rcvBufSize/2 {
-		threshold = e.rcvBufSize / 2
+	// rcvBufFraction is the inverse of the fraction of receive buffer size that
+	// is used to decide if the available buffer space is now above it.
+	const rcvBufFraction = 2
+	if wndThreshold := wndFromSpace(e.rcvBufSize / rcvBufFraction); threshold > wndThreshold {
+		threshold = wndThreshold
 	}
-
 	switch {
 	case oldAvail < threshold && newAvail >= threshold:
 		return true, true
@@ -1609,18 +1711,24 @@ func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
 	case tcpip.ReceiveBufferSizeOption:
 		// Make sure the receive buffer size is within the min and max
 		// allowed.
-		var rs ReceiveBufferSizeOption
-		if err := e.stack.TransportProtocolOption(ProtocolNumber, &rs); err == nil {
+		var rs tcpip.TCPReceiveBufferSizeRangeOption
+		if err := e.stack.TransportProtocolOption(ProtocolNumber, &rs); err != nil {
+			panic(fmt.Sprintf("e.stack.TransportProtocolOption(%d, %#v) = %s", ProtocolNumber, &rs, err))
+		}
+
+		if v > rs.Max {
+			v = rs.Max
+		}
+
+		if v < math.MaxInt32/SegOverheadFactor {
+			v *= SegOverheadFactor
 			if v < rs.Min {
 				v = rs.Min
 			}
-			if v > rs.Max {
-				v = rs.Max
-			}
+		} else {
+			v = math.MaxInt32
 		}
 
-		mask := uint32(notifyReceiveWindowChanged)
-
 		e.LockUser()
 		e.rcvListMu.Lock()
 
@@ -1634,14 +1742,9 @@ func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
 			v = 1 << scale
 		}
 
-		// Make sure 2*size doesn't overflow.
-		if v > math.MaxInt32/2 {
-			v = math.MaxInt32 / 2
-		}
-
-		availBefore := e.receiveBufferAvailableLocked()
+		availBefore := wndFromSpace(e.receiveBufferAvailableLocked())
 		e.rcvBufSize = v
-		availAfter := e.receiveBufferAvailableLocked()
+		availAfter := wndFromSpace(e.receiveBufferAvailableLocked())
 
 		e.rcvAutoParams.disabled = true
 
@@ -1649,24 +1752,31 @@ func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
 		// syndrome prevetion, when our available space grows above aMSS
 		// or half receive buffer, whichever smaller.
 		if crossed, above := e.windowCrossedACKThresholdLocked(availAfter - availBefore); crossed && above {
-			mask |= notifyNonZeroReceiveWindow
+			e.notifyProtocolGoroutine(notifyNonZeroReceiveWindow)
 		}
 
 		e.rcvListMu.Unlock()
 		e.UnlockUser()
-		e.notifyProtocolGoroutine(mask)
 
 	case tcpip.SendBufferSizeOption:
 		// Make sure the send buffer size is within the min and max
 		// allowed.
-		var ss SendBufferSizeOption
-		if err := e.stack.TransportProtocolOption(ProtocolNumber, &ss); err == nil {
+		var ss tcpip.TCPSendBufferSizeRangeOption
+		if err := e.stack.TransportProtocolOption(ProtocolNumber, &ss); err != nil {
+			panic(fmt.Sprintf("e.stack.TransportProtocolOption(%d, %#v) = %s", ProtocolNumber, &ss, err))
+		}
+
+		if v > ss.Max {
+			v = ss.Max
+		}
+
+		if v < math.MaxInt32/SegOverheadFactor {
+			v *= SegOverheadFactor
 			if v < ss.Min {
 				v = ss.Min
 			}
-			if v > ss.Max {
-				v = ss.Max
-			}
+		} else {
+			v = math.MaxInt32
 		}
 
 		e.sndBufMu.Lock()
@@ -1699,7 +1809,7 @@ func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
 				return tcpip.ErrInvalidOptionValue
 			}
 		}
-		var rs ReceiveBufferSizeOption
+		var rs tcpip.TCPReceiveBufferSizeRangeOption
 		if err := e.stack.TransportProtocolOption(ProtocolNumber, &rs); err == nil {
 			if v < rs.Min/2 {
 				v = rs.Min / 2
@@ -1713,10 +1823,10 @@ func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
 }
 
 // SetSockOpt sets a socket option.
-func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
+func (e *endpoint) SetSockOpt(opt tcpip.SettableSocketOption) *tcpip.Error {
 	switch v := opt.(type) {
-	case tcpip.BindToDeviceOption:
-		id := tcpip.NICID(v)
+	case *tcpip.BindToDeviceOption:
+		id := tcpip.NICID(*v)
 		if id != 0 && !e.stack.HasNIC(id) {
 			return tcpip.ErrUnknownDevice
 		}
@@ -1724,40 +1834,40 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 		e.bindToDevice = id
 		e.UnlockUser()
 
-	case tcpip.KeepaliveIdleOption:
+	case *tcpip.KeepaliveIdleOption:
 		e.keepalive.Lock()
-		e.keepalive.idle = time.Duration(v)
+		e.keepalive.idle = time.Duration(*v)
 		e.keepalive.Unlock()
 		e.notifyProtocolGoroutine(notifyKeepaliveChanged)
 
-	case tcpip.KeepaliveIntervalOption:
+	case *tcpip.KeepaliveIntervalOption:
 		e.keepalive.Lock()
-		e.keepalive.interval = time.Duration(v)
+		e.keepalive.interval = time.Duration(*v)
 		e.keepalive.Unlock()
 		e.notifyProtocolGoroutine(notifyKeepaliveChanged)
 
-	case tcpip.OutOfBandInlineOption:
+	case *tcpip.OutOfBandInlineOption:
 		// We don't currently support disabling this option.
 
-	case tcpip.TCPUserTimeoutOption:
+	case *tcpip.TCPUserTimeoutOption:
 		e.LockUser()
-		e.userTimeout = time.Duration(v)
+		e.userTimeout = time.Duration(*v)
 		e.UnlockUser()
 
-	case tcpip.CongestionControlOption:
+	case *tcpip.CongestionControlOption:
 		// Query the available cc algorithms in the stack and
 		// validate that the specified algorithm is actually
 		// supported in the stack.
-		var avail tcpip.AvailableCongestionControlOption
+		var avail tcpip.TCPAvailableCongestionControlOption
 		if err := e.stack.TransportProtocolOption(ProtocolNumber, &avail); err != nil {
 			return err
 		}
 		availCC := strings.Split(string(avail), " ")
 		for _, cc := range availCC {
-			if v == tcpip.CongestionControlOption(cc) {
+			if *v == tcpip.CongestionControlOption(cc) {
 				e.LockUser()
 				state := e.EndpointState()
-				e.cc = v
+				e.cc = *v
 				switch state {
 				case StateEstablished:
 					if e.EndpointState() == state {
@@ -1773,31 +1883,45 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 		// control algorithm is specified.
 		return tcpip.ErrNoSuchFile
 
-	case tcpip.TCPLingerTimeoutOption:
+	case *tcpip.TCPLingerTimeoutOption:
 		e.LockUser()
-		if v < 0 {
+
+		switch {
+		case *v < 0:
 			// Same as effectively disabling TCPLinger timeout.
-			v = 0
-		}
-		// Cap it to MaxTCPLingerTimeout.
-		stkTCPLingerTimeout := tcpip.TCPLingerTimeoutOption(MaxTCPLingerTimeout)
-		if v > stkTCPLingerTimeout {
-			v = stkTCPLingerTimeout
+			*v = -1
+		case *v == 0:
+			// Same as the stack default.
+			var stackLingerTimeout tcpip.TCPLingerTimeoutOption
+			if err := e.stack.TransportProtocolOption(ProtocolNumber, &stackLingerTimeout); err != nil {
+				panic(fmt.Sprintf("e.stack.TransportProtocolOption(%d, %+v) = %v", ProtocolNumber, &stackLingerTimeout, err))
+			}
+			*v = stackLingerTimeout
+		case *v > tcpip.TCPLingerTimeoutOption(MaxTCPLingerTimeout):
+			// Cap it to Stack's default TCP_LINGER2 timeout.
+			*v = tcpip.TCPLingerTimeoutOption(MaxTCPLingerTimeout)
+		default:
 		}
-		e.tcpLingerTimeout = time.Duration(v)
+
+		e.tcpLingerTimeout = time.Duration(*v)
 		e.UnlockUser()
 
-	case tcpip.TCPDeferAcceptOption:
+	case *tcpip.TCPDeferAcceptOption:
 		e.LockUser()
-		if time.Duration(v) > MaxRTO {
-			v = tcpip.TCPDeferAcceptOption(MaxRTO)
+		if time.Duration(*v) > MaxRTO {
+			*v = tcpip.TCPDeferAcceptOption(MaxRTO)
 		}
-		e.deferAccept = time.Duration(v)
+		e.deferAccept = time.Duration(*v)
 		e.UnlockUser()
 
-	case tcpip.SocketDetachFilterOption:
+	case *tcpip.SocketDetachFilterOption:
 		return nil
 
+	case *tcpip.LingerOption:
+		e.LockUser()
+		e.linger = *v
+		e.UnlockUser()
+
 	default:
 		return nil
 	}
@@ -1956,11 +2080,8 @@ func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
 }
 
 // GetSockOpt implements tcpip.Endpoint.GetSockOpt.
-func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
+func (e *endpoint) GetSockOpt(opt tcpip.GettableSocketOption) *tcpip.Error {
 	switch o := opt.(type) {
-	case tcpip.ErrorOption:
-		return e.takeLastError()
-
 	case *tcpip.BindToDeviceOption:
 		e.LockUser()
 		*o = tcpip.BindToDeviceOption(e.bindToDevice)
@@ -2013,8 +2134,10 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 		e.UnlockUser()
 
 	case *tcpip.OriginalDestinationOption:
+		e.LockUser()
 		ipt := e.stack.IPTables()
-		addr, port, err := ipt.OriginalDst(e.ID)
+		addr, port, err := ipt.OriginalDst(e.ID, e.NetProto)
+		e.UnlockUser()
 		if err != nil {
 			return err
 		}
@@ -2023,6 +2146,11 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 			Port: port,
 		}
 
+	case *tcpip.LingerOption:
+		e.LockUser()
+		*o = e.linger
+		e.UnlockUser()
+
 	default:
 		return tcpip.ErrUnknownProtocolOption
 	}
@@ -2169,7 +2297,7 @@ func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) *tc
 			if sameAddr && p == e.ID.RemotePort {
 				return false, nil
 			}
-			if _, err := e.stack.ReservePort(netProtos, ProtocolNumber, e.ID.LocalAddress, p, e.portFlags, e.bindToDevice, addr); err != nil {
+			if _, err := e.stack.ReservePort(netProtos, ProtocolNumber, e.ID.LocalAddress, p, e.portFlags, e.bindToDevice, addr, nil /* testPort */); err != nil {
 				if err != tcpip.ErrPortInUse || !reuse {
 					return false, nil
 				}
@@ -2207,7 +2335,7 @@ func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) *tc
 				tcpEP.notifyProtocolGoroutine(notifyAbort)
 				tcpEP.UnlockUser()
 				// Now try and Reserve again if it fails then we skip.
-				if _, err := e.stack.ReservePort(netProtos, ProtocolNumber, e.ID.LocalAddress, p, e.portFlags, e.bindToDevice, addr); err != nil {
+				if _, err := e.stack.ReservePort(netProtos, ProtocolNumber, e.ID.LocalAddress, p, e.portFlags, e.bindToDevice, addr, nil /* testPort */); err != nil {
 					return false, nil
 				}
 			}
@@ -2249,7 +2377,7 @@ func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) *tc
 	if !handshake {
 		e.segmentQueue.mu.Lock()
 		for _, l := range []segmentList{e.segmentQueue.list, e.sndQueue, e.snd.writeList} {
-			for s := l.Front(); s != nil; s = s.segEntry.Next() {
+			for s := l.Front(); s != nil; s = s.Next() {
 				s.id = e.ID
 				s.route = r.Clone()
 				e.sndWaker.Assert()
@@ -2447,7 +2575,9 @@ func (e *endpoint) startAcceptedLoop() {
 
 // Accept returns a new endpoint if a peer has established a connection
 // to an endpoint previously set to listen mode.
-func (e *endpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
+//
+// addr if not-nil will contain the peer address of the returned endpoint.
+func (e *endpoint) Accept(peerAddr *tcpip.FullAddress) (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
 	e.LockUser()
 	defer e.UnlockUser()
 
@@ -2469,6 +2599,9 @@ func (e *endpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
 	default:
 		return nil, nil, tcpip.ErrWouldBlock
 	}
+	if peerAddr != nil {
+		*peerAddr = n.getRemoteAddress()
+	}
 	return n, n.waiterQueue, nil
 }
 
@@ -2505,47 +2638,45 @@ func (e *endpoint) bindLocked(addr tcpip.FullAddress) (err *tcpip.Error) {
 		}
 	}
 
-	port, err := e.stack.ReservePort(netProtos, ProtocolNumber, addr.Addr, addr.Port, e.portFlags, e.bindToDevice, tcpip.FullAddress{})
-	if err != nil {
-		return err
-	}
-
-	e.boundBindToDevice = e.bindToDevice
-	e.boundPortFlags = e.portFlags
-	e.isPortReserved = true
-	e.effectiveNetProtos = netProtos
-	e.ID.LocalPort = port
-
-	// Any failures beyond this point must remove the port registration.
-	defer func(portFlags ports.Flags, bindToDevice tcpip.NICID) {
-		if err != nil {
-			e.stack.ReleasePort(netProtos, ProtocolNumber, addr.Addr, port, portFlags, bindToDevice, tcpip.FullAddress{})
-			e.isPortReserved = false
-			e.effectiveNetProtos = nil
-			e.ID.LocalPort = 0
-			e.ID.LocalAddress = ""
-			e.boundNICID = 0
-			e.boundBindToDevice = 0
-			e.boundPortFlags = ports.Flags{}
-		}
-	}(e.boundPortFlags, e.boundBindToDevice)
-
+	var nic tcpip.NICID
 	// If an address is specified, we must ensure that it's one of our
 	// local addresses.
 	if len(addr.Addr) != 0 {
-		nic := e.stack.CheckLocalAddress(addr.NIC, netProto, addr.Addr)
+		nic = e.stack.CheckLocalAddress(addr.NIC, netProto, addr.Addr)
 		if nic == 0 {
 			return tcpip.ErrBadLocalAddress
 		}
-
-		e.boundNICID = nic
 		e.ID.LocalAddress = addr.Addr
 	}
 
-	if err := e.stack.CheckRegisterTransportEndpoint(e.boundNICID, e.effectiveNetProtos, ProtocolNumber, e.ID, e.boundPortFlags, e.boundBindToDevice); err != nil {
+	port, err := e.stack.ReservePort(netProtos, ProtocolNumber, addr.Addr, addr.Port, e.portFlags, e.bindToDevice, tcpip.FullAddress{}, func(p uint16) bool {
+		id := e.ID
+		id.LocalPort = p
+		// CheckRegisterTransportEndpoint should only return an error if there is a
+		// listening endpoint bound with the same id and portFlags and bindToDevice
+		// options.
+		//
+		// NOTE: Only listening and connected endpoint register with
+		// demuxer. Further connected endpoints always have a remote
+		// address/port. Hence this will only return an error if there is a matching
+		// listening endpoint.
+		if err := e.stack.CheckRegisterTransportEndpoint(nic, netProtos, ProtocolNumber, id, e.portFlags, e.bindToDevice); err != nil {
+			return false
+		}
+		return true
+	})
+	if err != nil {
 		return err
 	}
 
+	e.boundBindToDevice = e.bindToDevice
+	e.boundPortFlags = e.portFlags
+	// TODO(gvisor.dev/issue/3691): Add test to verify boundNICID is correct.
+	e.boundNICID = nic
+	e.isPortReserved = true
+	e.effectiveNetProtos = netProtos
+	e.ID.LocalPort = port
+
 	// Mark endpoint as bound.
 	e.setEndpointState(StateBound)
 
@@ -2573,11 +2704,15 @@ func (e *endpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) {
 		return tcpip.FullAddress{}, tcpip.ErrNotConnected
 	}
 
+	return e.getRemoteAddress(), nil
+}
+
+func (e *endpoint) getRemoteAddress() tcpip.FullAddress {
 	return tcpip.FullAddress{
 		Addr: e.ID.RemoteAddress,
 		Port: e.ID.RemotePort,
 		NIC:  e.boundNICID,
-	}, nil
+	}
 }
 
 func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pkt *stack.PacketBuffer) {
@@ -2648,13 +2783,8 @@ func (e *endpoint) updateSndBufferUsage(v int) {
 func (e *endpoint) readyToRead(s *segment) {
 	e.rcvListMu.Lock()
 	if s != nil {
+		e.rcvBufUsed += s.payloadSize()
 		s.incRef()
-		e.rcvBufUsed += s.data.Size()
-		// Increase counter if the receive window falls down below MSS
-		// or half receive buffer size, whichever smaller.
-		if crossed, above := e.windowCrossedACKThresholdLocked(-s.data.Size()); crossed && !above {
-			e.stats.ReceiveErrors.ZeroRcvWindowState.Increment()
-		}
 		e.rcvList.PushBack(s)
 	} else {
 		e.rcvClosed = true
@@ -2669,15 +2799,17 @@ func (e *endpoint) readyToRead(s *segment) {
 func (e *endpoint) receiveBufferAvailableLocked() int {
 	// We may use more bytes than the buffer size when the receive buffer
 	// shrinks.
-	if e.rcvBufUsed >= e.rcvBufSize {
+	memUsed := e.receiveMemUsed()
+	if memUsed >= e.rcvBufSize {
 		return 0
 	}
 
-	return e.rcvBufSize - e.rcvBufUsed
+	return e.rcvBufSize - memUsed
 }
 
 // receiveBufferAvailable calculates how many bytes are still available in the
-// receive buffer.
+// receive buffer based on the actual memory used by all segments held in
+// receive buffer/pending and segment queue.
 func (e *endpoint) receiveBufferAvailable() int {
 	e.rcvListMu.Lock()
 	available := e.receiveBufferAvailableLocked()
@@ -2685,16 +2817,37 @@ func (e *endpoint) receiveBufferAvailable() int {
 	return available
 }
 
+// receiveBufferUsed returns the amount of in-use receive buffer.
+func (e *endpoint) receiveBufferUsed() int {
+	e.rcvListMu.Lock()
+	used := e.rcvBufUsed
+	e.rcvListMu.Unlock()
+	return used
+}
+
+// receiveBufferSize returns the current size of the receive buffer.
 func (e *endpoint) receiveBufferSize() int {
 	e.rcvListMu.Lock()
 	size := e.rcvBufSize
 	e.rcvListMu.Unlock()
-
 	return size
 }
 
+// receiveMemUsed returns the total memory in use by segments held by this
+// endpoint.
+func (e *endpoint) receiveMemUsed() int {
+	return int(atomic.LoadInt32(&e.rcvMemUsed))
+}
+
+// updateReceiveMemUsed adds the provided delta to e.rcvMemUsed.
+func (e *endpoint) updateReceiveMemUsed(delta int) {
+	atomic.AddInt32(&e.rcvMemUsed, int32(delta))
+}
+
+// maxReceiveBufferSize returns the stack wide maximum receive buffer size for
+// an endpoint.
 func (e *endpoint) maxReceiveBufferSize() int {
-	var rs ReceiveBufferSizeOption
+	var rs tcpip.TCPReceiveBufferSizeRangeOption
 	if err := e.stack.TransportProtocolOption(ProtocolNumber, &rs); err != nil {
 		// As a fallback return the hardcoded max buffer size.
 		return MaxBufferSize
@@ -2774,7 +2927,7 @@ func timeStampOffset() uint32 {
 // if the SYN options indicate that the SACK option was negotiated and the TCP
 // stack is configured to enable TCP SACK option.
 func (e *endpoint) maybeEnableSACKPermitted(synOpts *header.TCPSynOptions) {
-	var v SACKEnabled
+	var v tcpip.TCPSACKEnabled
 	if err := e.stack.TransportProtocolOption(ProtocolNumber, &v); err != nil {
 		// Stack doesn't support SACK. So just return.
 		return
@@ -2843,7 +2996,6 @@ func (e *endpoint) completeState() stack.TCPEndpointState {
 		RcvAcc:         e.rcv.rcvAcc,
 		RcvWndScale:    e.rcv.rcvWndScale,
 		PendingBufUsed: e.rcv.pendingBufUsed,
-		PendingBufSize: e.rcv.pendingBufSize,
 	}
 
 	// Copy sender state.
@@ -2898,6 +3050,7 @@ func (e *endpoint) completeState() stack.TCPEndpointState {
 		EndSequence: rc.endSequence,
 		FACK:        rc.fack,
 		RTT:         rc.rtt,
+		Reord:       rc.reorderSeen,
 	}
 	return s
 }
diff --git a/pkg/tcpip/transport/tcp/endpoint_state.go b/pkg/tcpip/transport/tcp/endpoint_state.go
index 723e47ddc..b25431467 100644
--- a/pkg/tcpip/transport/tcp/endpoint_state.go
+++ b/pkg/tcpip/transport/tcp/endpoint_state.go
@@ -44,7 +44,7 @@ func (e *endpoint) drainSegmentLocked() {
 // beforeSave is invoked by stateify.
 func (e *endpoint) beforeSave() {
 	// Stop incoming packets.
-	e.segmentQueue.setLimit(0)
+	e.segmentQueue.freeze()
 
 	e.mu.Lock()
 	defer e.mu.Unlock()
@@ -178,18 +178,18 @@ func (e *endpoint) afterLoad() {
 // Resume implements tcpip.ResumableEndpoint.Resume.
 func (e *endpoint) Resume(s *stack.Stack) {
 	e.stack = s
-	e.segmentQueue.setLimit(MaxUnprocessedSegments)
+	e.segmentQueue.thaw()
 	epState := e.origEndpointState
 	switch epState {
 	case StateInitial, StateBound, StateListen, StateConnecting, StateEstablished:
-		var ss SendBufferSizeOption
+		var ss tcpip.TCPSendBufferSizeRangeOption
 		if err := e.stack.TransportProtocolOption(ProtocolNumber, &ss); err == nil {
 			if e.sndBufSize < ss.Min || e.sndBufSize > ss.Max {
 				panic(fmt.Sprintf("endpoint.sndBufSize %d is outside the min and max allowed [%d, %d]", e.sndBufSize, ss.Min, ss.Max))
 			}
 		}
 
-		var rs ReceiveBufferSizeOption
+		var rs tcpip.TCPReceiveBufferSizeRangeOption
 		if err := e.stack.TransportProtocolOption(ProtocolNumber, &rs); err == nil {
 			if e.rcvBufSize < rs.Min || e.rcvBufSize > rs.Max {
 				panic(fmt.Sprintf("endpoint.rcvBufSize %d is outside the min and max allowed [%d, %d]", e.rcvBufSize, rs.Min, rs.Max))
diff --git a/pkg/tcpip/transport/tcp/protocol.go b/pkg/tcpip/transport/tcp/protocol.go
index c5afa2680..5bce73605 100644
--- a/pkg/tcpip/transport/tcp/protocol.go
+++ b/pkg/tcpip/transport/tcp/protocol.go
@@ -12,12 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// Package tcp contains the implementation of the TCP transport protocol. To use
-// it in the networking stack, this package must be added to the project, and
-// activated on the stack by passing tcp.NewProtocol() as one of the
-// transport protocols when calling stack.New(). Then endpoints can be created
-// by passing tcp.ProtocolNumber as the transport protocol number when calling
-// Stack.NewEndpoint().
+// Package tcp contains the implementation of the TCP transport protocol.
 package tcp
 
 import (
@@ -29,6 +24,7 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/header/parse"
 	"gvisor.dev/gvisor/pkg/tcpip/seqnum"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/tcpip/transport/raw"
@@ -79,50 +75,6 @@ const (
 	ccCubic = "cubic"
 )
 
-// SACKEnabled is used by stack.(*Stack).TransportProtocolOption to
-// enable/disable SACK support in TCP. See: https://tools.ietf.org/html/rfc2018.
-type SACKEnabled bool
-
-// Recovery is used by stack.(*Stack).TransportProtocolOption to
-// set loss detection algorithm in TCP.
-type Recovery int32
-
-const (
-	// RACKLossDetection indicates RACK is used for loss detection and
-	// recovery.
-	RACKLossDetection Recovery = 1 << iota
-
-	// RACKStaticReoWnd indicates the reordering window should not be
-	// adjusted when DSACK is received.
-	RACKStaticReoWnd
-
-	// RACKNoDupTh indicates RACK should not consider the classic three
-	// duplicate acknowledgements rule to mark the segments as lost. This
-	// is used when reordering is not detected.
-	RACKNoDupTh
-)
-
-// DelayEnabled is used by stack.(Stack*).TransportProtocolOption to
-// enable/disable Nagle's algorithm in TCP.
-type DelayEnabled bool
-
-// SendBufferSizeOption is used by stack.(Stack*).TransportProtocolOption
-// to get/set the default, min and max TCP send buffer sizes.
-type SendBufferSizeOption struct {
-	Min     int
-	Default int
-	Max     int
-}
-
-// ReceiveBufferSizeOption is used by
-// stack.(Stack*).TransportProtocolOption to get/set the default, min and max
-// TCP receive buffer sizes.
-type ReceiveBufferSizeOption struct {
-	Min     int
-	Default int
-	Max     int
-}
-
 // syncRcvdCounter tracks the number of endpoints in the SYN-RCVD state. The
 // value is protected by a mutex so that we can increment only when it's
 // guaranteed not to go above a threshold.
@@ -181,12 +133,14 @@ func (s *synRcvdCounter) Threshold() uint64 {
 }
 
 type protocol struct {
+	stack *stack.Stack
+
 	mu                         sync.RWMutex
 	sackEnabled                bool
-	recovery                   Recovery
+	recovery                   tcpip.TCPRecovery
 	delayEnabled               bool
-	sendBufferSize             SendBufferSizeOption
-	recvBufferSize             ReceiveBufferSizeOption
+	sendBufferSize             tcpip.TCPSendBufferSizeRangeOption
+	recvBufferSize             tcpip.TCPReceiveBufferSizeRangeOption
 	congestionControl          string
 	availableCongestionControl []string
 	moderateReceiveBuffer      bool
@@ -207,14 +161,14 @@ func (*protocol) Number() tcpip.TransportProtocolNumber {
 }
 
 // NewEndpoint creates a new tcp endpoint.
-func (p *protocol) NewEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
-	return newEndpoint(stack, netProto, waiterQueue), nil
+func (p *protocol) NewEndpoint(netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
+	return newEndpoint(p.stack, netProto, waiterQueue), nil
 }
 
 // NewRawEndpoint creates a new raw TCP endpoint. Raw TCP sockets are currently
 // unsupported. It implements stack.TransportProtocol.NewRawEndpoint.
-func (p *protocol) NewRawEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
-	return raw.NewEndpoint(stack, netProto, header.TCPProtocolNumber, waiterQueue)
+func (p *protocol) NewRawEndpoint(netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
+	return raw.NewEndpoint(p.stack, netProto, header.TCPProtocolNumber, waiterQueue)
 }
 
 // MinimumPacketSize returns the minimum valid tcp packet size.
@@ -244,21 +198,20 @@ func (p *protocol) QueuePacket(r *stack.Route, ep stack.TransportEndpoint, id st
 // a reset is sent in response to any incoming segment except another reset. In
 // particular, SYNs addressed to a non-existent connection are rejected by this
 // means."
-func (*protocol) HandleUnknownDestinationPacket(r *stack.Route, id stack.TransportEndpointID, pkt *stack.PacketBuffer) bool {
+
+func (*protocol) HandleUnknownDestinationPacket(r *stack.Route, id stack.TransportEndpointID, pkt *stack.PacketBuffer) stack.UnknownDestinationPacketDisposition {
 	s := newSegment(r, id, pkt)
 	defer s.decRef()
 
 	if !s.parse() || !s.csumValid {
-		return false
+		return stack.UnknownDestinationPacketMalformed
 	}
 
-	// There's nothing to do if this is already a reset packet.
-	if s.flagIsSet(header.TCPFlagRst) {
-		return true
+	if !s.flagIsSet(header.TCPFlagRst) {
+		replyWithReset(s, stack.DefaultTOS, s.route.DefaultTTL())
 	}
 
-	replyWithReset(s, stack.DefaultTOS, s.route.DefaultTTL())
-	return true
+	return stack.UnknownDestinationPacketHandled
 }
 
 // replyWithReset replies to the given segment with a reset segment.
@@ -296,49 +249,49 @@ func replyWithReset(s *segment, tos, ttl uint8) {
 }
 
 // SetOption implements stack.TransportProtocol.SetOption.
-func (p *protocol) SetOption(option interface{}) *tcpip.Error {
+func (p *protocol) SetOption(option tcpip.SettableTransportProtocolOption) *tcpip.Error {
 	switch v := option.(type) {
-	case SACKEnabled:
+	case *tcpip.TCPSACKEnabled:
 		p.mu.Lock()
-		p.sackEnabled = bool(v)
+		p.sackEnabled = bool(*v)
 		p.mu.Unlock()
 		return nil
 
-	case Recovery:
+	case *tcpip.TCPRecovery:
 		p.mu.Lock()
-		p.recovery = Recovery(v)
+		p.recovery = *v
 		p.mu.Unlock()
 		return nil
 
-	case DelayEnabled:
+	case *tcpip.TCPDelayEnabled:
 		p.mu.Lock()
-		p.delayEnabled = bool(v)
+		p.delayEnabled = bool(*v)
 		p.mu.Unlock()
 		return nil
 
-	case SendBufferSizeOption:
+	case *tcpip.TCPSendBufferSizeRangeOption:
 		if v.Min <= 0 || v.Default < v.Min || v.Default > v.Max {
 			return tcpip.ErrInvalidOptionValue
 		}
 		p.mu.Lock()
-		p.sendBufferSize = v
+		p.sendBufferSize = *v
 		p.mu.Unlock()
 		return nil
 
-	case ReceiveBufferSizeOption:
+	case *tcpip.TCPReceiveBufferSizeRangeOption:
 		if v.Min <= 0 || v.Default < v.Min || v.Default > v.Max {
 			return tcpip.ErrInvalidOptionValue
 		}
 		p.mu.Lock()
-		p.recvBufferSize = v
+		p.recvBufferSize = *v
 		p.mu.Unlock()
 		return nil
 
-	case tcpip.CongestionControlOption:
+	case *tcpip.CongestionControlOption:
 		for _, c := range p.availableCongestionControl {
-			if string(v) == c {
+			if string(*v) == c {
 				p.mu.Lock()
-				p.congestionControl = string(v)
+				p.congestionControl = string(*v)
 				p.mu.Unlock()
 				return nil
 			}
@@ -347,75 +300,79 @@ func (p *protocol) SetOption(option interface{}) *tcpip.Error {
 		// is specified.
 		return tcpip.ErrNoSuchFile
 
-	case tcpip.ModerateReceiveBufferOption:
+	case *tcpip.TCPModerateReceiveBufferOption:
 		p.mu.Lock()
-		p.moderateReceiveBuffer = bool(v)
+		p.moderateReceiveBuffer = bool(*v)
 		p.mu.Unlock()
 		return nil
 
-	case tcpip.TCPLingerTimeoutOption:
-		if v < 0 {
-			v = 0
-		}
+	case *tcpip.TCPLingerTimeoutOption:
 		p.mu.Lock()
-		p.lingerTimeout = time.Duration(v)
+		if *v < 0 {
+			p.lingerTimeout = 0
+		} else {
+			p.lingerTimeout = time.Duration(*v)
+		}
 		p.mu.Unlock()
 		return nil
 
-	case tcpip.TCPTimeWaitTimeoutOption:
-		if v < 0 {
-			v = 0
-		}
+	case *tcpip.TCPTimeWaitTimeoutOption:
 		p.mu.Lock()
-		p.timeWaitTimeout = time.Duration(v)
+		if *v < 0 {
+			p.timeWaitTimeout = 0
+		} else {
+			p.timeWaitTimeout = time.Duration(*v)
+		}
 		p.mu.Unlock()
 		return nil
 
-	case tcpip.TCPTimeWaitReuseOption:
-		if v < tcpip.TCPTimeWaitReuseDisabled || v > tcpip.TCPTimeWaitReuseLoopbackOnly {
+	case *tcpip.TCPTimeWaitReuseOption:
+		if *v < tcpip.TCPTimeWaitReuseDisabled || *v > tcpip.TCPTimeWaitReuseLoopbackOnly {
 			return tcpip.ErrInvalidOptionValue
 		}
 		p.mu.Lock()
-		p.timeWaitReuse = v
+		p.timeWaitReuse = *v
 		p.mu.Unlock()
 		return nil
 
-	case tcpip.TCPMinRTOOption:
-		if v < 0 {
-			v = tcpip.TCPMinRTOOption(MinRTO)
-		}
+	case *tcpip.TCPMinRTOOption:
 		p.mu.Lock()
-		p.minRTO = time.Duration(v)
+		if *v < 0 {
+			p.minRTO = MinRTO
+		} else {
+			p.minRTO = time.Duration(*v)
+		}
 		p.mu.Unlock()
 		return nil
 
-	case tcpip.TCPMaxRTOOption:
-		if v < 0 {
-			v = tcpip.TCPMaxRTOOption(MaxRTO)
-		}
+	case *tcpip.TCPMaxRTOOption:
 		p.mu.Lock()
-		p.maxRTO = time.Duration(v)
+		if *v < 0 {
+			p.maxRTO = MaxRTO
+		} else {
+			p.maxRTO = time.Duration(*v)
+		}
 		p.mu.Unlock()
 		return nil
 
-	case tcpip.TCPMaxRetriesOption:
+	case *tcpip.TCPMaxRetriesOption:
 		p.mu.Lock()
-		p.maxRetries = uint32(v)
+		p.maxRetries = uint32(*v)
 		p.mu.Unlock()
 		return nil
 
-	case tcpip.TCPSynRcvdCountThresholdOption:
+	case *tcpip.TCPSynRcvdCountThresholdOption:
 		p.mu.Lock()
-		p.synRcvdCount.SetThreshold(uint64(v))
+		p.synRcvdCount.SetThreshold(uint64(*v))
 		p.mu.Unlock()
 		return nil
 
-	case tcpip.TCPSynRetriesOption:
-		if v < 1 || v > 255 {
+	case *tcpip.TCPSynRetriesOption:
+		if *v < 1 || *v > 255 {
 			return tcpip.ErrInvalidOptionValue
 		}
 		p.mu.Lock()
-		p.synRetries = uint8(v)
+		p.synRetries = uint8(*v)
 		p.mu.Unlock()
 		return nil
 
@@ -425,33 +382,33 @@ func (p *protocol) SetOption(option interface{}) *tcpip.Error {
 }
 
 // Option implements stack.TransportProtocol.Option.
-func (p *protocol) Option(option interface{}) *tcpip.Error {
+func (p *protocol) Option(option tcpip.GettableTransportProtocolOption) *tcpip.Error {
 	switch v := option.(type) {
-	case *SACKEnabled:
+	case *tcpip.TCPSACKEnabled:
 		p.mu.RLock()
-		*v = SACKEnabled(p.sackEnabled)
+		*v = tcpip.TCPSACKEnabled(p.sackEnabled)
 		p.mu.RUnlock()
 		return nil
 
-	case *Recovery:
+	case *tcpip.TCPRecovery:
 		p.mu.RLock()
-		*v = Recovery(p.recovery)
+		*v = tcpip.TCPRecovery(p.recovery)
 		p.mu.RUnlock()
 		return nil
 
-	case *DelayEnabled:
+	case *tcpip.TCPDelayEnabled:
 		p.mu.RLock()
-		*v = DelayEnabled(p.delayEnabled)
+		*v = tcpip.TCPDelayEnabled(p.delayEnabled)
 		p.mu.RUnlock()
 		return nil
 
-	case *SendBufferSizeOption:
+	case *tcpip.TCPSendBufferSizeRangeOption:
 		p.mu.RLock()
 		*v = p.sendBufferSize
 		p.mu.RUnlock()
 		return nil
 
-	case *ReceiveBufferSizeOption:
+	case *tcpip.TCPReceiveBufferSizeRangeOption:
 		p.mu.RLock()
 		*v = p.recvBufferSize
 		p.mu.RUnlock()
@@ -463,15 +420,15 @@ func (p *protocol) Option(option interface{}) *tcpip.Error {
 		p.mu.RUnlock()
 		return nil
 
-	case *tcpip.AvailableCongestionControlOption:
+	case *tcpip.TCPAvailableCongestionControlOption:
 		p.mu.RLock()
-		*v = tcpip.AvailableCongestionControlOption(strings.Join(p.availableCongestionControl, " "))
+		*v = tcpip.TCPAvailableCongestionControlOption(strings.Join(p.availableCongestionControl, " "))
 		p.mu.RUnlock()
 		return nil
 
-	case *tcpip.ModerateReceiveBufferOption:
+	case *tcpip.TCPModerateReceiveBufferOption:
 		p.mu.RLock()
-		*v = tcpip.ModerateReceiveBufferOption(p.moderateReceiveBuffer)
+		*v = tcpip.TCPModerateReceiveBufferOption(p.moderateReceiveBuffer)
 		p.mu.RUnlock()
 		return nil
 
@@ -546,33 +503,19 @@ func (p *protocol) SynRcvdCounter() *synRcvdCounter {
 
 // Parse implements stack.TransportProtocol.Parse.
 func (*protocol) Parse(pkt *stack.PacketBuffer) bool {
-	// TCP header is variable length, peek at it first.
-	hdrLen := header.TCPMinimumSize
-	hdr, ok := pkt.Data.PullUp(hdrLen)
-	if !ok {
-		return false
-	}
-
-	// If the header has options, pull those up as well.
-	if offset := int(header.TCP(hdr).DataOffset()); offset > header.TCPMinimumSize && offset <= pkt.Data.Size() {
-		// TODO(gvisor.dev/issue/2404): Figure out whether to reject this kind of
-		// packets.
-		hdrLen = offset
-	}
-
-	_, ok = pkt.TransportHeader().Consume(hdrLen)
-	return ok
+	return parse.TCP(pkt)
 }
 
 // NewProtocol returns a TCP transport protocol.
-func NewProtocol() stack.TransportProtocol {
+func NewProtocol(s *stack.Stack) stack.TransportProtocol {
 	p := protocol{
-		sendBufferSize: SendBufferSizeOption{
+		stack: s,
+		sendBufferSize: tcpip.TCPSendBufferSizeRangeOption{
 			Min:     MinBufferSize,
 			Default: DefaultSendBufferSize,
 			Max:     MaxBufferSize,
 		},
-		recvBufferSize: ReceiveBufferSizeOption{
+		recvBufferSize: tcpip.TCPReceiveBufferSizeRangeOption{
 			Min:     MinBufferSize,
 			Default: DefaultReceiveBufferSize,
 			Max:     MaxBufferSize,
@@ -587,7 +530,7 @@ func NewProtocol() stack.TransportProtocol {
 		minRTO:                     MinRTO,
 		maxRTO:                     MaxRTO,
 		maxRetries:                 MaxRetries,
-		recovery:                   RACKLossDetection,
+		recovery:                   tcpip.TCPRACKLossDetection,
 	}
 	p.dispatcher.init(runtime.GOMAXPROCS(0))
 	return &p
diff --git a/pkg/tcpip/transport/tcp/rack.go b/pkg/tcpip/transport/tcp/rack.go
index d969ca23a..d312b1b8b 100644
--- a/pkg/tcpip/transport/tcp/rack.go
+++ b/pkg/tcpip/transport/tcp/rack.go
@@ -29,26 +29,36 @@ import (
 //
 // +stateify savable
 type rackControl struct {
-	// xmitTime is the latest transmission timestamp of rackControl.seg.
-	xmitTime time.Time `state:".(unixTime)"`
-
 	// endSequence is the ending TCP sequence number of rackControl.seg.
 	endSequence seqnum.Value
 
+	// dsack indicates if the connection has seen a DSACK.
+	dsack bool
+
 	// fack is the highest selectively or cumulatively acknowledged
 	// sequence.
 	fack seqnum.Value
 
+	// minRTT is the estimated minimum RTT of the connection.
+	minRTT time.Duration
+
 	// rtt is the RTT of the most recently delivered packet on the
 	// connection (either cumulatively acknowledged or selectively
 	// acknowledged) that was not marked invalid as a possible spurious
 	// retransmission.
 	rtt time.Duration
+
+	// reorderSeen indicates if reordering has been detected on this
+	// connection.
+	reorderSeen bool
+
+	// xmitTime is the latest transmission timestamp of rackControl.seg.
+	xmitTime time.Time `state:".(unixTime)"`
 }
 
-// Update will update the RACK related fields when an ACK has been received.
+// update will update the RACK related fields when an ACK has been received.
 // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2
-func (rc *rackControl) Update(seg *segment, ackSeg *segment, srtt time.Duration, offset uint32) {
+func (rc *rackControl) update(seg *segment, ackSeg *segment, offset uint32) {
 	rtt := time.Now().Sub(seg.xmitTime)
 
 	// If the ACK is for a retransmitted packet, do not update if it is a
@@ -65,12 +75,21 @@ func (rc *rackControl) Update(seg *segment, ackSeg *segment, srtt time.Duration,
 				return
 			}
 		}
-		if rtt < srtt {
+		if rtt < rc.minRTT {
 			return
 		}
 	}
 
 	rc.rtt = rtt
+
+	// The sender can either track a simple global minimum of all RTT
+	// measurements from the connection, or a windowed min-filtered value
+	// of recent RTT measurements. This implementation keeps track of the
+	// simple global minimum of all RTTs for the connection.
+	if rtt < rc.minRTT || rc.minRTT == 0 {
+		rc.minRTT = rtt
+	}
+
 	// Update rc.xmitTime and rc.endSequence to the transmit time and
 	// ending sequence number of the packet which has been acknowledged
 	// most recently.
@@ -80,3 +99,26 @@ func (rc *rackControl) Update(seg *segment, ackSeg *segment, srtt time.Duration,
 		rc.endSequence = endSeq
 	}
 }
+
+// detectReorder detects if packet reordering has been observed.
+// See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2
+// * Step 3: Detect data segment reordering.
+//   To detect reordering, the sender looks for original data segments being
+//   delivered out of order. To detect such cases, the sender tracks the
+//   highest sequence selectively or cumulatively acknowledged in the RACK.fack
+//   variable. The name "fack" stands for the most "Forward ACK" (this term is
+//   adopted from [FACK]). If a never retransmitted segment that's below
+//   RACK.fack is (selectively or cumulatively) acknowledged, it has been
+//   delivered out of order. The sender sets RACK.reord to TRUE if such segment
+//   is identified.
+func (rc *rackControl) detectReorder(seg *segment) {
+	endSeq := seg.sequenceNumber.Add(seqnum.Size(seg.data.Size()))
+	if rc.fack.LessThan(endSeq) {
+		rc.fack = endSeq
+		return
+	}
+
+	if endSeq.LessThan(rc.fack) && seg.xmitCount == 1 {
+		rc.reorderSeen = true
+	}
+}
diff --git a/pkg/tcpip/transport/tcp/rcv.go b/pkg/tcpip/transport/tcp/rcv.go
index 5e0bfe585..8e0b7c843 100644
--- a/pkg/tcpip/transport/tcp/rcv.go
+++ b/pkg/tcpip/transport/tcp/rcv.go
@@ -43,26 +43,32 @@ type receiver struct {
 	// rcvWnd is the non-scaled receive window last advertised to the peer.
 	rcvWnd seqnum.Size
 
+	// rcvWUP is the rcvNxt value at the last window update sent.
+	rcvWUP seqnum.Value
+
 	rcvWndScale uint8
 
 	closed bool
 
+	// pendingRcvdSegments is bounded by the receive buffer size of the
+	// endpoint.
 	pendingRcvdSegments segmentHeap
-	pendingBufUsed      seqnum.Size
-	pendingBufSize      seqnum.Size
+	// pendingBufUsed tracks the total number of bytes (including segment
+	// overhead) currently queued in pendingRcvdSegments.
+	pendingBufUsed int
 
 	// Time when the last ack was received.
 	lastRcvdAckTime time.Time `state:".(unixTime)"`
 }
 
-func newReceiver(ep *endpoint, irs seqnum.Value, rcvWnd seqnum.Size, rcvWndScale uint8, pendingBufSize seqnum.Size) *receiver {
+func newReceiver(ep *endpoint, irs seqnum.Value, rcvWnd seqnum.Size, rcvWndScale uint8) *receiver {
 	return &receiver{
 		ep:              ep,
 		rcvNxt:          irs + 1,
 		rcvAcc:          irs.Add(rcvWnd + 1),
 		rcvWnd:          rcvWnd,
+		rcvWUP:          irs + 1,
 		rcvWndScale:     rcvWndScale,
-		pendingBufSize:  pendingBufSize,
 		lastRcvdAckTime: time.Now(),
 	}
 }
@@ -82,19 +88,54 @@ func (r *receiver) acceptable(segSeq seqnum.Value, segLen seqnum.Size) bool {
 	return header.Acceptable(segSeq, segLen, r.rcvNxt, r.rcvNxt.Add(advertisedWindowSize))
 }
 
+// currentWindow returns the available space in the window that was advertised
+// last to our peer.
+func (r *receiver) currentWindow() (curWnd seqnum.Size) {
+	endOfWnd := r.rcvWUP.Add(r.rcvWnd)
+	if endOfWnd.LessThan(r.rcvNxt) {
+		// return 0 if r.rcvNxt is past the end of the previously advertised window.
+		// This can happen because we accept a large segment completely even if
+		// accepting it causes it to partially exceed the advertised window.
+		return 0
+	}
+	return r.rcvNxt.Size(endOfWnd)
+}
+
 // getSendParams returns the parameters needed by the sender when building
 // segments to send.
 func (r *receiver) getSendParams() (rcvNxt seqnum.Value, rcvWnd seqnum.Size) {
-	// Calculate the window size based on the available buffer space.
-	receiveBufferAvailable := r.ep.receiveBufferAvailable()
-	acc := r.rcvNxt.Add(seqnum.Size(receiveBufferAvailable))
-	if r.rcvAcc.LessThan(acc) {
-		r.rcvAcc = acc
+	newWnd := r.ep.selectWindow()
+	curWnd := r.currentWindow()
+	// Update rcvAcc only if new window is > previously advertised window. We
+	// should never shrink the acceptable sequence space once it has been
+	// advertised the peer. If we shrink the acceptable sequence space then we
+	// would end up dropping bytes that might already be in flight.
+	// ====================================================  sequence space.
+	// ^             ^               ^                   ^
+	// rcvWUP       rcvNxt         rcvAcc          new rcvAcc
+	//               <=====curWnd ===>
+	//               <========= newWnd > curWnd ========= >
+	if r.rcvNxt.Add(seqnum.Size(curWnd)).LessThan(r.rcvNxt.Add(seqnum.Size(newWnd))) {
+		// If the new window moves the right edge, then update rcvAcc.
+		r.rcvAcc = r.rcvNxt.Add(seqnum.Size(newWnd))
+	} else {
+		if newWnd == 0 {
+			// newWnd is zero but we can't advertise a zero as it would cause window
+			// to shrink so just increment a metric to record this event.
+			r.ep.stats.ReceiveErrors.WantZeroRcvWindow.Increment()
+		}
+		newWnd = curWnd
 	}
 	// Stash away the non-scaled receive window as we use it for measuring
 	// receiver's estimated RTT.
-	r.rcvWnd = r.rcvNxt.Size(r.rcvAcc)
-	return r.rcvNxt, r.rcvWnd >> r.rcvWndScale
+	r.rcvWnd = newWnd
+	r.rcvWUP = r.rcvNxt
+	scaledWnd := r.rcvWnd >> r.rcvWndScale
+	if scaledWnd == 0 {
+		// Increment a metric if we are advertising an actual zero window.
+		r.ep.stats.ReceiveErrors.ZeroRcvWindowState.Increment()
+	}
+	return r.rcvNxt, scaledWnd
 }
 
 // nonZeroWindow is called when the receive window grows from zero to nonzero;
@@ -195,7 +236,9 @@ func (r *receiver) consumeSegment(s *segment, segSeq seqnum.Value, segLen seqnum
 		}
 
 		for i := first; i < len(r.pendingRcvdSegments); i++ {
+			r.pendingBufUsed -= r.pendingRcvdSegments[i].segMemSize()
 			r.pendingRcvdSegments[i].decRef()
+
 			// Note that slice truncation does not allow garbage collection of
 			// truncated items, thus truncated items must be set to nil to avoid
 			// memory leaks.
@@ -268,14 +311,7 @@ func (r *receiver) handleRcvdSegmentClosing(s *segment, state EndpointState, clo
 	// If we are in one of the shutdown states then we need to do
 	// additional checks before we try and process the segment.
 	switch state {
-	case StateCloseWait:
-		// If the ACK acks something not yet sent then we send an ACK.
-		if r.ep.snd.sndNxt.LessThan(s.ackNumber) {
-			r.ep.snd.sendAck()
-			return true, nil
-		}
-		fallthrough
-	case StateClosing, StateLastAck:
+	case StateCloseWait, StateClosing, StateLastAck:
 		if !s.sequenceNumber.LessThanEq(r.rcvNxt) {
 			// Just drop the segment as we have
 			// already received a FIN and this
@@ -284,9 +320,31 @@ func (r *receiver) handleRcvdSegmentClosing(s *segment, state EndpointState, clo
 			return true, nil
 		}
 		fallthrough
-	case StateFinWait1:
-		fallthrough
-	case StateFinWait2:
+	case StateFinWait1, StateFinWait2:
+		// If the ACK acks something not yet sent then we send an ACK.
+		//
+		// RFC793, page 37: If the connection is in a synchronized state,
+		// (ESTABLISHED, FIN-WAIT-1, FIN-WAIT-2, CLOSE-WAIT, CLOSING, LAST-ACK,
+		// TIME-WAIT), any unacceptable segment (out of window sequence number
+		// or unacceptable acknowledgment number) must elicit only an empty
+		// acknowledgment segment containing the current send-sequence number
+		// and an acknowledgment indicating the next sequence number expected
+		// to be received, and the connection remains in the same state.
+		//
+		// Just as on Linux, we do not apply this behavior when state is
+		// ESTABLISHED.
+		// Linux receive processing for all states except ESTABLISHED and
+		// TIME_WAIT is here where if the ACK check fails, we attempt to
+		// reply back with an ACK with correct seq/ack numbers.
+		// https://github.com/torvalds/linux/blob/v5.8/net/ipv4/tcp_input.c#L6186
+		// The ESTABLISHED state processing is here where if the ACK check
+		// fails, we ignore the packet:
+		// https://github.com/torvalds/linux/blob/v5.8/net/ipv4/tcp_input.c#L5591
+		if r.ep.snd.sndNxt.LessThan(s.ackNumber) {
+			r.ep.snd.sendAck()
+			return true, nil
+		}
+
 		// If we are closed for reads (either due to an
 		// incoming FIN or the user calling shutdown(..,
 		// SHUT_RD) then any data past the rcvNxt should
@@ -369,10 +427,16 @@ func (r *receiver) handleRcvdSegment(s *segment) (drop bool, err *tcpip.Error) {
 	// Defer segment processing if it can't be consumed now.
 	if !r.consumeSegment(s, segSeq, segLen) {
 		if segLen > 0 || s.flagIsSet(header.TCPFlagFin) {
-			// We only store the segment if it's within our buffer
-			// size limit.
-			if r.pendingBufUsed < r.pendingBufSize {
-				r.pendingBufUsed += seqnum.Size(s.segMemSize())
+			// We only store the segment if it's within our buffer size limit.
+			//
+			// Only use 75% of the receive buffer queue for out-of-order
+			// segments. This ensures that we always leave some space for the inorder
+			// segments to arrive allowing pending segments to be processed and
+			// delivered to the user.
+			if r.ep.receiveBufferAvailable() > 0 && r.pendingBufUsed < r.ep.receiveBufferSize()>>2 {
+				r.ep.rcvListMu.Lock()
+				r.pendingBufUsed += s.segMemSize()
+				r.ep.rcvListMu.Unlock()
 				s.incRef()
 				heap.Push(&r.pendingRcvdSegments, s)
 				UpdateSACKBlocks(&r.ep.sack, segSeq, segSeq.Add(segLen), r.rcvNxt)
@@ -406,7 +470,9 @@ func (r *receiver) handleRcvdSegment(s *segment) (drop bool, err *tcpip.Error) {
 		}
 
 		heap.Pop(&r.pendingRcvdSegments)
-		r.pendingBufUsed -= seqnum.Size(s.segMemSize())
+		r.ep.rcvListMu.Lock()
+		r.pendingBufUsed -= s.segMemSize()
+		r.ep.rcvListMu.Unlock()
 		s.decRef()
 	}
 	return false, nil
@@ -421,6 +487,13 @@ func (r *receiver) handleTimeWaitSegment(s *segment) (resetTimeWait bool, newSyn
 	// Just silently drop any RST packets in TIME_WAIT. We do not support
 	// TIME_WAIT assasination as a result we confirm w/ fix 1 as described
 	// in https://tools.ietf.org/html/rfc1337#section-3.
+	//
+	// This behavior overrides RFC793 page 70 where we transition to CLOSED
+	// on receiving RST, which is also default Linux behavior.
+	// On Linux the RST can be ignored by setting sysctl net.ipv4.tcp_rfc1337.
+	//
+	// As we do not yet support PAWS, we are being conservative in ignoring
+	// RSTs by default.
 	if s.flagIsSet(header.TCPFlagRst) {
 		return false, false
 	}
diff --git a/pkg/tcpip/transport/tcp/sack_scoreboard.go b/pkg/tcpip/transport/tcp/sack_scoreboard.go
index 7ef2df377..833a7b470 100644
--- a/pkg/tcpip/transport/tcp/sack_scoreboard.go
+++ b/pkg/tcpip/transport/tcp/sack_scoreboard.go
@@ -164,7 +164,7 @@ func (s *SACKScoreboard) IsSACKED(r header.SACKBlock) bool {
 	return found
 }
 
-// Dump prints the state of the scoreboard structure.
+// String returns human-readable state of the scoreboard structure.
 func (s *SACKScoreboard) String() string {
 	var str strings.Builder
 	str.WriteString("SACKScoreboard: {")
diff --git a/pkg/tcpip/transport/tcp/segment.go b/pkg/tcpip/transport/tcp/segment.go
index a20755f78..1f9c5cf50 100644
--- a/pkg/tcpip/transport/tcp/segment.go
+++ b/pkg/tcpip/transport/tcp/segment.go
@@ -15,6 +15,7 @@
 package tcp
 
 import (
+	"fmt"
 	"sync/atomic"
 	"time"
 
@@ -24,19 +25,29 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 )
 
+// queueFlags are used to indicate which queue of an endpoint a particular segment
+// belongs to. This is used to track memory accounting correctly.
+type queueFlags uint8
+
+const (
+	recvQ queueFlags = 1 << iota
+	sendQ
+)
+
 // segment represents a TCP segment. It holds the payload and parsed TCP segment
 // information, and can be added to intrusive lists.
 // segment is mostly immutable, the only field allowed to change is viewToDeliver.
 //
 // +stateify savable
 type segment struct {
-	segEntry     segmentEntry
-	rackSegEntry rackSegmentEntry
-	refCnt       int32
-	id           stack.TransportEndpointID `state:"manual"`
-	route        stack.Route               `state:"manual"`
-	data         buffer.VectorisedView     `state:".(buffer.VectorisedView)"`
-	hdr          header.TCP
+	segmentEntry
+	refCnt int32
+	ep     *endpoint
+	qFlags queueFlags
+	id     stack.TransportEndpointID `state:"manual"`
+	route  stack.Route               `state:"manual"`
+	data   buffer.VectorisedView     `state:".(buffer.VectorisedView)"`
+	hdr    header.TCP
 	// views is used as buffer for data when its length is large
 	// enough to store a VectorisedView.
 	views [8]buffer.View `state:"nosave"`
@@ -60,17 +71,10 @@ type segment struct {
 	// xmitTime is the last transmit time of this segment.
 	xmitTime  time.Time `state:".(unixTime)"`
 	xmitCount uint32
-}
-
-// segmentMapper is the ElementMapper for the writeList.
-type segmentMapper struct{}
-
-func (segmentMapper) linkerFor(seg *segment) *segmentEntry { return &seg.segEntry }
-
-// rackSegmentMapper is the ElementMapper for the rcList.
-type rackSegmentMapper struct{}
 
-func (rackSegmentMapper) linkerFor(seg *segment) *rackSegmentEntry { return &seg.rackSegEntry }
+	// acked indicates if the segment has already been SACKed.
+	acked bool
+}
 
 func newSegment(r *stack.Route, id stack.TransportEndpointID, pkt *stack.PacketBuffer) *segment {
 	s := &segment{
@@ -111,6 +115,8 @@ func (s *segment) clone() *segment {
 		rcvdTime:       s.rcvdTime,
 		xmitTime:       s.xmitTime,
 		xmitCount:      s.xmitCount,
+		ep:             s.ep,
+		qFlags:         s.qFlags,
 	}
 	t.data = s.data.Clone(t.views[:])
 	return t
@@ -126,8 +132,34 @@ func (s *segment) flagsAreSet(flags uint8) bool {
 	return s.flags&flags == flags
 }
 
+// setOwner sets the owning endpoint for this segment. Its required
+// to be called to ensure memory accounting for receive/send buffer
+// queues is done properly.
+func (s *segment) setOwner(ep *endpoint, qFlags queueFlags) {
+	switch qFlags {
+	case recvQ:
+		ep.updateReceiveMemUsed(s.segMemSize())
+	case sendQ:
+		// no memory account for sendQ yet.
+	default:
+		panic(fmt.Sprintf("unexpected queue flag %b", qFlags))
+	}
+	s.ep = ep
+	s.qFlags = qFlags
+}
+
 func (s *segment) decRef() {
 	if atomic.AddInt32(&s.refCnt, -1) == 0 {
+		if s.ep != nil {
+			switch s.qFlags {
+			case recvQ:
+				s.ep.updateReceiveMemUsed(-s.segMemSize())
+			case sendQ:
+				// no memory accounting for sendQ yet.
+			default:
+				panic(fmt.Sprintf("unexpected queue flag %b set for segment", s.qFlags))
+			}
+		}
 		s.route.Release()
 	}
 }
@@ -149,6 +181,11 @@ func (s *segment) logicalLen() seqnum.Size {
 	return l
 }
 
+// payloadSize is the size of s.data.
+func (s *segment) payloadSize() int {
+	return s.data.Size()
+}
+
 // segMemSize is the amount of memory used to hold the segment data and
 // the associated metadata.
 func (s *segment) segMemSize() int {
diff --git a/pkg/tcpip/transport/tcp/segment_queue.go b/pkg/tcpip/transport/tcp/segment_queue.go
index 48a257137..54545a1b1 100644
--- a/pkg/tcpip/transport/tcp/segment_queue.go
+++ b/pkg/tcpip/transport/tcp/segment_queue.go
@@ -22,16 +22,16 @@ import (
 //
 // +stateify savable
 type segmentQueue struct {
-	mu    sync.Mutex  `state:"nosave"`
-	list  segmentList `state:"wait"`
-	limit int
-	used  int
+	mu     sync.Mutex  `state:"nosave"`
+	list   segmentList `state:"wait"`
+	ep     *endpoint
+	frozen bool
 }
 
 // emptyLocked determines if the queue is empty.
 // Preconditions: q.mu must be held.
 func (q *segmentQueue) emptyLocked() bool {
-	return q.used == 0
+	return q.list.Empty()
 }
 
 // empty determines if the queue is empty.
@@ -43,14 +43,6 @@ func (q *segmentQueue) empty() bool {
 	return r
 }
 
-// setLimit updates the limit. No segments are immediately dropped in case the
-// queue becomes full due to the new limit.
-func (q *segmentQueue) setLimit(limit int) {
-	q.mu.Lock()
-	q.limit = limit
-	q.mu.Unlock()
-}
-
 // enqueue adds the given segment to the queue.
 //
 // Returns true when the segment is successfully added to the queue, in which
@@ -58,15 +50,23 @@ func (q *segmentQueue) setLimit(limit int) {
 // false if the queue is full, in which case ownership is retained by the
 // caller.
 func (q *segmentQueue) enqueue(s *segment) bool {
+	// q.ep.receiveBufferParams() must be called without holding q.mu to
+	// avoid lock order inversion.
+	bufSz := q.ep.receiveBufferSize()
+	used := q.ep.receiveMemUsed()
 	q.mu.Lock()
-	r := q.used < q.limit
-	if r {
+	// Allow zero sized segments (ACK/FIN/RSTs etc even if the segment queue
+	// is currently full).
+	allow := (used <= bufSz || s.payloadSize() == 0) && !q.frozen
+
+	if allow {
 		q.list.PushBack(s)
-		q.used++
+		// Set the owner now that the endpoint owns the segment.
+		s.setOwner(q.ep, recvQ)
 	}
 	q.mu.Unlock()
 
-	return r
+	return allow
 }
 
 // dequeue removes and returns the next segment from queue, if one exists.
@@ -77,9 +77,25 @@ func (q *segmentQueue) dequeue() *segment {
 	s := q.list.Front()
 	if s != nil {
 		q.list.Remove(s)
-		q.used--
 	}
 	q.mu.Unlock()
 
 	return s
 }
+
+// freeze prevents any more segments from being added to the queue. i.e all
+// future segmentQueue.enqueue will return false and not add the segment to the
+// queue till the queue is unfroze with a corresponding segmentQueue.thaw call.
+func (q *segmentQueue) freeze() {
+	q.mu.Lock()
+	q.frozen = true
+	q.mu.Unlock()
+}
+
+// thaw unfreezes a previously frozen queue using segmentQueue.freeze() and
+// allows new segments to be queued again.
+func (q *segmentQueue) thaw() {
+	q.mu.Lock()
+	q.frozen = false
+	q.mu.Unlock()
+}
diff --git a/pkg/tcpip/transport/tcp/snd.go b/pkg/tcpip/transport/tcp/snd.go
index 31151f23d..6fa8d63cd 100644
--- a/pkg/tcpip/transport/tcp/snd.go
+++ b/pkg/tcpip/transport/tcp/snd.go
@@ -17,6 +17,7 @@ package tcp
 import (
 	"fmt"
 	"math"
+	"sort"
 	"sync/atomic"
 	"time"
 
@@ -154,7 +155,6 @@ type sender struct {
 	closed      bool
 	writeNext   *segment
 	writeList   segmentList
-	rcList      rackSegmentList
 	resendTimer timer       `state:"nosave"`
 	resendWaker sleep.Waker `state:"nosave"`
 
@@ -264,6 +264,9 @@ func newSender(ep *endpoint, iss, irs seqnum.Value, sndWnd seqnum.Size, mss uint
 			highRxt:   iss,
 			rescueRxt: iss,
 		},
+		rc: rackControl{
+			fack: iss,
+		},
 		gso: ep.gso != nil,
 	}
 
@@ -368,7 +371,7 @@ func (s *sender) updateMaxPayloadSize(mtu, count int) {
 
 	// Rewind writeNext to the first segment exceeding the MTU. Do nothing
 	// if it is already before such a packet.
-	for seg := s.writeList.Front(); seg != nil; seg = seg.segEntry.Next() {
+	for seg := s.writeList.Front(); seg != nil; seg = seg.Next() {
 		if seg == s.writeNext {
 			// We got to writeNext before we could find a segment
 			// exceeding the MTU.
@@ -623,7 +626,6 @@ func (s *sender) splitSeg(seg *segment, size int) {
 	nSeg.data.TrimFront(size)
 	nSeg.sequenceNumber.UpdateForward(seqnum.Size(size))
 	s.writeList.InsertAfter(seg, nSeg)
-	s.rcList.InsertAfter(seg, nSeg)
 
 	// The segment being split does not carry PUSH flag because it is
 	// followed by the newly split segment.
@@ -655,7 +657,7 @@ func (s *sender) NextSeg(nextSegHint *segment) (nextSeg, hint *segment, rescueRt
 	var s3 *segment
 	var s4 *segment
 	// Step 1.
-	for seg := nextSegHint; seg != nil; seg = seg.segEntry.Next() {
+	for seg := nextSegHint; seg != nil; seg = seg.Next() {
 		// Stop iteration if we hit a segment that has never been
 		// transmitted (i.e. either it has no assigned sequence number
 		// or if it does have one, it's >= the next sequence number
@@ -685,7 +687,7 @@ func (s *sender) NextSeg(nextSegHint *segment) (nextSeg, hint *segment, rescueRt
 				// NextSeg():
 				//     (1.c) IsLost(S2) returns true.
 				if s.ep.scoreboard.IsLost(segSeq) {
-					return seg, seg.segEntry.Next(), false
+					return seg, seg.Next(), false
 				}
 
 				// NextSeg():
@@ -699,7 +701,7 @@ func (s *sender) NextSeg(nextSegHint *segment) (nextSeg, hint *segment, rescueRt
 				// SHOULD be returned.
 				if s3 == nil {
 					s3 = seg
-					hint = seg.segEntry.Next()
+					hint = seg.Next()
 				}
 			}
 			// NextSeg():
@@ -733,7 +735,7 @@ func (s *sender) NextSeg(nextSegHint *segment) (nextSeg, hint *segment, rescueRt
 	// range of one segment of up to SMSS octets of
 	// previously unsent data starting with sequence number
 	// HighData+1 MUST be returned."
-	for seg := s.writeNext; seg != nil; seg = seg.segEntry.Next() {
+	for seg := s.writeNext; seg != nil; seg = seg.Next() {
 		if s.isAssignedSequenceNumber(seg) && seg.sequenceNumber.LessThan(s.sndNxt) {
 			continue
 		}
@@ -775,16 +777,15 @@ func (s *sender) maybeSendSegment(seg *segment, limit int, end seqnum.Value) (se
 			// triggering bugs in poorly written DNS
 			// implementations.
 			var nextTooBig bool
-			for seg.segEntry.Next() != nil && seg.segEntry.Next().data.Size() != 0 {
-				if seg.data.Size()+seg.segEntry.Next().data.Size() > available {
+			for seg.Next() != nil && seg.Next().data.Size() != 0 {
+				if seg.data.Size()+seg.Next().data.Size() > available {
 					nextTooBig = true
 					break
 				}
-				seg.data.Append(seg.segEntry.Next().data)
+				seg.data.Append(seg.Next().data)
 
 				// Consume the segment that we just merged in.
-				s.writeList.Remove(seg.segEntry.Next())
-				s.rcList.Remove(seg.rackSegEntry.Next())
+				s.writeList.Remove(seg.Next())
 			}
 			if !nextTooBig && seg.data.Size() < available {
 				// Segment is not full.
@@ -951,7 +952,7 @@ func (s *sender) handleSACKRecovery(limit int, end seqnum.Value) (dataSent bool)
 			}
 			dataSent = true
 			s.outstanding++
-			s.writeNext = nextSeg.segEntry.Next()
+			s.writeNext = nextSeg.Next()
 			continue
 		}
 
@@ -964,7 +965,6 @@ func (s *sender) handleSACKRecovery(limit int, end seqnum.Value) (dataSent bool)
 		// transmitted in (C.1)."
 		s.outstanding++
 		dataSent = true
-
 		s.sendSegment(nextSeg)
 
 		segEnd := nextSeg.sequenceNumber.Add(nextSeg.logicalLen())
@@ -1039,7 +1039,7 @@ func (s *sender) sendData() {
 	if s.fr.active && s.ep.sackPermitted {
 		dataSent = s.handleSACKRecovery(s.maxPayloadSize, end)
 	} else {
-		for seg := s.writeNext; seg != nil && s.outstanding < s.sndCwnd; seg = seg.segEntry.Next() {
+		for seg := s.writeNext; seg != nil && s.outstanding < s.sndCwnd; seg = seg.Next() {
 			cwndLimit := (s.sndCwnd - s.outstanding) * s.maxPayloadSize
 			if cwndLimit < limit {
 				limit = cwndLimit
@@ -1047,7 +1047,7 @@ func (s *sender) sendData() {
 			if s.isAssignedSequenceNumber(seg) && s.ep.sackPermitted && s.ep.scoreboard.IsSACKED(seg.sackBlock()) {
 				// Move writeNext along so that we don't try and scan data that
 				// has already been SACKED.
-				s.writeNext = seg.segEntry.Next()
+				s.writeNext = seg.Next()
 				continue
 			}
 			if sent := s.maybeSendSegment(seg, limit, end); !sent {
@@ -1055,7 +1055,7 @@ func (s *sender) sendData() {
 			}
 			dataSent = true
 			s.outstanding += s.pCount(seg)
-			s.writeNext = seg.segEntry.Next()
+			s.writeNext = seg.Next()
 		}
 	}
 
@@ -1186,7 +1186,7 @@ func (s *sender) SetPipe() {
 	}
 	pipe := 0
 	smss := seqnum.Size(s.ep.scoreboard.SMSS())
-	for s1 := s.writeList.Front(); s1 != nil && s1.data.Size() != 0 && s.isAssignedSequenceNumber(s1); s1 = s1.segEntry.Next() {
+	for s1 := s.writeList.Front(); s1 != nil && s1.data.Size() != 0 && s.isAssignedSequenceNumber(s1); s1 = s1.Next() {
 		// With GSO each segment can be much larger than SMSS. So check the segment
 		// in SMSS sized ranges.
 		segEnd := s1.sequenceNumber.Add(seqnum.Size(s1.data.Size()))
@@ -1278,6 +1278,39 @@ func (s *sender) checkDuplicateAck(seg *segment) (rtx bool) {
 	return true
 }
 
+// Iterate the writeList and update RACK for each segment which is newly acked
+// either cumulatively or selectively. Loop through the segments which are
+// sacked, and update the RACK related variables and check for reordering.
+//
+// See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2
+// steps 2 and 3.
+func (s *sender) walkSACK(rcvdSeg *segment) {
+	// Sort the SACK blocks. The first block is the most recent unacked
+	// block. The following blocks can be in arbitrary order.
+	sackBlocks := make([]header.SACKBlock, len(rcvdSeg.parsedOptions.SACKBlocks))
+	copy(sackBlocks, rcvdSeg.parsedOptions.SACKBlocks)
+	sort.Slice(sackBlocks, func(i, j int) bool {
+		return sackBlocks[j].Start.LessThan(sackBlocks[i].Start)
+	})
+
+	seg := s.writeList.Front()
+	for _, sb := range sackBlocks {
+		// This check excludes DSACK blocks.
+		if sb.Start.LessThanEq(rcvdSeg.ackNumber) || sb.Start.LessThanEq(s.sndUna) || s.sndNxt.LessThan(sb.End) {
+			continue
+		}
+
+		for seg != nil && seg.sequenceNumber.LessThan(sb.End) && seg.xmitCount != 0 {
+			if sb.Start.LessThanEq(seg.sequenceNumber) && !seg.acked {
+				s.rc.update(seg, rcvdSeg, s.ep.tsOffset)
+				s.rc.detectReorder(seg)
+				seg.acked = true
+			}
+			seg = seg.Next()
+		}
+	}
+}
+
 // handleRcvdSegment is called when a segment is received; it is responsible for
 // updating the send-related state.
 func (s *sender) handleRcvdSegment(rcvdSeg *segment) {
@@ -1312,6 +1345,21 @@ func (s *sender) handleRcvdSegment(rcvdSeg *segment) {
 				rcvdSeg.hasNewSACKInfo = true
 			}
 		}
+
+		// See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08
+		// section-7.2
+		// * Step 2: Update RACK stats.
+		//   If the ACK is not ignored as invalid, update the RACK.rtt
+		//   to be the RTT sample calculated using this ACK, and
+		//   continue.  If this ACK or SACK was for the most recently
+		//   sent packet, then record the RACK.xmit_ts timestamp and
+		//   RACK.end_seq sequence implied by this ACK.
+		// * Step 3: Detect packet reordering.
+		//   If the ACK selectively or cumulatively acknowledges an
+		//   unacknowledged and also never retransmitted sequence below
+		//   RACK.fack, then the corresponding packet has been
+		//   reordered and RACK.reord is set to TRUE.
+		s.walkSACK(rcvdSeg)
 		s.SetPipe()
 	}
 
@@ -1369,9 +1417,6 @@ func (s *sender) handleRcvdSegment(rcvdSeg *segment) {
 
 		ackLeft := acked
 		originalOutstanding := s.outstanding
-		s.rtt.Lock()
-		srtt := s.rtt.srtt
-		s.rtt.Unlock()
 		for ackLeft > 0 {
 			// We use logicalLen here because we can have FIN
 			// segments (which are always at the end of list) that
@@ -1388,18 +1433,18 @@ func (s *sender) handleRcvdSegment(rcvdSeg *segment) {
 			}
 
 			if s.writeNext == seg {
-				s.writeNext = seg.segEntry.Next()
+				s.writeNext = seg.Next()
 			}
 
 			// Update the RACK fields if SACK is enabled.
-			if s.ep.sackPermitted {
-				s.rc.Update(seg, rcvdSeg, srtt, s.ep.tsOffset)
+			if s.ep.sackPermitted && !seg.acked {
+				s.rc.update(seg, rcvdSeg, s.ep.tsOffset)
+				s.rc.detectReorder(seg)
 			}
 
 			s.writeList.Remove(seg)
-			s.rcList.Remove(seg)
 
-			// if SACK is enabled then Only reduce outstanding if
+			// If SACK is enabled then Only reduce outstanding if
 			// the segment was not previously SACKED as these have
 			// already been accounted for in SetPipe().
 			if !s.ep.sackPermitted || !s.ep.scoreboard.IsSACKED(seg.sackBlock()) {
@@ -1465,12 +1510,6 @@ func (s *sender) sendSegment(seg *segment) *tcpip.Error {
 		if s.sndCwnd < s.sndSsthresh {
 			s.ep.stack.Stats().TCP.SlowStartRetransmits.Increment()
 		}
-
-		// Move the segment which has to be retransmitted to the end of the list, as
-		// RACK requires the segments in the order of their transmission times.
-		// See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-09#section-6.2
-		// Step 5
-		s.rcList.PushBack(seg)
 	}
 	seg.xmitTime = time.Now()
 	seg.xmitCount++
diff --git a/pkg/tcpip/transport/tcp/tcp_rack_test.go b/pkg/tcpip/transport/tcp/tcp_rack_test.go
index e03f101e8..d3f92b48c 100644
--- a/pkg/tcpip/transport/tcp/tcp_rack_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_rack_test.go
@@ -21,17 +21,20 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/seqnum"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/tcpip/transport/tcp/testing/context"
 )
 
+const (
+	maxPayload       = 10
+	tsOptionSize     = 12
+	maxTCPOptionSize = 40
+)
+
 // TestRACKUpdate tests the RACK related fields are updated when an ACK is
 // received on a SACK enabled connection.
 func TestRACKUpdate(t *testing.T) {
-	const maxPayload = 10
-	const tsOptionSize = 12
-	const maxTCPOptionSize = 40
-
 	c := context.New(t, uint32(header.TCPMinimumSize+header.IPv4MinimumSize+maxTCPOptionSize+maxPayload))
 	defer c.Cleanup()
 
@@ -49,7 +52,7 @@ func TestRACKUpdate(t *testing.T) {
 		}
 
 		if state.Sender.RACKState.RTT == 0 {
-			t.Fatalf("RACK RTT failed to update when an ACK is received")
+			t.Fatalf("RACK RTT failed to update when an ACK is received, got RACKState.RTT == 0 want != 0")
 		}
 	})
 	setStackSACKPermitted(t, c, true)
@@ -69,6 +72,66 @@ func TestRACKUpdate(t *testing.T) {
 	bytesRead := 0
 	c.ReceiveAndCheckPacketWithOptions(data, bytesRead, maxPayload, tsOptionSize)
 	bytesRead += maxPayload
-	c.SendAck(790, bytesRead)
+	c.SendAck(seqnum.Value(context.TestInitialSequenceNumber).Add(1), bytesRead)
 	time.Sleep(200 * time.Millisecond)
 }
+
+// TestRACKDetectReorder tests that RACK detects packet reordering.
+func TestRACKDetectReorder(t *testing.T) {
+	c := context.New(t, uint32(header.TCPMinimumSize+header.IPv4MinimumSize+maxTCPOptionSize+maxPayload))
+	defer c.Cleanup()
+
+	const ackNum = 2
+
+	var n int
+	ch := make(chan struct{})
+	c.Stack().AddTCPProbe(func(state stack.TCPEndpointState) {
+		gotSeq := state.Sender.RACKState.FACK
+		wantSeq := state.Sender.SndNxt
+		// FACK should be updated to the highest ending sequence number of the
+		// segment acknowledged most recently.
+		if !gotSeq.LessThanEq(wantSeq) || gotSeq.LessThan(wantSeq) {
+			t.Fatalf("RACK FACK failed to update, got: %v, but want: %v", gotSeq, wantSeq)
+		}
+
+		n++
+		if n < ackNum {
+			if state.Sender.RACKState.Reord {
+				t.Fatalf("RACK reorder detected when there is no reordering")
+			}
+			return
+		}
+
+		if state.Sender.RACKState.Reord == false {
+			t.Fatalf("RACK reorder detection failed")
+		}
+		close(ch)
+	})
+	setStackSACKPermitted(t, c, true)
+	createConnectedWithSACKAndTS(c)
+	data := buffer.NewView(ackNum * maxPayload)
+	for i := range data {
+		data[i] = byte(i)
+	}
+
+	// Write the data.
+	if _, _, err := c.EP.Write(tcpip.SlicePayload(data), tcpip.WriteOptions{}); err != nil {
+		t.Fatalf("Write failed: %s", err)
+	}
+
+	bytesRead := 0
+	for i := 0; i < ackNum; i++ {
+		c.ReceiveAndCheckPacketWithOptions(data, bytesRead, maxPayload, tsOptionSize)
+		bytesRead += maxPayload
+	}
+
+	start := c.IRS.Add(maxPayload + 1)
+	end := start.Add(maxPayload)
+	seq := seqnum.Value(context.TestInitialSequenceNumber).Add(1)
+	c.SendAckWithSACK(seq, 0, []header.SACKBlock{{start, end}})
+	c.SendAck(seq, bytesRead)
+
+	// Wait for the probe function to finish processing the ACK before the
+	// test completes.
+	<-ch
+}
diff --git a/pkg/tcpip/transport/tcp/tcp_sack_test.go b/pkg/tcpip/transport/tcp/tcp_sack_test.go
index 99521f0c1..ef7f5719f 100644
--- a/pkg/tcpip/transport/tcp/tcp_sack_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_sack_test.go
@@ -46,8 +46,9 @@ func createConnectedWithSACKAndTS(c *context.Context) *context.RawEndpoint {
 
 func setStackSACKPermitted(t *testing.T, c *context.Context, enable bool) {
 	t.Helper()
-	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SACKEnabled(enable)); err != nil {
-		t.Fatalf("c.s.SetTransportProtocolOption(tcp.ProtocolNumber, SACKEnabled(%t) = %s", enable, err)
+	opt := tcpip.TCPSACKEnabled(enable)
+	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+		t.Fatalf("c.s.SetTransportProtocolOption(%d, &%T(%t)): %s", tcp.ProtocolNumber, opt, opt, err)
 	}
 }
 
@@ -162,8 +163,9 @@ func TestSackPermittedAccept(t *testing.T) {
 						// Set the SynRcvd threshold to
 						// zero to force a syn cookie
 						// based accept to happen.
-						if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPSynRcvdCountThresholdOption(0)); err != nil {
-							t.Fatalf("setting TCPSynRcvdCountThresholdOption to 0 failed: %s", err)
+						var opt tcpip.TCPSynRcvdCountThresholdOption
+						if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+							t.Fatalf("SetTransportProtocolOption(%d, &%T(%d)): %s", tcp.ProtocolNumber, opt, opt, err)
 						}
 					}
 					setStackSACKPermitted(t, c, sackEnabled)
@@ -236,8 +238,9 @@ func TestSackDisabledAccept(t *testing.T) {
 						// Set the SynRcvd threshold to
 						// zero to force a syn cookie
 						// based accept to happen.
-						if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPSynRcvdCountThresholdOption(0)); err != nil {
-							t.Fatalf("setting TCPSynRcvdCountThresholdOption to 0 failed: %s", err)
+						var opt tcpip.TCPSynRcvdCountThresholdOption
+						if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+							t.Fatalf("SetTransportProtocolOption(%d, &%T(%d)): %s", tcp.ProtocolNumber, opt, opt, err)
 						}
 					}
 
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index 55ae09a2f..a7149efd0 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -21,6 +21,7 @@ import (
 	"testing"
 	"time"
 
+	"gvisor.dev/gvisor/pkg/rand"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
@@ -74,8 +75,8 @@ func TestGiveUpConnect(t *testing.T) {
 
 	// Wait for ep to become writable.
 	<-notifyCh
-	if err := ep.GetSockOpt(tcpip.ErrorOption{}); err != tcpip.ErrAborted {
-		t.Fatalf("got ep.GetSockOpt(tcpip.ErrorOption{}) = %s, want = %s", err, tcpip.ErrAborted)
+	if err := ep.LastError(); err != tcpip.ErrAborted {
+		t.Fatalf("got ep.LastError() = %s, want = %s", err, tcpip.ErrAborted)
 	}
 
 	// Call Connect again to retreive the handshake failure status
@@ -240,6 +241,38 @@ func TestTCPResetsSentIncrement(t *testing.T) {
 	}
 }
 
+// TestTCPResetsSentNoICMP confirms that we don't get an ICMP
+// DstUnreachable packet when we try send a packet which is not part
+// of an active session.
+func TestTCPResetsSentNoICMP(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+	stats := c.Stack().Stats()
+
+	// Send a SYN request for a closed port. This should elicit an RST
+	// but NOT an ICMPv4 DstUnreachable packet.
+	iss := seqnum.Value(789)
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagSyn,
+		SeqNum:  iss,
+	})
+
+	// Receive whatever comes back.
+	b := c.GetPacket()
+	ipHdr := header.IPv4(b)
+	if got, want := ipHdr.Protocol(), uint8(header.TCPProtocolNumber); got != want {
+		t.Errorf("unexpected protocol, got = %d, want = %d", got, want)
+	}
+
+	// Read outgoing ICMP stats and check no ICMP DstUnreachable was recorded.
+	sent := stats.ICMP.V4PacketsSent
+	if got, want := sent.DstUnreachable.Value(), uint64(0); got != want {
+		t.Errorf("got ICMP DstUnreachable.Value() = %d, want = %d", got, want)
+	}
+}
+
 // TestTCPResetSentForACKWhenNotUsingSynCookies checks that the stack generates
 // a RST if an ACK is received on the listening socket for which there is no
 // active handshake in progress and we are not using SYN cookies.
@@ -291,12 +324,12 @@ func TestTCPResetSentForACKWhenNotUsingSynCookies(t *testing.T) {
 	wq.EventRegister(&we, waiter.EventIn)
 	defer wq.EventUnregister(&we)
 
-	c.EP, _, err = ep.Accept()
+	c.EP, _, err = ep.Accept(nil)
 	if err == tcpip.ErrWouldBlock {
 		// Wait for connection to be established.
 		select {
 		case <-ch:
-			c.EP, _, err = ep.Accept()
+			c.EP, _, err = ep.Accept(nil)
 			if err != nil {
 				t.Fatalf("Accept failed: %s", err)
 			}
@@ -309,16 +342,16 @@ func TestTCPResetSentForACKWhenNotUsingSynCookies(t *testing.T) {
 	// Lower stackwide TIME_WAIT timeout so that the reservations
 	// are released instantly on Close.
 	tcpTW := tcpip.TCPTimeWaitTimeoutOption(1 * time.Millisecond)
-	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpTW); err != nil {
-		t.Fatalf("e.stack.SetTransportProtocolOption(%d, %#v) = %s", tcp.ProtocolNumber, tcpTW, err)
+	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, &tcpTW); err != nil {
+		t.Fatalf("SetTransportProtocolOption(%d, &%T(%d)): %s", tcp.ProtocolNumber, tcpTW, tcpTW, err)
 	}
 
 	c.EP.Close()
 	checker.IPv4(t, c.GetPacket(), checker.TCP(
 		checker.SrcPort(context.StackPort),
 		checker.DstPort(context.TestPort),
-		checker.SeqNum(uint32(c.IRS+1)),
-		checker.AckNum(uint32(iss)+1),
+		checker.TCPSeqNum(uint32(c.IRS+1)),
+		checker.TCPAckNum(uint32(iss)+1),
 		checker.TCPFlags(header.TCPFlagFin|header.TCPFlagAck)))
 	finHeaders := &context.Headers{
 		SrcPort: context.TestPort,
@@ -348,8 +381,8 @@ func TestTCPResetSentForACKWhenNotUsingSynCookies(t *testing.T) {
 	checker.IPv4(t, c.GetPacket(), checker.TCP(
 		checker.SrcPort(context.StackPort),
 		checker.DstPort(context.TestPort),
-		checker.SeqNum(uint32(c.IRS+1)),
-		checker.AckNum(0),
+		checker.TCPSeqNum(uint32(c.IRS+1)),
+		checker.TCPAckNum(0),
 		checker.TCPFlags(header.TCPFlagRst)))
 }
 
@@ -432,8 +465,9 @@ func TestConnectResetAfterClose(t *testing.T) {
 	// Set TCPLinger to 3 seconds so that sockets are marked closed
 	// after 3 second in FIN_WAIT2 state.
 	tcpLingerTimeout := 3 * time.Second
-	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPLingerTimeoutOption(tcpLingerTimeout)); err != nil {
-		t.Fatalf("c.stack.SetTransportProtocolOption(tcp, tcpip.TCPLingerTimeoutOption(%s) failed: %s", tcpLingerTimeout, err)
+	opt := tcpip.TCPLingerTimeoutOption(tcpLingerTimeout)
+	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+		t.Fatalf("SetTransportProtocolOption(%d, &%T(%d)): %s", tcp.ProtocolNumber, opt, opt, err)
 	}
 
 	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
@@ -446,8 +480,8 @@ func TestConnectResetAfterClose(t *testing.T) {
 	checker.IPv4(t, c.GetPacket(),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(c.IRS)+1),
-			checker.AckNum(790),
+			checker.TCPSeqNum(uint32(c.IRS)+1),
+			checker.TCPAckNum(790),
 			checker.TCPFlags(header.TCPFlagAck|header.TCPFlagFin),
 		),
 	)
@@ -488,8 +522,8 @@ func TestConnectResetAfterClose(t *testing.T) {
 				// RST is always generated with sndNxt which if the FIN
 				// has been sent will be 1 higher than the sequence number
 				// of the FIN itself.
-				checker.SeqNum(uint32(c.IRS)+2),
-				checker.AckNum(0),
+				checker.TCPSeqNum(uint32(c.IRS)+2),
+				checker.TCPAckNum(0),
 				checker.TCPFlags(header.TCPFlagRst),
 			),
 		)
@@ -506,8 +540,9 @@ func TestCurrentConnectedIncrement(t *testing.T) {
 	// Set TCPTimeWaitTimeout to 1 seconds so that sockets are marked closed
 	// after 1 second in TIME_WAIT state.
 	tcpTimeWaitTimeout := 1 * time.Second
-	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPTimeWaitTimeoutOption(tcpTimeWaitTimeout)); err != nil {
-		t.Fatalf("c.stack.SetTransportProtocolOption(tcp, tcpip.TCPTimeWaitTimeout(%d) failed: %s", tcpTimeWaitTimeout, err)
+	opt := tcpip.TCPTimeWaitTimeoutOption(tcpTimeWaitTimeout)
+	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+		t.Fatalf("SetTransportProtocolOption(%d, &%T(%d)): %s", tcp.ProtocolNumber, opt, opt, err)
 	}
 
 	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
@@ -527,8 +562,8 @@ func TestCurrentConnectedIncrement(t *testing.T) {
 	checker.IPv4(t, c.GetPacket(),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(c.IRS)+1),
-			checker.AckNum(790),
+			checker.TCPSeqNum(uint32(c.IRS)+1),
+			checker.TCPAckNum(790),
 			checker.TCPFlags(header.TCPFlagAck|header.TCPFlagFin),
 		),
 	)
@@ -563,8 +598,8 @@ func TestCurrentConnectedIncrement(t *testing.T) {
 		checker.PayloadLen(header.TCPMinimumSize),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(c.IRS)+2),
-			checker.AckNum(791),
+			checker.TCPSeqNum(uint32(c.IRS)+2),
+			checker.TCPAckNum(791),
 			checker.TCPFlags(header.TCPFlagAck),
 		),
 	)
@@ -610,8 +645,8 @@ func TestClosingWithEnqueuedSegments(t *testing.T) {
 	checker.IPv4(t, c.GetPacket(),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(c.IRS)+1),
-			checker.AckNum(791),
+			checker.TCPSeqNum(uint32(c.IRS)+1),
+			checker.TCPAckNum(791),
 			checker.TCPFlags(header.TCPFlagAck),
 		),
 	)
@@ -631,8 +666,8 @@ func TestClosingWithEnqueuedSegments(t *testing.T) {
 	checker.IPv4(t, c.GetPacket(),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(c.IRS)+1),
-			checker.AckNum(791),
+			checker.TCPSeqNum(uint32(c.IRS)+1),
+			checker.TCPAckNum(791),
 			checker.TCPFlags(header.TCPFlagAck|header.TCPFlagFin),
 		),
 	)
@@ -691,8 +726,8 @@ func TestClosingWithEnqueuedSegments(t *testing.T) {
 	checker.IPv4(t, c.GetPacket(),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(c.IRS)+2),
-			checker.AckNum(0),
+			checker.TCPSeqNum(uint32(c.IRS)+2),
+			checker.TCPAckNum(0),
 			checker.TCPFlags(header.TCPFlagRst),
 		),
 	)
@@ -743,8 +778,8 @@ func TestSimpleReceive(t *testing.T) {
 	checker.IPv4(t, c.GetPacket(),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(c.IRS)+1),
-			checker.AckNum(uint32(790+len(data))),
+			checker.TCPSeqNum(uint32(c.IRS)+1),
+			checker.TCPAckNum(uint32(790+len(data))),
 			checker.TCPFlags(header.TCPFlagAck),
 		),
 	)
@@ -933,8 +968,8 @@ func TestUserSuppliedMSSOnListenAccept(t *testing.T) {
 
 					// Set the SynRcvd threshold to force a syn cookie based accept to happen.
 					opt := tcpip.TCPSynRcvdCountThresholdOption(nonSynCookieAccepts)
-					if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, opt); err != nil {
-						t.Fatalf("SetTransportProtocolOption(%d, %#v): %s", tcp.ProtocolNumber, opt, err)
+					if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+						t.Fatalf("SetTransportProtocolOption(%d, &%T(%d)): %s", tcp.ProtocolNumber, opt, opt, err)
 					}
 
 					if err := c.EP.SetSockOptInt(tcpip.MaxSegOption, int(test.setMSS)); err != nil {
@@ -996,7 +1031,7 @@ func TestSendRstOnListenerRxSynAckV4(t *testing.T) {
 	checker.IPv4(t, c.GetPacket(), checker.TCP(
 		checker.DstPort(context.TestPort),
 		checker.TCPFlags(header.TCPFlagRst),
-		checker.SeqNum(200)))
+		checker.TCPSeqNum(200)))
 }
 
 func TestSendRstOnListenerRxSynAckV6(t *testing.T) {
@@ -1024,7 +1059,7 @@ func TestSendRstOnListenerRxSynAckV6(t *testing.T) {
 	checker.IPv6(t, c.GetV6Packet(), checker.TCP(
 		checker.DstPort(context.TestPort),
 		checker.TCPFlags(header.TCPFlagRst),
-		checker.SeqNum(200)))
+		checker.TCPSeqNum(200)))
 }
 
 // TestTCPAckBeforeAcceptV4 tests that once the 3-way handshake is complete,
@@ -1061,8 +1096,8 @@ func TestTCPAckBeforeAcceptV4(t *testing.T) {
 	checker.IPv4(t, c.GetPacket(), checker.TCP(
 		checker.DstPort(context.TestPort),
 		checker.TCPFlags(header.TCPFlagAck),
-		checker.SeqNum(uint32(iss+1)),
-		checker.AckNum(uint32(irs+5))))
+		checker.TCPSeqNum(uint32(iss+1)),
+		checker.TCPAckNum(uint32(irs+5))))
 }
 
 // TestTCPAckBeforeAcceptV6 tests that once the 3-way handshake is complete,
@@ -1099,8 +1134,8 @@ func TestTCPAckBeforeAcceptV6(t *testing.T) {
 	checker.IPv6(t, c.GetV6Packet(), checker.TCP(
 		checker.DstPort(context.TestPort),
 		checker.TCPFlags(header.TCPFlagAck),
-		checker.SeqNum(uint32(iss+1)),
-		checker.AckNum(uint32(irs+5))))
+		checker.TCPSeqNum(uint32(iss+1)),
+		checker.TCPAckNum(uint32(irs+5))))
 }
 
 func TestSendRstOnListenerRxAckV4(t *testing.T) {
@@ -1128,7 +1163,7 @@ func TestSendRstOnListenerRxAckV4(t *testing.T) {
 	checker.IPv4(t, c.GetPacket(), checker.TCP(
 		checker.DstPort(context.TestPort),
 		checker.TCPFlags(header.TCPFlagRst),
-		checker.SeqNum(200)))
+		checker.TCPSeqNum(200)))
 }
 
 func TestSendRstOnListenerRxAckV6(t *testing.T) {
@@ -1156,7 +1191,7 @@ func TestSendRstOnListenerRxAckV6(t *testing.T) {
 	checker.IPv6(t, c.GetV6Packet(), checker.TCP(
 		checker.DstPort(context.TestPort),
 		checker.TCPFlags(header.TCPFlagRst),
-		checker.SeqNum(200)))
+		checker.TCPSeqNum(200)))
 }
 
 // TestListenShutdown tests for the listening endpoint replying with RST
@@ -1272,8 +1307,8 @@ func TestTOSV4(t *testing.T) {
 		checker.PayloadLen(len(data)+header.TCPMinimumSize),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(c.IRS)+1),
-			checker.AckNum(790), // Acknum is initial sequence number + 1
+			checker.TCPSeqNum(uint32(c.IRS)+1),
+			checker.TCPAckNum(790), // Acknum is initial sequence number + 1
 			checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
 		),
 		checker.TOS(tos, 0),
@@ -1321,8 +1356,8 @@ func TestTrafficClassV6(t *testing.T) {
 		checker.PayloadLen(len(data)+header.TCPMinimumSize),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(c.IRS)+1),
-			checker.AckNum(790),
+			checker.TCPSeqNum(uint32(c.IRS)+1),
+			checker.TCPAckNum(790),
 			checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
 		),
 		checker.TOS(tos, 0),
@@ -1349,7 +1384,9 @@ func TestConnectBindToDevice(t *testing.T) {
 
 			c.Create(-1)
 			bindToDevice := tcpip.BindToDeviceOption(test.device)
-			c.EP.SetSockOpt(bindToDevice)
+			if err := c.EP.SetSockOpt(&bindToDevice); err != nil {
+				t.Fatalf("c.EP.SetSockOpt(&%T(%d)): %s", bindToDevice, bindToDevice, err)
+			}
 			// Start connection attempt.
 			waitEntry, _ := waiter.NewChannelEntry(nil)
 			c.WQ.EventRegister(&waitEntry, waiter.EventOut)
@@ -1510,8 +1547,8 @@ func TestOutOfOrderReceive(t *testing.T) {
 	checker.IPv4(t, c.GetPacket(),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(c.IRS)+1),
-			checker.AckNum(790),
+			checker.TCPSeqNum(uint32(c.IRS)+1),
+			checker.TCPAckNum(790),
 			checker.TCPFlags(header.TCPFlagAck),
 		),
 	)
@@ -1561,8 +1598,8 @@ func TestOutOfOrderReceive(t *testing.T) {
 	checker.IPv4(t, c.GetPacket(),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(c.IRS)+1),
-			checker.AckNum(uint32(790+len(data))),
+			checker.TCPSeqNum(uint32(c.IRS)+1),
+			checker.TCPAckNum(uint32(790+len(data))),
 			checker.TCPFlags(header.TCPFlagAck),
 		),
 	)
@@ -1572,8 +1609,8 @@ func TestOutOfOrderFlood(t *testing.T) {
 	c := context.New(t, defaultMTU)
 	defer c.Cleanup()
 
-	// Create a new connection with initial window size of 10.
-	c.CreateConnected(789, 30000, 10)
+	rcvBufSz := math.MaxUint16
+	c.CreateConnected(789, 30000, rcvBufSz)
 
 	if _, _, err := c.EP.Read(nil); err != tcpip.ErrWouldBlock {
 		t.Fatalf("got c.EP.Read(nil) = %s, want = %s", err, tcpip.ErrWouldBlock)
@@ -1594,8 +1631,8 @@ func TestOutOfOrderFlood(t *testing.T) {
 		checker.IPv4(t, c.GetPacket(),
 			checker.TCP(
 				checker.DstPort(context.TestPort),
-				checker.SeqNum(uint32(c.IRS)+1),
-				checker.AckNum(790),
+				checker.TCPSeqNum(uint32(c.IRS)+1),
+				checker.TCPAckNum(790),
 				checker.TCPFlags(header.TCPFlagAck),
 			),
 		)
@@ -1615,8 +1652,8 @@ func TestOutOfOrderFlood(t *testing.T) {
 	checker.IPv4(t, c.GetPacket(),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(c.IRS)+1),
-			checker.AckNum(790),
+			checker.TCPSeqNum(uint32(c.IRS)+1),
+			checker.TCPAckNum(790),
 			checker.TCPFlags(header.TCPFlagAck),
 		),
 	)
@@ -1635,8 +1672,8 @@ func TestOutOfOrderFlood(t *testing.T) {
 	checker.IPv4(t, c.GetPacket(),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(c.IRS)+1),
-			checker.AckNum(793),
+			checker.TCPSeqNum(uint32(c.IRS)+1),
+			checker.TCPAckNum(793),
 			checker.TCPFlags(header.TCPFlagAck),
 		),
 	)
@@ -1677,8 +1714,8 @@ func TestRstOnCloseWithUnreadData(t *testing.T) {
 	checker.IPv4(t, c.GetPacket(),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(c.IRS)+1),
-			checker.AckNum(uint32(790+len(data))),
+			checker.TCPSeqNum(uint32(c.IRS)+1),
+			checker.TCPAckNum(uint32(790+len(data))),
 			checker.TCPFlags(header.TCPFlagAck),
 		),
 	)
@@ -1692,7 +1729,7 @@ func TestRstOnCloseWithUnreadData(t *testing.T) {
 			checker.DstPort(context.TestPort),
 			checker.TCPFlags(header.TCPFlagAck|header.TCPFlagRst),
 			// We shouldn't consume a sequence number on RST.
-			checker.SeqNum(uint32(c.IRS)+1),
+			checker.TCPSeqNum(uint32(c.IRS)+1),
 		))
 	// The RST puts the endpoint into an error state.
 	if got, want := tcp.EndpointState(c.EP.State()), tcp.StateError; got != want {
@@ -1746,8 +1783,8 @@ func TestRstOnCloseWithUnreadDataFinConvertRst(t *testing.T) {
 	checker.IPv4(t, c.GetPacket(),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(c.IRS)+1),
-			checker.AckNum(uint32(790+len(data))),
+			checker.TCPSeqNum(uint32(c.IRS)+1),
+			checker.TCPAckNum(uint32(790+len(data))),
 			checker.TCPFlags(header.TCPFlagAck),
 		),
 	)
@@ -1760,7 +1797,7 @@ func TestRstOnCloseWithUnreadDataFinConvertRst(t *testing.T) {
 		checker.TCP(
 			checker.DstPort(context.TestPort),
 			checker.TCPFlags(header.TCPFlagAck|header.TCPFlagFin),
-			checker.SeqNum(uint32(c.IRS)+1),
+			checker.TCPSeqNum(uint32(c.IRS)+1),
 		))
 
 	if got, want := tcp.EndpointState(c.EP.State()), tcp.StateFinWait1; got != want {
@@ -1779,7 +1816,7 @@ func TestRstOnCloseWithUnreadDataFinConvertRst(t *testing.T) {
 			// RST is always generated with sndNxt which if the FIN
 			// has been sent will be 1 higher than the sequence
 			// number of the FIN itself.
-			checker.SeqNum(uint32(c.IRS)+2),
+			checker.TCPSeqNum(uint32(c.IRS)+2),
 		))
 	// The RST puts the endpoint into an error state.
 	if got, want := tcp.EndpointState(c.EP.State()), tcp.StateError; got != want {
@@ -1825,7 +1862,8 @@ func TestFullWindowReceive(t *testing.T) {
 	c := context.New(t, defaultMTU)
 	defer c.Cleanup()
 
-	c.CreateConnected(789, 30000, 10)
+	const rcvBufSz = 10
+	c.CreateConnected(789, 30000, rcvBufSz)
 
 	we, ch := waiter.NewChannelEntry(nil)
 	c.WQ.EventRegister(&we, waiter.EventIn)
@@ -1836,8 +1874,13 @@ func TestFullWindowReceive(t *testing.T) {
 		t.Fatalf("Read failed: %s", err)
 	}
 
-	// Fill up the window.
-	data := []byte{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
+	// Fill up the window w/ tcp.SegOverheadFactor*rcvBufSz as netstack multiplies
+	// the provided buffer value by tcp.SegOverheadFactor to calculate the actual
+	// receive buffer size.
+	data := make([]byte, tcp.SegOverheadFactor*rcvBufSz)
+	for i := range data {
+		data[i] = byte(i % 255)
+	}
 	c.SendPacket(data, &context.Headers{
 		SrcPort: context.TestPort,
 		DstPort: c.Port,
@@ -1858,10 +1901,10 @@ func TestFullWindowReceive(t *testing.T) {
 	checker.IPv4(t, c.GetPacket(),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(c.IRS)+1),
-			checker.AckNum(uint32(790+len(data))),
+			checker.TCPSeqNum(uint32(c.IRS)+1),
+			checker.TCPAckNum(uint32(790+len(data))),
 			checker.TCPFlags(header.TCPFlagAck),
-			checker.Window(0),
+			checker.TCPWindow(0),
 		),
 	)
 
@@ -1884,10 +1927,10 @@ func TestFullWindowReceive(t *testing.T) {
 	checker.IPv4(t, c.GetPacket(),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(c.IRS)+1),
-			checker.AckNum(uint32(790+len(data))),
+			checker.TCPSeqNum(uint32(c.IRS)+1),
+			checker.TCPAckNum(uint32(790+len(data))),
 			checker.TCPFlags(header.TCPFlagAck),
-			checker.Window(10),
+			checker.TCPWindow(10),
 		),
 	)
 }
@@ -1896,12 +1939,15 @@ func TestNoWindowShrinking(t *testing.T) {
 	c := context.New(t, defaultMTU)
 	defer c.Cleanup()
 
-	// Start off with a window size of 10, then shrink it to 5.
-	c.CreateConnected(789, 30000, 10)
-
-	if err := c.EP.SetSockOptInt(tcpip.ReceiveBufferSizeOption, 5); err != nil {
-		t.Fatalf("SetSockOptInt(ReceiveBufferSizeOption, 5) failed: %s", err)
-	}
+	// Start off with a certain receive buffer then cut it in half and verify that
+	// the right edge of the window does not shrink.
+	// NOTE: Netstack doubles the value specified here.
+	rcvBufSize := 65536
+	iss := seqnum.Value(789)
+	// Enable window scaling with a scale of zero from our end.
+	c.CreateConnectedWithRawOptions(iss, 30000, rcvBufSize, []byte{
+		header.TCPOptionWS, 3, 0, header.TCPOptionNOP,
+	})
 
 	we, ch := waiter.NewChannelEntry(nil)
 	c.WQ.EventRegister(&we, waiter.EventIn)
@@ -1910,14 +1956,15 @@ func TestNoWindowShrinking(t *testing.T) {
 	if _, _, err := c.EP.Read(nil); err != tcpip.ErrWouldBlock {
 		t.Fatalf("got c.EP.Read(nil) = %s, want = %s", err, tcpip.ErrWouldBlock)
 	}
-
-	// Send 3 bytes, check that the peer acknowledges them.
-	data := []byte{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
-	c.SendPacket(data[:3], &context.Headers{
+	// Send a 1 byte payload so that we can record the current receive window.
+	// Send a payload of half the size of rcvBufSize.
+	seqNum := iss.Add(1)
+	payload := []byte{1}
+	c.SendPacket(payload, &context.Headers{
 		SrcPort: context.TestPort,
 		DstPort: c.Port,
 		Flags:   header.TCPFlagAck,
-		SeqNum:  790,
+		SeqNum:  seqNum,
 		AckNum:  c.IRS.Add(1),
 		RcvWnd:  30000,
 	})
@@ -1929,46 +1976,93 @@ func TestNoWindowShrinking(t *testing.T) {
 		t.Fatalf("Timed out waiting for data to arrive")
 	}
 
-	// Check that data is acknowledged, and that window doesn't go to zero
-	// just yet because it was previously set to 10. It must go to 7 now.
-	checker.IPv4(t, c.GetPacket(),
+	// Read the 1 byte payload we just sent.
+	v, _, err := c.EP.Read(nil)
+	if err != nil {
+		t.Fatalf("Read failed: %s", err)
+	}
+	if got, want := payload, v; !bytes.Equal(got, want) {
+		t.Fatalf("got data: %v, want: %v", got, want)
+	}
+
+	seqNum = seqNum.Add(1)
+	// Verify that the ACK does not shrink the window.
+	pkt := c.GetPacket()
+	checker.IPv4(t, pkt,
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(c.IRS)+1),
-			checker.AckNum(793),
+			checker.TCPSeqNum(uint32(c.IRS)+1),
+			checker.TCPAckNum(uint32(seqNum)),
 			checker.TCPFlags(header.TCPFlagAck),
-			checker.Window(7),
 		),
 	)
+	// Stash the initial window.
+	initialWnd := header.TCP(header.IPv4(pkt).Payload()).WindowSize() << c.RcvdWindowScale
+	initialLastAcceptableSeq := seqNum.Add(seqnum.Size(initialWnd))
+	// Now shrink the receive buffer to half its original size.
+	if err := c.EP.SetSockOptInt(tcpip.ReceiveBufferSizeOption, rcvBufSize/2); err != nil {
+		t.Fatalf("SetSockOptInt(ReceiveBufferSizeOption, 5) failed: %s", err)
+	}
 
-	// Send 7 more bytes, check that the window fills up.
-	c.SendPacket(data[3:], &context.Headers{
+	data := generateRandomPayload(t, rcvBufSize)
+	// Send a payload of half the size of rcvBufSize.
+	c.SendPacket(data[:rcvBufSize/2], &context.Headers{
 		SrcPort: context.TestPort,
 		DstPort: c.Port,
 		Flags:   header.TCPFlagAck,
-		SeqNum:  793,
+		SeqNum:  seqNum,
 		AckNum:  c.IRS.Add(1),
 		RcvWnd:  30000,
 	})
+	seqNum = seqNum.Add(seqnum.Size(rcvBufSize / 2))
 
-	select {
-	case <-ch:
-	case <-time.After(5 * time.Second):
-		t.Fatalf("Timed out waiting for data to arrive")
+	// Verify that the ACK does not shrink the window.
+	pkt = c.GetPacket()
+	checker.IPv4(t, pkt,
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.TCPSeqNum(uint32(c.IRS)+1),
+			checker.TCPAckNum(uint32(seqNum)),
+			checker.TCPFlags(header.TCPFlagAck),
+		),
+	)
+	newWnd := header.TCP(header.IPv4(pkt).Payload()).WindowSize() << c.RcvdWindowScale
+	newLastAcceptableSeq := seqNum.Add(seqnum.Size(newWnd))
+	if newLastAcceptableSeq.LessThan(initialLastAcceptableSeq) {
+		t.Fatalf("receive window shrunk unexpectedly got: %d, want >= %d", newLastAcceptableSeq, initialLastAcceptableSeq)
 	}
 
+	// Send another payload of half the size of rcvBufSize. This should fill up the
+	// socket receive buffer and we should see a zero window.
+	c.SendPacket(data[rcvBufSize/2:], &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: c.Port,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  seqNum,
+		AckNum:  c.IRS.Add(1),
+		RcvWnd:  30000,
+	})
+	seqNum = seqNum.Add(seqnum.Size(rcvBufSize / 2))
+
 	checker.IPv4(t, c.GetPacket(),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(c.IRS)+1),
-			checker.AckNum(uint32(790+len(data))),
+			checker.TCPSeqNum(uint32(c.IRS)+1),
+			checker.TCPAckNum(uint32(seqNum)),
 			checker.TCPFlags(header.TCPFlagAck),
-			checker.Window(0),
+			checker.TCPWindow(0),
 		),
 	)
 
+	// Wait for receive to be notified.
+	select {
+	case <-ch:
+	case <-time.After(5 * time.Second):
+		t.Fatalf("Timed out waiting for data to arrive")
+	}
+
 	// Receive data and check it.
-	read := make([]byte, 0, 10)
+	read := make([]byte, 0, rcvBufSize)
 	for len(read) < len(data) {
 		v, _, err := c.EP.Read(nil)
 		if err != nil {
@@ -1982,15 +2076,15 @@ func TestNoWindowShrinking(t *testing.T) {
 		t.Fatalf("got data = %v, want = %v", read, data)
 	}
 
-	// Check that we get an ACK for the newly non-zero window, which is the
-	// new size.
+	// Check that we get an ACK for the newly non-zero window, which is the new
+	// receive buffer size we set after the connection was established.
 	checker.IPv4(t, c.GetPacket(),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(c.IRS)+1),
-			checker.AckNum(uint32(790+len(data))),
+			checker.TCPSeqNum(uint32(c.IRS)+1),
+			checker.TCPAckNum(uint32(seqNum)),
 			checker.TCPFlags(header.TCPFlagAck),
-			checker.Window(5),
+			checker.TCPWindow(uint16(rcvBufSize/2)>>c.RcvdWindowScale),
 		),
 	)
 }
@@ -2015,8 +2109,8 @@ func TestSimpleSend(t *testing.T) {
 		checker.PayloadLen(len(data)+header.TCPMinimumSize),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(c.IRS)+1),
-			checker.AckNum(790),
+			checker.TCPSeqNum(uint32(c.IRS)+1),
+			checker.TCPAckNum(790),
 			checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
 		),
 	)
@@ -2057,8 +2151,8 @@ func TestZeroWindowSend(t *testing.T) {
 		checker.PayloadLen(header.TCPMinimumSize),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(c.IRS)),
-			checker.AckNum(790),
+			checker.TCPSeqNum(uint32(c.IRS)),
+			checker.TCPAckNum(790),
 			checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
 		),
 	)
@@ -2079,8 +2173,8 @@ func TestZeroWindowSend(t *testing.T) {
 		checker.PayloadLen(len(data)+header.TCPMinimumSize),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(c.IRS)+1),
-			checker.AckNum(790),
+			checker.TCPSeqNum(uint32(c.IRS)+1),
+			checker.TCPAckNum(790),
 			checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
 		),
 	)
@@ -2119,16 +2213,16 @@ func TestScaledWindowConnect(t *testing.T) {
 		t.Fatalf("Write failed: %s", err)
 	}
 
-	// Check that data is received, and that advertised window is 0xbfff,
+	// Check that data is received, and that advertised window is 0x5fff,
 	// that is, that it is scaled.
 	b := c.GetPacket()
 	checker.IPv4(t, b,
 		checker.PayloadLen(len(data)+header.TCPMinimumSize),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(c.IRS)+1),
-			checker.AckNum(790),
-			checker.Window(0xbfff),
+			checker.TCPSeqNum(uint32(c.IRS)+1),
+			checker.TCPAckNum(790),
+			checker.TCPWindow(0x5fff),
 			checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
 		),
 	)
@@ -2158,9 +2252,9 @@ func TestNonScaledWindowConnect(t *testing.T) {
 		checker.PayloadLen(len(data)+header.TCPMinimumSize),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(c.IRS)+1),
-			checker.AckNum(790),
-			checker.Window(0xffff),
+			checker.TCPSeqNum(uint32(c.IRS)+1),
+			checker.TCPAckNum(790),
+			checker.TCPWindow(0xffff),
 			checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
 		),
 	)
@@ -2194,19 +2288,20 @@ func TestScaledWindowAccept(t *testing.T) {
 	}
 
 	// Do 3-way handshake.
-	c.PassiveConnectWithOptions(100, 2, header.TCPSynOptions{MSS: defaultIPv4MSS})
+	// wndScale expected is 3 as 65535 * 3 * 2 < 65535 * 2^3 but > 65535 *2 *2
+	c.PassiveConnectWithOptions(100, 3 /* wndScale */, header.TCPSynOptions{MSS: defaultIPv4MSS})
 
 	// Try to accept the connection.
 	we, ch := waiter.NewChannelEntry(nil)
 	wq.EventRegister(&we, waiter.EventIn)
 	defer wq.EventUnregister(&we)
 
-	c.EP, _, err = ep.Accept()
+	c.EP, _, err = ep.Accept(nil)
 	if err == tcpip.ErrWouldBlock {
 		// Wait for connection to be established.
 		select {
 		case <-ch:
-			c.EP, _, err = ep.Accept()
+			c.EP, _, err = ep.Accept(nil)
 			if err != nil {
 				t.Fatalf("Accept failed: %s", err)
 			}
@@ -2224,16 +2319,16 @@ func TestScaledWindowAccept(t *testing.T) {
 		t.Fatalf("Write failed: %s", err)
 	}
 
-	// Check that data is received, and that advertised window is 0xbfff,
+	// Check that data is received, and that advertised window is 0x5fff,
 	// that is, that it is scaled.
 	b := c.GetPacket()
 	checker.IPv4(t, b,
 		checker.PayloadLen(len(data)+header.TCPMinimumSize),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(c.IRS)+1),
-			checker.AckNum(790),
-			checker.Window(0xbfff),
+			checker.TCPSeqNum(uint32(c.IRS)+1),
+			checker.TCPAckNum(790),
+			checker.TCPWindow(0x5fff),
 			checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
 		),
 	)
@@ -2275,12 +2370,12 @@ func TestNonScaledWindowAccept(t *testing.T) {
 	wq.EventRegister(&we, waiter.EventIn)
 	defer wq.EventUnregister(&we)
 
-	c.EP, _, err = ep.Accept()
+	c.EP, _, err = ep.Accept(nil)
 	if err == tcpip.ErrWouldBlock {
 		// Wait for connection to be established.
 		select {
 		case <-ch:
-			c.EP, _, err = ep.Accept()
+			c.EP, _, err = ep.Accept(nil)
 			if err != nil {
 				t.Fatalf("Accept failed: %s", err)
 			}
@@ -2305,9 +2400,9 @@ func TestNonScaledWindowAccept(t *testing.T) {
 		checker.PayloadLen(len(data)+header.TCPMinimumSize),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(c.IRS)+1),
-			checker.AckNum(790),
-			checker.Window(0xffff),
+			checker.TCPSeqNum(uint32(c.IRS)+1),
+			checker.TCPAckNum(790),
+			checker.TCPWindow(0xffff),
 			checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
 		),
 	)
@@ -2320,18 +2415,19 @@ func TestZeroScaledWindowReceive(t *testing.T) {
 	c := context.New(t, defaultMTU)
 	defer c.Cleanup()
 
-	// Set the window size such that a window scale of 4 will be used.
-	const wnd = 65535 * 10
-	const ws = uint32(4)
-	c.CreateConnectedWithRawOptions(789, 30000, wnd, []byte{
+	// Set the buffer size such that a window scale of 5 will be used.
+	const bufSz = 65535 * 10
+	const ws = uint32(5)
+	c.CreateConnectedWithRawOptions(789, 30000, bufSz, []byte{
 		header.TCPOptionWS, 3, 0, header.TCPOptionNOP,
 	})
 
 	// Write chunks of 50000 bytes.
-	remain := wnd
+	remain := 0
 	sent := 0
 	data := make([]byte, 50000)
-	for remain > len(data) {
+	// Keep writing till the window drops below len(data).
+	for {
 		c.SendPacket(data, &context.Headers{
 			SrcPort: context.TestPort,
 			DstPort: c.Port,
@@ -2341,21 +2437,25 @@ func TestZeroScaledWindowReceive(t *testing.T) {
 			RcvWnd:  30000,
 		})
 		sent += len(data)
-		remain -= len(data)
-		checker.IPv4(t, c.GetPacket(),
+		pkt := c.GetPacket()
+		checker.IPv4(t, pkt,
 			checker.PayloadLen(header.TCPMinimumSize),
 			checker.TCP(
 				checker.DstPort(context.TestPort),
-				checker.SeqNum(uint32(c.IRS)+1),
-				checker.AckNum(uint32(790+sent)),
-				checker.Window(uint16(remain>>ws)),
+				checker.TCPSeqNum(uint32(c.IRS)+1),
+				checker.TCPAckNum(uint32(790+sent)),
 				checker.TCPFlags(header.TCPFlagAck),
 			),
 		)
+		// Don't reduce window to zero here.
+		if wnd := int(header.TCP(header.IPv4(pkt).Payload()).WindowSize()); wnd<<ws < len(data) {
+			remain = wnd << ws
+			break
+		}
 	}
 
 	// Make the window non-zero, but the scaled window zero.
-	if remain >= 16 {
+	for remain >= 16 {
 		data = data[:remain-15]
 		c.SendPacket(data, &context.Headers{
 			SrcPort: context.TestPort,
@@ -2366,22 +2466,35 @@ func TestZeroScaledWindowReceive(t *testing.T) {
 			RcvWnd:  30000,
 		})
 		sent += len(data)
-		remain -= len(data)
-		checker.IPv4(t, c.GetPacket(),
+		pkt := c.GetPacket()
+		checker.IPv4(t, pkt,
 			checker.PayloadLen(header.TCPMinimumSize),
 			checker.TCP(
 				checker.DstPort(context.TestPort),
-				checker.SeqNum(uint32(c.IRS)+1),
-				checker.AckNum(uint32(790+sent)),
-				checker.Window(0),
+				checker.TCPSeqNum(uint32(c.IRS)+1),
+				checker.TCPAckNum(uint32(790+sent)),
 				checker.TCPFlags(header.TCPFlagAck),
 			),
 		)
+		// Since the receive buffer is split between window advertisement and
+		// application data buffer the window does not always reflect the space
+		// available and actual space available can be a bit more than what is
+		// advertised in the window.
+		wnd := int(header.TCP(header.IPv4(pkt).Payload()).WindowSize())
+		if wnd == 0 {
+			break
+		}
+		remain = wnd << ws
 	}
 
-	// Read at least 1MSS of data. An ack should be sent in response to that.
+	// Read at least 2MSS of data. An ack should be sent in response to that.
+	// Since buffer space is now split in half between window and application
+	// data we need to read more than 1 MSS(65536) of data for a non-zero window
+	// update to be sent. For 1MSS worth of window to be available we need to
+	// read at least 128KB. Since our segments above were 50KB each it means
+	// we need to read at 3 packets.
 	sz := 0
-	for sz < defaultMTU {
+	for sz < defaultMTU*2 {
 		v, _, err := c.EP.Read(nil)
 		if err != nil {
 			t.Fatalf("Read failed: %s", err)
@@ -2393,9 +2506,9 @@ func TestZeroScaledWindowReceive(t *testing.T) {
 		checker.PayloadLen(header.TCPMinimumSize),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(c.IRS)+1),
-			checker.AckNum(uint32(790+sent)),
-			checker.Window(uint16(sz>>ws)),
+			checker.TCPSeqNum(uint32(c.IRS)+1),
+			checker.TCPAckNum(uint32(790+sent)),
+			checker.TCPWindowGreaterThanEq(uint16(defaultMTU>>ws)),
 			checker.TCPFlags(header.TCPFlagAck),
 		),
 	)
@@ -2462,8 +2575,8 @@ func TestSegmentMerging(t *testing.T) {
 					checker.PayloadLen(header.TCPMinimumSize+1),
 					checker.TCP(
 						checker.DstPort(context.TestPort),
-						checker.SeqNum(uint32(c.IRS)+uint32(i)+1),
-						checker.AckNum(790),
+						checker.TCPSeqNum(uint32(c.IRS)+uint32(i)+1),
+						checker.TCPAckNum(790),
 						checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
 					),
 				)
@@ -2485,8 +2598,8 @@ func TestSegmentMerging(t *testing.T) {
 				checker.PayloadLen(len(allData)+header.TCPMinimumSize),
 				checker.TCP(
 					checker.DstPort(context.TestPort),
-					checker.SeqNum(uint32(c.IRS)+11),
-					checker.AckNum(790),
+					checker.TCPSeqNum(uint32(c.IRS)+11),
+					checker.TCPAckNum(790),
 					checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
 				),
 			)
@@ -2533,8 +2646,8 @@ func TestDelay(t *testing.T) {
 			checker.PayloadLen(len(want)+header.TCPMinimumSize),
 			checker.TCP(
 				checker.DstPort(context.TestPort),
-				checker.SeqNum(uint32(seq)),
-				checker.AckNum(790),
+				checker.TCPSeqNum(uint32(seq)),
+				checker.TCPAckNum(790),
 				checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
 			),
 		)
@@ -2580,8 +2693,8 @@ func TestUndelay(t *testing.T) {
 		checker.PayloadLen(len(allData[0])+header.TCPMinimumSize),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(seq)),
-			checker.AckNum(790),
+			checker.TCPSeqNum(uint32(seq)),
+			checker.TCPAckNum(790),
 			checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
 		),
 	)
@@ -2603,8 +2716,8 @@ func TestUndelay(t *testing.T) {
 		checker.PayloadLen(len(allData[1])+header.TCPMinimumSize),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(seq)),
-			checker.AckNum(790),
+			checker.TCPSeqNum(uint32(seq)),
+			checker.TCPAckNum(790),
 			checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
 		),
 	)
@@ -2665,8 +2778,8 @@ func TestMSSNotDelayed(t *testing.T) {
 					checker.PayloadLen(len(data)+header.TCPMinimumSize),
 					checker.TCP(
 						checker.DstPort(context.TestPort),
-						checker.SeqNum(uint32(seq)),
-						checker.AckNum(790),
+						checker.TCPSeqNum(uint32(seq)),
+						checker.TCPAckNum(790),
 						checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
 					),
 				)
@@ -2717,8 +2830,8 @@ func testBrokenUpWrite(t *testing.T, c *context.Context, maxPayload int) {
 		checker.IPv4(t, b,
 			checker.TCP(
 				checker.DstPort(context.TestPort),
-				checker.SeqNum(uint32(c.IRS)+1+uint32(bytesReceived)),
-				checker.AckNum(790),
+				checker.TCPSeqNum(uint32(c.IRS)+1+uint32(bytesReceived)),
+				checker.TCPAckNum(790),
 				checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
 			),
 		)
@@ -2838,12 +2951,12 @@ func TestPassiveSendMSSLessThanMTU(t *testing.T) {
 	wq.EventRegister(&we, waiter.EventIn)
 	defer wq.EventUnregister(&we)
 
-	c.EP, _, err = ep.Accept()
+	c.EP, _, err = ep.Accept(nil)
 	if err == tcpip.ErrWouldBlock {
 		// Wait for connection to be established.
 		select {
 		case <-ch:
-			c.EP, _, err = ep.Accept()
+			c.EP, _, err = ep.Accept(nil)
 			if err != nil {
 				t.Fatalf("Accept failed: %s", err)
 			}
@@ -2865,8 +2978,9 @@ func TestSynCookiePassiveSendMSSLessThanMTU(t *testing.T) {
 
 	// Set the SynRcvd threshold to zero to force a syn cookie based accept
 	// to happen.
-	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPSynRcvdCountThresholdOption(0)); err != nil {
-		t.Fatalf("setting TCPSynRcvdCountThresholdOption to 0 failed: %s", err)
+	opt := tcpip.TCPSynRcvdCountThresholdOption(0)
+	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+		t.Fatalf("SetTransportProtocolOption(%d, &%T(%d)): %s", tcp.ProtocolNumber, opt, opt, err)
 	}
 
 	// Create EP and start listening.
@@ -2893,12 +3007,12 @@ func TestSynCookiePassiveSendMSSLessThanMTU(t *testing.T) {
 	wq.EventRegister(&we, waiter.EventIn)
 	defer wq.EventUnregister(&we)
 
-	c.EP, _, err = ep.Accept()
+	c.EP, _, err = ep.Accept(nil)
 	if err == tcpip.ErrWouldBlock {
 		// Wait for connection to be established.
 		select {
 		case <-ch:
-			c.EP, _, err = ep.Accept()
+			c.EP, _, err = ep.Accept(nil)
 			if err != nil {
 				t.Fatalf("Accept failed: %s", err)
 			}
@@ -2959,7 +3073,7 @@ func TestSynOptionsOnActiveConnect(t *testing.T) {
 	// Set the buffer size to a deterministic size so that we can check the
 	// window scaling option.
 	const rcvBufferSize = 0x20000
-	const wndScale = 2
+	const wndScale = 3
 	if err := c.EP.SetSockOptInt(tcpip.ReceiveBufferSizeOption, rcvBufferSize); err != nil {
 		t.Fatalf("SetSockOptInt(ReceiveBufferSizeOption, %d) failed failed: %s", rcvBufferSize, err)
 	}
@@ -2994,7 +3108,7 @@ func TestSynOptionsOnActiveConnect(t *testing.T) {
 			checker.DstPort(context.TestPort),
 			checker.TCPFlags(header.TCPFlagSyn),
 			checker.SrcPort(tcpHdr.SourcePort()),
-			checker.SeqNum(tcpHdr.SequenceNumber()),
+			checker.TCPSeqNum(tcpHdr.SequenceNumber()),
 			checker.TCPSynOptions(header.TCPSynOptions{MSS: mss, WS: wndScale}),
 		),
 	)
@@ -3015,16 +3129,16 @@ func TestSynOptionsOnActiveConnect(t *testing.T) {
 		checker.TCP(
 			checker.DstPort(context.TestPort),
 			checker.TCPFlags(header.TCPFlagAck),
-			checker.SeqNum(uint32(c.IRS)+1),
-			checker.AckNum(uint32(iss)+1),
+			checker.TCPSeqNum(uint32(c.IRS)+1),
+			checker.TCPAckNum(uint32(iss)+1),
 		),
 	)
 
 	// Wait for connection to be established.
 	select {
 	case <-ch:
-		if err := c.EP.GetSockOpt(tcpip.ErrorOption{}); err != nil {
-			t.Fatalf("GetSockOpt failed: %s", err)
+		if err := c.EP.LastError(); err != nil {
+			t.Fatalf("Connect failed: %s", err)
 		}
 	case <-time.After(1 * time.Second):
 		t.Fatalf("Timed out waiting for connection")
@@ -3144,8 +3258,9 @@ func TestMaxRetransmitsTimeout(t *testing.T) {
 	defer c.Cleanup()
 
 	const numRetries = 2
-	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPMaxRetriesOption(numRetries)); err != nil {
-		t.Fatalf("could not set protocol option MaxRetries.\n")
+	opt := tcpip.TCPMaxRetriesOption(numRetries)
+	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+		t.Fatalf("SetTransportProtocolOption(%d, &%T(%d)): %s", tcp.ProtocolNumber, opt, opt, err)
 	}
 
 	c.CreateConnected(789 /* iss */, 30000 /* rcvWnd */, -1 /* epRcvBuf */)
@@ -3204,8 +3319,9 @@ func TestMaxRTO(t *testing.T) {
 	defer c.Cleanup()
 
 	rto := 1 * time.Second
-	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPMaxRTOOption(rto)); err != nil {
-		t.Fatalf("c.stack.SetTransportProtocolOption(tcp, tcpip.TCPMaxRTO(%d) failed: %s", rto, err)
+	opt := tcpip.TCPMaxRTOOption(rto)
+	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+		t.Fatalf("SetTransportProtocolOption(%d, &%T(%d)): %s", tcp.ProtocolNumber, opt, opt, err)
 	}
 
 	c.CreateConnected(789 /* iss */, 30000 /* rcvWnd */, -1 /* epRcvBuf */)
@@ -3307,8 +3423,8 @@ func TestFinImmediately(t *testing.T) {
 		checker.PayloadLen(header.TCPMinimumSize),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(c.IRS)+1),
-			checker.AckNum(790),
+			checker.TCPSeqNum(uint32(c.IRS)+1),
+			checker.TCPAckNum(790),
 			checker.TCPFlags(header.TCPFlagAck|header.TCPFlagFin),
 		),
 	)
@@ -3328,8 +3444,8 @@ func TestFinImmediately(t *testing.T) {
 		checker.PayloadLen(header.TCPMinimumSize),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(c.IRS)+2),
-			checker.AckNum(791),
+			checker.TCPSeqNum(uint32(c.IRS)+2),
+			checker.TCPAckNum(791),
 			checker.TCPFlags(header.TCPFlagAck),
 		),
 	)
@@ -3350,8 +3466,8 @@ func TestFinRetransmit(t *testing.T) {
 		checker.PayloadLen(header.TCPMinimumSize),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(c.IRS)+1),
-			checker.AckNum(790),
+			checker.TCPSeqNum(uint32(c.IRS)+1),
+			checker.TCPAckNum(790),
 			checker.TCPFlags(header.TCPFlagAck|header.TCPFlagFin),
 		),
 	)
@@ -3361,8 +3477,8 @@ func TestFinRetransmit(t *testing.T) {
 		checker.PayloadLen(header.TCPMinimumSize),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(c.IRS)+1),
-			checker.AckNum(790),
+			checker.TCPSeqNum(uint32(c.IRS)+1),
+			checker.TCPAckNum(790),
 			checker.TCPFlags(header.TCPFlagAck|header.TCPFlagFin),
 		),
 	)
@@ -3382,8 +3498,8 @@ func TestFinRetransmit(t *testing.T) {
 		checker.PayloadLen(header.TCPMinimumSize),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(c.IRS)+2),
-			checker.AckNum(791),
+			checker.TCPSeqNum(uint32(c.IRS)+2),
+			checker.TCPAckNum(791),
 			checker.TCPFlags(header.TCPFlagAck),
 		),
 	)
@@ -3406,8 +3522,8 @@ func TestFinWithNoPendingData(t *testing.T) {
 		checker.PayloadLen(len(view)+header.TCPMinimumSize),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(next),
-			checker.AckNum(790),
+			checker.TCPSeqNum(next),
+			checker.TCPAckNum(790),
 			checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
 		),
 	)
@@ -3431,8 +3547,8 @@ func TestFinWithNoPendingData(t *testing.T) {
 		checker.PayloadLen(header.TCPMinimumSize),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(next),
-			checker.AckNum(790),
+			checker.TCPSeqNum(next),
+			checker.TCPAckNum(790),
 			checker.TCPFlags(header.TCPFlagAck|header.TCPFlagFin),
 		),
 	)
@@ -3453,8 +3569,8 @@ func TestFinWithNoPendingData(t *testing.T) {
 		checker.PayloadLen(header.TCPMinimumSize),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(next),
-			checker.AckNum(791),
+			checker.TCPSeqNum(next),
+			checker.TCPAckNum(791),
 			checker.TCPFlags(header.TCPFlagAck),
 		),
 	)
@@ -3481,8 +3597,8 @@ func TestFinWithPendingDataCwndFull(t *testing.T) {
 			checker.PayloadLen(len(view)+header.TCPMinimumSize),
 			checker.TCP(
 				checker.DstPort(context.TestPort),
-				checker.SeqNum(next),
-				checker.AckNum(790),
+				checker.TCPSeqNum(next),
+				checker.TCPAckNum(790),
 				checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
 			),
 		)
@@ -3500,8 +3616,8 @@ func TestFinWithPendingDataCwndFull(t *testing.T) {
 		checker.PayloadLen(len(view)+header.TCPMinimumSize),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(c.IRS)+1),
-			checker.AckNum(790),
+			checker.TCPSeqNum(uint32(c.IRS)+1),
+			checker.TCPAckNum(790),
 			checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
 		),
 	)
@@ -3520,8 +3636,8 @@ func TestFinWithPendingDataCwndFull(t *testing.T) {
 		checker.PayloadLen(header.TCPMinimumSize),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(next),
-			checker.AckNum(790),
+			checker.TCPSeqNum(next),
+			checker.TCPAckNum(790),
 			checker.TCPFlags(header.TCPFlagAck|header.TCPFlagFin),
 		),
 	)
@@ -3541,8 +3657,8 @@ func TestFinWithPendingDataCwndFull(t *testing.T) {
 		checker.PayloadLen(header.TCPMinimumSize),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(next),
-			checker.AckNum(791),
+			checker.TCPSeqNum(next),
+			checker.TCPAckNum(791),
 			checker.TCPFlags(header.TCPFlagAck),
 		),
 	)
@@ -3565,8 +3681,8 @@ func TestFinWithPendingData(t *testing.T) {
 		checker.PayloadLen(len(view)+header.TCPMinimumSize),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(next),
-			checker.AckNum(790),
+			checker.TCPSeqNum(next),
+			checker.TCPAckNum(790),
 			checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
 		),
 	)
@@ -3590,8 +3706,8 @@ func TestFinWithPendingData(t *testing.T) {
 		checker.PayloadLen(len(view)+header.TCPMinimumSize),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(next),
-			checker.AckNum(790),
+			checker.TCPSeqNum(next),
+			checker.TCPAckNum(790),
 			checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
 		),
 	)
@@ -3606,8 +3722,8 @@ func TestFinWithPendingData(t *testing.T) {
 		checker.PayloadLen(header.TCPMinimumSize),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(next),
-			checker.AckNum(790),
+			checker.TCPSeqNum(next),
+			checker.TCPAckNum(790),
 			checker.TCPFlags(header.TCPFlagAck|header.TCPFlagFin),
 		),
 	)
@@ -3627,8 +3743,8 @@ func TestFinWithPendingData(t *testing.T) {
 		checker.PayloadLen(header.TCPMinimumSize),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(next),
-			checker.AckNum(791),
+			checker.TCPSeqNum(next),
+			checker.TCPAckNum(791),
 			checker.TCPFlags(header.TCPFlagAck),
 		),
 	)
@@ -3652,8 +3768,8 @@ func TestFinWithPartialAck(t *testing.T) {
 		checker.PayloadLen(len(view)+header.TCPMinimumSize),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(next),
-			checker.AckNum(790),
+			checker.TCPSeqNum(next),
+			checker.TCPAckNum(790),
 			checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
 		),
 	)
@@ -3673,8 +3789,8 @@ func TestFinWithPartialAck(t *testing.T) {
 		checker.PayloadLen(header.TCPMinimumSize),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(next),
-			checker.AckNum(791),
+			checker.TCPSeqNum(next),
+			checker.TCPAckNum(791),
 			checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
 		),
 	)
@@ -3688,8 +3804,8 @@ func TestFinWithPartialAck(t *testing.T) {
 		checker.PayloadLen(len(view)+header.TCPMinimumSize),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(next),
-			checker.AckNum(791),
+			checker.TCPSeqNum(next),
+			checker.TCPAckNum(791),
 			checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
 		),
 	)
@@ -3704,8 +3820,8 @@ func TestFinWithPartialAck(t *testing.T) {
 		checker.PayloadLen(header.TCPMinimumSize),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(next),
-			checker.AckNum(791),
+			checker.TCPSeqNum(next),
+			checker.TCPAckNum(791),
 			checker.TCPFlags(header.TCPFlagAck|header.TCPFlagFin),
 		),
 	)
@@ -3796,8 +3912,8 @@ func scaledSendWindow(t *testing.T, scale uint8) {
 		checker.PayloadLen((1<<scale)+header.TCPMinimumSize),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(c.IRS)+1),
-			checker.AckNum(790),
+			checker.TCPSeqNum(uint32(c.IRS)+1),
+			checker.TCPAckNum(790),
 			checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
 		),
 	)
@@ -3935,7 +4051,7 @@ func TestReceivedSegmentQueuing(t *testing.T) {
 		checker.IPv4(t, b,
 			checker.TCP(
 				checker.DstPort(context.TestPort),
-				checker.SeqNum(uint32(c.IRS)+1),
+				checker.TCPSeqNum(uint32(c.IRS)+1),
 				checker.TCPFlags(header.TCPFlagAck),
 			),
 		)
@@ -3962,8 +4078,9 @@ func TestReadAfterClosedState(t *testing.T) {
 	// Set TCPTimeWaitTimeout to 1 seconds so that sockets are marked closed
 	// after 1 second in TIME_WAIT state.
 	tcpTimeWaitTimeout := 1 * time.Second
-	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPTimeWaitTimeoutOption(tcpTimeWaitTimeout)); err != nil {
-		t.Fatalf("c.stack.SetTransportProtocolOption(tcp, tcpip.TCPTimeWaitTimeout(%d) failed: %s", tcpTimeWaitTimeout, err)
+	opt := tcpip.TCPTimeWaitTimeoutOption(tcpTimeWaitTimeout)
+	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+		t.Fatalf("SetTransportProtocolOption(%d, &%T(%d)): %s", tcp.ProtocolNumber, opt, opt, err)
 	}
 
 	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
@@ -3985,8 +4102,8 @@ func TestReadAfterClosedState(t *testing.T) {
 		checker.PayloadLen(header.TCPMinimumSize),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(c.IRS)+1),
-			checker.AckNum(790),
+			checker.TCPSeqNum(uint32(c.IRS)+1),
+			checker.TCPAckNum(790),
 			checker.TCPFlags(header.TCPFlagAck|header.TCPFlagFin),
 		),
 	)
@@ -4010,8 +4127,8 @@ func TestReadAfterClosedState(t *testing.T) {
 	checker.IPv4(t, c.GetPacket(),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(c.IRS)+2),
-			checker.AckNum(uint32(791+len(data))),
+			checker.TCPSeqNum(uint32(c.IRS)+2),
+			checker.TCPAckNum(uint32(791+len(data))),
 			checker.TCPFlags(header.TCPFlagAck),
 		),
 	)
@@ -4183,8 +4300,8 @@ func checkSendBufferSize(t *testing.T, ep tcpip.Endpoint, v int) {
 
 func TestDefaultBufferSizes(t *testing.T) {
 	s := stack.New(stack.Options{
-		NetworkProtocols:   []stack.NetworkProtocol{ipv4.NewProtocol()},
-		TransportProtocols: []stack.TransportProtocol{tcp.NewProtocol()},
+		NetworkProtocols:   []stack.NetworkProtocolFactory{ipv4.NewProtocol},
+		TransportProtocols: []stack.TransportProtocolFactory{tcp.NewProtocol},
 	})
 
 	// Check the default values.
@@ -4202,11 +4319,15 @@ func TestDefaultBufferSizes(t *testing.T) {
 	checkRecvBufferSize(t, ep, tcp.DefaultReceiveBufferSize)
 
 	// Change the default send buffer size.
-	if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SendBufferSizeOption{
-		Min:     1,
-		Default: tcp.DefaultSendBufferSize * 2,
-		Max:     tcp.DefaultSendBufferSize * 20}); err != nil {
-		t.Fatalf("SetTransportProtocolOption failed: %s", err)
+	{
+		opt := tcpip.TCPSendBufferSizeRangeOption{
+			Min:     1,
+			Default: tcp.DefaultSendBufferSize * 2,
+			Max:     tcp.DefaultSendBufferSize * 20,
+		}
+		if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+			t.Fatalf("SetTransportProtocolOption(%d, &%#v): %s", tcp.ProtocolNumber, opt, err)
+		}
 	}
 
 	ep.Close()
@@ -4219,11 +4340,15 @@ func TestDefaultBufferSizes(t *testing.T) {
 	checkRecvBufferSize(t, ep, tcp.DefaultReceiveBufferSize)
 
 	// Change the default receive buffer size.
-	if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.ReceiveBufferSizeOption{
-		Min:     1,
-		Default: tcp.DefaultReceiveBufferSize * 3,
-		Max:     tcp.DefaultReceiveBufferSize * 30}); err != nil {
-		t.Fatalf("SetTransportProtocolOption failed: %v", err)
+	{
+		opt := tcpip.TCPReceiveBufferSizeRangeOption{
+			Min:     1,
+			Default: tcp.DefaultReceiveBufferSize * 3,
+			Max:     tcp.DefaultReceiveBufferSize * 30,
+		}
+		if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+			t.Fatalf("SetTransportProtocolOption(%d, &%#v): %s", tcp.ProtocolNumber, opt, err)
+		}
 	}
 
 	ep.Close()
@@ -4238,8 +4363,8 @@ func TestDefaultBufferSizes(t *testing.T) {
 
 func TestMinMaxBufferSizes(t *testing.T) {
 	s := stack.New(stack.Options{
-		NetworkProtocols:   []stack.NetworkProtocol{ipv4.NewProtocol()},
-		TransportProtocols: []stack.TransportProtocol{tcp.NewProtocol()},
+		NetworkProtocols:   []stack.NetworkProtocolFactory{ipv4.NewProtocol},
+		TransportProtocols: []stack.TransportProtocolFactory{tcp.NewProtocol},
 	})
 
 	// Check the default values.
@@ -4250,22 +4375,28 @@ func TestMinMaxBufferSizes(t *testing.T) {
 	defer ep.Close()
 
 	// Change the min/max values for send/receive
-	if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.ReceiveBufferSizeOption{Min: 200, Default: tcp.DefaultReceiveBufferSize * 2, Max: tcp.DefaultReceiveBufferSize * 20}); err != nil {
-		t.Fatalf("SetTransportProtocolOption failed: %s", err)
+	{
+		opt := tcpip.TCPReceiveBufferSizeRangeOption{Min: 200, Default: tcp.DefaultReceiveBufferSize * 2, Max: tcp.DefaultReceiveBufferSize * 20}
+		if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+			t.Fatalf("SetTransportProtocolOption(%d, &%#v): %s", tcp.ProtocolNumber, opt, err)
+		}
 	}
 
-	if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SendBufferSizeOption{Min: 300, Default: tcp.DefaultSendBufferSize * 3, Max: tcp.DefaultSendBufferSize * 30}); err != nil {
-		t.Fatalf("SetTransportProtocolOption failed: %s", err)
+	{
+		opt := tcpip.TCPSendBufferSizeRangeOption{Min: 300, Default: tcp.DefaultSendBufferSize * 3, Max: tcp.DefaultSendBufferSize * 30}
+		if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+			t.Fatalf("SetTransportProtocolOption(%d, &%#v): %s", tcp.ProtocolNumber, opt, err)
+		}
 	}
 
-	// Set values below the min.
-	if err := ep.SetSockOptInt(tcpip.ReceiveBufferSizeOption, 199); err != nil {
+	// Set values below the min/2.
+	if err := ep.SetSockOptInt(tcpip.ReceiveBufferSizeOption, 99); err != nil {
 		t.Fatalf("SetSockOptInt(ReceiveBufferSizeOption, 199) failed: %s", err)
 	}
 
 	checkRecvBufferSize(t, ep, 200)
 
-	if err := ep.SetSockOptInt(tcpip.SendBufferSizeOption, 299); err != nil {
+	if err := ep.SetSockOptInt(tcpip.SendBufferSizeOption, 149); err != nil {
 		t.Fatalf("SetSockOptInt(SendBufferSizeOption, 299) failed: %s", err)
 	}
 
@@ -4276,19 +4407,21 @@ func TestMinMaxBufferSizes(t *testing.T) {
 		t.Fatalf("SetSockOptInt(ReceiveBufferSizeOption) failed: %s", err)
 	}
 
-	checkRecvBufferSize(t, ep, tcp.DefaultReceiveBufferSize*20)
+	// Values above max are capped at max and then doubled.
+	checkRecvBufferSize(t, ep, tcp.DefaultReceiveBufferSize*20*2)
 
 	if err := ep.SetSockOptInt(tcpip.SendBufferSizeOption, 1+tcp.DefaultSendBufferSize*30); err != nil {
 		t.Fatalf("SetSockOptInt(SendBufferSizeOption) failed: %s", err)
 	}
 
-	checkSendBufferSize(t, ep, tcp.DefaultSendBufferSize*30)
+	// Values above max are capped at max and then doubled.
+	checkSendBufferSize(t, ep, tcp.DefaultSendBufferSize*30*2)
 }
 
 func TestBindToDeviceOption(t *testing.T) {
 	s := stack.New(stack.Options{
-		NetworkProtocols:   []stack.NetworkProtocol{ipv4.NewProtocol()},
-		TransportProtocols: []stack.TransportProtocol{tcp.NewProtocol()}})
+		NetworkProtocols:   []stack.NetworkProtocolFactory{ipv4.NewProtocol},
+		TransportProtocols: []stack.TransportProtocolFactory{tcp.NewProtocol}})
 
 	ep, err := s.NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &waiter.Queue{})
 	if err != nil {
@@ -4321,16 +4454,15 @@ func TestBindToDeviceOption(t *testing.T) {
 		t.Run(testAction.name, func(t *testing.T) {
 			if testAction.setBindToDevice != nil {
 				bindToDevice := tcpip.BindToDeviceOption(*testAction.setBindToDevice)
-				if gotErr, wantErr := ep.SetSockOpt(bindToDevice), testAction.setBindToDeviceError; gotErr != wantErr {
-					t.Errorf("SetSockOpt(%#v) got %v, want %v", bindToDevice, gotErr, wantErr)
+				if gotErr, wantErr := ep.SetSockOpt(&bindToDevice), testAction.setBindToDeviceError; gotErr != wantErr {
+					t.Errorf("got SetSockOpt(&%T(%d)) = %s, want = %s", bindToDevice, bindToDevice, gotErr, wantErr)
 				}
 			}
 			bindToDevice := tcpip.BindToDeviceOption(88888)
 			if err := ep.GetSockOpt(&bindToDevice); err != nil {
-				t.Errorf("GetSockOpt got %s, want %v", err, nil)
-			}
-			if got, want := bindToDevice, testAction.getBindToDevice; got != want {
-				t.Errorf("bindToDevice got %d, want %d", got, want)
+				t.Errorf("GetSockOpt(&%T): %s", bindToDevice, err)
+			} else if bindToDevice != testAction.getBindToDevice {
+				t.Errorf("got bindToDevice = %d, want %d", bindToDevice, testAction.getBindToDevice)
 			}
 		})
 	}
@@ -4338,11 +4470,11 @@ func TestBindToDeviceOption(t *testing.T) {
 
 func makeStack() (*stack.Stack, *tcpip.Error) {
 	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{
-			ipv4.NewProtocol(),
-			ipv6.NewProtocol(),
+		NetworkProtocols: []stack.NetworkProtocolFactory{
+			ipv4.NewProtocol,
+			ipv6.NewProtocol,
 		},
-		TransportProtocols: []stack.TransportProtocol{tcp.NewProtocol()},
+		TransportProtocols: []stack.TransportProtocolFactory{tcp.NewProtocol},
 	})
 
 	id := loopback.New()
@@ -4411,7 +4543,7 @@ func TestSelfConnect(t *testing.T) {
 	}
 
 	<-notifyCh
-	if err := ep.GetSockOpt(tcpip.ErrorOption{}); err != nil {
+	if err := ep.LastError(); err != nil {
 		t.Fatalf("Connect failed: %s", err)
 	}
 
@@ -4625,8 +4757,8 @@ func TestPathMTUDiscovery(t *testing.T) {
 				checker.PayloadLen(size+header.TCPMinimumSize),
 				checker.TCP(
 					checker.DstPort(context.TestPort),
-					checker.SeqNum(seqNum),
-					checker.AckNum(790),
+					checker.TCPSeqNum(seqNum),
+					checker.TCPAckNum(790),
 					checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
 				),
 			)
@@ -4717,8 +4849,8 @@ func TestStackSetCongestionControl(t *testing.T) {
 				t.Fatalf("s.TransportProtocolOption(%v, %v) = %s", tcp.ProtocolNumber, &oldCC, err)
 			}
 
-			if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, tc.cc); err != tc.err {
-				t.Fatalf("s.SetTransportProtocolOption(%v, %v) = %v, want %v", tcp.ProtocolNumber, tc.cc, err, tc.err)
+			if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, &tc.cc); err != tc.err {
+				t.Fatalf("s.SetTransportProtocolOption(%d, &%T(%s)) = %s, want = %s", tcp.ProtocolNumber, tc.cc, tc.cc, err, tc.err)
 			}
 
 			var cc tcpip.CongestionControlOption
@@ -4750,12 +4882,12 @@ func TestStackAvailableCongestionControl(t *testing.T) {
 	s := c.Stack()
 
 	// Query permitted congestion control algorithms.
-	var aCC tcpip.AvailableCongestionControlOption
+	var aCC tcpip.TCPAvailableCongestionControlOption
 	if err := s.TransportProtocolOption(tcp.ProtocolNumber, &aCC); err != nil {
 		t.Fatalf("s.TransportProtocolOption(%v, %v) = %v", tcp.ProtocolNumber, &aCC, err)
 	}
-	if got, want := aCC, tcpip.AvailableCongestionControlOption("reno cubic"); got != want {
-		t.Fatalf("got tcpip.AvailableCongestionControlOption: %v, want: %v", got, want)
+	if got, want := aCC, tcpip.TCPAvailableCongestionControlOption("reno cubic"); got != want {
+		t.Fatalf("got tcpip.TCPAvailableCongestionControlOption: %v, want: %v", got, want)
 	}
 }
 
@@ -4766,18 +4898,18 @@ func TestStackSetAvailableCongestionControl(t *testing.T) {
 	s := c.Stack()
 
 	// Setting AvailableCongestionControlOption should fail.
-	aCC := tcpip.AvailableCongestionControlOption("xyz")
+	aCC := tcpip.TCPAvailableCongestionControlOption("xyz")
 	if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, &aCC); err == nil {
-		t.Fatalf("s.TransportProtocolOption(%v, %v) = nil, want non-nil", tcp.ProtocolNumber, &aCC)
+		t.Fatalf("s.SetTransportProtocolOption(%d, &%T(%s)) = nil, want non-nil", tcp.ProtocolNumber, aCC, aCC)
 	}
 
 	// Verify that we still get the expected list of congestion control options.
-	var cc tcpip.AvailableCongestionControlOption
+	var cc tcpip.TCPAvailableCongestionControlOption
 	if err := s.TransportProtocolOption(tcp.ProtocolNumber, &cc); err != nil {
-		t.Fatalf("s.TransportProtocolOption(%v, %v) = %v", tcp.ProtocolNumber, &cc, err)
+		t.Fatalf("s.TransportProtocolOptio(%d, &%T(%s)): %s", tcp.ProtocolNumber, cc, cc, err)
 	}
-	if got, want := cc, tcpip.AvailableCongestionControlOption("reno cubic"); got != want {
-		t.Fatalf("got tcpip.AvailableCongestionControlOption: %v, want: %v", got, want)
+	if got, want := cc, tcpip.TCPAvailableCongestionControlOption("reno cubic"); got != want {
+		t.Fatalf("got tcpip.TCPAvailableCongestionControlOption = %s, want = %s", got, want)
 	}
 }
 
@@ -4806,20 +4938,20 @@ func TestEndpointSetCongestionControl(t *testing.T) {
 
 				var oldCC tcpip.CongestionControlOption
 				if err := c.EP.GetSockOpt(&oldCC); err != nil {
-					t.Fatalf("c.EP.SockOpt(%v) = %s", &oldCC, err)
+					t.Fatalf("c.EP.GetSockOpt(&%T) = %s", oldCC, err)
 				}
 
 				if connected {
 					c.Connect(789 /* iss */, 32768 /* rcvWnd */, nil)
 				}
 
-				if err := c.EP.SetSockOpt(tc.cc); err != tc.err {
-					t.Fatalf("c.EP.SetSockOpt(%v) = %s, want %s", tc.cc, err, tc.err)
+				if err := c.EP.SetSockOpt(&tc.cc); err != tc.err {
+					t.Fatalf("got c.EP.SetSockOpt(&%#v) = %s, want %s", tc.cc, err, tc.err)
 				}
 
 				var cc tcpip.CongestionControlOption
 				if err := c.EP.GetSockOpt(&cc); err != nil {
-					t.Fatalf("c.EP.SockOpt(%v) = %s", &cc, err)
+					t.Fatalf("c.EP.GetSockOpt(&%T): %s", cc, err)
 				}
 
 				got, want := cc, oldCC
@@ -4831,7 +4963,7 @@ func TestEndpointSetCongestionControl(t *testing.T) {
 					want = tc.cc
 				}
 				if got != want {
-					t.Fatalf("got congestion control: %v, want: %v", got, want)
+					t.Fatalf("got congestion control = %+v, want = %+v", got, want)
 				}
 			})
 		}
@@ -4841,8 +4973,8 @@ func TestEndpointSetCongestionControl(t *testing.T) {
 func enableCUBIC(t *testing.T, c *context.Context) {
 	t.Helper()
 	opt := tcpip.CongestionControlOption("cubic")
-	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, opt); err != nil {
-		t.Fatalf("c.s.SetTransportProtocolOption(tcp.ProtocolNumber, %s = %s", opt, err)
+	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+		t.Fatalf("SetTransportProtocolOption(%d, &%T(%s)) %s", tcp.ProtocolNumber, opt, opt, err)
 	}
 }
 
@@ -4852,11 +4984,23 @@ func TestKeepalive(t *testing.T) {
 
 	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
 
+	const keepAliveIdle = 100 * time.Millisecond
 	const keepAliveInterval = 3 * time.Second
-	c.EP.SetSockOpt(tcpip.KeepaliveIdleOption(100 * time.Millisecond))
-	c.EP.SetSockOpt(tcpip.KeepaliveIntervalOption(keepAliveInterval))
+	keepAliveIdleOpt := tcpip.KeepaliveIdleOption(keepAliveIdle)
+	if err := c.EP.SetSockOpt(&keepAliveIdleOpt); err != nil {
+		t.Fatalf("c.EP.SetSockOpt(&%T(%s)): %s", keepAliveIdleOpt, keepAliveIdle, err)
+	}
+	keepAliveIntervalOpt := tcpip.KeepaliveIntervalOption(keepAliveInterval)
+	if err := c.EP.SetSockOpt(&keepAliveIntervalOpt); err != nil {
+		t.Fatalf("c.EP.SetSockOpt(&%T(%s)): %s", keepAliveIntervalOpt, keepAliveInterval, err)
+	}
 	c.EP.SetSockOptInt(tcpip.KeepaliveCountOption, 5)
-	c.EP.SetSockOptBool(tcpip.KeepaliveEnabledOption, true)
+	if err := c.EP.SetSockOptInt(tcpip.KeepaliveCountOption, 5); err != nil {
+		t.Fatalf("c.EP.SetSockOptInt(tcpip.KeepaliveCountOption, 5): %s", err)
+	}
+	if err := c.EP.SetSockOptBool(tcpip.KeepaliveEnabledOption, true); err != nil {
+		t.Fatalf("c.EP.SetSockOptBool(tcpip.KeepaliveEnabledOption, true): %s", err)
+	}
 
 	// 5 unacked keepalives are sent. ACK each one, and check that the
 	// connection stays alive after 5.
@@ -4865,8 +5009,8 @@ func TestKeepalive(t *testing.T) {
 		checker.IPv4(t, b,
 			checker.TCP(
 				checker.DstPort(context.TestPort),
-				checker.SeqNum(uint32(c.IRS)),
-				checker.AckNum(uint32(790)),
+				checker.TCPSeqNum(uint32(c.IRS)),
+				checker.TCPAckNum(uint32(790)),
 				checker.TCPFlags(header.TCPFlagAck),
 			),
 		)
@@ -4899,8 +5043,8 @@ func TestKeepalive(t *testing.T) {
 		checker.PayloadLen(len(view)+header.TCPMinimumSize),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(next),
-			checker.AckNum(790),
+			checker.TCPSeqNum(next),
+			checker.TCPAckNum(790),
 			checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
 		),
 	)
@@ -4911,8 +5055,8 @@ func TestKeepalive(t *testing.T) {
 		checker.PayloadLen(len(view)+header.TCPMinimumSize),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(next),
-			checker.AckNum(790),
+			checker.TCPSeqNum(next),
+			checker.TCPAckNum(790),
 			checker.TCPFlags(header.TCPFlagAck|header.TCPFlagPsh),
 		),
 	)
@@ -4937,8 +5081,8 @@ func TestKeepalive(t *testing.T) {
 		checker.IPv4(t, b,
 			checker.TCP(
 				checker.DstPort(context.TestPort),
-				checker.SeqNum(uint32(next-1)),
-				checker.AckNum(uint32(790)),
+				checker.TCPSeqNum(uint32(next-1)),
+				checker.TCPAckNum(uint32(790)),
 				checker.TCPFlags(header.TCPFlagAck),
 			),
 		)
@@ -4964,8 +5108,8 @@ func TestKeepalive(t *testing.T) {
 	checker.IPv4(t, c.GetPacket(),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(next)),
-			checker.AckNum(uint32(0)),
+			checker.TCPSeqNum(uint32(next)),
+			checker.TCPAckNum(uint32(0)),
 			checker.TCPFlags(header.TCPFlagRst),
 		),
 	)
@@ -5005,7 +5149,7 @@ func executeHandshake(t *testing.T, c *context.Context, srcPort uint16, synCooki
 		checker.SrcPort(context.StackPort),
 		checker.DstPort(srcPort),
 		checker.TCPFlags(header.TCPFlagAck | header.TCPFlagSyn),
-		checker.AckNum(uint32(irs) + 1),
+		checker.TCPAckNum(uint32(irs) + 1),
 	}
 
 	if synCookieInUse {
@@ -5049,7 +5193,7 @@ func executeV6Handshake(t *testing.T, c *context.Context, srcPort uint16, synCoo
 		checker.SrcPort(context.StackPort),
 		checker.DstPort(srcPort),
 		checker.TCPFlags(header.TCPFlagAck | header.TCPFlagSyn),
-		checker.AckNum(uint32(irs) + 1),
+		checker.TCPAckNum(uint32(irs) + 1),
 	}
 
 	if synCookieInUse {
@@ -5122,12 +5266,12 @@ func TestListenBacklogFull(t *testing.T) {
 	defer c.WQ.EventUnregister(&we)
 
 	for i := 0; i < listenBacklog; i++ {
-		_, _, err = c.EP.Accept()
+		_, _, err = c.EP.Accept(nil)
 		if err == tcpip.ErrWouldBlock {
 			// Wait for connection to be established.
 			select {
 			case <-ch:
-				_, _, err = c.EP.Accept()
+				_, _, err = c.EP.Accept(nil)
 				if err != nil {
 					t.Fatalf("Accept failed: %s", err)
 				}
@@ -5139,7 +5283,7 @@ func TestListenBacklogFull(t *testing.T) {
 	}
 
 	// Now verify that there are no more connections that can be accepted.
-	_, _, err = c.EP.Accept()
+	_, _, err = c.EP.Accept(nil)
 	if err != tcpip.ErrWouldBlock {
 		select {
 		case <-ch:
@@ -5151,12 +5295,12 @@ func TestListenBacklogFull(t *testing.T) {
 	// Now a new handshake must succeed.
 	executeHandshake(t, c, context.TestPort+2, false /*synCookieInUse */)
 
-	newEP, _, err := c.EP.Accept()
+	newEP, _, err := c.EP.Accept(nil)
 	if err == tcpip.ErrWouldBlock {
 		// Wait for connection to be established.
 		select {
 		case <-ch:
-			newEP, _, err = c.EP.Accept()
+			newEP, _, err = c.EP.Accept(nil)
 			if err != nil {
 				t.Fatalf("Accept failed: %s", err)
 			}
@@ -5181,6 +5325,8 @@ func TestListenBacklogFull(t *testing.T) {
 func TestListenNoAcceptNonUnicastV4(t *testing.T) {
 	multicastAddr := tcpip.Address("\xe0\x00\x01\x02")
 	otherMulticastAddr := tcpip.Address("\xe0\x00\x01\x03")
+	subnet := context.StackAddrWithPrefix.Subnet()
+	subnetBroadcastAddr := subnet.Broadcast()
 
 	tests := []struct {
 		name    string
@@ -5188,53 +5334,59 @@ func TestListenNoAcceptNonUnicastV4(t *testing.T) {
 		dstAddr tcpip.Address
 	}{
 		{
-			"SourceUnspecified",
-			header.IPv4Any,
-			context.StackAddr,
+			name:    "SourceUnspecified",
+			srcAddr: header.IPv4Any,
+			dstAddr: context.StackAddr,
 		},
 		{
-			"SourceBroadcast",
-			header.IPv4Broadcast,
-			context.StackAddr,
+			name:    "SourceBroadcast",
+			srcAddr: header.IPv4Broadcast,
+			dstAddr: context.StackAddr,
 		},
 		{
-			"SourceOurMulticast",
-			multicastAddr,
-			context.StackAddr,
+			name:    "SourceOurMulticast",
+			srcAddr: multicastAddr,
+			dstAddr: context.StackAddr,
 		},
 		{
-			"SourceOtherMulticast",
-			otherMulticastAddr,
-			context.StackAddr,
+			name:    "SourceOtherMulticast",
+			srcAddr: otherMulticastAddr,
+			dstAddr: context.StackAddr,
 		},
 		{
-			"DestUnspecified",
-			context.TestAddr,
-			header.IPv4Any,
+			name:    "DestUnspecified",
+			srcAddr: context.TestAddr,
+			dstAddr: header.IPv4Any,
 		},
 		{
-			"DestBroadcast",
-			context.TestAddr,
-			header.IPv4Broadcast,
+			name:    "DestBroadcast",
+			srcAddr: context.TestAddr,
+			dstAddr: header.IPv4Broadcast,
 		},
 		{
-			"DestOurMulticast",
-			context.TestAddr,
-			multicastAddr,
+			name:    "DestOurMulticast",
+			srcAddr: context.TestAddr,
+			dstAddr: multicastAddr,
 		},
 		{
-			"DestOtherMulticast",
-			context.TestAddr,
-			otherMulticastAddr,
+			name:    "DestOtherMulticast",
+			srcAddr: context.TestAddr,
+			dstAddr: otherMulticastAddr,
+		},
+		{
+			name:    "SrcSubnetBroadcast",
+			srcAddr: subnetBroadcastAddr,
+			dstAddr: context.StackAddr,
+		},
+		{
+			name:    "DestSubnetBroadcast",
+			srcAddr: context.TestAddr,
+			dstAddr: subnetBroadcastAddr,
 		},
 	}
 
 	for _, test := range tests {
-		test := test // capture range variable
-
 		t.Run(test.name, func(t *testing.T) {
-			t.Parallel()
-
 			c := context.New(t, defaultMTU)
 			defer c.Cleanup()
 
@@ -5275,7 +5427,7 @@ func TestListenNoAcceptNonUnicastV4(t *testing.T) {
 					checker.SrcPort(context.StackPort),
 					checker.DstPort(context.TestPort),
 					checker.TCPFlags(header.TCPFlagAck|header.TCPFlagSyn),
-					checker.AckNum(uint32(irs)+1)))
+					checker.TCPAckNum(uint32(irs)+1)))
 		})
 	}
 }
@@ -5283,8 +5435,8 @@ func TestListenNoAcceptNonUnicastV4(t *testing.T) {
 // TestListenNoAcceptMulticastBroadcastV6 makes sure that TCP segments with a
 // non unicast IPv6 address are not accepted.
 func TestListenNoAcceptNonUnicastV6(t *testing.T) {
-	multicastAddr := tcpip.Address("\xff\x0e\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x01")
-	otherMulticastAddr := tcpip.Address("\xff\x0e\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x02")
+	multicastAddr := tcpip.Address("\xff\x0e\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x01")
+	otherMulticastAddr := tcpip.Address("\xff\x0e\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x02")
 
 	tests := []struct {
 		name    string
@@ -5334,11 +5486,7 @@ func TestListenNoAcceptNonUnicastV6(t *testing.T) {
 	}
 
 	for _, test := range tests {
-		test := test // capture range variable
-
 		t.Run(test.name, func(t *testing.T) {
-			t.Parallel()
-
 			c := context.New(t, defaultMTU)
 			defer c.Cleanup()
 
@@ -5379,7 +5527,7 @@ func TestListenNoAcceptNonUnicastV6(t *testing.T) {
 					checker.SrcPort(context.StackPort),
 					checker.DstPort(context.TestPort),
 					checker.TCPFlags(header.TCPFlagAck|header.TCPFlagSyn),
-					checker.AckNum(uint32(irs)+1)))
+					checker.TCPAckNum(uint32(irs)+1)))
 		})
 	}
 }
@@ -5427,7 +5575,7 @@ func TestListenSynRcvdQueueFull(t *testing.T) {
 		checker.SrcPort(context.StackPort),
 		checker.DstPort(context.TestPort),
 		checker.TCPFlags(header.TCPFlagAck | header.TCPFlagSyn),
-		checker.AckNum(uint32(irs) + 1),
+		checker.TCPAckNum(uint32(irs) + 1),
 	}
 	checker.IPv4(t, b, checker.TCP(tcpCheckers...))
 
@@ -5463,12 +5611,12 @@ func TestListenSynRcvdQueueFull(t *testing.T) {
 	c.WQ.EventRegister(&we, waiter.EventIn)
 	defer c.WQ.EventUnregister(&we)
 
-	newEP, _, err := c.EP.Accept()
+	newEP, _, err := c.EP.Accept(nil)
 	if err == tcpip.ErrWouldBlock {
 		// Wait for connection to be established.
 		select {
 		case <-ch:
-			newEP, _, err = c.EP.Accept()
+			newEP, _, err = c.EP.Accept(nil)
 			if err != nil {
 				t.Fatalf("Accept failed: %s", err)
 			}
@@ -5492,8 +5640,9 @@ func TestListenBacklogFullSynCookieInUse(t *testing.T) {
 	c := context.New(t, defaultMTU)
 	defer c.Cleanup()
 
-	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPSynRcvdCountThresholdOption(1)); err != nil {
-		t.Fatalf("setting TCPSynRcvdCountThresholdOption to 1 failed: %s", err)
+	opt := tcpip.TCPSynRcvdCountThresholdOption(1)
+	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+		t.Fatalf("SetTransportProtocolOption(%d, &%T(%d)): %s", tcp.ProtocolNumber, opt, opt, err)
 	}
 
 	// Create TCP endpoint.
@@ -5539,12 +5688,12 @@ func TestListenBacklogFullSynCookieInUse(t *testing.T) {
 	c.WQ.EventRegister(&we, waiter.EventIn)
 	defer c.WQ.EventUnregister(&we)
 
-	_, _, err = c.EP.Accept()
+	_, _, err = c.EP.Accept(nil)
 	if err == tcpip.ErrWouldBlock {
 		// Wait for connection to be established.
 		select {
 		case <-ch:
-			_, _, err = c.EP.Accept()
+			_, _, err = c.EP.Accept(nil)
 			if err != nil {
 				t.Fatalf("Accept failed: %s", err)
 			}
@@ -5555,7 +5704,7 @@ func TestListenBacklogFullSynCookieInUse(t *testing.T) {
 	}
 
 	// Now verify that there are no more connections that can be accepted.
-	_, _, err = c.EP.Accept()
+	_, _, err = c.EP.Accept(nil)
 	if err != tcpip.ErrWouldBlock {
 		select {
 		case <-ch:
@@ -5604,7 +5753,7 @@ func TestSynRcvdBadSeqNumber(t *testing.T) {
 		checker.SrcPort(context.StackPort),
 		checker.DstPort(context.TestPort),
 		checker.TCPFlags(header.TCPFlagAck | header.TCPFlagSyn),
-		checker.AckNum(uint32(irs) + 1),
+		checker.TCPAckNum(uint32(irs) + 1),
 	}
 	checker.IPv4(t, b, checker.TCP(tcpCheckers...))
 
@@ -5625,8 +5774,8 @@ func TestSynRcvdBadSeqNumber(t *testing.T) {
 		checker.SrcPort(context.StackPort),
 		checker.DstPort(context.TestPort),
 		checker.TCPFlags(header.TCPFlagAck),
-		checker.AckNum(uint32(irs) + 1),
-		checker.SeqNum(uint32(iss + 1)),
+		checker.TCPAckNum(uint32(irs) + 1),
+		checker.TCPSeqNum(uint32(iss + 1)),
 	}
 	checker.IPv4(t, b, checker.TCP(tcpCheckers...))
 
@@ -5644,7 +5793,7 @@ func TestSynRcvdBadSeqNumber(t *testing.T) {
 		RcvWnd:  30000,
 	})
 
-	newEP, _, err := c.EP.Accept()
+	newEP, _, err := c.EP.Accept(nil)
 
 	if err != nil && err != tcpip.ErrWouldBlock {
 		t.Fatalf("Accept failed: %s", err)
@@ -5659,7 +5808,7 @@ func TestSynRcvdBadSeqNumber(t *testing.T) {
 		// Wait for connection to be established.
 		select {
 		case <-ch:
-			newEP, _, err = c.EP.Accept()
+			newEP, _, err = c.EP.Accept(nil)
 			if err != nil {
 				t.Fatalf("Accept failed: %s", err)
 			}
@@ -5717,12 +5866,12 @@ func TestPassiveConnectionAttemptIncrement(t *testing.T) {
 	defer c.WQ.EventUnregister(&we)
 
 	// Verify that there is only one acceptable connection at this point.
-	_, _, err = c.EP.Accept()
+	_, _, err = c.EP.Accept(nil)
 	if err == tcpip.ErrWouldBlock {
 		// Wait for connection to be established.
 		select {
 		case <-ch:
-			_, _, err = c.EP.Accept()
+			_, _, err = c.EP.Accept(nil)
 			if err != nil {
 				t.Fatalf("Accept failed: %s", err)
 			}
@@ -5787,12 +5936,12 @@ func TestPassiveFailedConnectionAttemptIncrement(t *testing.T) {
 	defer c.WQ.EventUnregister(&we)
 
 	// Now check that there is one acceptable connections.
-	_, _, err = c.EP.Accept()
+	_, _, err = c.EP.Accept(nil)
 	if err == tcpip.ErrWouldBlock {
 		// Wait for connection to be established.
 		select {
 		case <-ch:
-			_, _, err = c.EP.Accept()
+			_, _, err = c.EP.Accept(nil)
 			if err != nil {
 				t.Fatalf("Accept failed: %s", err)
 			}
@@ -5840,12 +5989,12 @@ func TestEndpointBindListenAcceptState(t *testing.T) {
 	wq.EventRegister(&we, waiter.EventIn)
 	defer wq.EventUnregister(&we)
 
-	aep, _, err := ep.Accept()
+	aep, _, err := ep.Accept(nil)
 	if err == tcpip.ErrWouldBlock {
 		// Wait for connection to be established.
 		select {
 		case <-ch:
-			aep, _, err = ep.Accept()
+			aep, _, err = ep.Accept(nil)
 			if err != nil {
 				t.Fatalf("Accept failed: %s", err)
 			}
@@ -5893,13 +6042,19 @@ func TestReceiveBufferAutoTuningApplicationLimited(t *testing.T) {
 	// the segment queue holding unprocessed packets is limited to 500.
 	const receiveBufferSize = 80 << 10 // 80KB.
 	const maxReceiveBufferSize = receiveBufferSize * 10
-	if err := stk.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.ReceiveBufferSizeOption{Min: 1, Default: receiveBufferSize, Max: maxReceiveBufferSize}); err != nil {
-		t.Fatalf("SetTransportProtocolOption failed: %s", err)
+	{
+		opt := tcpip.TCPReceiveBufferSizeRangeOption{Min: 1, Default: receiveBufferSize, Max: maxReceiveBufferSize}
+		if err := stk.SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+			t.Fatalf("SetTransportProtocolOption(%d, &%#v): %s", tcp.ProtocolNumber, opt, err)
+		}
 	}
 
 	// Enable auto-tuning.
-	if err := stk.SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.ModerateReceiveBufferOption(true)); err != nil {
-		t.Fatalf("SetTransportProtocolOption failed: %s", err)
+	{
+		opt := tcpip.TCPModerateReceiveBufferOption(true)
+		if err := stk.SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+			t.Fatalf("SetTransportProtocolOption(%d, &%T(%t)): %s", tcp.ProtocolNumber, opt, opt, err)
+		}
 	}
 	// Change the expected window scale to match the value needed for the
 	// maximum buffer size defined above.
@@ -5918,16 +6073,14 @@ func TestReceiveBufferAutoTuningApplicationLimited(t *testing.T) {
 	time.Sleep(latency)
 	rawEP.SendPacketWithTS([]byte{1}, tsVal)
 
-	// Verify that the ACK has the expected window.
-	wantRcvWnd := receiveBufferSize
-	wantRcvWnd = (wantRcvWnd >> uint32(c.WindowScale))
-	rawEP.VerifyACKRcvWnd(uint16(wantRcvWnd - 1))
+	pkt := rawEP.VerifyAndReturnACKWithTS(tsVal)
+	rcvWnd := header.TCP(header.IPv4(pkt).Payload()).WindowSize()
 	time.Sleep(25 * time.Millisecond)
 
 	// Allocate a large enough payload for the test.
-	b := make([]byte, int(receiveBufferSize)*2)
-	offset := 0
-	payloadSize := receiveBufferSize - 1
+	payloadSize := receiveBufferSize * 2
+	b := make([]byte, int(payloadSize))
+
 	worker := (c.EP).(interface {
 		StopWork()
 		ResumeWork()
@@ -5936,11 +6089,15 @@ func TestReceiveBufferAutoTuningApplicationLimited(t *testing.T) {
 
 	// Stop the worker goroutine.
 	worker.StopWork()
-	start := offset
-	end := offset + payloadSize
+	start := 0
+	end := payloadSize / 2
 	packetsSent := 0
 	for ; start < end; start += mss {
-		rawEP.SendPacketWithTS(b[start:start+mss], tsVal)
+		packetEnd := start + mss
+		if start+mss > end {
+			packetEnd = end
+		}
+		rawEP.SendPacketWithTS(b[start:packetEnd], tsVal)
 		packetsSent++
 	}
 
@@ -5948,29 +6105,20 @@ func TestReceiveBufferAutoTuningApplicationLimited(t *testing.T) {
 	// are waiting to be read.
 	worker.ResumeWork()
 
-	// Since we read no bytes the window should goto zero till the
-	// application reads some of the data.
-	// Discard all intermediate acks except the last one.
-	if packetsSent > 100 {
-		for i := 0; i < (packetsSent / 100); i++ {
-			_ = c.GetPacket()
-		}
+	// Since we sent almost the full receive buffer worth of data (some may have
+	// been dropped due to segment overheads), we should get a zero window back.
+	pkt = c.GetPacket()
+	tcpHdr := header.TCP(header.IPv4(pkt).Payload())
+	gotRcvWnd := tcpHdr.WindowSize()
+	wantAckNum := tcpHdr.AckNumber()
+	if got, want := int(gotRcvWnd), 0; got != want {
+		t.Fatalf("got rcvWnd: %d, want: %d", got, want)
 	}
-	rawEP.VerifyACKRcvWnd(0)
 
 	time.Sleep(25 * time.Millisecond)
-	// Verify that sending more data when window is closed is dropped and
-	// not acked.
+	// Verify that sending more data when receiveBuffer is exhausted.
 	rawEP.SendPacketWithTS(b[start:start+mss], tsVal)
 
-	// Verify that the stack sends us back an ACK with the sequence number
-	// of the last packet sent indicating it was dropped.
-	p := c.GetPacket()
-	checker.IPv4(t, p, checker.TCP(
-		checker.AckNum(uint32(rawEP.NextSeqNum)-uint32(mss)),
-		checker.Window(0),
-	))
-
 	// Now read all the data from the endpoint and verify that advertised
 	// window increases to the full available buffer size.
 	for {
@@ -5983,23 +6131,26 @@ func TestReceiveBufferAutoTuningApplicationLimited(t *testing.T) {
 	// Verify that we receive a non-zero window update ACK. When running
 	// under thread santizer this test can end up sending more than 1
 	// ack, 1 for the non-zero window
-	p = c.GetPacket()
+	p := c.GetPacket()
 	checker.IPv4(t, p, checker.TCP(
-		checker.AckNum(uint32(rawEP.NextSeqNum)-uint32(mss)),
+		checker.TCPAckNum(uint32(wantAckNum)),
 		func(t *testing.T, h header.Transport) {
 			tcp, ok := h.(header.TCP)
 			if !ok {
 				return
 			}
-			if w := tcp.WindowSize(); w == 0 || w > uint16(wantRcvWnd) {
-				t.Errorf("expected a non-zero window: got %d, want <= wantRcvWnd", w)
+			// We use 10% here as the error margin upwards as the initial window we
+			// got was afer 1 segment was already in the receive buffer queue.
+			tolerance := 1.1
+			if w := tcp.WindowSize(); w == 0 || w > uint16(float64(rcvWnd)*tolerance) {
+				t.Errorf("expected a non-zero window: got %d, want <= %d", w, uint16(float64(rcvWnd)*tolerance))
 			}
 		},
 	))
 }
 
-// This test verifies that the auto tuning does not grow the receive buffer if
-// the application is not reading the data actively.
+// This test verifies that the advertised window is auto-tuned up as the
+// application is reading the data that is being received.
 func TestReceiveBufferAutoTuning(t *testing.T) {
 	const mtu = 1500
 	const mss = mtu - header.IPv4MinimumSize - header.TCPMinimumSize
@@ -6009,26 +6160,33 @@ func TestReceiveBufferAutoTuning(t *testing.T) {
 
 	// Enable Auto-tuning.
 	stk := c.Stack()
-	// Set lower limits for auto-tuning tests. This is required because the
-	// test stops the worker which can cause packets to be dropped because
-	// the segment queue holding unprocessed packets is limited to 300.
 	const receiveBufferSize = 80 << 10 // 80KB.
 	const maxReceiveBufferSize = receiveBufferSize * 10
-	if err := stk.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.ReceiveBufferSizeOption{Min: 1, Default: receiveBufferSize, Max: maxReceiveBufferSize}); err != nil {
-		t.Fatalf("SetTransportProtocolOption failed: %s", err)
+	{
+		opt := tcpip.TCPReceiveBufferSizeRangeOption{Min: 1, Default: receiveBufferSize, Max: maxReceiveBufferSize}
+		if err := stk.SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+			t.Fatalf("SetTransportProtocolOption(%d, &%#v): %s", tcp.ProtocolNumber, opt, err)
+		}
 	}
 
 	// Enable auto-tuning.
-	if err := stk.SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.ModerateReceiveBufferOption(true)); err != nil {
-		t.Fatalf("SetTransportProtocolOption failed: %s", err)
+	{
+		opt := tcpip.TCPModerateReceiveBufferOption(true)
+		if err := stk.SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+			t.Fatalf("SetTransportProtocolOption(%d, &%T(%t)): %s", tcp.ProtocolNumber, opt, opt, err)
+		}
 	}
 	// Change the expected window scale to match the value needed for the
 	// maximum buffer size used by stack.
 	c.WindowScale = uint8(tcp.FindWndScale(maxReceiveBufferSize))
 
 	rawEP := c.CreateConnectedWithOptions(header.TCPSynOptions{TS: true, WS: 4})
-
-	wantRcvWnd := receiveBufferSize
+	tsVal := uint32(rawEP.TSVal)
+	rawEP.NextSeqNum--
+	rawEP.SendPacketWithTS(nil, tsVal)
+	rawEP.NextSeqNum++
+	pkt := rawEP.VerifyAndReturnACKWithTS(tsVal)
+	curRcvWnd := int(header.TCP(header.IPv4(pkt).Payload()).WindowSize()) << c.WindowScale
 	scaleRcvWnd := func(rcvWnd int) uint16 {
 		return uint16(rcvWnd >> uint16(c.WindowScale))
 	}
@@ -6045,14 +6203,8 @@ func TestReceiveBufferAutoTuning(t *testing.T) {
 		StopWork()
 		ResumeWork()
 	})
-	tsVal := rawEP.TSVal
-	// We are going to do our own computation of what the moderated receive
-	// buffer should be based on sent/copied data per RTT and verify that
-	// the advertised window by the stack matches our calculations.
-	prevCopied := 0
-	done := false
 	latency := 1 * time.Millisecond
-	for i := 0; !done; i++ {
+	for i := 0; i < 5; i++ {
 		tsVal++
 
 		// Stop the worker goroutine.
@@ -6074,15 +6226,20 @@ func TestReceiveBufferAutoTuning(t *testing.T) {
 		// Give 1ms for the worker to process the packets.
 		time.Sleep(1 * time.Millisecond)
 
-		// Verify that the advertised window on the ACK is reduced by
-		// the total bytes sent.
-		expectedWnd := wantRcvWnd - totalSent
-		if packetsSent > 100 {
-			for i := 0; i < (packetsSent / 100); i++ {
-				_ = c.GetPacket()
+		lastACK := c.GetPacket()
+		// Discard any intermediate ACKs and only check the last ACK we get in a
+		// short time period of few ms.
+		for {
+			time.Sleep(1 * time.Millisecond)
+			pkt := c.GetPacketNonBlocking()
+			if pkt == nil {
+				break
 			}
+			lastACK = pkt
+		}
+		if got, want := int(header.TCP(header.IPv4(lastACK).Payload()).WindowSize()), int(scaleRcvWnd(curRcvWnd)); got > want {
+			t.Fatalf("advertised window got: %d, want <= %d", got, want)
 		}
-		rawEP.VerifyACKRcvWnd(scaleRcvWnd(expectedWnd))
 
 		// Now read all the data from the endpoint and invoke the
 		// moderation API to allow for receive buffer auto-tuning
@@ -6112,30 +6269,28 @@ func TestReceiveBufferAutoTuning(t *testing.T) {
 			// In the first iteration the receiver based RTT is not
 			// yet known as a result the moderation code should not
 			// increase the advertised window.
-			rawEP.VerifyACKRcvWnd(scaleRcvWnd(wantRcvWnd))
-			prevCopied = totalCopied
+			rawEP.VerifyACKRcvWnd(scaleRcvWnd(curRcvWnd))
 		} else {
-			rttCopied := totalCopied
-			if i == 1 {
-				// The moderation code accumulates copied bytes till
-				// RTT is established. So add in the bytes sent in
-				// the first iteration to the total bytes for this
-				// RTT.
-				rttCopied += prevCopied
-				// Now reset it to the initial value used by the
-				// auto tuning logic.
-				prevCopied = tcp.InitialCwnd * mss * 2
+			// Read loop above could generate an ACK if the window had dropped to
+			// zero and then read had opened it up.
+			lastACK := c.GetPacket()
+			// Discard any intermediate ACKs and only check the last ACK we get in a
+			// short time period of few ms.
+			for {
+				time.Sleep(1 * time.Millisecond)
+				pkt := c.GetPacketNonBlocking()
+				if pkt == nil {
+					break
+				}
+				lastACK = pkt
 			}
-			newWnd := rttCopied<<1 + 16*mss
-			grow := (newWnd * (rttCopied - prevCopied)) / prevCopied
-			newWnd += (grow << 1)
-			if newWnd > maxReceiveBufferSize {
-				newWnd = maxReceiveBufferSize
-				done = true
+			curRcvWnd = int(header.TCP(header.IPv4(lastACK).Payload()).WindowSize()) << c.WindowScale
+			// If thew new current window is close maxReceiveBufferSize then terminate
+			// the loop. This can happen before all iterations are done due to timing
+			// differences when running the test.
+			if int(float64(curRcvWnd)*1.1) > maxReceiveBufferSize/2 {
+				break
 			}
-			rawEP.VerifyACKRcvWnd(scaleRcvWnd(newWnd))
-			wantRcvWnd = newWnd
-			prevCopied = rttCopied
 			// Increase the latency after first two iterations to
 			// establish a low RTT value in the receiver since it
 			// only tracks the lowest value. This ensures that when
@@ -6148,6 +6303,12 @@ func TestReceiveBufferAutoTuning(t *testing.T) {
 		offset += payloadSize
 		payloadSize *= 2
 	}
+	// Check that at the end of our iterations the receive window grew close to the maximum
+	// permissible size of maxReceiveBufferSize/2
+	if got, want := int(float64(curRcvWnd)*1.1), maxReceiveBufferSize/2; got < want {
+		t.Fatalf("unexpected rcvWnd got: %d, want > %d", got, want)
+	}
+
 }
 
 func TestDelayEnabled(t *testing.T) {
@@ -6156,7 +6317,7 @@ func TestDelayEnabled(t *testing.T) {
 	checkDelayOption(t, c, false, false) // Delay is disabled by default.
 
 	for _, v := range []struct {
-		delayEnabled    tcp.DelayEnabled
+		delayEnabled    tcpip.TCPDelayEnabled
 		wantDelayOption bool
 	}{
 		{delayEnabled: false, wantDelayOption: false},
@@ -6164,17 +6325,17 @@ func TestDelayEnabled(t *testing.T) {
 	} {
 		c := context.New(t, defaultMTU)
 		defer c.Cleanup()
-		if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, v.delayEnabled); err != nil {
-			t.Fatalf("SetTransportProtocolOption(tcp, %t) failed: %s", v.delayEnabled, err)
+		if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, &v.delayEnabled); err != nil {
+			t.Fatalf("SetTransportProtocolOption(%d, &%T(%t)): %s", tcp.ProtocolNumber, v.delayEnabled, v.delayEnabled, err)
 		}
 		checkDelayOption(t, c, v.delayEnabled, v.wantDelayOption)
 	}
 }
 
-func checkDelayOption(t *testing.T, c *context.Context, wantDelayEnabled tcp.DelayEnabled, wantDelayOption bool) {
+func checkDelayOption(t *testing.T, c *context.Context, wantDelayEnabled tcpip.TCPDelayEnabled, wantDelayOption bool) {
 	t.Helper()
 
-	var gotDelayEnabled tcp.DelayEnabled
+	var gotDelayEnabled tcpip.TCPDelayEnabled
 	if err := c.Stack().TransportProtocolOption(tcp.ProtocolNumber, &gotDelayEnabled); err != nil {
 		t.Fatalf("TransportProtocolOption(tcp, &gotDelayEnabled) failed: %s", err)
 	}
@@ -6206,24 +6367,27 @@ func TestTCPLingerTimeout(t *testing.T) {
 		tcpLingerTimeout time.Duration
 		want             time.Duration
 	}{
-		{"NegativeLingerTimeout", -123123, 0},
-		{"ZeroLingerTimeout", 0, 0},
+		{"NegativeLingerTimeout", -123123, -1},
+		// Zero is treated same as the stack's default TCP_LINGER2 timeout.
+		{"ZeroLingerTimeout", 0, tcp.DefaultTCPLingerTimeout},
 		{"InRangeLingerTimeout", 10 * time.Second, 10 * time.Second},
 		// Values > stack's TCPLingerTimeout are capped to the stack's
 		// value. Defaults to tcp.DefaultTCPLingerTimeout(60 seconds)
-		{"AboveMaxLingerTimeout", 125 * time.Second, 120 * time.Second},
+		{"AboveMaxLingerTimeout", tcp.MaxTCPLingerTimeout + 5*time.Second, tcp.MaxTCPLingerTimeout},
 	}
 	for _, tc := range testCases {
 		t.Run(tc.name, func(t *testing.T) {
-			if err := c.EP.SetSockOpt(tcpip.TCPLingerTimeoutOption(tc.tcpLingerTimeout)); err != nil {
-				t.Fatalf("SetSockOpt(%s) = %s", tc.tcpLingerTimeout, err)
+			v := tcpip.TCPLingerTimeoutOption(tc.tcpLingerTimeout)
+			if err := c.EP.SetSockOpt(&v); err != nil {
+				t.Fatalf("SetSockOpt(&%T(%s)) = %s", v, tc.tcpLingerTimeout, err)
 			}
-			var v tcpip.TCPLingerTimeoutOption
+
+			v = 0
 			if err := c.EP.GetSockOpt(&v); err != nil {
-				t.Fatalf("GetSockOpt(tcpip.TCPLingerTimeoutOption) = %s", err)
+				t.Fatalf("GetSockOpt(&%T) = %s", v, err)
 			}
 			if got, want := time.Duration(v), tc.want; got != want {
-				t.Fatalf("unexpected linger timeout got: %s, want: %s", got, want)
+				t.Fatalf("got linger timeout = %s, want = %s", got, want)
 			}
 		})
 	}
@@ -6277,12 +6441,12 @@ func TestTCPTimeWaitRSTIgnored(t *testing.T) {
 	wq.EventRegister(&we, waiter.EventIn)
 	defer wq.EventUnregister(&we)
 
-	c.EP, _, err = ep.Accept()
+	c.EP, _, err = ep.Accept(nil)
 	if err == tcpip.ErrWouldBlock {
 		// Wait for connection to be established.
 		select {
 		case <-ch:
-			c.EP, _, err = ep.Accept()
+			c.EP, _, err = ep.Accept(nil)
 			if err != nil {
 				t.Fatalf("Accept failed: %s", err)
 			}
@@ -6296,8 +6460,8 @@ func TestTCPTimeWaitRSTIgnored(t *testing.T) {
 	checker.IPv4(t, c.GetPacket(), checker.TCP(
 		checker.SrcPort(context.StackPort),
 		checker.DstPort(context.TestPort),
-		checker.SeqNum(uint32(c.IRS+1)),
-		checker.AckNum(uint32(iss)+1),
+		checker.TCPSeqNum(uint32(c.IRS+1)),
+		checker.TCPAckNum(uint32(iss)+1),
 		checker.TCPFlags(header.TCPFlagFin|header.TCPFlagAck)))
 
 	finHeaders := &context.Headers{
@@ -6314,8 +6478,8 @@ func TestTCPTimeWaitRSTIgnored(t *testing.T) {
 	checker.IPv4(t, c.GetPacket(), checker.TCP(
 		checker.SrcPort(context.StackPort),
 		checker.DstPort(context.TestPort),
-		checker.SeqNum(uint32(c.IRS+2)),
-		checker.AckNum(uint32(iss)+2),
+		checker.TCPSeqNum(uint32(c.IRS+2)),
+		checker.TCPAckNum(uint32(iss)+2),
 		checker.TCPFlags(header.TCPFlagAck)))
 
 	// Now send a RST and this should be ignored and not
@@ -6343,8 +6507,8 @@ func TestTCPTimeWaitRSTIgnored(t *testing.T) {
 	checker.IPv4(t, c.GetPacket(), checker.TCP(
 		checker.SrcPort(context.StackPort),
 		checker.DstPort(context.TestPort),
-		checker.SeqNum(uint32(c.IRS+2)),
-		checker.AckNum(uint32(iss)+2),
+		checker.TCPSeqNum(uint32(c.IRS+2)),
+		checker.TCPAckNum(uint32(iss)+2),
 		checker.TCPFlags(header.TCPFlagAck)))
 }
 
@@ -6396,12 +6560,12 @@ func TestTCPTimeWaitOutOfOrder(t *testing.T) {
 	wq.EventRegister(&we, waiter.EventIn)
 	defer wq.EventUnregister(&we)
 
-	c.EP, _, err = ep.Accept()
+	c.EP, _, err = ep.Accept(nil)
 	if err == tcpip.ErrWouldBlock {
 		// Wait for connection to be established.
 		select {
 		case <-ch:
-			c.EP, _, err = ep.Accept()
+			c.EP, _, err = ep.Accept(nil)
 			if err != nil {
 				t.Fatalf("Accept failed: %s", err)
 			}
@@ -6415,8 +6579,8 @@ func TestTCPTimeWaitOutOfOrder(t *testing.T) {
 	checker.IPv4(t, c.GetPacket(), checker.TCP(
 		checker.SrcPort(context.StackPort),
 		checker.DstPort(context.TestPort),
-		checker.SeqNum(uint32(c.IRS+1)),
-		checker.AckNum(uint32(iss)+1),
+		checker.TCPSeqNum(uint32(c.IRS+1)),
+		checker.TCPAckNum(uint32(iss)+1),
 		checker.TCPFlags(header.TCPFlagFin|header.TCPFlagAck)))
 
 	finHeaders := &context.Headers{
@@ -6433,8 +6597,8 @@ func TestTCPTimeWaitOutOfOrder(t *testing.T) {
 	checker.IPv4(t, c.GetPacket(), checker.TCP(
 		checker.SrcPort(context.StackPort),
 		checker.DstPort(context.TestPort),
-		checker.SeqNum(uint32(c.IRS+2)),
-		checker.AckNum(uint32(iss)+2),
+		checker.TCPSeqNum(uint32(c.IRS+2)),
+		checker.TCPAckNum(uint32(iss)+2),
 		checker.TCPFlags(header.TCPFlagAck)))
 
 	// Out of order ACK should generate an immediate ACK in
@@ -6450,8 +6614,8 @@ func TestTCPTimeWaitOutOfOrder(t *testing.T) {
 	checker.IPv4(t, c.GetPacket(), checker.TCP(
 		checker.SrcPort(context.StackPort),
 		checker.DstPort(context.TestPort),
-		checker.SeqNum(uint32(c.IRS+2)),
-		checker.AckNum(uint32(iss)+2),
+		checker.TCPSeqNum(uint32(c.IRS+2)),
+		checker.TCPAckNum(uint32(iss)+2),
 		checker.TCPFlags(header.TCPFlagAck)))
 }
 
@@ -6503,12 +6667,12 @@ func TestTCPTimeWaitNewSyn(t *testing.T) {
 	wq.EventRegister(&we, waiter.EventIn)
 	defer wq.EventUnregister(&we)
 
-	c.EP, _, err = ep.Accept()
+	c.EP, _, err = ep.Accept(nil)
 	if err == tcpip.ErrWouldBlock {
 		// Wait for connection to be established.
 		select {
 		case <-ch:
-			c.EP, _, err = ep.Accept()
+			c.EP, _, err = ep.Accept(nil)
 			if err != nil {
 				t.Fatalf("Accept failed: %s", err)
 			}
@@ -6522,8 +6686,8 @@ func TestTCPTimeWaitNewSyn(t *testing.T) {
 	checker.IPv4(t, c.GetPacket(), checker.TCP(
 		checker.SrcPort(context.StackPort),
 		checker.DstPort(context.TestPort),
-		checker.SeqNum(uint32(c.IRS+1)),
-		checker.AckNum(uint32(iss)+1),
+		checker.TCPSeqNum(uint32(c.IRS+1)),
+		checker.TCPAckNum(uint32(iss)+1),
 		checker.TCPFlags(header.TCPFlagFin|header.TCPFlagAck)))
 
 	finHeaders := &context.Headers{
@@ -6540,8 +6704,8 @@ func TestTCPTimeWaitNewSyn(t *testing.T) {
 	checker.IPv4(t, c.GetPacket(), checker.TCP(
 		checker.SrcPort(context.StackPort),
 		checker.DstPort(context.TestPort),
-		checker.SeqNum(uint32(c.IRS+2)),
-		checker.AckNum(uint32(iss)+2),
+		checker.TCPSeqNum(uint32(c.IRS+2)),
+		checker.TCPAckNum(uint32(iss)+2),
 		checker.TCPFlags(header.TCPFlagAck)))
 
 	// Send a SYN request w/ sequence number lower than
@@ -6586,12 +6750,12 @@ func TestTCPTimeWaitNewSyn(t *testing.T) {
 	c.SendPacket(nil, ackHeaders)
 
 	// Try to accept the connection.
-	c.EP, _, err = ep.Accept()
+	c.EP, _, err = ep.Accept(nil)
 	if err == tcpip.ErrWouldBlock {
 		// Wait for connection to be established.
 		select {
 		case <-ch:
-			c.EP, _, err = ep.Accept()
+			c.EP, _, err = ep.Accept(nil)
 			if err != nil {
 				t.Fatalf("Accept failed: %s", err)
 			}
@@ -6609,8 +6773,9 @@ func TestTCPTimeWaitDuplicateFINExtendsTimeWait(t *testing.T) {
 	// Set TCPTimeWaitTimeout to 5 seconds so that sockets are marked closed
 	// after 5 seconds in TIME_WAIT state.
 	tcpTimeWaitTimeout := 5 * time.Second
-	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPTimeWaitTimeoutOption(tcpTimeWaitTimeout)); err != nil {
-		t.Fatalf("c.stack.SetTransportProtocolOption(tcp, tcpip.TCPLingerTimeoutOption(%d) failed: %s", tcpTimeWaitTimeout, err)
+	opt := tcpip.TCPTimeWaitTimeoutOption(tcpTimeWaitTimeout)
+	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+		t.Fatalf("SetTransportProtocolOption(%d, &%T(%s)): %s", tcp.ProtocolNumber, opt, tcpTimeWaitTimeout, err)
 	}
 
 	want := c.Stack().Stats().TCP.EstablishedClosed.Value() + 1
@@ -6659,12 +6824,12 @@ func TestTCPTimeWaitDuplicateFINExtendsTimeWait(t *testing.T) {
 	wq.EventRegister(&we, waiter.EventIn)
 	defer wq.EventUnregister(&we)
 
-	c.EP, _, err = ep.Accept()
+	c.EP, _, err = ep.Accept(nil)
 	if err == tcpip.ErrWouldBlock {
 		// Wait for connection to be established.
 		select {
 		case <-ch:
-			c.EP, _, err = ep.Accept()
+			c.EP, _, err = ep.Accept(nil)
 			if err != nil {
 				t.Fatalf("Accept failed: %s", err)
 			}
@@ -6678,8 +6843,8 @@ func TestTCPTimeWaitDuplicateFINExtendsTimeWait(t *testing.T) {
 	checker.IPv4(t, c.GetPacket(), checker.TCP(
 		checker.SrcPort(context.StackPort),
 		checker.DstPort(context.TestPort),
-		checker.SeqNum(uint32(c.IRS+1)),
-		checker.AckNum(uint32(iss)+1),
+		checker.TCPSeqNum(uint32(c.IRS+1)),
+		checker.TCPAckNum(uint32(iss)+1),
 		checker.TCPFlags(header.TCPFlagFin|header.TCPFlagAck)))
 
 	finHeaders := &context.Headers{
@@ -6696,8 +6861,8 @@ func TestTCPTimeWaitDuplicateFINExtendsTimeWait(t *testing.T) {
 	checker.IPv4(t, c.GetPacket(), checker.TCP(
 		checker.SrcPort(context.StackPort),
 		checker.DstPort(context.TestPort),
-		checker.SeqNum(uint32(c.IRS+2)),
-		checker.AckNum(uint32(iss)+2),
+		checker.TCPSeqNum(uint32(c.IRS+2)),
+		checker.TCPAckNum(uint32(iss)+2),
 		checker.TCPFlags(header.TCPFlagAck)))
 
 	time.Sleep(2 * time.Second)
@@ -6711,8 +6876,8 @@ func TestTCPTimeWaitDuplicateFINExtendsTimeWait(t *testing.T) {
 	checker.IPv4(t, c.GetPacket(), checker.TCP(
 		checker.SrcPort(context.StackPort),
 		checker.DstPort(context.TestPort),
-		checker.SeqNum(uint32(c.IRS+2)),
-		checker.AckNum(uint32(iss)+2),
+		checker.TCPSeqNum(uint32(c.IRS+2)),
+		checker.TCPAckNum(uint32(iss)+2),
 		checker.TCPFlags(header.TCPFlagAck)))
 
 	// Sleep for 4 seconds so at this point we are 1 second past the
@@ -6740,8 +6905,8 @@ func TestTCPTimeWaitDuplicateFINExtendsTimeWait(t *testing.T) {
 	checker.IPv4(t, c.GetPacket(), checker.TCP(
 		checker.SrcPort(context.StackPort),
 		checker.DstPort(context.TestPort),
-		checker.SeqNum(uint32(ackHeaders.AckNum)),
-		checker.AckNum(0),
+		checker.TCPSeqNum(uint32(ackHeaders.AckNum)),
+		checker.TCPAckNum(0),
 		checker.TCPFlags(header.TCPFlagRst)))
 
 	if got := c.Stack().Stats().TCP.EstablishedClosed.Value(); got != want {
@@ -6759,8 +6924,9 @@ func TestTCPCloseWithData(t *testing.T) {
 	// Set TCPTimeWaitTimeout to 5 seconds so that sockets are marked closed
 	// after 5 seconds in TIME_WAIT state.
 	tcpTimeWaitTimeout := 5 * time.Second
-	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPTimeWaitTimeoutOption(tcpTimeWaitTimeout)); err != nil {
-		t.Fatalf("c.stack.SetTransportProtocolOption(tcp, tcpip.TCPLingerTimeoutOption(%d) failed: %s", tcpTimeWaitTimeout, err)
+	opt := tcpip.TCPTimeWaitTimeoutOption(tcpTimeWaitTimeout)
+	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+		t.Fatalf("SetTransportProtocolOption(%d, &%T(%s)): %s", tcp.ProtocolNumber, opt, tcpTimeWaitTimeout, err)
 	}
 
 	wq := &waiter.Queue{}
@@ -6808,12 +6974,12 @@ func TestTCPCloseWithData(t *testing.T) {
 	wq.EventRegister(&we, waiter.EventIn)
 	defer wq.EventUnregister(&we)
 
-	c.EP, _, err = ep.Accept()
+	c.EP, _, err = ep.Accept(nil)
 	if err == tcpip.ErrWouldBlock {
 		// Wait for connection to be established.
 		select {
 		case <-ch:
-			c.EP, _, err = ep.Accept()
+			c.EP, _, err = ep.Accept(nil)
 			if err != nil {
 				t.Fatalf("Accept failed: %s", err)
 			}
@@ -6839,8 +7005,8 @@ func TestTCPCloseWithData(t *testing.T) {
 	checker.IPv4(t, c.GetPacket(), checker.TCP(
 		checker.SrcPort(context.StackPort),
 		checker.DstPort(context.TestPort),
-		checker.SeqNum(uint32(c.IRS+1)),
-		checker.AckNum(uint32(iss)+2),
+		checker.TCPSeqNum(uint32(c.IRS+1)),
+		checker.TCPAckNum(uint32(iss)+2),
 		checker.TCPFlags(header.TCPFlagAck)))
 
 	// Now write a few bytes and then close the endpoint.
@@ -6858,8 +7024,8 @@ func TestTCPCloseWithData(t *testing.T) {
 		checker.PayloadLen(len(data)+header.TCPMinimumSize),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(c.IRS)+1),
-			checker.AckNum(uint32(iss)+2), // Acknum is initial sequence number + 1
+			checker.TCPSeqNum(uint32(c.IRS)+1),
+			checker.TCPAckNum(uint32(iss)+2), // Acknum is initial sequence number + 1
 			checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
 		),
 	)
@@ -6873,8 +7039,8 @@ func TestTCPCloseWithData(t *testing.T) {
 	checker.IPv4(t, c.GetPacket(), checker.TCP(
 		checker.SrcPort(context.StackPort),
 		checker.DstPort(context.TestPort),
-		checker.SeqNum(uint32(c.IRS+1)+uint32(len(data))),
-		checker.AckNum(uint32(iss+2)),
+		checker.TCPSeqNum(uint32(c.IRS+1)+uint32(len(data))),
+		checker.TCPAckNum(uint32(iss+2)),
 		checker.TCPFlags(header.TCPFlagFin|header.TCPFlagAck)))
 
 	// First send a partial ACK.
@@ -6919,8 +7085,8 @@ func TestTCPCloseWithData(t *testing.T) {
 	checker.IPv4(t, c.GetPacket(), checker.TCP(
 		checker.SrcPort(context.StackPort),
 		checker.DstPort(context.TestPort),
-		checker.SeqNum(uint32(ackHeaders.AckNum)),
-		checker.AckNum(0),
+		checker.TCPSeqNum(uint32(ackHeaders.AckNum)),
+		checker.TCPAckNum(0),
 		checker.TCPFlags(header.TCPFlagRst)))
 }
 
@@ -6940,7 +7106,10 @@ func TestTCPUserTimeout(t *testing.T) {
 	// expired.
 	initRTO := 1 * time.Second
 	userTimeout := initRTO / 2
-	c.EP.SetSockOpt(tcpip.TCPUserTimeoutOption(userTimeout))
+	v := tcpip.TCPUserTimeoutOption(userTimeout)
+	if err := c.EP.SetSockOpt(&v); err != nil {
+		t.Fatalf("c.EP.SetSockOpt(&%T(%s): %s", v, userTimeout, err)
+	}
 
 	// Send some data and wait before ACKing it.
 	view := buffer.NewView(3)
@@ -6953,8 +7122,8 @@ func TestTCPUserTimeout(t *testing.T) {
 		checker.PayloadLen(len(view)+header.TCPMinimumSize),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(next),
-			checker.AckNum(790),
+			checker.TCPSeqNum(next),
+			checker.TCPAckNum(790),
 			checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
 		),
 	)
@@ -6988,8 +7157,8 @@ func TestTCPUserTimeout(t *testing.T) {
 	checker.IPv4(t, c.GetPacket(),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(next)),
-			checker.AckNum(uint32(0)),
+			checker.TCPSeqNum(uint32(next)),
+			checker.TCPAckNum(uint32(0)),
 			checker.TCPFlags(header.TCPFlagRst),
 		),
 	)
@@ -7014,18 +7183,31 @@ func TestKeepaliveWithUserTimeout(t *testing.T) {
 
 	origEstablishedTimedout := c.Stack().Stats().TCP.EstablishedTimedout.Value()
 
+	const keepAliveIdle = 100 * time.Millisecond
 	const keepAliveInterval = 3 * time.Second
-	c.EP.SetSockOpt(tcpip.KeepaliveIdleOption(100 * time.Millisecond))
-	c.EP.SetSockOpt(tcpip.KeepaliveIntervalOption(keepAliveInterval))
-	c.EP.SetSockOptInt(tcpip.KeepaliveCountOption, 10)
-	c.EP.SetSockOptBool(tcpip.KeepaliveEnabledOption, true)
+	keepAliveIdleOption := tcpip.KeepaliveIdleOption(keepAliveIdle)
+	if err := c.EP.SetSockOpt(&keepAliveIdleOption); err != nil {
+		t.Fatalf("c.EP.SetSockOpt(&%T(%s)): %s", keepAliveIdleOption, keepAliveIdle, err)
+	}
+	keepAliveIntervalOption := tcpip.KeepaliveIntervalOption(keepAliveInterval)
+	if err := c.EP.SetSockOpt(&keepAliveIntervalOption); err != nil {
+		t.Fatalf("c.EP.SetSockOpt(&%T(%s)): %s", keepAliveIntervalOption, keepAliveInterval, err)
+	}
+	if err := c.EP.SetSockOptInt(tcpip.KeepaliveCountOption, 10); err != nil {
+		t.Fatalf("c.EP.SetSockOptInt(tcpip.KeepaliveCountOption, 10): %s", err)
+	}
+	if err := c.EP.SetSockOptBool(tcpip.KeepaliveEnabledOption, true); err != nil {
+		t.Fatalf("c.EP.SetSockOptBool(tcpip.KeepaliveEnabledOption, true): %s", err)
+	}
 
 	// Set userTimeout to be the duration to be 1 keepalive
 	// probes. Which means that after the first probe is sent
 	// the second one should cause the connection to be
 	// closed due to userTimeout being hit.
-	userTimeout := 1 * keepAliveInterval
-	c.EP.SetSockOpt(tcpip.TCPUserTimeoutOption(userTimeout))
+	userTimeout := tcpip.TCPUserTimeoutOption(keepAliveInterval)
+	if err := c.EP.SetSockOpt(&userTimeout); err != nil {
+		t.Fatalf("c.EP.SetSockOpt(&%T(%s)): %s", userTimeout, keepAliveInterval, err)
+	}
 
 	// Check that the connection is still alive.
 	if _, _, err := c.EP.Read(nil); err != tcpip.ErrWouldBlock {
@@ -7037,8 +7219,8 @@ func TestKeepaliveWithUserTimeout(t *testing.T) {
 	checker.IPv4(t, b,
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(c.IRS)),
-			checker.AckNum(uint32(790)),
+			checker.TCPSeqNum(uint32(c.IRS)),
+			checker.TCPAckNum(uint32(790)),
 			checker.TCPFlags(header.TCPFlagAck),
 		),
 	)
@@ -7063,8 +7245,8 @@ func TestKeepaliveWithUserTimeout(t *testing.T) {
 	checker.IPv4(t, c.GetPacket(),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(c.IRS+1)),
-			checker.AckNum(uint32(0)),
+			checker.TCPSeqNum(uint32(c.IRS+1)),
+			checker.TCPAckNum(uint32(0)),
 			checker.TCPFlags(header.TCPFlagRst),
 		),
 	)
@@ -7080,9 +7262,9 @@ func TestKeepaliveWithUserTimeout(t *testing.T) {
 	}
 }
 
-func TestIncreaseWindowOnReceive(t *testing.T) {
+func TestIncreaseWindowOnRead(t *testing.T) {
 	// This test ensures that the endpoint sends an ack,
-	// after recv() when the window grows to more than 1 MSS.
+	// after read() when the window grows by more than 1 MSS.
 	c := context.New(t, defaultMTU)
 	defer c.Cleanup()
 
@@ -7091,10 +7273,9 @@ func TestIncreaseWindowOnReceive(t *testing.T) {
 
 	// Write chunks of ~30000 bytes. It's important that two
 	// payloads make it equal or longer than MSS.
-	remain := rcvBuf
+	remain := rcvBuf * 2
 	sent := 0
 	data := make([]byte, defaultMTU/2)
-	lastWnd := uint16(0)
 
 	for remain > len(data) {
 		c.SendPacket(data, &context.Headers{
@@ -7107,46 +7288,43 @@ func TestIncreaseWindowOnReceive(t *testing.T) {
 		})
 		sent += len(data)
 		remain -= len(data)
-
-		lastWnd = uint16(remain)
-		if remain > 0xffff {
-			lastWnd = 0xffff
-		}
-		checker.IPv4(t, c.GetPacket(),
+		pkt := c.GetPacket()
+		checker.IPv4(t, pkt,
 			checker.PayloadLen(header.TCPMinimumSize),
 			checker.TCP(
 				checker.DstPort(context.TestPort),
-				checker.SeqNum(uint32(c.IRS)+1),
-				checker.AckNum(uint32(790+sent)),
-				checker.Window(lastWnd),
+				checker.TCPSeqNum(uint32(c.IRS)+1),
+				checker.TCPAckNum(uint32(790+sent)),
 				checker.TCPFlags(header.TCPFlagAck),
 			),
 		)
+		// Break once the window drops below defaultMTU/2
+		if wnd := header.TCP(header.IPv4(pkt).Payload()).WindowSize(); wnd < defaultMTU/2 {
+			break
+		}
 	}
 
-	if lastWnd == 0xffff || lastWnd == 0 {
-		t.Fatalf("expected small, non-zero window: %d", lastWnd)
-	}
-
-	// We now have < 1 MSS in the buffer space. Read the data! An
-	// ack should be sent in response to that. The window was not
-	// zero, but it grew to larger than MSS.
-	if _, _, err := c.EP.Read(nil); err != nil {
-		t.Fatalf("Read failed: %s", err)
-	}
-
-	if _, _, err := c.EP.Read(nil); err != nil {
-		t.Fatalf("Read failed: %s", err)
+	// We now have < 1 MSS in the buffer space. Read at least > 2 MSS
+	// worth of data as receive buffer space
+	read := 0
+	// defaultMTU is a good enough estimate for the MSS used for this
+	// connection.
+	for read < defaultMTU*2 {
+		v, _, err := c.EP.Read(nil)
+		if err != nil {
+			t.Fatalf("Read failed: %s", err)
+		}
+		read += len(v)
 	}
 
-	// After reading two packets, we surely crossed MSS. See the ack:
+	// After reading > MSS worth of data, we surely crossed MSS. See the ack:
 	checker.IPv4(t, c.GetPacket(),
 		checker.PayloadLen(header.TCPMinimumSize),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(c.IRS)+1),
-			checker.AckNum(uint32(790+sent)),
-			checker.Window(uint16(0xffff)),
+			checker.TCPSeqNum(uint32(c.IRS)+1),
+			checker.TCPAckNum(uint32(790+sent)),
+			checker.TCPWindow(uint16(0xffff)),
 			checker.TCPFlags(header.TCPFlagAck),
 		),
 	)
@@ -7166,7 +7344,6 @@ func TestIncreaseWindowOnBufferResize(t *testing.T) {
 	remain := rcvBuf
 	sent := 0
 	data := make([]byte, defaultMTU/2)
-	lastWnd := uint16(0)
 
 	for remain > len(data) {
 		c.SendPacket(data, &context.Headers{
@@ -7179,39 +7356,29 @@ func TestIncreaseWindowOnBufferResize(t *testing.T) {
 		})
 		sent += len(data)
 		remain -= len(data)
-
-		lastWnd = uint16(remain)
-		if remain > 0xffff {
-			lastWnd = 0xffff
-		}
 		checker.IPv4(t, c.GetPacket(),
 			checker.PayloadLen(header.TCPMinimumSize),
 			checker.TCP(
 				checker.DstPort(context.TestPort),
-				checker.SeqNum(uint32(c.IRS)+1),
-				checker.AckNum(uint32(790+sent)),
-				checker.Window(lastWnd),
+				checker.TCPSeqNum(uint32(c.IRS)+1),
+				checker.TCPAckNum(uint32(790+sent)),
+				checker.TCPWindowLessThanEq(0xffff),
 				checker.TCPFlags(header.TCPFlagAck),
 			),
 		)
 	}
 
-	if lastWnd == 0xffff || lastWnd == 0 {
-		t.Fatalf("expected small, non-zero window: %d", lastWnd)
-	}
-
 	// Increasing the buffer from should generate an ACK,
 	// since window grew from small value to larger equal MSS
 	c.EP.SetSockOptInt(tcpip.ReceiveBufferSizeOption, rcvBuf*2)
 
-	// After reading two packets, we surely crossed MSS. See the ack:
 	checker.IPv4(t, c.GetPacket(),
 		checker.PayloadLen(header.TCPMinimumSize),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(c.IRS)+1),
-			checker.AckNum(uint32(790+sent)),
-			checker.Window(uint16(0xffff)),
+			checker.TCPSeqNum(uint32(c.IRS)+1),
+			checker.TCPAckNum(uint32(790+sent)),
+			checker.TCPWindow(uint16(0xffff)),
 			checker.TCPFlags(header.TCPFlagAck),
 		),
 	)
@@ -7232,14 +7399,15 @@ func TestTCPDeferAccept(t *testing.T) {
 	}
 
 	const tcpDeferAccept = 1 * time.Second
-	if err := c.EP.SetSockOpt(tcpip.TCPDeferAcceptOption(tcpDeferAccept)); err != nil {
-		t.Fatalf("c.EP.SetSockOpt(TCPDeferAcceptOption(%s) failed: %s", tcpDeferAccept, err)
+	tcpDeferAcceptOption := tcpip.TCPDeferAcceptOption(tcpDeferAccept)
+	if err := c.EP.SetSockOpt(&tcpDeferAcceptOption); err != nil {
+		t.Fatalf("c.EP.SetSockOpt(&%T(%s)): %s", tcpDeferAcceptOption, tcpDeferAccept, err)
 	}
 
 	irs, iss := executeHandshake(t, c, context.TestPort, false /* synCookiesInUse */)
 
-	if _, _, err := c.EP.Accept(); err != tcpip.ErrWouldBlock {
-		t.Fatalf("c.EP.Accept() returned unexpected error got: %s, want: %s", err, tcpip.ErrWouldBlock)
+	if _, _, err := c.EP.Accept(nil); err != tcpip.ErrWouldBlock {
+		t.Fatalf("got c.EP.Accept(nil) = %s, want: %s", err, tcpip.ErrWouldBlock)
 	}
 
 	// Send data. This should result in an acceptable endpoint.
@@ -7255,14 +7423,14 @@ func TestTCPDeferAccept(t *testing.T) {
 	checker.IPv4(t, c.GetPacket(), checker.TCP(
 		checker.DstPort(context.TestPort),
 		checker.TCPFlags(header.TCPFlagAck),
-		checker.SeqNum(uint32(iss+1)),
-		checker.AckNum(uint32(irs+5))))
+		checker.TCPSeqNum(uint32(iss+1)),
+		checker.TCPAckNum(uint32(irs+5))))
 
 	// Give a bit of time for the socket to be delivered to the accept queue.
 	time.Sleep(50 * time.Millisecond)
-	aep, _, err := c.EP.Accept()
+	aep, _, err := c.EP.Accept(nil)
 	if err != nil {
-		t.Fatalf("c.EP.Accept() returned unexpected error got: %s, want: nil", err)
+		t.Fatalf("got c.EP.Accept(nil) = %s, want: nil", err)
 	}
 
 	aep.Close()
@@ -7270,8 +7438,8 @@ func TestTCPDeferAccept(t *testing.T) {
 	checker.IPv4(t, c.GetPacket(), checker.TCP(
 		checker.DstPort(context.TestPort),
 		checker.TCPFlags(header.TCPFlagRst|header.TCPFlagAck),
-		checker.SeqNum(uint32(iss+1)),
-		checker.AckNum(uint32(irs+5))))
+		checker.TCPSeqNum(uint32(iss+1)),
+		checker.TCPAckNum(uint32(irs+5))))
 }
 
 func TestTCPDeferAcceptTimeout(t *testing.T) {
@@ -7289,14 +7457,15 @@ func TestTCPDeferAcceptTimeout(t *testing.T) {
 	}
 
 	const tcpDeferAccept = 1 * time.Second
-	if err := c.EP.SetSockOpt(tcpip.TCPDeferAcceptOption(tcpDeferAccept)); err != nil {
-		t.Fatalf("c.EP.SetSockOpt(TCPDeferAcceptOption(%s) failed: %s", tcpDeferAccept, err)
+	tcpDeferAcceptOpt := tcpip.TCPDeferAcceptOption(tcpDeferAccept)
+	if err := c.EP.SetSockOpt(&tcpDeferAcceptOpt); err != nil {
+		t.Fatalf("c.EP.SetSockOpt(&%T(%s)) failed: %s", tcpDeferAcceptOpt, tcpDeferAccept, err)
 	}
 
 	irs, iss := executeHandshake(t, c, context.TestPort, false /* synCookiesInUse */)
 
-	if _, _, err := c.EP.Accept(); err != tcpip.ErrWouldBlock {
-		t.Fatalf("c.EP.Accept() returned unexpected error got: %s, want: %s", err, tcpip.ErrWouldBlock)
+	if _, _, err := c.EP.Accept(nil); err != tcpip.ErrWouldBlock {
+		t.Fatalf("got c.EP.Accept(nil) = %s, want: %s", err, tcpip.ErrWouldBlock)
 	}
 
 	// Sleep for a little of the tcpDeferAccept timeout.
@@ -7307,7 +7476,7 @@ func TestTCPDeferAcceptTimeout(t *testing.T) {
 		checker.SrcPort(context.StackPort),
 		checker.DstPort(context.TestPort),
 		checker.TCPFlags(header.TCPFlagAck|header.TCPFlagSyn),
-		checker.AckNum(uint32(irs)+1)))
+		checker.TCPAckNum(uint32(irs)+1)))
 
 	// Send data. This should result in an acceptable endpoint.
 	c.SendPacket([]byte{1, 2, 3, 4}, &context.Headers{
@@ -7323,14 +7492,14 @@ func TestTCPDeferAcceptTimeout(t *testing.T) {
 		checker.SrcPort(context.StackPort),
 		checker.DstPort(context.TestPort),
 		checker.TCPFlags(header.TCPFlagAck),
-		checker.SeqNum(uint32(iss+1)),
-		checker.AckNum(uint32(irs+5))))
+		checker.TCPSeqNum(uint32(iss+1)),
+		checker.TCPAckNum(uint32(irs+5))))
 
 	// Give sometime for the endpoint to be delivered to the accept queue.
 	time.Sleep(50 * time.Millisecond)
-	aep, _, err := c.EP.Accept()
+	aep, _, err := c.EP.Accept(nil)
 	if err != nil {
-		t.Fatalf("c.EP.Accept() returned unexpected error got: %s, want: nil", err)
+		t.Fatalf("got c.EP.Accept(nil) = %s, want: nil", err)
 	}
 
 	aep.Close()
@@ -7339,8 +7508,8 @@ func TestTCPDeferAcceptTimeout(t *testing.T) {
 		checker.SrcPort(context.StackPort),
 		checker.DstPort(context.TestPort),
 		checker.TCPFlags(header.TCPFlagRst|header.TCPFlagAck),
-		checker.SeqNum(uint32(iss+1)),
-		checker.AckNum(uint32(irs+5))))
+		checker.TCPSeqNum(uint32(iss+1)),
+		checker.TCPAckNum(uint32(irs+5))))
 }
 
 func TestResetDuringClose(t *testing.T) {
@@ -7365,8 +7534,8 @@ func TestResetDuringClose(t *testing.T) {
 	checker.IPv4(t, c.GetPacket(), checker.TCP(
 		checker.DstPort(context.TestPort),
 		checker.TCPFlags(header.TCPFlagAck),
-		checker.SeqNum(uint32(irs.Add(1))),
-		checker.AckNum(uint32(iss.Add(5)))))
+		checker.TCPSeqNum(uint32(irs.Add(1))),
+		checker.TCPAckNum(uint32(iss.Add(5)))))
 
 	// Close in a separate goroutine so that we can trigger
 	// a race with the RST we send below. This should not
@@ -7428,9 +7597,10 @@ func TestSetStackTimeWaitReuse(t *testing.T) {
 	}
 
 	for _, tc := range testCases {
-		err := s.SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPTimeWaitReuseOption(tc.v))
+		opt := tcpip.TCPTimeWaitReuseOption(tc.v)
+		err := s.SetTransportProtocolOption(tcp.ProtocolNumber, &opt)
 		if got, want := err, tc.err; got != want {
-			t.Fatalf("s.TransportProtocolOption(%v, %v) = %v, want %v", tcp.ProtocolNumber, tc.v, err, tc.err)
+			t.Fatalf("s.SetTransportProtocolOption(%d, &%T(%d)) = %s, want = %s", tcp.ProtocolNumber, tc.v, tc.v, err, tc.err)
 		}
 		if tc.err != nil {
 			continue
@@ -7446,3 +7616,14 @@ func TestSetStackTimeWaitReuse(t *testing.T) {
 		}
 	}
 }
+
+// generateRandomPayload generates a random byte slice of the specified length
+// causing a fatal test failure if it is unable to do so.
+func generateRandomPayload(t *testing.T, n int) []byte {
+	t.Helper()
+	buf := make([]byte, n)
+	if _, err := rand.Read(buf); err != nil {
+		t.Fatalf("rand.Read(buf) failed: %s", err)
+	}
+	return buf
+}
diff --git a/pkg/tcpip/transport/tcp/tcp_timestamp_test.go b/pkg/tcpip/transport/tcp/tcp_timestamp_test.go
index 8edbff964..0f9ed06cd 100644
--- a/pkg/tcpip/transport/tcp/tcp_timestamp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_timestamp_test.go
@@ -131,8 +131,9 @@ func timeStampEnabledAccept(t *testing.T, cookieEnabled bool, wndScale int, wndS
 	defer c.Cleanup()
 
 	if cookieEnabled {
-		if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPSynRcvdCountThresholdOption(0)); err != nil {
-			t.Fatalf("setting TCPSynRcvdCountThresholdOption to 0 failed: %s", err)
+		var opt tcpip.TCPSynRcvdCountThresholdOption
+		if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+			t.Fatalf("SetTransportProtocolOption(%d, &%T(%d)): %s", tcp.ProtocolNumber, opt, opt, err)
 		}
 	}
 
@@ -158,9 +159,9 @@ func timeStampEnabledAccept(t *testing.T, cookieEnabled bool, wndScale int, wndS
 		checker.PayloadLen(len(data)+header.TCPMinimumSize+12),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(c.IRS)+1),
-			checker.AckNum(790),
-			checker.Window(wndSize),
+			checker.TCPSeqNum(uint32(c.IRS)+1),
+			checker.TCPAckNum(790),
+			checker.TCPWindow(wndSize),
 			checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
 			checker.TCPTimestampChecker(true, 0, tsVal+1),
 		),
@@ -180,7 +181,8 @@ func TestTimeStampEnabledAccept(t *testing.T) {
 		wndSize       uint16
 	}{
 		{true, -1, 0xffff}, // When cookie is used window scaling is disabled.
-		{false, 5, 0x8000}, // DefaultReceiveBufferSize is 1MB >> 5.
+		// DefaultReceiveBufferSize is 1MB >> 5. Advertised window will be 1/2 of that.
+		{false, 5, 0x4000},
 	}
 	for _, tc := range testCases {
 		timeStampEnabledAccept(t, tc.cookieEnabled, tc.wndScale, tc.wndSize)
@@ -192,8 +194,9 @@ func timeStampDisabledAccept(t *testing.T, cookieEnabled bool, wndScale int, wnd
 	defer c.Cleanup()
 
 	if cookieEnabled {
-		if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPSynRcvdCountThresholdOption(0)); err != nil {
-			t.Fatalf("setting TCPSynRcvdCountThresholdOption to 0 failed: %s", err)
+		var opt tcpip.TCPSynRcvdCountThresholdOption
+		if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+			t.Fatalf("SetTransportProtocolOption(%d, &%T(%d)): %s", tcp.ProtocolNumber, opt, opt, err)
 		}
 	}
 
@@ -217,9 +220,9 @@ func timeStampDisabledAccept(t *testing.T, cookieEnabled bool, wndScale int, wnd
 		checker.PayloadLen(len(data)+header.TCPMinimumSize),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(c.IRS)+1),
-			checker.AckNum(790),
-			checker.Window(wndSize),
+			checker.TCPSeqNum(uint32(c.IRS)+1),
+			checker.TCPAckNum(790),
+			checker.TCPWindow(wndSize),
 			checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
 			checker.TCPTimestampChecker(false, 0, 0),
 		),
@@ -235,7 +238,9 @@ func TestTimeStampDisabledAccept(t *testing.T) {
 		wndSize       uint16
 	}{
 		{true, -1, 0xffff}, // When cookie is used window scaling is disabled.
-		{false, 5, 0x8000}, // DefaultReceiveBufferSize is 1MB >> 5.
+		// DefaultReceiveBufferSize is 1MB >> 5. Advertised window will be half of
+		// that.
+		{false, 5, 0x4000},
 	}
 	for _, tc := range testCases {
 		timeStampDisabledAccept(t, tc.cookieEnabled, tc.wndScale, tc.wndSize)
diff --git a/pkg/tcpip/transport/tcp/testing/context/context.go b/pkg/tcpip/transport/tcp/testing/context/context.go
index b6031354e..4d7847142 100644
--- a/pkg/tcpip/transport/tcp/testing/context/context.go
+++ b/pkg/tcpip/transport/tcp/testing/context/context.go
@@ -53,11 +53,11 @@ const (
 	TestPort = 4096
 
 	// StackV6Addr is the IPv6 address assigned to the stack.
-	StackV6Addr = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01"
+	StackV6Addr = "\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01"
 
 	// TestV6Addr is the source address for packets sent to the stack via
 	// the link layer endpoint.
-	TestV6Addr = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02"
+	TestV6Addr = "\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02"
 
 	// StackV4MappedAddr is StackAddr as a mapped v6 address.
 	StackV4MappedAddr = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\xff" + StackAddr
@@ -68,11 +68,23 @@ const (
 	// V4MappedWildcardAddr is the mapped v6 representation of 0.0.0.0.
 	V4MappedWildcardAddr = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\xff\x00\x00\x00\x00"
 
-	// testInitialSequenceNumber is the initial sequence number sent in packets that
+	// TestInitialSequenceNumber is the initial sequence number sent in packets that
 	// are sent in response to a SYN or in the initial SYN sent to the stack.
-	testInitialSequenceNumber = 789
+	TestInitialSequenceNumber = 789
 )
 
+// StackAddrWithPrefix is StackAddr with its associated prefix length.
+var StackAddrWithPrefix = tcpip.AddressWithPrefix{
+	Address:   StackAddr,
+	PrefixLen: 24,
+}
+
+// StackV6AddrWithPrefix is StackV6Addr with its associated prefix length.
+var StackV6AddrWithPrefix = tcpip.AddressWithPrefix{
+	Address:   StackV6Addr,
+	PrefixLen: header.IIDOffsetInIPv6Address * 8,
+}
+
 // Headers is used to represent the TCP header fields when building a
 // new packet.
 type Headers struct {
@@ -133,32 +145,39 @@ type Context struct {
 	// WindowScale is the expected window scale in SYN packets sent by
 	// the stack.
 	WindowScale uint8
+
+	// RcvdWindowScale is the actual window scale sent by the stack in
+	// SYN/SYN-ACK.
+	RcvdWindowScale uint8
 }
 
 // New allocates and initializes a test context containing a new
 // stack and a link-layer endpoint.
 func New(t *testing.T, mtu uint32) *Context {
 	s := stack.New(stack.Options{
-		NetworkProtocols:   []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol()},
-		TransportProtocols: []stack.TransportProtocol{tcp.NewProtocol()},
+		NetworkProtocols:   []stack.NetworkProtocolFactory{ipv4.NewProtocol, ipv6.NewProtocol},
+		TransportProtocols: []stack.TransportProtocolFactory{tcp.NewProtocol},
 	})
 
 	const sendBufferSize = 1 << 20 // 1 MiB
 	const recvBufferSize = 1 << 20 // 1 MiB
 	// Allow minimum send/receive buffer sizes to be 1 during tests.
-	if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SendBufferSizeOption{Min: 1, Default: sendBufferSize, Max: 10 * sendBufferSize}); err != nil {
-		t.Fatalf("SetTransportProtocolOption failed: %s", err)
+	sendBufOpt := tcpip.TCPSendBufferSizeRangeOption{Min: 1, Default: sendBufferSize, Max: 10 * sendBufferSize}
+	if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, &sendBufOpt); err != nil {
+		t.Fatalf("SetTransportProtocolOption(%d, &%#v) failed: %s", tcp.ProtocolNumber, sendBufOpt, err)
 	}
 
-	if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.ReceiveBufferSizeOption{Min: 1, Default: recvBufferSize, Max: 10 * recvBufferSize}); err != nil {
-		t.Fatalf("SetTransportProtocolOption failed: %s", err)
+	rcvBufOpt := tcpip.TCPReceiveBufferSizeRangeOption{Min: 1, Default: recvBufferSize, Max: 10 * recvBufferSize}
+	if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, &rcvBufOpt); err != nil {
+		t.Fatalf("SetTransportProtocolOption(%d, &%#v) failed: %s", tcp.ProtocolNumber, rcvBufOpt, err)
 	}
 
 	// Increase minimum RTO in tests to avoid test flakes due to early
 	// retransmit in case the test executors are overloaded and cause timers
 	// to fire earlier than expected.
-	if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPMinRTOOption(3*time.Second)); err != nil {
-		t.Fatalf("failed to set stack-wide minRTO: %s", err)
+	minRTOOpt := tcpip.TCPMinRTOOption(3 * time.Second)
+	if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, &minRTOOpt); err != nil {
+		t.Fatalf("s.SetTransportProtocolOption(%d, &%T(%d)): %s", tcp.ProtocolNumber, minRTOOpt, minRTOOpt, err)
 	}
 
 	// Some of the congestion control tests send up to 640 packets, we so
@@ -181,12 +200,20 @@ func New(t *testing.T, mtu uint32) *Context {
 		t.Fatalf("CreateNICWithOptions(_, _, %+v) failed: %v", opts2, err)
 	}
 
-	if err := s.AddAddress(1, ipv4.ProtocolNumber, StackAddr); err != nil {
-		t.Fatalf("AddAddress failed: %v", err)
+	v4ProtocolAddr := tcpip.ProtocolAddress{
+		Protocol:          ipv4.ProtocolNumber,
+		AddressWithPrefix: StackAddrWithPrefix,
+	}
+	if err := s.AddProtocolAddress(1, v4ProtocolAddr); err != nil {
+		t.Fatalf("AddProtocolAddress(1, %#v): %s", v4ProtocolAddr, err)
 	}
 
-	if err := s.AddAddress(1, ipv6.ProtocolNumber, StackV6Addr); err != nil {
-		t.Fatalf("AddAddress failed: %v", err)
+	v6ProtocolAddr := tcpip.ProtocolAddress{
+		Protocol:          ipv6.ProtocolNumber,
+		AddressWithPrefix: StackV6AddrWithPrefix,
+	}
+	if err := s.AddProtocolAddress(1, v6ProtocolAddr); err != nil {
+		t.Fatalf("AddProtocolAddress(1, %#v): %s", v6ProtocolAddr, err)
 	}
 
 	s.SetRouteTable([]tcpip.Route{
@@ -238,18 +265,17 @@ func (c *Context) CheckNoPacket(errMsg string) {
 	c.CheckNoPacketTimeout(errMsg, 1*time.Second)
 }
 
-// GetPacket reads a packet from the link layer endpoint and verifies
+// GetPacketWithTimeout reads a packet from the link layer endpoint and verifies
 // that it is an IPv4 packet with the expected source and destination
-// addresses. It will fail with an error if no packet is received for
-// 2 seconds.
-func (c *Context) GetPacket() []byte {
+// addresses. If no packet is received in the specified timeout it will return
+// nil.
+func (c *Context) GetPacketWithTimeout(timeout time.Duration) []byte {
 	c.t.Helper()
 
-	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+	ctx, cancel := context.WithTimeout(context.Background(), timeout)
 	defer cancel()
 	p, ok := c.linkEP.ReadContext(ctx)
 	if !ok {
-		c.t.Fatalf("Packet wasn't written out")
 		return nil
 	}
 
@@ -257,6 +283,14 @@ func (c *Context) GetPacket() []byte {
 		c.t.Fatalf("Bad network protocol: got %v, wanted %v", p.Proto, ipv4.ProtocolNumber)
 	}
 
+	// Just check that the stack set the transport protocol number for outbound
+	// TCP messages.
+	// TODO(gvisor.dev/issues/3810): Remove when protocol numbers are part
+	// of the headerinfo.
+	if p.Pkt.TransportProtocolNumber != tcp.ProtocolNumber {
+		c.t.Fatalf("got p.Pkt.TransportProtocolNumber = %d, want = %d", p.Pkt.TransportProtocolNumber, tcp.ProtocolNumber)
+	}
+
 	vv := buffer.NewVectorisedView(p.Pkt.Size(), p.Pkt.Views())
 	b := vv.ToView()
 
@@ -268,6 +302,21 @@ func (c *Context) GetPacket() []byte {
 	return b
 }
 
+// GetPacket reads a packet from the link layer endpoint and verifies
+// that it is an IPv4 packet with the expected source and destination
+// addresses.
+func (c *Context) GetPacket() []byte {
+	c.t.Helper()
+
+	p := c.GetPacketWithTimeout(5 * time.Second)
+	if p == nil {
+		c.t.Fatalf("Packet wasn't written out")
+		return nil
+	}
+
+	return p
+}
+
 // GetPacketNonBlocking reads a packet from the link layer endpoint
 // and verifies that it is an IPv4 packet with the expected source
 // and destination address. If no packet is available it will return
@@ -284,6 +333,14 @@ func (c *Context) GetPacketNonBlocking() []byte {
 		c.t.Fatalf("Bad network protocol: got %v, wanted %v", p.Proto, ipv4.ProtocolNumber)
 	}
 
+	// Just check that the stack set the transport protocol number for outbound
+	// TCP messages.
+	// TODO(gvisor.dev/issues/3810): Remove when protocol numbers are part
+	// of the headerinfo.
+	if p.Pkt.TransportProtocolNumber != tcp.ProtocolNumber {
+		c.t.Fatalf("got p.Pkt.TransportProtocolNumber = %d, want = %d", p.Pkt.TransportProtocolNumber, tcp.ProtocolNumber)
+	}
+
 	vv := buffer.NewVectorisedView(p.Pkt.Size(), p.Pkt.Views())
 	b := vv.ToView()
 
@@ -447,8 +504,8 @@ func (c *Context) ReceiveAndCheckPacketWithOptions(data []byte, offset, size, op
 		checker.PayloadLen(size+header.TCPMinimumSize+optlen),
 		checker.TCP(
 			checker.DstPort(TestPort),
-			checker.SeqNum(uint32(c.IRS.Add(seqnum.Size(1+offset)))),
-			checker.AckNum(uint32(seqnum.Value(testInitialSequenceNumber).Add(1))),
+			checker.TCPSeqNum(uint32(c.IRS.Add(seqnum.Size(1+offset)))),
+			checker.TCPAckNum(uint32(seqnum.Value(TestInitialSequenceNumber).Add(1))),
 			checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
 		),
 	)
@@ -474,8 +531,8 @@ func (c *Context) ReceiveNonBlockingAndCheckPacket(data []byte, offset, size int
 		checker.PayloadLen(size+header.TCPMinimumSize),
 		checker.TCP(
 			checker.DstPort(TestPort),
-			checker.SeqNum(uint32(c.IRS.Add(seqnum.Size(1+offset)))),
-			checker.AckNum(uint32(seqnum.Value(testInitialSequenceNumber).Add(1))),
+			checker.TCPSeqNum(uint32(c.IRS.Add(seqnum.Size(1+offset)))),
+			checker.TCPAckNum(uint32(seqnum.Value(TestInitialSequenceNumber).Add(1))),
 			checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
 		),
 	)
@@ -613,6 +670,7 @@ func (c *Context) Connect(iss seqnum.Value, rcvWnd seqnum.Size, options []byte)
 	}
 
 	tcpHdr := header.TCP(header.IPv4(b).Payload())
+	synOpts := header.ParseSynOptions(tcpHdr.Options(), false /* isAck */)
 	c.IRS = seqnum.Value(tcpHdr.SequenceNumber())
 
 	c.SendPacket(nil, &Headers{
@@ -630,15 +688,15 @@ func (c *Context) Connect(iss seqnum.Value, rcvWnd seqnum.Size, options []byte)
 		checker.TCP(
 			checker.DstPort(TestPort),
 			checker.TCPFlags(header.TCPFlagAck),
-			checker.SeqNum(uint32(c.IRS)+1),
-			checker.AckNum(uint32(iss)+1),
+			checker.TCPSeqNum(uint32(c.IRS)+1),
+			checker.TCPAckNum(uint32(iss)+1),
 		),
 	)
 
 	// Wait for connection to be established.
 	select {
 	case <-notifyCh:
-		if err := c.EP.GetSockOpt(tcpip.ErrorOption{}); err != nil {
+		if err := c.EP.LastError(); err != nil {
 			c.t.Fatalf("Unexpected error when connecting: %v", err)
 		}
 	case <-time.After(1 * time.Second):
@@ -648,6 +706,7 @@ func (c *Context) Connect(iss seqnum.Value, rcvWnd seqnum.Size, options []byte)
 		c.t.Fatalf("Unexpected endpoint state: want %v, got %v", want, got)
 	}
 
+	c.RcvdWindowScale = uint8(synOpts.WS)
 	c.Port = tcpHdr.SourcePort()
 }
 
@@ -719,17 +778,18 @@ func (r *RawEndpoint) SendPacket(payload []byte, opts []byte) {
 	r.NextSeqNum = r.NextSeqNum.Add(seqnum.Size(len(payload)))
 }
 
-// VerifyACKWithTS verifies that the tsEcr field in the ack matches the provided
-// tsVal.
-func (r *RawEndpoint) VerifyACKWithTS(tsVal uint32) {
+// VerifyAndReturnACKWithTS verifies that the tsEcr field int he ACK matches
+// the provided tsVal as well as returns the original packet.
+func (r *RawEndpoint) VerifyAndReturnACKWithTS(tsVal uint32) []byte {
+	r.C.t.Helper()
 	// Read ACK and verify that tsEcr of ACK packet is [1,2,3,4]
 	ackPacket := r.C.GetPacket()
 	checker.IPv4(r.C.t, ackPacket,
 		checker.TCP(
 			checker.DstPort(r.SrcPort),
 			checker.TCPFlags(header.TCPFlagAck),
-			checker.SeqNum(uint32(r.AckNum)),
-			checker.AckNum(uint32(r.NextSeqNum)),
+			checker.TCPSeqNum(uint32(r.AckNum)),
+			checker.TCPAckNum(uint32(r.NextSeqNum)),
 			checker.TCPTimestampChecker(true, 0, tsVal),
 		),
 	)
@@ -737,19 +797,28 @@ func (r *RawEndpoint) VerifyACKWithTS(tsVal uint32) {
 	tcpSeg := header.TCP(header.IPv4(ackPacket).Payload())
 	opts := tcpSeg.ParsedOptions()
 	r.RecentTS = opts.TSVal
+	return ackPacket
+}
+
+// VerifyACKWithTS verifies that the tsEcr field in the ack matches the provided
+// tsVal.
+func (r *RawEndpoint) VerifyACKWithTS(tsVal uint32) {
+	r.C.t.Helper()
+	_ = r.VerifyAndReturnACKWithTS(tsVal)
 }
 
 // VerifyACKRcvWnd verifies that the window advertised by the incoming ACK
 // matches the provided rcvWnd.
 func (r *RawEndpoint) VerifyACKRcvWnd(rcvWnd uint16) {
+	r.C.t.Helper()
 	ackPacket := r.C.GetPacket()
 	checker.IPv4(r.C.t, ackPacket,
 		checker.TCP(
 			checker.DstPort(r.SrcPort),
 			checker.TCPFlags(header.TCPFlagAck),
-			checker.SeqNum(uint32(r.AckNum)),
-			checker.AckNum(uint32(r.NextSeqNum)),
-			checker.Window(rcvWnd),
+			checker.TCPSeqNum(uint32(r.AckNum)),
+			checker.TCPAckNum(uint32(r.NextSeqNum)),
+			checker.TCPWindow(rcvWnd),
 		),
 	)
 }
@@ -768,8 +837,8 @@ func (r *RawEndpoint) VerifyACKHasSACK(sackBlocks []header.SACKBlock) {
 		checker.TCP(
 			checker.DstPort(r.SrcPort),
 			checker.TCPFlags(header.TCPFlagAck),
-			checker.SeqNum(uint32(r.AckNum)),
-			checker.AckNum(uint32(r.NextSeqNum)),
+			checker.TCPSeqNum(uint32(r.AckNum)),
+			checker.TCPAckNum(uint32(r.NextSeqNum)),
 			checker.TCPSACKBlockChecker(sackBlocks),
 		),
 	)
@@ -843,7 +912,7 @@ func (c *Context) CreateConnectedWithOptions(wantOptions header.TCPSynOptions) *
 
 	// Build SYN-ACK.
 	c.IRS = seqnum.Value(tcpSeg.SequenceNumber())
-	iss := seqnum.Value(testInitialSequenceNumber)
+	iss := seqnum.Value(TestInitialSequenceNumber)
 	c.SendPacket(nil, &Headers{
 		SrcPort: tcpSeg.DestinationPort(),
 		DstPort: tcpSeg.SourcePort(),
@@ -861,8 +930,8 @@ func (c *Context) CreateConnectedWithOptions(wantOptions header.TCPSynOptions) *
 	tcpCheckers := []checker.TransportChecker{
 		checker.DstPort(TestPort),
 		checker.TCPFlags(header.TCPFlagAck),
-		checker.SeqNum(uint32(c.IRS) + 1),
-		checker.AckNum(uint32(iss) + 1),
+		checker.TCPSeqNum(uint32(c.IRS) + 1),
+		checker.TCPAckNum(uint32(iss) + 1),
 	}
 
 	// Verify that tsEcr of ACK packet is wantOptions.TSVal if the
@@ -882,8 +951,7 @@ func (c *Context) CreateConnectedWithOptions(wantOptions header.TCPSynOptions) *
 	// Wait for connection to be established.
 	select {
 	case <-notifyCh:
-		err = c.EP.GetSockOpt(tcpip.ErrorOption{})
-		if err != nil {
+		if err := c.EP.LastError(); err != nil {
 			c.t.Fatalf("Unexpected error when connecting: %v", err)
 		}
 	case <-time.After(1 * time.Second):
@@ -898,7 +966,7 @@ func (c *Context) CreateConnectedWithOptions(wantOptions header.TCPSynOptions) *
 
 	// Mark in context that timestamp option is enabled for this endpoint.
 	c.TimeStampEnabled = true
-
+	c.RcvdWindowScale = uint8(synOptions.WS)
 	return &RawEndpoint{
 		C:             c,
 		SrcPort:       tcpSeg.DestinationPort(),
@@ -949,12 +1017,12 @@ func (c *Context) AcceptWithOptions(wndScale int, synOptions header.TCPSynOption
 	wq.EventRegister(&we, waiter.EventIn)
 	defer wq.EventUnregister(&we)
 
-	c.EP, _, err = ep.Accept()
+	c.EP, _, err = ep.Accept(nil)
 	if err == tcpip.ErrWouldBlock {
 		// Wait for connection to be established.
 		select {
 		case <-ch:
-			c.EP, _, err = ep.Accept()
+			c.EP, _, err = ep.Accept(nil)
 			if err != nil {
 				c.t.Fatalf("Accept failed: %v", err)
 			}
@@ -991,6 +1059,7 @@ func (c *Context) PassiveConnect(maxPayload, wndScale int, synOptions header.TCP
 // value of the window scaling option to be sent in the SYN. If synOptions.WS >
 // 0 then we send the WindowScale option.
 func (c *Context) PassiveConnectWithOptions(maxPayload, wndScale int, synOptions header.TCPSynOptions) *RawEndpoint {
+	c.t.Helper()
 	opts := make([]byte, header.TCPOptionsMaximumSize)
 	offset := 0
 	offset += header.EncodeMSSOption(uint32(maxPayload), opts)
@@ -1015,7 +1084,7 @@ func (c *Context) PassiveConnectWithOptions(maxPayload, wndScale int, synOptions
 	offset += paddingToAdd
 
 	// Send a SYN request.
-	iss := seqnum.Value(testInitialSequenceNumber)
+	iss := seqnum.Value(TestInitialSequenceNumber)
 	c.SendPacket(nil, &Headers{
 		SrcPort: TestPort,
 		DstPort: StackPort,
@@ -1029,13 +1098,14 @@ func (c *Context) PassiveConnectWithOptions(maxPayload, wndScale int, synOptions
 	// are present.
 	b := c.GetPacket()
 	tcp := header.TCP(header.IPv4(b).Payload())
+	rcvdSynOptions := header.ParseSynOptions(tcp.Options(), true /* isAck */)
 	c.IRS = seqnum.Value(tcp.SequenceNumber())
 
 	tcpCheckers := []checker.TransportChecker{
 		checker.SrcPort(StackPort),
 		checker.DstPort(TestPort),
 		checker.TCPFlags(header.TCPFlagAck | header.TCPFlagSyn),
-		checker.AckNum(uint32(iss) + 1),
+		checker.TCPAckNum(uint32(iss) + 1),
 		checker.TCPSynOptions(header.TCPSynOptions{MSS: synOptions.MSS, WS: wndScale, SACKPermitted: synOptions.SACKPermitted && c.SACKEnabled()}),
 	}
 
@@ -1078,6 +1148,7 @@ func (c *Context) PassiveConnectWithOptions(maxPayload, wndScale int, synOptions
 	// Send ACK.
 	c.SendPacket(nil, ackHeaders)
 
+	c.RcvdWindowScale = uint8(rcvdSynOptions.WS)
 	c.Port = StackPort
 
 	return &RawEndpoint{
@@ -1097,7 +1168,7 @@ func (c *Context) PassiveConnectWithOptions(maxPayload, wndScale int, synOptions
 // SACKEnabled returns true if the TCP Protocol option SACKEnabled is set to true
 // for the Stack in the context.
 func (c *Context) SACKEnabled() bool {
-	var v tcp.SACKEnabled
+	var v tcpip.TCPSACKEnabled
 	if err := c.Stack().TransportProtocolOption(tcp.ProtocolNumber, &v); err != nil {
 		// Stack doesn't support SACK. So just return.
 		return false
diff --git a/pkg/tcpip/transport/udp/BUILD b/pkg/tcpip/transport/udp/BUILD
index b5d2d0ba6..c78549424 100644
--- a/pkg/tcpip/transport/udp/BUILD
+++ b/pkg/tcpip/transport/udp/BUILD
@@ -32,6 +32,7 @@ go_library(
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/header",
+        "//pkg/tcpip/header/parse",
         "//pkg/tcpip/ports",
         "//pkg/tcpip/stack",
         "//pkg/tcpip/transport/raw",
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index 73608783c..d57ed5d79 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -139,7 +139,7 @@ type endpoint struct {
 
 	// multicastMemberships that need to be remvoed when the endpoint is
 	// closed. Protected by the mu mutex.
-	multicastMemberships []multicastMembership
+	multicastMemberships map[multicastMembership]struct{}
 
 	// effectiveNetProtos contains the network protocols actually in use. In
 	// most cases it will only contain "netProto", but in cases like IPv6
@@ -154,6 +154,9 @@ type endpoint struct {
 
 	// owner is used to get uid and gid of the packet.
 	owner tcpip.PacketOwner
+
+	// linger is used for SO_LINGER socket option.
+	linger tcpip.LingerOption
 }
 
 // +stateify savable
@@ -182,12 +185,13 @@ func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQue
 		// TTL=1.
 		//
 		// Linux defaults to TTL=1.
-		multicastTTL:  1,
-		multicastLoop: true,
-		rcvBufSizeMax: 32 * 1024,
-		sndBufSizeMax: 32 * 1024,
-		state:         StateInitial,
-		uniqueID:      s.UniqueID(),
+		multicastTTL:         1,
+		multicastLoop:        true,
+		rcvBufSizeMax:        32 * 1024,
+		sndBufSizeMax:        32 * 1024,
+		multicastMemberships: make(map[multicastMembership]struct{}),
+		state:                StateInitial,
+		uniqueID:             s.UniqueID(),
 	}
 
 	// Override with stack defaults.
@@ -209,7 +213,7 @@ func (e *endpoint) UniqueID() uint64 {
 	return e.uniqueID
 }
 
-func (e *endpoint) takeLastError() *tcpip.Error {
+func (e *endpoint) LastError() *tcpip.Error {
 	e.lastErrorMu.Lock()
 	defer e.lastErrorMu.Unlock()
 
@@ -237,10 +241,10 @@ func (e *endpoint) Close() {
 		e.boundPortFlags = ports.Flags{}
 	}
 
-	for _, mem := range e.multicastMemberships {
+	for mem := range e.multicastMemberships {
 		e.stack.LeaveGroup(e.NetProto, mem.nicID, mem.multicastAddr)
 	}
-	e.multicastMemberships = nil
+	e.multicastMemberships = make(map[multicastMembership]struct{})
 
 	// Close the receive list and drain it.
 	e.rcvMu.Lock()
@@ -268,7 +272,7 @@ func (e *endpoint) ModerateRecvBuf(copied int) {}
 // Read reads data from the endpoint. This method does not block if
 // there is no data pending.
 func (e *endpoint) Read(addr *tcpip.FullAddress) (buffer.View, tcpip.ControlMessages, *tcpip.Error) {
-	if err := e.takeLastError(); err != nil {
+	if err := e.LastError(); err != nil {
 		return buffer.View{}, tcpip.ControlMessages{}, err
 	}
 
@@ -411,7 +415,7 @@ func (e *endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c
 }
 
 func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-chan struct{}, *tcpip.Error) {
-	if err := e.takeLastError(); err != nil {
+	if err := e.LastError(); err != nil {
 		return 0, nil, err
 	}
 
@@ -683,9 +687,9 @@ func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
 }
 
 // SetSockOpt implements tcpip.Endpoint.SetSockOpt.
-func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
+func (e *endpoint) SetSockOpt(opt tcpip.SettableSocketOption) *tcpip.Error {
 	switch v := opt.(type) {
-	case tcpip.MulticastInterfaceOption:
+	case *tcpip.MulticastInterfaceOption:
 		e.mu.Lock()
 		defer e.mu.Unlock()
 
@@ -721,7 +725,7 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 		e.multicastNICID = nic
 		e.multicastAddr = addr
 
-	case tcpip.AddMembershipOption:
+	case *tcpip.AddMembershipOption:
 		if !header.IsV4MulticastAddress(v.MulticastAddr) && !header.IsV6MulticastAddress(v.MulticastAddr) {
 			return tcpip.ErrInvalidOptionValue
 		}
@@ -752,19 +756,17 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 		e.mu.Lock()
 		defer e.mu.Unlock()
 
-		for _, mem := range e.multicastMemberships {
-			if mem == memToInsert {
-				return tcpip.ErrPortInUse
-			}
+		if _, ok := e.multicastMemberships[memToInsert]; ok {
+			return tcpip.ErrPortInUse
 		}
 
 		if err := e.stack.JoinGroup(e.NetProto, nicID, v.MulticastAddr); err != nil {
 			return err
 		}
 
-		e.multicastMemberships = append(e.multicastMemberships, memToInsert)
+		e.multicastMemberships[memToInsert] = struct{}{}
 
-	case tcpip.RemoveMembershipOption:
+	case *tcpip.RemoveMembershipOption:
 		if !header.IsV4MulticastAddress(v.MulticastAddr) && !header.IsV6MulticastAddress(v.MulticastAddr) {
 			return tcpip.ErrInvalidOptionValue
 		}
@@ -786,18 +788,11 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 		}
 
 		memToRemove := multicastMembership{nicID: nicID, multicastAddr: v.MulticastAddr}
-		memToRemoveIndex := -1
 
 		e.mu.Lock()
 		defer e.mu.Unlock()
 
-		for i, mem := range e.multicastMemberships {
-			if mem == memToRemove {
-				memToRemoveIndex = i
-				break
-			}
-		}
-		if memToRemoveIndex == -1 {
+		if _, ok := e.multicastMemberships[memToRemove]; !ok {
 			return tcpip.ErrBadLocalAddress
 		}
 
@@ -805,11 +800,10 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 			return err
 		}
 
-		e.multicastMemberships[memToRemoveIndex] = e.multicastMemberships[len(e.multicastMemberships)-1]
-		e.multicastMemberships = e.multicastMemberships[:len(e.multicastMemberships)-1]
+		delete(e.multicastMemberships, memToRemove)
 
-	case tcpip.BindToDeviceOption:
-		id := tcpip.NICID(v)
+	case *tcpip.BindToDeviceOption:
+		id := tcpip.NICID(*v)
 		if id != 0 && !e.stack.HasNIC(id) {
 			return tcpip.ErrUnknownDevice
 		}
@@ -817,8 +811,13 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 		e.bindToDevice = id
 		e.mu.Unlock()
 
-	case tcpip.SocketDetachFilterOption:
+	case *tcpip.SocketDetachFilterOption:
 		return nil
+
+	case *tcpip.LingerOption:
+		e.mu.Lock()
+		e.linger = *v
+		e.mu.Unlock()
 	}
 	return nil
 }
@@ -960,10 +959,8 @@ func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
 }
 
 // GetSockOpt implements tcpip.Endpoint.GetSockOpt.
-func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
+func (e *endpoint) GetSockOpt(opt tcpip.GettableSocketOption) *tcpip.Error {
 	switch o := opt.(type) {
-	case tcpip.ErrorOption:
-		return e.takeLastError()
 	case *tcpip.MulticastInterfaceOption:
 		e.mu.Lock()
 		*o = tcpip.MulticastInterfaceOption{
@@ -977,6 +974,11 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 		*o = tcpip.BindToDeviceOption(e.bindToDevice)
 		e.mu.RUnlock()
 
+	case *tcpip.LingerOption:
+		e.mu.RLock()
+		*o = e.linger
+		e.mu.RUnlock()
+
 	default:
 		return tcpip.ErrUnknownProtocolOption
 	}
@@ -994,6 +996,7 @@ func sendUDP(r *stack.Route, data buffer.VectorisedView, localPort, remotePort u
 
 	// Initialize the UDP header.
 	udp := header.UDP(pkt.TransportHeader().Push(header.UDPMinimumSize))
+	pkt.TransportProtocolNumber = ProtocolNumber
 
 	length := uint16(pkt.Size())
 	udp.Encode(&header.UDPFields{
@@ -1220,13 +1223,13 @@ func (*endpoint) Listen(int) *tcpip.Error {
 }
 
 // Accept is not supported by UDP, it just fails.
-func (*endpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
+func (*endpoint) Accept(*tcpip.FullAddress) (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
 	return nil, nil, tcpip.ErrNotSupported
 }
 
 func (e *endpoint) registerWithStack(nicID tcpip.NICID, netProtos []tcpip.NetworkProtocolNumber, id stack.TransportEndpointID) (stack.TransportEndpointID, tcpip.NICID, *tcpip.Error) {
 	if e.ID.LocalPort == 0 {
-		port, err := e.stack.ReservePort(netProtos, ProtocolNumber, id.LocalAddress, id.LocalPort, e.portFlags, e.bindToDevice, tcpip.FullAddress{})
+		port, err := e.stack.ReservePort(netProtos, ProtocolNumber, id.LocalAddress, id.LocalPort, e.portFlags, e.bindToDevice, tcpip.FullAddress{}, nil /* testPort */)
 		if err != nil {
 			return id, e.bindToDevice, err
 		}
@@ -1366,6 +1369,22 @@ func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
 	return result
 }
 
+// verifyChecksum verifies the checksum unless RX checksum offload is enabled.
+// On IPv4, UDP checksum is optional, and a zero value means the transmitter
+// omitted the checksum generation (RFC768).
+// On IPv6, UDP checksum is not optional (RFC2460 Section 8.1).
+func verifyChecksum(r *stack.Route, hdr header.UDP, pkt *stack.PacketBuffer) bool {
+	if r.Capabilities()&stack.CapabilityRXChecksumOffload == 0 &&
+		(hdr.Checksum() != 0 || r.NetProto == header.IPv6ProtocolNumber) {
+		xsum := r.PseudoHeaderChecksum(ProtocolNumber, hdr.Length())
+		for _, v := range pkt.Data.Views() {
+			xsum = header.Checksum(v, xsum)
+		}
+		return hdr.CalculateChecksum(xsum) == 0xffff
+	}
+	return true
+}
+
 // HandlePacket is called by the stack when new packets arrive to this transport
 // endpoint.
 func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pkt *stack.PacketBuffer) {
@@ -1378,33 +1397,13 @@ func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pk
 		return
 	}
 
-	// Never receive from a multicast address.
-	if header.IsV4MulticastAddress(id.RemoteAddress) ||
-		header.IsV6MulticastAddress(id.RemoteAddress) {
-		e.stack.Stats().UDP.InvalidSourceAddress.Increment()
-		e.stack.Stats().IP.InvalidSourceAddressesReceived.Increment()
-		e.stats.ReceiveErrors.MalformedPacketsReceived.Increment()
+	if !verifyChecksum(r, hdr, pkt) {
+		// Checksum Error.
+		e.stack.Stats().UDP.ChecksumErrors.Increment()
+		e.stats.ReceiveErrors.ChecksumErrors.Increment()
 		return
 	}
 
-	// Verify checksum unless RX checksum offload is enabled.
-	// On IPv4, UDP checksum is optional, and a zero value means
-	// the transmitter omitted the checksum generation (RFC768).
-	// On IPv6, UDP checksum is not optional (RFC2460 Section 8.1).
-	if r.Capabilities()&stack.CapabilityRXChecksumOffload == 0 &&
-		(hdr.Checksum() != 0 || r.NetProto == header.IPv6ProtocolNumber) {
-		xsum := r.PseudoHeaderChecksum(ProtocolNumber, hdr.Length())
-		for _, v := range pkt.Data.Views() {
-			xsum = header.Checksum(v, xsum)
-		}
-		if hdr.CalculateChecksum(xsum) != 0xffff {
-			// Checksum Error.
-			e.stack.Stats().UDP.ChecksumErrors.Increment()
-			e.stats.ReceiveErrors.ChecksumErrors.Increment()
-			return
-		}
-	}
-
 	e.stack.Stats().UDP.PacketsReceived.Increment()
 	e.stats.PacketsReceived.Increment()
 
diff --git a/pkg/tcpip/transport/udp/endpoint_state.go b/pkg/tcpip/transport/udp/endpoint_state.go
index 851e6b635..858c99a45 100644
--- a/pkg/tcpip/transport/udp/endpoint_state.go
+++ b/pkg/tcpip/transport/udp/endpoint_state.go
@@ -92,7 +92,7 @@ func (e *endpoint) Resume(s *stack.Stack) {
 
 	e.stack = s
 
-	for _, m := range e.multicastMemberships {
+	for m := range e.multicastMemberships {
 		if err := e.stack.JoinGroup(e.NetProto, m.nicID, m.multicastAddr); err != nil {
 			panic(err)
 		}
diff --git a/pkg/tcpip/transport/udp/forwarder.go b/pkg/tcpip/transport/udp/forwarder.go
index c67e0ba95..3ae6cc221 100644
--- a/pkg/tcpip/transport/udp/forwarder.go
+++ b/pkg/tcpip/transport/udp/forwarder.go
@@ -81,6 +81,7 @@ func (r *ForwarderRequest) CreateEndpoint(queue *waiter.Queue) (tcpip.Endpoint,
 	ep.ID = r.id
 	ep.route = r.route.Clone()
 	ep.dstPort = r.id.RemotePort
+	ep.effectiveNetProtos = []tcpip.NetworkProtocolNumber{r.route.NetProto}
 	ep.RegisterNICID = r.route.NICID()
 	ep.boundPortFlags = ep.portFlags
 
diff --git a/pkg/tcpip/transport/udp/protocol.go b/pkg/tcpip/transport/udp/protocol.go
index 63d4bed7c..da5b1deb2 100644
--- a/pkg/tcpip/transport/udp/protocol.go
+++ b/pkg/tcpip/transport/udp/protocol.go
@@ -12,18 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// Package udp contains the implementation of the UDP transport protocol. To use
-// it in the networking stack, this package must be added to the project, and
-// activated on the stack by passing udp.NewProtocol() as one of the
-// transport protocols when calling stack.New(). Then endpoints can be created
-// by passing udp.ProtocolNumber as the transport protocol number when calling
-// Stack.NewEndpoint().
+// Package udp contains the implementation of the UDP transport protocol.
 package udp
 
 import (
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/header/parse"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/tcpip/transport/raw"
 	"gvisor.dev/gvisor/pkg/waiter"
@@ -49,6 +45,7 @@ const (
 )
 
 type protocol struct {
+	stack *stack.Stack
 }
 
 // Number returns the udp protocol number.
@@ -57,14 +54,14 @@ func (*protocol) Number() tcpip.TransportProtocolNumber {
 }
 
 // NewEndpoint creates a new udp endpoint.
-func (*protocol) NewEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
-	return newEndpoint(stack, netProto, waiterQueue), nil
+func (p *protocol) NewEndpoint(netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
+	return newEndpoint(p.stack, netProto, waiterQueue), nil
 }
 
 // NewRawEndpoint creates a new raw UDP endpoint. It implements
 // stack.TransportProtocol.NewRawEndpoint.
-func (p *protocol) NewRawEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
-	return raw.NewEndpoint(stack, netProto, header.UDPProtocolNumber, waiterQueue)
+func (p *protocol) NewRawEndpoint(netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
+	return raw.NewEndpoint(p.stack, netProto, header.UDPProtocolNumber, waiterQueue)
 }
 
 // MinimumPacketSize returns the minimum valid udp packet size.
@@ -79,130 +76,30 @@ func (*protocol) ParsePorts(v buffer.View) (src, dst uint16, err *tcpip.Error) {
 	return h.SourcePort(), h.DestinationPort(), nil
 }
 
-// HandleUnknownDestinationPacket handles packets targeted at this protocol but
-// that don't match any existing endpoint.
-func (p *protocol) HandleUnknownDestinationPacket(r *stack.Route, id stack.TransportEndpointID, pkt *stack.PacketBuffer) bool {
+// HandleUnknownDestinationPacket handles packets that are targeted at this
+// protocol but don't match any existing endpoint.
+func (p *protocol) HandleUnknownDestinationPacket(r *stack.Route, id stack.TransportEndpointID, pkt *stack.PacketBuffer) stack.UnknownDestinationPacketDisposition {
 	hdr := header.UDP(pkt.TransportHeader().View())
 	if int(hdr.Length()) > pkt.Data.Size()+header.UDPMinimumSize {
-		// Malformed packet.
 		r.Stack().Stats().UDP.MalformedPacketsReceived.Increment()
-		return true
-	}
-	// TODO(b/129426613): only send an ICMP message if UDP checksum is valid.
-
-	// Only send ICMP error if the address is not a multicast/broadcast
-	// v4/v6 address or the source is not the unspecified address.
-	//
-	// See: point e) in https://tools.ietf.org/html/rfc4443#section-2.4
-	if id.LocalAddress == header.IPv4Broadcast || header.IsV4MulticastAddress(id.LocalAddress) || header.IsV6MulticastAddress(id.LocalAddress) || id.RemoteAddress == header.IPv6Any || id.RemoteAddress == header.IPv4Any {
-		return true
+		return stack.UnknownDestinationPacketMalformed
 	}
 
-	// As per RFC: 1122 Section 3.2.2.1 A host SHOULD generate Destination
-	//   Unreachable messages with code:
-	//
-	//     2 (Protocol Unreachable), when the designated transport protocol
-	//     is not supported; or
-	//
-	//     3 (Port Unreachable), when the designated transport protocol
-	//     (e.g., UDP) is unable to demultiplex the datagram but has no
-	//     protocol mechanism to inform the sender.
-	switch len(id.LocalAddress) {
-	case header.IPv4AddressSize:
-		if !r.Stack().AllowICMPMessage() {
-			r.Stack().Stats().ICMP.V4PacketsSent.RateLimited.Increment()
-			return true
-		}
-		// As per RFC 1812 Section 4.3.2.3
-		//
-		//   ICMP datagram SHOULD contain as much of the original
-		//   datagram as possible without the length of the ICMP
-		//   datagram exceeding 576 bytes
-		//
-		// NOTE: The above RFC referenced is different from the original
-		// recommendation in RFC 1122 where it mentioned that at least 8
-		// bytes of the payload must be included. Today linux and other
-		// systems implement the] RFC1812 definition and not the original
-		// RFC 1122 requirement.
-		mtu := int(r.MTU())
-		if mtu > header.IPv4MinimumProcessableDatagramSize {
-			mtu = header.IPv4MinimumProcessableDatagramSize
-		}
-		headerLen := int(r.MaxHeaderLength()) + header.ICMPv4MinimumSize
-		available := int(mtu) - headerLen
-		payloadLen := pkt.NetworkHeader().View().Size() + pkt.TransportHeader().View().Size() + pkt.Data.Size()
-		if payloadLen > available {
-			payloadLen = available
-		}
-
-		// The buffers used by pkt may be used elsewhere in the system.
-		// For example, a raw or packet socket may use what UDP
-		// considers an unreachable destination. Thus we deep copy pkt
-		// to prevent multiple ownership and SR errors.
-		newHeader := append(buffer.View(nil), pkt.NetworkHeader().View()...)
-		newHeader = append(newHeader, pkt.TransportHeader().View()...)
-		payload := newHeader.ToVectorisedView()
-		payload.AppendView(pkt.Data.ToView())
-		payload.CapLength(payloadLen)
-
-		icmpPkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
-			ReserveHeaderBytes: headerLen,
-			Data:               payload,
-		})
-		icmpHdr := header.ICMPv4(icmpPkt.TransportHeader().Push(header.ICMPv4MinimumSize))
-		icmpHdr.SetType(header.ICMPv4DstUnreachable)
-		icmpHdr.SetCode(header.ICMPv4PortUnreachable)
-		icmpHdr.SetChecksum(header.ICMPv4Checksum(icmpHdr, icmpPkt.Data))
-		r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: header.ICMPv4ProtocolNumber, TTL: r.DefaultTTL(), TOS: stack.DefaultTOS}, icmpPkt)
-
-	case header.IPv6AddressSize:
-		if !r.Stack().AllowICMPMessage() {
-			r.Stack().Stats().ICMP.V6PacketsSent.RateLimited.Increment()
-			return true
-		}
-
-		// As per RFC 4443 section 2.4
-		//
-		//    (c) Every ICMPv6 error message (type < 128) MUST include
-		//    as much of the IPv6 offending (invoking) packet (the
-		//    packet that caused the error) as possible without making
-		//    the error message packet exceed the minimum IPv6 MTU
-		//    [IPv6].
-		mtu := int(r.MTU())
-		if mtu > header.IPv6MinimumMTU {
-			mtu = header.IPv6MinimumMTU
-		}
-		headerLen := int(r.MaxHeaderLength()) + header.ICMPv6DstUnreachableMinimumSize
-		available := int(mtu) - headerLen
-		network, transport := pkt.NetworkHeader().View(), pkt.TransportHeader().View()
-		payloadLen := len(network) + len(transport) + pkt.Data.Size()
-		if payloadLen > available {
-			payloadLen = available
-		}
-		payload := buffer.NewVectorisedView(len(network)+len(transport), []buffer.View{network, transport})
-		payload.Append(pkt.Data)
-		payload.CapLength(payloadLen)
-
-		icmpPkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
-			ReserveHeaderBytes: headerLen,
-			Data:               payload,
-		})
-		icmpHdr := header.ICMPv6(icmpPkt.TransportHeader().Push(header.ICMPv6DstUnreachableMinimumSize))
-		icmpHdr.SetType(header.ICMPv6DstUnreachable)
-		icmpHdr.SetCode(header.ICMPv6PortUnreachable)
-		icmpHdr.SetChecksum(header.ICMPv6Checksum(icmpHdr, r.LocalAddress, r.RemoteAddress, icmpPkt.Data))
-		r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: header.ICMPv6ProtocolNumber, TTL: r.DefaultTTL(), TOS: stack.DefaultTOS}, icmpPkt)
+	if !verifyChecksum(r, hdr, pkt) {
+		r.Stack().Stats().UDP.ChecksumErrors.Increment()
+		return stack.UnknownDestinationPacketMalformed
 	}
-	return true
+
+	return stack.UnknownDestinationPacketUnhandled
 }
 
 // SetOption implements stack.TransportProtocol.SetOption.
-func (p *protocol) SetOption(option interface{}) *tcpip.Error {
+func (*protocol) SetOption(tcpip.SettableTransportProtocolOption) *tcpip.Error {
 	return tcpip.ErrUnknownProtocolOption
 }
 
 // Option implements stack.TransportProtocol.Option.
-func (p *protocol) Option(option interface{}) *tcpip.Error {
+func (*protocol) Option(tcpip.GettableTransportProtocolOption) *tcpip.Error {
 	return tcpip.ErrUnknownProtocolOption
 }
 
@@ -214,11 +111,10 @@ func (*protocol) Wait() {}
 
 // Parse implements stack.TransportProtocol.Parse.
 func (*protocol) Parse(pkt *stack.PacketBuffer) bool {
-	_, ok := pkt.TransportHeader().Consume(header.UDPMinimumSize)
-	return ok
+	return parse.UDP(pkt)
 }
 
 // NewProtocol returns a UDP transport protocol.
-func NewProtocol() stack.TransportProtocol {
-	return &protocol{}
+func NewProtocol(s *stack.Stack) stack.TransportProtocol {
+	return &protocol{stack: s}
 }
diff --git a/pkg/tcpip/transport/udp/udp_test.go b/pkg/tcpip/transport/udp/udp_test.go
index f87d99d5a..b4604ba35 100644
--- a/pkg/tcpip/transport/udp/udp_test.go
+++ b/pkg/tcpip/transport/udp/udp_test.go
@@ -294,8 +294,8 @@ type testContext struct {
 func newDualTestContext(t *testing.T, mtu uint32) *testContext {
 	t.Helper()
 	return newDualTestContextWithOptions(t, mtu, stack.Options{
-		NetworkProtocols:   []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol()},
-		TransportProtocols: []stack.TransportProtocol{udp.NewProtocol()},
+		NetworkProtocols:   []stack.NetworkProtocolFactory{ipv4.NewProtocol, ipv6.NewProtocol},
+		TransportProtocols: []stack.TransportProtocolFactory{udp.NewProtocol},
 	})
 }
 
@@ -388,6 +388,10 @@ func (c *testContext) getPacketAndVerify(flow testFlow, checkers ...checker.Netw
 		c.t.Fatalf("Bad network protocol: got %v, wanted %v", p.Proto, flow.netProto())
 	}
 
+	if got, want := p.Pkt.TransportProtocolNumber, header.UDPProtocolNumber; got != want {
+		c.t.Errorf("got p.Pkt.TransportProtocolNumber = %d, want = %d", got, want)
+	}
+
 	vv := buffer.NewVectorisedView(p.Pkt.Size(), p.Pkt.Views())
 	b := vv.ToView()
 
@@ -403,18 +407,35 @@ func (c *testContext) getPacketAndVerify(flow testFlow, checkers ...checker.Netw
 }
 
 // injectPacket creates a packet of the given flow and with the given payload,
-// and injects it into the link endpoint.
-func (c *testContext) injectPacket(flow testFlow, payload []byte) {
+// and injects it into the link endpoint. If badChecksum is true, the packet has
+// a bad checksum in the UDP header.
+func (c *testContext) injectPacket(flow testFlow, payload []byte, badChecksum bool) {
 	c.t.Helper()
 
 	h := flow.header4Tuple(incoming)
 	if flow.isV4() {
 		buf := c.buildV4Packet(payload, &h)
+		if badChecksum {
+			// Invalidate the UDP header checksum field, taking care to avoid
+			// overflow to zero, which would disable checksum validation.
+			for u := header.UDP(buf[header.IPv4MinimumSize:]); ; {
+				u.SetChecksum(u.Checksum() + 1)
+				if u.Checksum() != 0 {
+					break
+				}
+			}
+		}
 		c.linkEP.InjectInbound(ipv4.ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
 			Data: buf.ToVectorisedView(),
 		}))
 	} else {
 		buf := c.buildV6Packet(payload, &h)
+		if badChecksum {
+			// Invalidate the UDP header checksum field (Unlike IPv4, zero is
+			// a valid checksum value for IPv6 so no need to avoid it).
+			u := header.UDP(buf[header.IPv6MinimumSize:])
+			u.SetChecksum(u.Checksum() + 1)
+		}
 		c.linkEP.InjectInbound(ipv6.ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
 			Data: buf.ToVectorisedView(),
 		}))
@@ -511,8 +532,8 @@ func newMinPayload(minSize int) []byte {
 
 func TestBindToDeviceOption(t *testing.T) {
 	s := stack.New(stack.Options{
-		NetworkProtocols:   []stack.NetworkProtocol{ipv4.NewProtocol()},
-		TransportProtocols: []stack.TransportProtocol{udp.NewProtocol()}})
+		NetworkProtocols:   []stack.NetworkProtocolFactory{ipv4.NewProtocol},
+		TransportProtocols: []stack.TransportProtocolFactory{udp.NewProtocol}})
 
 	ep, err := s.NewEndpoint(udp.ProtocolNumber, ipv4.ProtocolNumber, &waiter.Queue{})
 	if err != nil {
@@ -522,7 +543,7 @@ func TestBindToDeviceOption(t *testing.T) {
 
 	opts := stack.NICOptions{Name: "my_device"}
 	if err := s.CreateNICWithOptions(321, loopback.New(), opts); err != nil {
-		t.Errorf("CreateNICWithOptions(_, _, %+v) failed: %v", opts, err)
+		t.Errorf("CreateNICWithOptions(_, _, %+v) failed: %s", opts, err)
 	}
 
 	// nicIDPtr is used instead of taking the address of NICID literals, which is
@@ -546,16 +567,15 @@ func TestBindToDeviceOption(t *testing.T) {
 		t.Run(testAction.name, func(t *testing.T) {
 			if testAction.setBindToDevice != nil {
 				bindToDevice := tcpip.BindToDeviceOption(*testAction.setBindToDevice)
-				if gotErr, wantErr := ep.SetSockOpt(bindToDevice), testAction.setBindToDeviceError; gotErr != wantErr {
-					t.Errorf("SetSockOpt(%v) got %v, want %v", bindToDevice, gotErr, wantErr)
+				if gotErr, wantErr := ep.SetSockOpt(&bindToDevice), testAction.setBindToDeviceError; gotErr != wantErr {
+					t.Errorf("got SetSockOpt(&%T(%d)) = %s, want = %s", bindToDevice, bindToDevice, gotErr, wantErr)
 				}
 			}
 			bindToDevice := tcpip.BindToDeviceOption(88888)
 			if err := ep.GetSockOpt(&bindToDevice); err != nil {
-				t.Errorf("GetSockOpt got %v, want %v", err, nil)
-			}
-			if got, want := bindToDevice, testAction.getBindToDevice; got != want {
-				t.Errorf("bindToDevice got %d, want %d", got, want)
+				t.Errorf("GetSockOpt(&%T): %s", bindToDevice, err)
+			} else if bindToDevice != testAction.getBindToDevice {
+				t.Errorf("got bindToDevice = %d, want = %d", bindToDevice, testAction.getBindToDevice)
 			}
 		})
 	}
@@ -569,7 +589,7 @@ func testReadInternal(c *testContext, flow testFlow, packetShouldBeDropped, expe
 	c.t.Helper()
 
 	payload := newPayload()
-	c.injectPacket(flow, payload)
+	c.injectPacket(flow, payload, false)
 
 	// Try to receive the data.
 	we, ch := waiter.NewChannelEntry(nil)
@@ -611,12 +631,12 @@ func testReadInternal(c *testContext, flow testFlow, packetShouldBeDropped, expe
 	// Check the peer address.
 	h := flow.header4Tuple(incoming)
 	if addr.Addr != h.srcAddr.Addr {
-		c.t.Fatalf("unexpected remote address: got %s, want %v", addr.Addr, h.srcAddr)
+		c.t.Fatalf("got address = %s, want = %s", addr.Addr, h.srcAddr.Addr)
 	}
 
 	// Check the payload.
 	if !bytes.Equal(payload, v) {
-		c.t.Fatalf("bad payload: got %x, want %x", v, payload)
+		c.t.Fatalf("got payload = %x, want = %x", v, payload)
 	}
 
 	// Run any checkers against the ControlMessages.
@@ -677,7 +697,7 @@ func TestBindReservedPort(t *testing.T) {
 		}
 		defer ep.Close()
 		if got, want := ep.Bind(addr), tcpip.ErrPortInUse; got != want {
-			t.Fatalf("got ep.Bind(...) = %v, want = %v", got, want)
+			t.Fatalf("got ep.Bind(...) = %s, want = %s", got, want)
 		}
 	}
 
@@ -690,7 +710,7 @@ func TestBindReservedPort(t *testing.T) {
 		// We can't bind ipv4-any on the port reserved by the connected endpoint
 		// above, since the endpoint is dual-stack.
 		if got, want := ep.Bind(tcpip.FullAddress{Port: addr.Port}), tcpip.ErrPortInUse; got != want {
-			t.Fatalf("got ep.Bind(...) = %v, want = %v", got, want)
+			t.Fatalf("got ep.Bind(...) = %s, want = %s", got, want)
 		}
 		// We can bind an ipv4 address on this port, though.
 		if err := ep.Bind(tcpip.FullAddress{Addr: stackAddr, Port: addr.Port}); err != nil {
@@ -787,8 +807,8 @@ func TestV4ReadSelfSource(t *testing.T) {
 	} {
 		t.Run(tt.name, func(t *testing.T) {
 			c := newDualTestContextWithOptions(t, defaultMTU, stack.Options{
-				NetworkProtocols:   []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol()},
-				TransportProtocols: []stack.TransportProtocol{udp.NewProtocol()},
+				NetworkProtocols:   []stack.NetworkProtocolFactory{ipv4.NewProtocol, ipv6.NewProtocol},
+				TransportProtocols: []stack.TransportProtocolFactory{udp.NewProtocol},
 				HandleLocal:        tt.handleLocal,
 			})
 			defer c.cleanup()
@@ -813,7 +833,7 @@ func TestV4ReadSelfSource(t *testing.T) {
 			}
 
 			if _, _, err := c.ep.Read(nil); err != tt.wantErr {
-				t.Errorf("c.ep.Read() got error %v, want %v", err, tt.wantErr)
+				t.Errorf("got c.ep.Read(nil) = %s, want = %s", err, tt.wantErr)
 			}
 		})
 	}
@@ -854,8 +874,8 @@ func TestReadOnBoundToMulticast(t *testing.T) {
 
 			// Join multicast group.
 			ifoptSet := tcpip.AddMembershipOption{NIC: 1, MulticastAddr: mcastAddr}
-			if err := c.ep.SetSockOpt(ifoptSet); err != nil {
-				c.t.Fatal("SetSockOpt failed:", err)
+			if err := c.ep.SetSockOpt(&ifoptSet); err != nil {
+				c.t.Fatalf("SetSockOpt(&%#v): %s", ifoptSet, err)
 			}
 
 			// Check that we receive multicast packets but not unicast or broadcast
@@ -908,42 +928,6 @@ func TestReadFromMulticast(t *testing.T) {
 	}
 }
 
-// TestReadFromMulticaststats checks that a discarded packet
-// that that was sent with multicast SOURCE address increments
-// the correct counters and that a regular packet does not.
-func TestReadFromMulticastStats(t *testing.T) {
-	t.Helper()
-	for _, flow := range []testFlow{reverseMulticast4, reverseMulticast6, unicastV4} {
-		t.Run(fmt.Sprintf("flow:%s", flow), func(t *testing.T) {
-			c := newDualTestContext(t, defaultMTU)
-			defer c.cleanup()
-
-			c.createEndpointForFlow(flow)
-
-			if err := c.ep.Bind(tcpip.FullAddress{Port: stackPort}); err != nil {
-				t.Fatalf("Bind failed: %s", err)
-			}
-
-			payload := newPayload()
-			c.injectPacket(flow, payload)
-
-			var want uint64 = 0
-			if flow.isReverseMulticast() {
-				want = 1
-			}
-			if got := c.s.Stats().IP.InvalidSourceAddressesReceived.Value(); got != want {
-				t.Errorf("got stats.IP.InvalidSourceAddressesReceived.Value() = %d, want = %d", got, want)
-			}
-			if got := c.s.Stats().UDP.InvalidSourceAddress.Value(); got != want {
-				t.Errorf("got stats.UDP.InvalidSourceAddress.Value() = %d, want = %d", got, want)
-			}
-			if got := c.ep.Stats().(*tcpip.TransportEndpointStats).ReceiveErrors.MalformedPacketsReceived.Value(); got != want {
-				t.Errorf("got EP Stats.ReceiveErrors.MalformedPacketsReceived stats = %d, want = %d", got, want)
-			}
-		})
-	}
-}
-
 // TestV4ReadBroadcastOnBoundToWildcard checks that an endpoint can bind to ANY
 // and receive broadcast and unicast data.
 func TestV4ReadBroadcastOnBoundToWildcard(t *testing.T) {
@@ -1386,8 +1370,8 @@ func TestReadIPPacketInfo(t *testing.T) {
 
 			if test.flow.isMulticast() {
 				ifoptSet := tcpip.AddMembershipOption{NIC: 1, MulticastAddr: test.flow.getMcastAddr()}
-				if err := c.ep.SetSockOpt(ifoptSet); err != nil {
-					c.t.Fatalf("SetSockOpt(%+v): %s:", ifoptSet, err)
+				if err := c.ep.SetSockOpt(&ifoptSet); err != nil {
+					c.t.Fatalf("SetSockOpt(&%#v): %s:", ifoptSet, err)
 				}
 			}
 
@@ -1446,6 +1430,28 @@ func TestNoChecksum(t *testing.T) {
 	}
 }
 
+var _ stack.NetworkInterface = (*testInterface)(nil)
+
+type testInterface struct {
+	stack.NetworkLinkEndpoint
+}
+
+func (*testInterface) ID() tcpip.NICID {
+	return 0
+}
+
+func (*testInterface) IsLoopback() bool {
+	return false
+}
+
+func (*testInterface) Name() string {
+	return ""
+}
+
+func (*testInterface) Enabled() bool {
+	return true
+}
+
 func TestTTL(t *testing.T) {
 	for _, flow := range []testFlow{unicastV4, unicastV4in6, unicastV6, unicastV6Only, multicastV4, multicastV4in6, multicastV6, broadcast, broadcastIn6} {
 		t.Run(fmt.Sprintf("flow:%s", flow), func(t *testing.T) {
@@ -1463,16 +1469,19 @@ func TestTTL(t *testing.T) {
 			if flow.isMulticast() {
 				wantTTL = multicastTTL
 			} else {
-				var p stack.NetworkProtocol
+				var p stack.NetworkProtocolFactory
+				var n tcpip.NetworkProtocolNumber
 				if flow.isV4() {
-					p = ipv4.NewProtocol()
+					p = ipv4.NewProtocol
+					n = ipv4.ProtocolNumber
 				} else {
-					p = ipv6.NewProtocol()
+					p = ipv6.NewProtocol
+					n = ipv6.ProtocolNumber
 				}
-				ep := p.NewEndpoint(0, nil, nil, nil, stack.New(stack.Options{
-					NetworkProtocols:   []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol()},
-					TransportProtocols: []stack.TransportProtocol{udp.NewProtocol()},
-				}))
+				s := stack.New(stack.Options{
+					NetworkProtocols: []stack.NetworkProtocolFactory{p},
+				})
+				ep := s.NetworkProtocolInstance(n).NewEndpoint(&testInterface{}, nil, nil, nil)
 				wantTTL = ep.DefaultTTL()
 				ep.Close()
 			}
@@ -1496,18 +1505,6 @@ func TestSetTTL(t *testing.T) {
 						c.t.Fatalf("SetSockOptInt(TTLOption, %d) failed: %s", wantTTL, err)
 					}
 
-					var p stack.NetworkProtocol
-					if flow.isV4() {
-						p = ipv4.NewProtocol()
-					} else {
-						p = ipv6.NewProtocol()
-					}
-					ep := p.NewEndpoint(0, nil, nil, nil, stack.New(stack.Options{
-						NetworkProtocols:   []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol()},
-						TransportProtocols: []stack.TransportProtocol{udp.NewProtocol()},
-					}))
-					ep.Close()
-
 					testWrite(c, flow, checker.TTL(wantTTL))
 				})
 			}
@@ -1530,7 +1527,7 @@ func TestSetTOS(t *testing.T) {
 			}
 			// Test for expected default value.
 			if v != 0 {
-				c.t.Errorf("got GetSockOpt(IPv4TOSOption) = 0x%x, want = 0x%x", v, 0)
+				c.t.Errorf("got GetSockOptInt(IPv4TOSOption) = 0x%x, want = 0x%x", v, 0)
 			}
 
 			if err := c.ep.SetSockOptInt(tcpip.IPv4TOSOption, tos); err != nil {
@@ -1691,19 +1688,17 @@ func TestMulticastInterfaceOption(t *testing.T) {
 								}
 							}
 
-							if err := c.ep.SetSockOpt(ifoptSet); err != nil {
-								c.t.Fatalf("SetSockOpt failed: %s", err)
+							if err := c.ep.SetSockOpt(&ifoptSet); err != nil {
+								c.t.Fatalf("SetSockOpt(&%#v): %s", ifoptSet, err)
 							}
 
 							// Verify multicast interface addr and NIC were set correctly.
 							// Note that NIC must be 1 since this is our outgoing interface.
-							ifoptWant := tcpip.MulticastInterfaceOption{NIC: 1, InterfaceAddr: ifoptSet.InterfaceAddr}
 							var ifoptGot tcpip.MulticastInterfaceOption
 							if err := c.ep.GetSockOpt(&ifoptGot); err != nil {
-								c.t.Fatalf("GetSockOpt failed: %s", err)
-							}
-							if ifoptGot != ifoptWant {
-								c.t.Errorf("got GetSockOpt() = %#v, want = %#v", ifoptGot, ifoptWant)
+								c.t.Fatalf("GetSockOpt(&%T): %s", ifoptGot, err)
+							} else if ifoptWant := (tcpip.MulticastInterfaceOption{NIC: 1, InterfaceAddr: ifoptSet.InterfaceAddr}); ifoptGot != ifoptWant {
+								c.t.Errorf("got multicast interface option = %#v, want = %#v", ifoptGot, ifoptWant)
 							}
 						})
 					}
@@ -1727,21 +1722,33 @@ func TestV4UnknownDestination(t *testing.T) {
 		// so that the final generated IPv4 packet is larger than
 		// header.IPv4MinimumProcessableDatagramSize.
 		largePayload bool
+		// badChecksum if true, will set an invalid checksum in the
+		// header.
+		badChecksum bool
 	}{
-		{unicastV4, true, false},
-		{unicastV4, true, true},
-		{multicastV4, false, false},
-		{multicastV4, false, true},
-		{broadcast, false, false},
-		{broadcast, false, true},
-	}
+		{unicastV4, true, false, false},
+		{unicastV4, true, true, false},
+		{unicastV4, false, false, true},
+		{unicastV4, false, true, true},
+		{multicastV4, false, false, false},
+		{multicastV4, false, true, false},
+		{broadcast, false, false, false},
+		{broadcast, false, true, false},
+	}
+	checksumErrors := uint64(0)
 	for _, tc := range testCases {
-		t.Run(fmt.Sprintf("flow:%s icmpRequired:%t largePayload:%t", tc.flow, tc.icmpRequired, tc.largePayload), func(t *testing.T) {
+		t.Run(fmt.Sprintf("flow:%s icmpRequired:%t largePayload:%t badChecksum:%t", tc.flow, tc.icmpRequired, tc.largePayload, tc.badChecksum), func(t *testing.T) {
 			payload := newPayload()
 			if tc.largePayload {
 				payload = newMinPayload(576)
 			}
-			c.injectPacket(tc.flow, payload)
+			c.injectPacket(tc.flow, payload, tc.badChecksum)
+			if tc.badChecksum {
+				checksumErrors++
+				if got, want := c.s.Stats().UDP.ChecksumErrors.Value(), checksumErrors; got != want {
+					t.Fatalf("got stats.UDP.ChecksumErrors.Value() = %d, want = %d", got, want)
+				}
+			}
 			if !tc.icmpRequired {
 				ctx, cancel := context.WithTimeout(context.Background(), time.Second)
 				defer cancel()
@@ -1771,16 +1778,26 @@ func TestV4UnknownDestination(t *testing.T) {
 				checker.ICMPv4Type(header.ICMPv4DstUnreachable),
 				checker.ICMPv4Code(header.ICMPv4PortUnreachable)))
 
+			// We need to compare the included data part of the UDP packet that is in
+			// the ICMP packet with the matching original data.
 			icmpPkt := header.ICMPv4(hdr.Payload())
 			payloadIPHeader := header.IPv4(icmpPkt.Payload())
+			incomingHeaderLength := header.IPv4MinimumSize + header.UDPMinimumSize
 			wantLen := len(payload)
 			if tc.largePayload {
-				wantLen = header.IPv4MinimumProcessableDatagramSize - header.IPv4MinimumSize*2 - header.ICMPv4MinimumSize - header.UDPMinimumSize
+				// To work out the data size we need to simulate what the sender would
+				// have done. The wanted size is the total available minus the sum of
+				// the headers in the UDP AND ICMP packets, given that we know the test
+				// had only a minimal IP header but the ICMP sender will have allowed
+				// for a maximally sized packet header.
+				wantLen = header.IPv4MinimumProcessableDatagramSize - header.IPv4MaximumHeaderSize - header.ICMPv4MinimumSize - incomingHeaderLength
+
 			}
 
-			// In case of large payloads the IP packet may be truncated. Update
+			// In the case of large payloads the IP packet may be truncated. Update
 			// the length field before retrieving the udp datagram payload.
-			payloadIPHeader.SetTotalLength(uint16(wantLen + header.UDPMinimumSize + header.IPv4MinimumSize))
+			// Add back the two headers within the payload.
+			payloadIPHeader.SetTotalLength(uint16(wantLen + incomingHeaderLength))
 
 			origDgram := header.UDP(payloadIPHeader.Payload())
 			if got, want := len(origDgram.Payload()), wantLen; got != want {
@@ -1806,19 +1823,31 @@ func TestV6UnknownDestination(t *testing.T) {
 		// largePayload if true will result in a payload large enough to
 		// create an IPv6 packet > header.IPv6MinimumMTU bytes.
 		largePayload bool
+		// badChecksum if true, will set an invalid checksum in the
+		// header.
+		badChecksum bool
 	}{
-		{unicastV6, true, false},
-		{unicastV6, true, true},
-		{multicastV6, false, false},
-		{multicastV6, false, true},
-	}
+		{unicastV6, true, false, false},
+		{unicastV6, true, true, false},
+		{unicastV6, false, false, true},
+		{unicastV6, false, true, true},
+		{multicastV6, false, false, false},
+		{multicastV6, false, true, false},
+	}
+	checksumErrors := uint64(0)
 	for _, tc := range testCases {
-		t.Run(fmt.Sprintf("flow:%s icmpRequired:%t largePayload:%t", tc.flow, tc.icmpRequired, tc.largePayload), func(t *testing.T) {
+		t.Run(fmt.Sprintf("flow:%s icmpRequired:%t largePayload:%t badChecksum:%t", tc.flow, tc.icmpRequired, tc.largePayload, tc.badChecksum), func(t *testing.T) {
 			payload := newPayload()
 			if tc.largePayload {
 				payload = newMinPayload(1280)
 			}
-			c.injectPacket(tc.flow, payload)
+			c.injectPacket(tc.flow, payload, tc.badChecksum)
+			if tc.badChecksum {
+				checksumErrors++
+				if got, want := c.s.Stats().UDP.ChecksumErrors.Value(), checksumErrors; got != want {
+					t.Fatalf("got stats.UDP.ChecksumErrors.Value() = %d, want = %d", got, want)
+				}
+			}
 			if !tc.icmpRequired {
 				ctx, cancel := context.WithTimeout(context.Background(), time.Second)
 				defer cancel()
@@ -1953,74 +1982,29 @@ func TestShortHeader(t *testing.T) {
 	}
 }
 
-// TestIncrementChecksumErrorsV4 verifies if a checksum error is detected,
+// TestBadChecksumErrors verifies if a checksum error is detected,
 // global and endpoint stats are incremented.
-func TestIncrementChecksumErrorsV4(t *testing.T) {
-	c := newDualTestContext(t, defaultMTU)
-	defer c.cleanup()
-
-	c.createEndpoint(ipv4.ProtocolNumber)
-	// Bind to wildcard.
-	if err := c.ep.Bind(tcpip.FullAddress{Port: stackPort}); err != nil {
-		c.t.Fatalf("Bind failed: %s", err)
-	}
-
-	payload := newPayload()
-	h := unicastV4.header4Tuple(incoming)
-	buf := c.buildV4Packet(payload, &h)
+func TestBadChecksumErrors(t *testing.T) {
+	for _, flow := range []testFlow{unicastV4, unicastV6} {
+		c := newDualTestContext(t, defaultMTU)
+		defer c.cleanup()
 
-	// Invalidate the UDP header checksum field, taking care to avoid
-	// overflow to zero, which would disable checksum validation.
-	for u := header.UDP(buf[header.IPv4MinimumSize:]); ; {
-		u.SetChecksum(u.Checksum() + 1)
-		if u.Checksum() != 0 {
-			break
+		c.createEndpoint(flow.sockProto())
+		// Bind to wildcard.
+		if err := c.ep.Bind(tcpip.FullAddress{Port: stackPort}); err != nil {
+			c.t.Fatalf("Bind failed: %s", err)
 		}
-	}
-
-	c.linkEP.InjectInbound(ipv4.ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
-		Data: buf.ToVectorisedView(),
-	}))
-
-	const want = 1
-	if got := c.s.Stats().UDP.ChecksumErrors.Value(); got != want {
-		t.Errorf("got stats.UDP.ChecksumErrors.Value() = %d, want = %d", got, want)
-	}
-	if got := c.ep.Stats().(*tcpip.TransportEndpointStats).ReceiveErrors.ChecksumErrors.Value(); got != want {
-		t.Errorf("got EP Stats.ReceiveErrors.ChecksumErrors stats = %d, want = %d", got, want)
-	}
-}
-
-// TestIncrementChecksumErrorsV6 verifies if a checksum error is detected,
-// global and endpoint stats are incremented.
-func TestIncrementChecksumErrorsV6(t *testing.T) {
-	c := newDualTestContext(t, defaultMTU)
-	defer c.cleanup()
-
-	c.createEndpoint(ipv6.ProtocolNumber)
-	// Bind to wildcard.
-	if err := c.ep.Bind(tcpip.FullAddress{Port: stackPort}); err != nil {
-		c.t.Fatalf("Bind failed: %s", err)
-	}
 
-	payload := newPayload()
-	h := unicastV6.header4Tuple(incoming)
-	buf := c.buildV6Packet(payload, &h)
+		payload := newPayload()
+		c.injectPacket(flow, payload, true /* badChecksum */)
 
-	// Invalidate the UDP header checksum field.
-	u := header.UDP(buf[header.IPv6MinimumSize:])
-	u.SetChecksum(u.Checksum() + 1)
-
-	c.linkEP.InjectInbound(ipv6.ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
-		Data: buf.ToVectorisedView(),
-	}))
-
-	const want = 1
-	if got := c.s.Stats().UDP.ChecksumErrors.Value(); got != want {
-		t.Errorf("got stats.UDP.ChecksumErrors.Value() = %d, want = %d", got, want)
-	}
-	if got := c.ep.Stats().(*tcpip.TransportEndpointStats).ReceiveErrors.ChecksumErrors.Value(); got != want {
-		t.Errorf("got EP Stats.ReceiveErrors.ChecksumErrors stats = %d, want = %d", got, want)
+		const want = 1
+		if got := c.s.Stats().UDP.ChecksumErrors.Value(); got != want {
+			t.Errorf("got stats.UDP.ChecksumErrors.Value() = %d, want = %d", got, want)
+		}
+		if got := c.ep.Stats().(*tcpip.TransportEndpointStats).ReceiveErrors.ChecksumErrors.Value(); got != want {
+			t.Errorf("got EP Stats.ReceiveErrors.ChecksumErrors stats = %d, want = %d", got, want)
+		}
 	}
 }
 
@@ -2039,7 +2023,8 @@ func TestPayloadModifiedV4(t *testing.T) {
 	payload := newPayload()
 	h := unicastV4.header4Tuple(incoming)
 	buf := c.buildV4Packet(payload, &h)
-	// Modify the payload so that the checksum value in the UDP header will be incorrect.
+	// Modify the payload so that the checksum value in the UDP header will be
+	// incorrect.
 	buf[len(buf)-1]++
 	c.linkEP.InjectInbound(ipv4.ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
 		Data: buf.ToVectorisedView(),
@@ -2069,7 +2054,8 @@ func TestPayloadModifiedV6(t *testing.T) {
 	payload := newPayload()
 	h := unicastV6.header4Tuple(incoming)
 	buf := c.buildV6Packet(payload, &h)
-	// Modify the payload so that the checksum value in the UDP header will be incorrect.
+	// Modify the payload so that the checksum value in the UDP header will be
+	// incorrect.
 	buf[len(buf)-1]++
 	c.linkEP.InjectInbound(ipv6.ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
 		Data: buf.ToVectorisedView(),
@@ -2350,17 +2336,18 @@ func TestOutgoingSubnetBroadcast(t *testing.T) {
 					NIC:         nicID1,
 				},
 			},
-			remoteAddr:           remNetSubnetBcast,
-			requiresBroadcastOpt: true,
+			remoteAddr: remNetSubnetBcast,
+			// TODO(gvisor.dev/issue/3938): Once we support marking a route as
+			// broadcast, this test should require the broadcast option to be set.
+			requiresBroadcastOpt: false,
 		},
 	}
 
 	for _, test := range tests {
 		t.Run(test.name, func(t *testing.T) {
 			s := stack.New(stack.Options{
-				NetworkProtocols: []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol()},
-
-				TransportProtocols: []stack.TransportProtocol{udp.NewProtocol()},
+				NetworkProtocols:   []stack.NetworkProtocolFactory{ipv4.NewProtocol, ipv6.NewProtocol},
+				TransportProtocols: []stack.TransportProtocolFactory{udp.NewProtocol},
 			})
 			e := channel.New(0, defaultMTU, "")
 			if err := s.CreateNIC(nicID1, e); err != nil {
diff --git a/pkg/test/dockerutil/container.go b/pkg/test/dockerutil/container.go
index 052b6b99d..64d17f661 100644
--- a/pkg/test/dockerutil/container.go
+++ b/pkg/test/dockerutil/container.go
@@ -22,6 +22,7 @@ import (
 	"net"
 	"os"
 	"path"
+	"path/filepath"
 	"regexp"
 	"strconv"
 	"strings"
@@ -403,10 +404,13 @@ func (c *Container) CopyFiles(opts *RunOpts, target string, sources ...string) {
 		return
 	}
 	for _, name := range sources {
-		src, err := testutil.FindFile(name)
-		if err != nil {
-			c.copyErr = fmt.Errorf("testutil.FindFile(%q) failed: %v", name, err)
-			return
+		src := name
+		if !filepath.IsAbs(src) {
+			src, err = testutil.FindFile(name)
+			if err != nil {
+				c.copyErr = fmt.Errorf("testutil.FindFile(%q) failed: %w", name, err)
+				return
+			}
 		}
 		dst := path.Join(dir, path.Base(name))
 		if err := testutil.Copy(src, dst); err != nil {
diff --git a/pkg/test/dockerutil/dockerutil.go b/pkg/test/dockerutil/dockerutil.go
index 952871f95..7027df1a5 100644
--- a/pkg/test/dockerutil/dockerutil.go
+++ b/pkg/test/dockerutil/dockerutil.go
@@ -60,7 +60,6 @@ var (
 	// enabled for each run.
 	pprofBlock = flag.Bool("pprof-block", false, "enables block profiling with runsc debug")
 	pprofCPU   = flag.Bool("pprof-cpu", false, "enables CPU profiling with runsc debug")
-	pprofGo    = flag.Bool("pprof-go", false, "enables goroutine profiling with runsc debug")
 	pprofHeap  = flag.Bool("pprof-heap", false, "enables heap profiling with runsc debug")
 	pprofMutex = flag.Bool("pprof-mutex", false, "enables mutex profiling with runsc debug")
 )
diff --git a/pkg/test/dockerutil/profile.go b/pkg/test/dockerutil/profile.go
index f0396ef24..55f9496cd 100644
--- a/pkg/test/dockerutil/profile.go
+++ b/pkg/test/dockerutil/profile.go
@@ -63,7 +63,7 @@ type Pprof struct {
 
 // MakePprofFromFlags makes a Pprof profile from flags.
 func MakePprofFromFlags(c *Container) *Pprof {
-	if !(*pprofBlock || *pprofCPU || *pprofGo || *pprofHeap || *pprofMutex) {
+	if !(*pprofBlock || *pprofCPU || *pprofHeap || *pprofMutex) {
 		return nil
 	}
 	return &Pprof{
diff --git a/pkg/test/testutil/BUILD b/pkg/test/testutil/BUILD
index 2d8f56bc0..c4b131896 100644
--- a/pkg/test/testutil/BUILD
+++ b/pkg/test/testutil/BUILD
@@ -12,7 +12,7 @@ go_library(
     visibility = ["//:sandbox"],
     deps = [
         "//pkg/sync",
-        "//runsc/boot",
+        "//runsc/config",
         "//runsc/specutils",
         "@com_github_cenkalti_backoff//:go_default_library",
         "@com_github_opencontainers_runtime_spec//specs-go:go_default_library",
diff --git a/pkg/test/testutil/testutil.go b/pkg/test/testutil/testutil.go
index 1580527b5..49ab87c58 100644
--- a/pkg/test/testutil/testutil.go
+++ b/pkg/test/testutil/testutil.go
@@ -44,7 +44,7 @@ import (
 	"github.com/cenkalti/backoff"
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"gvisor.dev/gvisor/pkg/sync"
-	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/specutils"
 )
 
@@ -133,25 +133,28 @@ func Command(logger Logger, args ...string) *Cmd {
 
 // TestConfig returns the default configuration to use in tests. Note that
 // 'RootDir' must be set by caller if required.
-func TestConfig(t *testing.T) *boot.Config {
+func TestConfig(t *testing.T) *config.Config {
 	logDir := os.TempDir()
 	if dir, ok := os.LookupEnv("TEST_UNDECLARED_OUTPUTS_DIR"); ok {
 		logDir = dir + "/"
 	}
-	return &boot.Config{
-		Debug:              true,
-		DebugLog:           path.Join(logDir, "runsc.log."+t.Name()+".%TIMESTAMP%.%COMMAND%"),
-		LogFormat:          "text",
-		DebugLogFormat:     "text",
-		LogPackets:         true,
-		Network:            boot.NetworkNone,
-		Strace:             true,
-		Platform:           "ptrace",
-		FileAccess:         boot.FileAccessExclusive,
-		NumNetworkChannels: 1,
 
-		TestOnlyAllowRunAsCurrentUserWithoutChroot: true,
+	// Only register flags if config is being used. Otherwise anyone that uses
+	// testutil will get flags registered and they may conflict.
+	config.RegisterFlags()
+
+	conf, err := config.NewFromFlags()
+	if err != nil {
+		panic(err)
 	}
+	// Change test defaults.
+	conf.Debug = true
+	conf.DebugLog = path.Join(logDir, "runsc.log."+t.Name()+".%TIMESTAMP%.%COMMAND%")
+	conf.LogPackets = true
+	conf.Network = config.NetworkNone
+	conf.Strace = true
+	conf.TestOnlyAllowRunAsCurrentUserWithoutChroot = true
+	return conf
 }
 
 // NewSpecWithArgs creates a simple spec with the given args suitable for use
@@ -203,7 +206,7 @@ func SetupRootDir() (string, func(), error) {
 
 // SetupContainer creates a bundle and root dir for the container, generates a
 // test config, and writes the spec to config.json in the bundle dir.
-func SetupContainer(spec *specs.Spec, conf *boot.Config) (rootDir, bundleDir string, cleanup func(), err error) {
+func SetupContainer(spec *specs.Spec, conf *config.Config) (rootDir, bundleDir string, cleanup func(), err error) {
 	rootDir, rootCleanup, err := SetupRootDir()
 	if err != nil {
 		return "", "", nil, err
@@ -243,12 +246,15 @@ func writeSpec(dir string, spec *specs.Spec) error {
 	return ioutil.WriteFile(filepath.Join(dir, "config.json"), b, 0755)
 }
 
+// idRandomSrc is a pseudo random generator used to in RandomID.
+var idRandomSrc = rand.New(rand.NewSource(time.Now().UnixNano()))
+
 // RandomID returns 20 random bytes following the given prefix.
 func RandomID(prefix string) string {
 	// Read 20 random bytes.
 	b := make([]byte, 20)
 	// "[Read] always returns len(p) and a nil error." --godoc
-	if _, err := rand.Read(b); err != nil {
+	if _, err := idRandomSrc.Read(b); err != nil {
 		panic("rand.Read failed: " + err.Error())
 	}
 	if prefix != "" {
@@ -264,7 +270,7 @@ func RandomID(prefix string) string {
 // same name, sometimes between test runs the socket does not get cleaned up
 // quickly enough, causing container creation to fail.
 func RandomContainerID() string {
-	return RandomID("test-container-")
+	return RandomID("test-container")
 }
 
 // Copy copies file from src to dst.
@@ -326,13 +332,13 @@ func PollContext(ctx context.Context, cb func() error) error {
 }
 
 // WaitForHTTP tries GET requests on a port until the call succeeds or timeout.
-func WaitForHTTP(port int, timeout time.Duration) error {
+func WaitForHTTP(ip string, port int, timeout time.Duration) error {
 	cb := func() error {
 		c := &http.Client{
 			// Calculate timeout to be able to do minimum 5 attempts.
 			Timeout: timeout / 5,
 		}
-		url := fmt.Sprintf("http://localhost:%d/", port)
+		url := fmt.Sprintf("http://%s:%d/", ip, port)
 		resp, err := c.Get(url)
 		if err != nil {
 			log.Printf("Waiting %s: %v", url, err)
diff --git a/pkg/unet/unet.go b/pkg/unet/unet.go
index d843f19cf..c976d7230 100644
--- a/pkg/unet/unet.go
+++ b/pkg/unet/unet.go
@@ -522,7 +522,7 @@ func (s *ServerSocket) Listen() error {
 // This is always blocking.
 //
 // Preconditions:
-//  * ServerSocket is listening (Listen called).
+// * ServerSocket is listening (Listen called).
 func (s *ServerSocket) Accept() (*Socket, error) {
 	fd, ok := s.socket.enterFD()
 	if !ok {
diff --git a/pkg/usermem/addr_range_seq_unsafe.go b/pkg/usermem/addr_range_seq_unsafe.go
index c09337c15..495896ded 100644
--- a/pkg/usermem/addr_range_seq_unsafe.go
+++ b/pkg/usermem/addr_range_seq_unsafe.go
@@ -81,8 +81,10 @@ func AddrRangeSeqFromSlice(slice []AddrRange) AddrRangeSeq {
 	return addrRangeSeqFromSliceLimited(slice, limit)
 }
 
-// Preconditions: The combined length of all AddrRanges in slice <= limit.
-// limit >= 0. If len(slice) != 0, then limit > 0.
+// Preconditions:
+// * The combined length of all AddrRanges in slice <= limit.
+// * limit >= 0.
+// * If len(slice) != 0, then limit > 0.
 func addrRangeSeqFromSliceLimited(slice []AddrRange, limit int64) AddrRangeSeq {
 	switch len(slice) {
 	case 0:
diff --git a/pkg/usermem/usermem.go b/pkg/usermem/usermem.go
index cd6a0ea6b..9b1e7a085 100644
--- a/pkg/usermem/usermem.go
+++ b/pkg/usermem/usermem.go
@@ -21,7 +21,6 @@ import (
 	"io"
 	"strconv"
 
-	"gvisor.dev/gvisor/pkg/binary"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/gohacks"
 	"gvisor.dev/gvisor/pkg/safemem"
@@ -54,8 +53,10 @@ type IO interface {
 	// of bytes zeroed. If the number of bytes zeroed is < toZero, it returns a
 	// non-nil error explaining why.
 	//
-	// Preconditions: The caller must not hold mm.MemoryManager.mappingMu or
-	// any following locks in the lock order. toZero >= 0.
+	// Preconditions:
+	// * The caller must not hold mm.MemoryManager.mappingMu or any
+	//   following locks in the lock order.
+	// * toZero >= 0.
 	ZeroOut(ctx context.Context, addr Addr, toZero int64, opts IOOpts) (int64, error)
 
 	// CopyOutFrom copies ars.NumBytes() bytes from src to the memory mapped at
@@ -66,9 +67,11 @@ type IO interface {
 	//
 	// CopyOutFrom calls src.ReadToBlocks at most once.
 	//
-	// Preconditions: The caller must not hold mm.MemoryManager.mappingMu or
-	// any following locks in the lock order. src.ReadToBlocks must not block
-	// on mm.MemoryManager.activeMu or any preceding locks in the lock order.
+	// Preconditions:
+	// * The caller must not hold mm.MemoryManager.mappingMu or any
+	//   following locks in the lock order.
+	// * src.ReadToBlocks must not block on mm.MemoryManager.activeMu or
+	//   any preceding locks in the lock order.
 	CopyOutFrom(ctx context.Context, ars AddrRangeSeq, src safemem.Reader, opts IOOpts) (int64, error)
 
 	// CopyInTo copies ars.NumBytes() bytes from the memory mapped at ars to
@@ -78,10 +81,11 @@ type IO interface {
 	//
 	// CopyInTo calls dst.WriteFromBlocks at most once.
 	//
-	// Preconditions: The caller must not hold mm.MemoryManager.mappingMu or
-	// any following locks in the lock order. dst.WriteFromBlocks must not
-	// block on mm.MemoryManager.activeMu or any preceding locks in the lock
-	// order.
+	// Preconditions:
+	// * The caller must not hold mm.MemoryManager.mappingMu or any
+	//   following locks in the lock order.
+	// * dst.WriteFromBlocks must not block on mm.MemoryManager.activeMu or
+	//   any preceding locks in the lock order.
 	CopyInTo(ctx context.Context, ars AddrRangeSeq, dst safemem.Writer, opts IOOpts) (int64, error)
 
 	// TODO(jamieliu): The requirement that CopyOutFrom/CopyInTo call src/dst
@@ -93,25 +97,28 @@ type IO interface {
 	// SwapUint32 atomically sets the uint32 value at addr to new and
 	// returns the previous value.
 	//
-	// Preconditions: The caller must not hold mm.MemoryManager.mappingMu or
-	// any following locks in the lock order. addr must be aligned to a 4-byte
-	// boundary.
+	// Preconditions:
+	// * The caller must not hold mm.MemoryManager.mappingMu or any
+	//   following locks in the lock order.
+	// * addr must be aligned to a 4-byte boundary.
 	SwapUint32(ctx context.Context, addr Addr, new uint32, opts IOOpts) (uint32, error)
 
 	// CompareAndSwapUint32 atomically compares the uint32 value at addr to
 	// old; if they are equal, the value in memory is replaced by new. In
 	// either case, the previous value stored in memory is returned.
 	//
-	// Preconditions: The caller must not hold mm.MemoryManager.mappingMu or
-	// any following locks in the lock order. addr must be aligned to a 4-byte
-	// boundary.
+	// Preconditions:
+	// * The caller must not hold mm.MemoryManager.mappingMu or any
+	//   following locks in the lock order.
+	// * addr must be aligned to a 4-byte boundary.
 	CompareAndSwapUint32(ctx context.Context, addr Addr, old, new uint32, opts IOOpts) (uint32, error)
 
 	// LoadUint32 atomically loads the uint32 value at addr and returns it.
 	//
-	// Preconditions: The caller must not hold mm.MemoryManager.mappingMu or
-	// any following locks in the lock order. addr must be aligned to a 4-byte
-	// boundary.
+	// Preconditions:
+	// * The caller must not hold mm.MemoryManager.mappingMu or any
+	//   following locks in the lock order.
+	// * addr must be aligned to a 4-byte boundary.
 	LoadUint32(ctx context.Context, addr Addr, opts IOOpts) (uint32, error)
 }
 
@@ -176,51 +183,6 @@ func (rw *IOReadWriter) Write(src []byte) (int, error) {
 	return n, err
 }
 
-// CopyObjectOut copies a fixed-size value or slice of fixed-size values from
-// src to the memory mapped at addr in uio. It returns the number of bytes
-// copied.
-//
-// CopyObjectOut must use reflection to encode src; performance-sensitive
-// clients should do encoding manually and use uio.CopyOut directly.
-//
-// Preconditions: As for IO.CopyOut.
-func CopyObjectOut(ctx context.Context, uio IO, addr Addr, src interface{}, opts IOOpts) (int, error) {
-	w := &IOReadWriter{
-		Ctx:  ctx,
-		IO:   uio,
-		Addr: addr,
-		Opts: opts,
-	}
-	// Allocate a byte slice the size of the object being marshaled. This
-	// adds an extra reflection call, but avoids needing to grow the slice
-	// during encoding, which can result in many heap-allocated slices.
-	b := make([]byte, 0, binary.Size(src))
-	return w.Write(binary.Marshal(b, ByteOrder, src))
-}
-
-// CopyObjectIn copies a fixed-size value or slice of fixed-size values from
-// the memory mapped at addr in uio to dst. It returns the number of bytes
-// copied.
-//
-// CopyObjectIn must use reflection to decode dst; performance-sensitive
-// clients should use uio.CopyIn directly and do decoding manually.
-//
-// Preconditions: As for IO.CopyIn.
-func CopyObjectIn(ctx context.Context, uio IO, addr Addr, dst interface{}, opts IOOpts) (int, error) {
-	r := &IOReadWriter{
-		Ctx:  ctx,
-		IO:   uio,
-		Addr: addr,
-		Opts: opts,
-	}
-	buf := make([]byte, binary.Size(dst))
-	if _, err := io.ReadFull(r, buf); err != nil {
-		return 0, err
-	}
-	binary.Unmarshal(buf, ByteOrder, dst)
-	return int(r.Addr - addr), nil
-}
-
 // CopyStringIn tuning parameters, defined outside that function for tests.
 const (
 	copyStringIncrement     = 64
@@ -233,7 +195,8 @@ const (
 // would exceed maxlen, CopyStringIn returns the string truncated to maxlen and
 // ENAMETOOLONG.
 //
-// Preconditions: As for IO.CopyFromUser. maxlen >= 0.
+// Preconditions: Same as IO.CopyFromUser, plus:
+// * maxlen >= 0.
 func CopyStringIn(ctx context.Context, uio IO, addr Addr, maxlen int, opts IOOpts) (string, error) {
 	initLen := maxlen
 	if initLen > copyStringMaxInitBufLen {
@@ -287,7 +250,7 @@ func CopyStringIn(ctx context.Context, uio IO, addr Addr, maxlen int, opts IOOpt
 // less. CopyOutVec returns the number of bytes copied; if this is less than
 // the maximum, it returns a non-nil error explaining why.
 //
-// Preconditions: As for IO.CopyOut.
+// Preconditions: Same as IO.CopyOut.
 func CopyOutVec(ctx context.Context, uio IO, ars AddrRangeSeq, src []byte, opts IOOpts) (int, error) {
 	var done int
 	for !ars.IsEmpty() && done < len(src) {
@@ -311,7 +274,7 @@ func CopyOutVec(ctx context.Context, uio IO, ars AddrRangeSeq, src []byte, opts
 // less. CopyInVec returns the number of bytes copied; if this is less than the
 // maximum, it returns a non-nil error explaining why.
 //
-// Preconditions: As for IO.CopyIn.
+// Preconditions: Same as IO.CopyIn.
 func CopyInVec(ctx context.Context, uio IO, ars AddrRangeSeq, dst []byte, opts IOOpts) (int, error) {
 	var done int
 	for !ars.IsEmpty() && done < len(dst) {
@@ -335,7 +298,7 @@ func CopyInVec(ctx context.Context, uio IO, ars AddrRangeSeq, dst []byte, opts I
 // ZeroOutVec returns the number of bytes written; if this is less than the
 // maximum, it returns a non-nil error explaining why.
 //
-// Preconditions: As for IO.ZeroOut.
+// Preconditions: Same as IO.ZeroOut.
 func ZeroOutVec(ctx context.Context, uio IO, ars AddrRangeSeq, toZero int64, opts IOOpts) (int64, error) {
 	var done int64
 	for !ars.IsEmpty() && done < toZero {
@@ -388,7 +351,7 @@ func isASCIIWhitespace(b byte) bool {
 //
 // - CopyInt32StringsInVec returns EINVAL if ars.NumBytes() == 0.
 //
-// Preconditions: As for CopyInVec.
+// Preconditions: Same as CopyInVec.
 func CopyInt32StringsInVec(ctx context.Context, uio IO, ars AddrRangeSeq, dsts []int32, opts IOOpts) (int64, error) {
 	if len(dsts) == 0 {
 		return 0, nil
@@ -481,28 +444,28 @@ func (s IOSequence) NumBytes() int64 {
 
 // DropFirst returns a copy of s with s.Addrs.DropFirst(n).
 //
-// Preconditions: As for AddrRangeSeq.DropFirst.
+// Preconditions: Same as AddrRangeSeq.DropFirst.
 func (s IOSequence) DropFirst(n int) IOSequence {
 	return IOSequence{s.IO, s.Addrs.DropFirst(n), s.Opts}
 }
 
 // DropFirst64 returns a copy of s with s.Addrs.DropFirst64(n).
 //
-// Preconditions: As for AddrRangeSeq.DropFirst64.
+// Preconditions: Same as AddrRangeSeq.DropFirst64.
 func (s IOSequence) DropFirst64(n int64) IOSequence {
 	return IOSequence{s.IO, s.Addrs.DropFirst64(n), s.Opts}
 }
 
 // TakeFirst returns a copy of s with s.Addrs.TakeFirst(n).
 //
-// Preconditions: As for AddrRangeSeq.TakeFirst.
+// Preconditions: Same as AddrRangeSeq.TakeFirst.
 func (s IOSequence) TakeFirst(n int) IOSequence {
 	return IOSequence{s.IO, s.Addrs.TakeFirst(n), s.Opts}
 }
 
 // TakeFirst64 returns a copy of s with s.Addrs.TakeFirst64(n).
 //
-// Preconditions: As for AddrRangeSeq.TakeFirst64.
+// Preconditions: Same as AddrRangeSeq.TakeFirst64.
 func (s IOSequence) TakeFirst64(n int64) IOSequence {
 	return IOSequence{s.IO, s.Addrs.TakeFirst64(n), s.Opts}
 }
@@ -512,7 +475,7 @@ func (s IOSequence) TakeFirst64(n int64) IOSequence {
 // As with CopyOutVec, if s.NumBytes() < len(src), the copy will be truncated
 // to s.NumBytes(), and a nil error will be returned.
 //
-// Preconditions: As for CopyOutVec.
+// Preconditions: Same as CopyOutVec.
 func (s IOSequence) CopyOut(ctx context.Context, src []byte) (int, error) {
 	return CopyOutVec(ctx, s.IO, s.Addrs, src, s.Opts)
 }
@@ -522,7 +485,7 @@ func (s IOSequence) CopyOut(ctx context.Context, src []byte) (int, error) {
 // As with CopyInVec, if s.NumBytes() < len(dst), the copy will be truncated to
 // s.NumBytes(), and a nil error will be returned.
 //
-// Preconditions: As for CopyInVec.
+// Preconditions: Same as CopyInVec.
 func (s IOSequence) CopyIn(ctx context.Context, dst []byte) (int, error) {
 	return CopyInVec(ctx, s.IO, s.Addrs, dst, s.Opts)
 }
@@ -532,21 +495,21 @@ func (s IOSequence) CopyIn(ctx context.Context, dst []byte) (int, error) {
 // As with ZeroOutVec, if s.NumBytes() < toZero, the write will be truncated
 // to s.NumBytes(), and a nil error will be returned.
 //
-// Preconditions: As for ZeroOutVec.
+// Preconditions: Same as ZeroOutVec.
 func (s IOSequence) ZeroOut(ctx context.Context, toZero int64) (int64, error) {
 	return ZeroOutVec(ctx, s.IO, s.Addrs, toZero, s.Opts)
 }
 
 // CopyOutFrom invokes s.CopyOutFrom over s.Addrs.
 //
-// Preconditions: As for IO.CopyOutFrom.
+// Preconditions: Same as IO.CopyOutFrom.
 func (s IOSequence) CopyOutFrom(ctx context.Context, src safemem.Reader) (int64, error) {
 	return s.IO.CopyOutFrom(ctx, s.Addrs, src, s.Opts)
 }
 
 // CopyInTo invokes s.CopyInTo over s.Addrs.
 //
-// Preconditions: As for IO.CopyInTo.
+// Preconditions: Same as IO.CopyInTo.
 func (s IOSequence) CopyInTo(ctx context.Context, dst safemem.Writer) (int64, error) {
 	return s.IO.CopyInTo(ctx, s.Addrs, dst, s.Opts)
 }
diff --git a/pkg/usermem/usermem_test.go b/pkg/usermem/usermem_test.go
index bf3c5df2b..da60b0cc7 100644
--- a/pkg/usermem/usermem_test.go
+++ b/pkg/usermem/usermem_test.go
@@ -16,7 +16,6 @@ package usermem
 
 import (
 	"bytes"
-	"encoding/binary"
 	"fmt"
 	"reflect"
 	"strings"
@@ -174,23 +173,6 @@ type testStruct struct {
 	Uint64 uint64
 }
 
-func TestCopyObject(t *testing.T) {
-	wantObj := testStruct{1, 2, 3, 4, 5, 6, 7, 8}
-	wantN := binary.Size(wantObj)
-	b := &BytesIO{make([]byte, wantN)}
-	ctx := newContext()
-	if n, err := CopyObjectOut(ctx, b, 0, &wantObj, IOOpts{}); n != wantN || err != nil {
-		t.Fatalf("CopyObjectOut: got (%v, %v), wanted (%v, nil)", n, err, wantN)
-	}
-	var gotObj testStruct
-	if n, err := CopyObjectIn(ctx, b, 0, &gotObj, IOOpts{}); n != wantN || err != nil {
-		t.Errorf("CopyObjectIn: got (%v, %v), wanted (%v, nil)", n, err, wantN)
-	}
-	if gotObj != wantObj {
-		t.Errorf("CopyObject round trip: got %+v, wanted %+v", gotObj, wantObj)
-	}
-}
-
 func TestCopyStringInShort(t *testing.T) {
 	// Tests for string length <= copyStringIncrement.
 	want := strings.Repeat("A", copyStringIncrement-2)