229 files changed, 9363 insertions, 4279 deletions
diff --git a/pkg/abi/linux/fcntl.go b/pkg/abi/linux/fcntl.go
index cc3571fad..d1ca56370 100644
--- a/pkg/abi/linux/fcntl.go
+++ b/pkg/abi/linux/fcntl.go
@@ -25,6 +25,8 @@ const (
 	F_SETLKW        = 7
 	F_SETOWN        = 8
 	F_GETOWN        = 9
+	F_SETSIG        = 10
+	F_GETSIG        = 11
 	F_SETOWN_EX     = 15
 	F_GETOWN_EX     = 16
 	F_DUPFD_CLOEXEC = 1024 + 6
diff --git a/pkg/abi/linux/sem.go b/pkg/abi/linux/sem.go
index 1b2f76c0b..0adff8dff 100644
--- a/pkg/abi/linux/sem.go
+++ b/pkg/abi/linux/sem.go
@@ -32,6 +32,23 @@ const (
 	SEM_STAT_ANY = 20
 )
 
+// Information about system-wide sempahore limits and parameters.
+//
+// Source: include/uapi/linux/sem.h
+const (
+	SEMMNI = 32000
+	SEMMSL = 32000
+	SEMMNS = SEMMNI * SEMMSL
+	SEMOPM = 500
+	SEMVMX = 32767
+	SEMAEM = SEMVMX
+
+	// followings are unused in kernel
+	SEMUME = SEMOPM
+	SEMMNU = SEMMNS
+	SEMMAP = SEMMNS
+)
+
 const SEM_UNDO = 0x1000
 
 // Sembuf is equivalent to struct sembuf.
@@ -42,3 +59,21 @@ type Sembuf struct {
 	SemOp  int16
 	SemFlg int16
 }
+
+// SemInfo is equivalent to struct seminfo.
+//
+// Source: include/uapi/linux/sem.h
+//
+// +marshal
+type SemInfo struct {
+	SemMap uint32
+	SemMni uint32
+	SemMns uint32
+	SemMnu uint32
+	SemMsl uint32
+	SemOpm uint32
+	SemUme uint32
+	SemUsz uint32
+	SemVmx uint32
+	SemAem uint32
+}
diff --git a/pkg/crypto/BUILD b/pkg/crypto/BUILD
new file mode 100644
index 000000000..08fa772ca
--- /dev/null
+++ b/pkg/crypto/BUILD
@@ -0,0 +1,12 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "crypto",
+    srcs = [
+        "crypto.go",
+        "crypto_stdlib.go",
+    ],
+    visibility = ["//:sandbox"],
+)
diff --git a/pkg/sleep/empty.s b/pkg/crypto/crypto.go
index fb37360ac..b26b55d37 100644
--- a/pkg/sleep/empty.s
+++ b/pkg/crypto/crypto.go
@@ -1,4 +1,4 @@
-// Copyright 2018 The gVisor Authors.
+// Copyright 2020 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,4 +12,5 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// Empty assembly file so empty func definitions work.
+// Package crypto wraps crypto primitives.
+package crypto
diff --git a/pkg/syncevent/waiter_asm_unsafe.go b/pkg/crypto/crypto_stdlib.go
index 19d6b0b15..74a55a123 100644
--- a/pkg/syncevent/waiter_asm_unsafe.go
+++ b/pkg/crypto/crypto_stdlib.go
@@ -12,13 +12,21 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// +build amd64 arm64
-
-package syncevent
+package crypto
 
 import (
-	"unsafe"
+	"crypto/ecdsa"
+	"crypto/sha512"
+	"math/big"
 )
 
-// See waiter_noasm_unsafe.go for a description of waiterUnlock.
-func waiterUnlock(ptr unsafe.Pointer, wg *unsafe.Pointer) bool
+// EcdsaVerify verifies the signature in r, s of hash using ECDSA and the
+// public key, pub. Its return value records whether the signature is valid.
+func EcdsaVerify(pub *ecdsa.PublicKey, hash []byte, r, s *big.Int) bool {
+	return ecdsa.Verify(pub, hash, r, s)
+}
+
+// SumSha384 returns the SHA384 checksum of the data.
+func SumSha384(data []byte) (sum384 [sha512.Size384]byte) {
+	return sha512.Sum384(data)
+}
diff --git a/pkg/flipcall/ctrl_futex.go b/pkg/flipcall/ctrl_futex.go
index e7c3a3a0b..2e8452a02 100644
--- a/pkg/flipcall/ctrl_futex.go
+++ b/pkg/flipcall/ctrl_futex.go
@@ -40,17 +40,41 @@ func (ep *Endpoint) ctrlInit(opts ...EndpointOption) error {
 	return nil
 }
 
-type ctrlHandshakeRequest struct{}
-
-type ctrlHandshakeResponse struct{}
-
 func (ep *Endpoint) ctrlConnect() error {
 	if err := ep.enterFutexWait(); err != nil {
 		return err
 	}
-	_, err := ep.futexConnect(&ctrlHandshakeRequest{})
-	ep.exitFutexWait()
-	return err
+	defer ep.exitFutexWait()
+
+	// Write the connection request.
+	w := ep.NewWriter()
+	if err := json.NewEncoder(w).Encode(struct{}{}); err != nil {
+		return fmt.Errorf("error writing connection request: %v", err)
+	}
+	*ep.dataLen() = w.Len()
+
+	// Exchange control with the server.
+	if err := ep.futexSetPeerActive(); err != nil {
+		return err
+	}
+	if err := ep.futexWakePeer(); err != nil {
+		return err
+	}
+	if err := ep.futexWaitUntilActive(); err != nil {
+		return err
+	}
+
+	// Read the connection response.
+	var resp struct{}
+	respLen := atomic.LoadUint32(ep.dataLen())
+	if respLen > ep.dataCap {
+		return fmt.Errorf("invalid connection response length %d (maximum %d)", respLen, ep.dataCap)
+	}
+	if err := json.NewDecoder(ep.NewReader(respLen)).Decode(&resp); err != nil {
+		return fmt.Errorf("error reading connection response: %v", err)
+	}
+
+	return nil
 }
 
 func (ep *Endpoint) ctrlWaitFirst() error {
@@ -59,52 +83,61 @@ func (ep *Endpoint) ctrlWaitFirst() error {
 	}
 	defer ep.exitFutexWait()
 
-	// Wait for the handshake request.
-	if err := ep.futexSwitchFromPeer(); err != nil {
+	// Wait for the connection request.
+	if err := ep.futexWaitUntilActive(); err != nil {
 		return err
 	}
 
-	// Read the handshake request.
+	// Read the connection request.
 	reqLen := atomic.LoadUint32(ep.dataLen())
 	if reqLen > ep.dataCap {
-		return fmt.Errorf("invalid handshake request length %d (maximum %d)", reqLen, ep.dataCap)
+		return fmt.Errorf("invalid connection request length %d (maximum %d)", reqLen, ep.dataCap)
 	}
-	var req ctrlHandshakeRequest
+	var req struct{}
 	if err := json.NewDecoder(ep.NewReader(reqLen)).Decode(&req); err != nil {
-		return fmt.Errorf("error reading handshake request: %v", err)
+		return fmt.Errorf("error reading connection request: %v", err)
 	}
 
-	// Write the handshake response.
+	// Write the connection response.
 	w := ep.NewWriter()
-	if err := json.NewEncoder(w).Encode(ctrlHandshakeResponse{}); err != nil {
-		return fmt.Errorf("error writing handshake response: %v", err)
+	if err := json.NewEncoder(w).Encode(struct{}{}); err != nil {
+		return fmt.Errorf("error writing connection response: %v", err)
 	}
 	*ep.dataLen() = w.Len()
 
 	// Return control to the client.
 	raceBecomeInactive()
-	if err := ep.futexSwitchToPeer(); err != nil {
+	if err := ep.futexSetPeerActive(); err != nil {
+		return err
+	}
+	if err := ep.futexWakePeer(); err != nil {
 		return err
 	}
 
-	// Wait for the first non-handshake message.
-	return ep.futexSwitchFromPeer()
+	// Wait for the first non-connection message.
+	return ep.futexWaitUntilActive()
 }
 
 func (ep *Endpoint) ctrlRoundTrip() error {
-	if err := ep.futexSwitchToPeer(); err != nil {
+	if err := ep.enterFutexWait(); err != nil {
 		return err
 	}
-	if err := ep.enterFutexWait(); err != nil {
+	defer ep.exitFutexWait()
+
+	if err := ep.futexSetPeerActive(); err != nil {
 		return err
 	}
-	err := ep.futexSwitchFromPeer()
-	ep.exitFutexWait()
-	return err
+	if err := ep.futexWakePeer(); err != nil {
+		return err
+	}
+	return ep.futexWaitUntilActive()
 }
 
 func (ep *Endpoint) ctrlWakeLast() error {
-	return ep.futexSwitchToPeer()
+	if err := ep.futexSetPeerActive(); err != nil {
+		return err
+	}
+	return ep.futexWakePeer()
 }
 
 func (ep *Endpoint) enterFutexWait() error {
diff --git a/pkg/flipcall/flipcall_unsafe.go b/pkg/flipcall/flipcall_unsafe.go
index ac974b232..580bf23a4 100644
--- a/pkg/flipcall/flipcall_unsafe.go
+++ b/pkg/flipcall/flipcall_unsafe.go
@@ -41,11 +41,11 @@ const (
 )
 
 func (ep *Endpoint) connState() *uint32 {
-	return (*uint32)((unsafe.Pointer)(ep.packet))
+	return (*uint32)(unsafe.Pointer(ep.packet))
 }
 
 func (ep *Endpoint) dataLen() *uint32 {
-	return (*uint32)((unsafe.Pointer)(ep.packet + 4))
+	return (*uint32)(unsafe.Pointer(ep.packet + 4))
 }
 
 // Data returns the datagram part of ep's packet window as a byte slice.
@@ -63,7 +63,7 @@ func (ep *Endpoint) dataLen() *uint32 {
 // all.
 func (ep *Endpoint) Data() []byte {
 	var bs []byte
-	bsReflect := (*reflect.SliceHeader)((unsafe.Pointer)(&bs))
+	bsReflect := (*reflect.SliceHeader)(unsafe.Pointer(&bs))
 	bsReflect.Data = ep.packet + PacketHeaderBytes
 	bsReflect.Len = int(ep.dataCap)
 	bsReflect.Cap = int(ep.dataCap)
@@ -76,12 +76,12 @@ var ioSync int64
 
 func raceBecomeActive() {
 	if sync.RaceEnabled {
-		sync.RaceAcquire((unsafe.Pointer)(&ioSync))
+		sync.RaceAcquire(unsafe.Pointer(&ioSync))
 	}
 }
 
 func raceBecomeInactive() {
 	if sync.RaceEnabled {
-		sync.RaceReleaseMerge((unsafe.Pointer)(&ioSync))
+		sync.RaceReleaseMerge(unsafe.Pointer(&ioSync))
 	}
 }
diff --git a/pkg/flipcall/futex_linux.go b/pkg/flipcall/futex_linux.go
index 168c1ccff..0e559ee16 100644
--- a/pkg/flipcall/futex_linux.go
+++ b/pkg/flipcall/futex_linux.go
@@ -17,7 +17,6 @@
 package flipcall
 
 import (
-	"encoding/json"
 	"fmt"
 	"runtime"
 	"sync/atomic"
@@ -26,55 +25,26 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 )
 
-func (ep *Endpoint) futexConnect(req *ctrlHandshakeRequest) (ctrlHandshakeResponse, error) {
-	var resp ctrlHandshakeResponse
-
-	// Write the handshake request.
-	w := ep.NewWriter()
-	if err := json.NewEncoder(w).Encode(req); err != nil {
-		return resp, fmt.Errorf("error writing handshake request: %v", err)
-	}
-	*ep.dataLen() = w.Len()
-
-	// Exchange control with the server.
-	if err := ep.futexSwitchToPeer(); err != nil {
-		return resp, err
+func (ep *Endpoint) futexSetPeerActive() error {
+	if atomic.CompareAndSwapUint32(ep.connState(), ep.activeState, ep.inactiveState) {
+		return nil
 	}
-	if err := ep.futexSwitchFromPeer(); err != nil {
-		return resp, err
+	switch cs := atomic.LoadUint32(ep.connState()); cs {
+	case csShutdown:
+		return ShutdownError{}
+	default:
+		return fmt.Errorf("unexpected connection state before FUTEX_WAKE: %v", cs)
 	}
-
-	// Read the handshake response.
-	respLen := atomic.LoadUint32(ep.dataLen())
-	if respLen > ep.dataCap {
-		return resp, fmt.Errorf("invalid handshake response length %d (maximum %d)", respLen, ep.dataCap)
-	}
-	if err := json.NewDecoder(ep.NewReader(respLen)).Decode(&resp); err != nil {
-		return resp, fmt.Errorf("error reading handshake response: %v", err)
-	}
-
-	return resp, nil
 }
 
-func (ep *Endpoint) futexSwitchToPeer() error {
-	// Update connection state to indicate that the peer should be active.
-	if !atomic.CompareAndSwapUint32(ep.connState(), ep.activeState, ep.inactiveState) {
-		switch cs := atomic.LoadUint32(ep.connState()); cs {
-		case csShutdown:
-			return ShutdownError{}
-		default:
-			return fmt.Errorf("unexpected connection state before FUTEX_WAKE: %v", cs)
-		}
-	}
-
-	// Wake the peer's Endpoint.futexSwitchFromPeer().
+func (ep *Endpoint) futexWakePeer() error {
 	if err := ep.futexWakeConnState(1); err != nil {
 		return fmt.Errorf("failed to FUTEX_WAKE peer Endpoint: %v", err)
 	}
 	return nil
 }
 
-func (ep *Endpoint) futexSwitchFromPeer() error {
+func (ep *Endpoint) futexWaitUntilActive() error {
 	for {
 		switch cs := atomic.LoadUint32(ep.connState()); cs {
 		case ep.activeState:
diff --git a/pkg/goid/BUILD b/pkg/goid/BUILD
index d855b702c..08832a8ae 100644
--- a/pkg/goid/BUILD
+++ b/pkg/goid/BUILD
@@ -9,6 +9,7 @@ go_library(
         "goid_amd64.s",
         "goid_arm64.s",
     ],
+    stateify = False,
     visibility = ["//visibility:public"],
 )
 
diff --git a/pkg/p9/client.go b/pkg/p9/client.go
index 71e944c30..eadea390a 100644
--- a/pkg/p9/client.go
+++ b/pkg/p9/client.go
@@ -570,6 +570,8 @@ func (c *Client) Version() uint32 {
 func (c *Client) Close() {
 	// unet.Socket.Shutdown() has no effect if unet.Socket.Close() has already
 	// been called (by c.watch()).
-	c.socket.Shutdown()
+	if err := c.socket.Shutdown(); err != nil {
+		log.Warningf("Socket.Shutdown() failed (FD: %d): %v", c.socket.FD(), err)
+	}
 	c.closedWg.Wait()
 }
diff --git a/pkg/p9/client_file.go b/pkg/p9/client_file.go
index 28fe081d6..8b46a2987 100644
--- a/pkg/p9/client_file.go
+++ b/pkg/p9/client_file.go
@@ -478,28 +478,23 @@ func (r *ReadWriterFile) ReadAt(p []byte, offset int64) (int, error) {
 }
 
 // Write implements part of the io.ReadWriter interface.
+//
+// Note that this may return a short write with a nil error. This violates the
+// contract of io.Writer, but is more consistent with gVisor's pattern of
+// returning errors that correspond to Linux errnos. Since short writes without
+// error are common in Linux, returning a nil error is appropriate.
 func (r *ReadWriterFile) Write(p []byte) (int, error) {
 	n, err := r.File.WriteAt(p, r.Offset)
 	r.Offset += uint64(n)
-	if err != nil {
-		return n, err
-	}
-	if n < len(p) {
-		return n, io.ErrShortWrite
-	}
-	return n, nil
+	return n, err
 }
 
 // WriteAt implements the io.WriteAt interface.
+//
+// Note that this may return a short write with a nil error. This violates the
+// contract of io.WriterAt. See comment on Write for justification.
 func (r *ReadWriterFile) WriteAt(p []byte, offset int64) (int, error) {
-	n, err := r.File.WriteAt(p, uint64(offset))
-	if err != nil {
-		return n, err
-	}
-	if n < len(p) {
-		return n, io.ErrShortWrite
-	}
-	return n, nil
+	return r.File.WriteAt(p, uint64(offset))
 }
 
 // Rename implements File.Rename.
diff --git a/pkg/p9/handlers.go b/pkg/p9/handlers.go
index abd237f46..81ceb37c5 100644
--- a/pkg/p9/handlers.go
+++ b/pkg/p9/handlers.go
@@ -296,25 +296,6 @@ func (t *Tlopen) handle(cs *connState) message {
 	}
 	defer ref.DecRef()
 
-	ref.openedMu.Lock()
-	defer ref.openedMu.Unlock()
-
-	// Has it been opened already?
-	if ref.opened || !CanOpen(ref.mode) {
-		return newErr(syscall.EINVAL)
-	}
-
-	if ref.mode.IsDir() {
-		// Directory must be opened ReadOnly.
-		if t.Flags&OpenFlagsModeMask != ReadOnly {
-			return newErr(syscall.EISDIR)
-		}
-		// Directory not truncatable.
-		if t.Flags&OpenTruncate != 0 {
-			return newErr(syscall.EISDIR)
-		}
-	}
-
 	var (
 		qid    QID
 		ioUnit uint32
@@ -326,6 +307,22 @@ func (t *Tlopen) handle(cs *connState) message {
 			return syscall.EINVAL
 		}
 
+		// Has it been opened already?
+		if ref.opened || !CanOpen(ref.mode) {
+			return syscall.EINVAL
+		}
+
+		if ref.mode.IsDir() {
+			// Directory must be opened ReadOnly.
+			if t.Flags&OpenFlagsModeMask != ReadOnly {
+				return syscall.EISDIR
+			}
+			// Directory not truncatable.
+			if t.Flags&OpenTruncate != 0 {
+				return syscall.EISDIR
+			}
+		}
+
 		osFile, qid, ioUnit, err = ref.file.Open(t.Flags)
 		return err
 	}); err != nil {
@@ -366,7 +363,7 @@ func (t *Tlcreate) do(cs *connState, uid UID) (*Rlcreate, error) {
 		}
 
 		// Not allowed on open directories.
-		if _, opened := ref.OpenFlags(); opened {
+		if ref.opened {
 			return syscall.EINVAL
 		}
 
@@ -437,7 +434,7 @@ func (t *Tsymlink) do(cs *connState, uid UID) (*Rsymlink, error) {
 		}
 
 		// Not allowed on open directories.
-		if _, opened := ref.OpenFlags(); opened {
+		if ref.opened {
 			return syscall.EINVAL
 		}
 
@@ -476,7 +473,7 @@ func (t *Tlink) handle(cs *connState) message {
 		}
 
 		// Not allowed on open directories.
-		if _, opened := ref.OpenFlags(); opened {
+		if ref.opened {
 			return syscall.EINVAL
 		}
 
@@ -518,7 +515,7 @@ func (t *Trenameat) handle(cs *connState) message {
 		}
 
 		// Not allowed on open directories.
-		if _, opened := ref.OpenFlags(); opened {
+		if ref.opened {
 			return syscall.EINVAL
 		}
 
@@ -561,7 +558,7 @@ func (t *Tunlinkat) handle(cs *connState) message {
 		}
 
 		// Not allowed on open directories.
-		if _, opened := ref.OpenFlags(); opened {
+		if ref.opened {
 			return syscall.EINVAL
 		}
 
@@ -701,13 +698,12 @@ func (t *Tread) handle(cs *connState) message {
 	)
 	if err := ref.safelyRead(func() (err error) {
 		// Has it been opened already?
-		openFlags, opened := ref.OpenFlags()
-		if !opened {
+		if !ref.opened {
 			return syscall.EINVAL
 		}
 
 		// Can it be read? Check permissions.
-		if openFlags&OpenFlagsModeMask == WriteOnly {
+		if ref.openFlags&OpenFlagsModeMask == WriteOnly {
 			return syscall.EPERM
 		}
 
@@ -731,13 +727,12 @@ func (t *Twrite) handle(cs *connState) message {
 	var n int
 	if err := ref.safelyRead(func() (err error) {
 		// Has it been opened already?
-		openFlags, opened := ref.OpenFlags()
-		if !opened {
+		if !ref.opened {
 			return syscall.EINVAL
 		}
 
 		// Can it be written? Check permissions.
-		if openFlags&OpenFlagsModeMask == ReadOnly {
+		if ref.openFlags&OpenFlagsModeMask == ReadOnly {
 			return syscall.EPERM
 		}
 
@@ -778,7 +773,7 @@ func (t *Tmknod) do(cs *connState, uid UID) (*Rmknod, error) {
 		}
 
 		// Not allowed on open directories.
-		if _, opened := ref.OpenFlags(); opened {
+		if ref.opened {
 			return syscall.EINVAL
 		}
 
@@ -820,7 +815,7 @@ func (t *Tmkdir) do(cs *connState, uid UID) (*Rmkdir, error) {
 		}
 
 		// Not allowed on open directories.
-		if _, opened := ref.OpenFlags(); opened {
+		if ref.opened {
 			return syscall.EINVAL
 		}
 
@@ -898,13 +893,12 @@ func (t *Tallocate) handle(cs *connState) message {
 
 	if err := ref.safelyWrite(func() error {
 		// Has it been opened already?
-		openFlags, opened := ref.OpenFlags()
-		if !opened {
+		if !ref.opened {
 			return syscall.EINVAL
 		}
 
 		// Can it be written? Check permissions.
-		if openFlags&OpenFlagsModeMask == ReadOnly {
+		if ref.openFlags&OpenFlagsModeMask == ReadOnly {
 			return syscall.EBADF
 		}
 
@@ -1049,8 +1043,8 @@ func (t *Treaddir) handle(cs *connState) message {
 			return syscall.EINVAL
 		}
 
-		// Has it been opened already?
-		if _, opened := ref.OpenFlags(); !opened {
+		// Has it been opened yet?
+		if !ref.opened {
 			return syscall.EINVAL
 		}
 
@@ -1076,8 +1070,8 @@ func (t *Tfsync) handle(cs *connState) message {
 	defer ref.DecRef()
 
 	if err := ref.safelyRead(func() (err error) {
-		// Has it been opened already?
-		if _, opened := ref.OpenFlags(); !opened {
+		// Has it been opened yet?
+		if !ref.opened {
 			return syscall.EINVAL
 		}
 
@@ -1185,8 +1179,13 @@ func doWalk(cs *connState, ref *fidRef, names []string, getattr bool) (qids []QI
 	}
 
 	// Has it been opened already?
-	if _, opened := ref.OpenFlags(); opened {
-		err = syscall.EBUSY
+	err = ref.safelyRead(func() (err error) {
+		if ref.opened {
+			return syscall.EBUSY
+		}
+		return nil
+	})
+	if err != nil {
 		return
 	}
 
diff --git a/pkg/p9/server.go b/pkg/p9/server.go
index 3736f12a3..8c5c434fd 100644
--- a/pkg/p9/server.go
+++ b/pkg/p9/server.go
@@ -134,12 +134,11 @@ type fidRef struct {
 	// The node above will be closed only when refs reaches zero.
 	refs int64
 
-	// openedMu protects opened and openFlags.
-	openedMu sync.Mutex
-
 	// opened indicates whether this has been opened already.
 	//
 	// This is updated in handlers.go.
+	//
+	// opened is protected by pathNode.opMu or renameMu (for write).
 	opened bool
 
 	// mode is the fidRef's mode from the walk. Only the type bits are
@@ -151,6 +150,8 @@ type fidRef struct {
 	// openFlags is the mode used in the open.
 	//
 	// This is updated in handlers.go.
+	//
+	// openFlags is protected by pathNode.opMu or renameMu (for write).
 	openFlags OpenFlags
 
 	// pathNode is the current pathNode for this FID.
@@ -177,13 +178,6 @@ type fidRef struct {
 	deleted uint32
 }
 
-// OpenFlags returns the flags the file was opened with and true iff the fid was opened previously.
-func (f *fidRef) OpenFlags() (OpenFlags, bool) {
-	f.openedMu.Lock()
-	defer f.openedMu.Unlock()
-	return f.openFlags, f.opened
-}
-
 // IncRef increases the references on a fid.
 func (f *fidRef) IncRef() {
 	atomic.AddInt64(&f.refs, 1)
diff --git a/pkg/sentry/arch/signal.go b/pkg/sentry/arch/signal.go
index c9fb55d00..5138f3bf5 100644
--- a/pkg/sentry/arch/signal.go
+++ b/pkg/sentry/arch/signal.go
@@ -251,3 +251,26 @@ func (s *SignalInfo) Arch() uint32 {
 func (s *SignalInfo) SetArch(val uint32) {
 	usermem.ByteOrder.PutUint32(s.Fields[12:16], val)
 }
+
+// Band returns the si_band field.
+func (s *SignalInfo) Band() int64 {
+	return int64(usermem.ByteOrder.Uint64(s.Fields[0:8]))
+}
+
+// SetBand mutates the si_band field.
+func (s *SignalInfo) SetBand(val int64) {
+	// Note: this assumes the platform uses `long` as `__ARCH_SI_BAND_T`.
+	// On some platforms, which gVisor doesn't support, `__ARCH_SI_BAND_T` is
+	// `int`. See siginfo.h.
+	usermem.ByteOrder.PutUint64(s.Fields[0:8], uint64(val))
+}
+
+// FD returns the si_fd field.
+func (s *SignalInfo) FD() uint32 {
+	return usermem.ByteOrder.Uint32(s.Fields[8:12])
+}
+
+// SetFD mutates the si_fd field.
+func (s *SignalInfo) SetFD(val uint32) {
+	usermem.ByteOrder.PutUint32(s.Fields[8:12], val)
+}
diff --git a/pkg/sentry/fs/BUILD b/pkg/sentry/fs/BUILD
index ea85ab33c..5c3e852e9 100644
--- a/pkg/sentry/fs/BUILD
+++ b/pkg/sentry/fs/BUILD
@@ -49,13 +49,13 @@ go_library(
         "//pkg/amutex",
         "//pkg/context",
         "//pkg/log",
-        "//pkg/metric",
         "//pkg/p9",
         "//pkg/refs",
         "//pkg/secio",
         "//pkg/sentry/arch",
         "//pkg/sentry/device",
         "//pkg/sentry/fs/lock",
+        "//pkg/sentry/fsmetric",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/kernel/time",
         "//pkg/sentry/limits",
diff --git a/pkg/sentry/fs/file.go b/pkg/sentry/fs/file.go
index 72ea70fcf..57f904801 100644
--- a/pkg/sentry/fs/file.go
+++ b/pkg/sentry/fs/file.go
@@ -17,13 +17,12 @@ package fs
 import (
 	"math"
 	"sync/atomic"
-	"time"
 
 	"gvisor.dev/gvisor/pkg/amutex"
 	"gvisor.dev/gvisor/pkg/context"
-	"gvisor.dev/gvisor/pkg/metric"
 	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/lock"
+	"gvisor.dev/gvisor/pkg/sentry/fsmetric"
 	"gvisor.dev/gvisor/pkg/sentry/limits"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/uniqueid"
@@ -33,28 +32,6 @@ import (
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
-var (
-	// RecordWaitTime controls writing metrics for filesystem reads.
-	// Enabling this comes at a small CPU cost due to performing two
-	// monotonic clock reads per read call.
-	//
-	// Note that this is only performed in the direct read path, and may
-	// not be consistently applied for other forms of reads, such as
-	// splice.
-	RecordWaitTime = false
-
-	reads    = metric.MustCreateNewUint64Metric("/fs/reads", false /* sync */, "Number of file reads.")
-	readWait = metric.MustCreateNewUint64NanosecondsMetric("/fs/read_wait", false /* sync */, "Time waiting on file reads, in nanoseconds.")
-)
-
-// IncrementWait increments the given wait time metric, if enabled.
-func IncrementWait(m *metric.Uint64Metric, start time.Time) {
-	if !RecordWaitTime {
-		return
-	}
-	m.IncrementBy(uint64(time.Since(start)))
-}
-
 // FileMaxOffset is the maximum possible file offset.
 const FileMaxOffset = math.MaxInt64
 
@@ -257,22 +234,19 @@ func (f *File) Readdir(ctx context.Context, serializer DentrySerializer) error {
 //
 // Returns syserror.ErrInterrupted if reading was interrupted.
 func (f *File) Readv(ctx context.Context, dst usermem.IOSequence) (int64, error) {
-	var start time.Time
-	if RecordWaitTime {
-		start = time.Now()
-	}
+	start := fsmetric.StartReadWait()
+	defer fsmetric.FinishReadWait(fsmetric.ReadWait, start)
+
 	if !f.mu.Lock(ctx) {
-		IncrementWait(readWait, start)
 		return 0, syserror.ErrInterrupted
 	}
 
-	reads.Increment()
+	fsmetric.Reads.Increment()
 	n, err := f.FileOperations.Read(ctx, f, dst, f.offset)
 	if n > 0 && !f.flags.NonSeekable {
 		atomic.AddInt64(&f.offset, n)
 	}
 	f.mu.Unlock()
-	IncrementWait(readWait, start)
 	return n, err
 }
 
@@ -282,19 +256,16 @@ func (f *File) Readv(ctx context.Context, dst usermem.IOSequence) (int64, error)
 //
 // Otherwise same as Readv.
 func (f *File) Preadv(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) {
-	var start time.Time
-	if RecordWaitTime {
-		start = time.Now()
-	}
+	start := fsmetric.StartReadWait()
+	defer fsmetric.FinishReadWait(fsmetric.ReadWait, start)
+
 	if !f.mu.Lock(ctx) {
-		IncrementWait(readWait, start)
 		return 0, syserror.ErrInterrupted
 	}
 
-	reads.Increment()
+	fsmetric.Reads.Increment()
 	n, err := f.FileOperations.Read(ctx, f, dst, offset)
 	f.mu.Unlock()
-	IncrementWait(readWait, start)
 	return n, err
 }
 
diff --git a/pkg/sentry/fs/fs.go b/pkg/sentry/fs/fs.go
index d2dbff268..a020da53b 100644
--- a/pkg/sentry/fs/fs.go
+++ b/pkg/sentry/fs/fs.go
@@ -65,7 +65,7 @@ var (
 	// runs with the lock held for reading. AsyncBarrier will take the lock
 	// for writing, thus ensuring that all Async work completes before
 	// AsyncBarrier returns.
-	workMu sync.RWMutex
+	workMu sync.CrossGoroutineRWMutex
 
 	// asyncError is used to store up to one asynchronous execution error.
 	asyncError = make(chan error, 1)
diff --git a/pkg/sentry/fs/gofer/BUILD b/pkg/sentry/fs/gofer/BUILD
index fea135eea..4c30098cd 100644
--- a/pkg/sentry/fs/gofer/BUILD
+++ b/pkg/sentry/fs/gofer/BUILD
@@ -28,7 +28,6 @@ go_library(
         "//pkg/context",
         "//pkg/fd",
         "//pkg/log",
-        "//pkg/metric",
         "//pkg/p9",
         "//pkg/refs",
         "//pkg/safemem",
@@ -38,6 +37,7 @@ go_library(
         "//pkg/sentry/fs/fdpipe",
         "//pkg/sentry/fs/fsutil",
         "//pkg/sentry/fs/host",
+        "//pkg/sentry/fsmetric",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/kernel/pipe",
         "//pkg/sentry/kernel/time",
diff --git a/pkg/sentry/fs/gofer/file.go b/pkg/sentry/fs/gofer/file.go
index c0bc63a32..bb63448cb 100644
--- a/pkg/sentry/fs/gofer/file.go
+++ b/pkg/sentry/fs/gofer/file.go
@@ -21,27 +21,17 @@ import (
 
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/log"
-	"gvisor.dev/gvisor/pkg/metric"
 	"gvisor.dev/gvisor/pkg/p9"
 	"gvisor.dev/gvisor/pkg/sentry/device"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.dev/gvisor/pkg/sentry/fsmetric"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
-var (
-	opensWX      = metric.MustCreateNewUint64Metric("/gofer/opened_write_execute_file", true /* sync */, "Number of times a writable+executable file was opened from a gofer.")
-	opens9P      = metric.MustCreateNewUint64Metric("/gofer/opens_9p", false /* sync */, "Number of times a 9P file was opened from a gofer.")
-	opensHost    = metric.MustCreateNewUint64Metric("/gofer/opens_host", false /* sync */, "Number of times a host file was opened from a gofer.")
-	reads9P      = metric.MustCreateNewUint64Metric("/gofer/reads_9p", false /* sync */, "Number of 9P file reads from a gofer.")
-	readWait9P   = metric.MustCreateNewUint64NanosecondsMetric("/gofer/read_wait_9p", false /* sync */, "Time waiting on 9P file reads from a gofer, in nanoseconds.")
-	readsHost    = metric.MustCreateNewUint64Metric("/gofer/reads_host", false /* sync */, "Number of host file reads from a gofer.")
-	readWaitHost = metric.MustCreateNewUint64NanosecondsMetric("/gofer/read_wait_host", false /* sync */, "Time waiting on host file reads from a gofer, in nanoseconds.")
-)
-
 // fileOperations implements fs.FileOperations for a remote file system.
 //
 // +stateify savable
@@ -101,14 +91,14 @@ func NewFile(ctx context.Context, dirent *fs.Dirent, name string, flags fs.FileF
 	}
 	if flags.Write {
 		if err := dirent.Inode.CheckPermission(ctx, fs.PermMask{Execute: true}); err == nil {
-			opensWX.Increment()
+			fsmetric.GoferOpensWX.Increment()
 			log.Warningf("Opened a writable executable: %q", name)
 		}
 	}
 	if handles.Host != nil {
-		opensHost.Increment()
+		fsmetric.GoferOpensHost.Increment()
 	} else {
-		opens9P.Increment()
+		fsmetric.GoferOpens9P.Increment()
 	}
 	return fs.NewFile(ctx, dirent, flags, f)
 }
@@ -278,20 +268,17 @@ func (f *fileOperations) Write(ctx context.Context, file *fs.File, src usermem.I
 // use this function rather than using a defer in Read() to avoid the performance hit of defer.
 func (f *fileOperations) incrementReadCounters(start time.Time) {
 	if f.handles.Host != nil {
-		readsHost.Increment()
-		fs.IncrementWait(readWaitHost, start)
+		fsmetric.GoferReadsHost.Increment()
+		fsmetric.FinishReadWait(fsmetric.GoferReadWaitHost, start)
 	} else {
-		reads9P.Increment()
-		fs.IncrementWait(readWait9P, start)
+		fsmetric.GoferReads9P.Increment()
+		fsmetric.FinishReadWait(fsmetric.GoferReadWait9P, start)
 	}
 }
 
 // Read implements fs.FileOperations.Read.
 func (f *fileOperations) Read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
-	var start time.Time
-	if fs.RecordWaitTime {
-		start = time.Now()
-	}
+	start := fsmetric.StartReadWait()
 	if fs.IsDir(file.Dirent.Inode.StableAttr) {
 		// Not all remote file systems enforce this so this client does.
 		f.incrementReadCounters(start)
diff --git a/pkg/sentry/fs/gofer/inode.go b/pkg/sentry/fs/gofer/inode.go
index 3a225fd39..9d6fdd08f 100644
--- a/pkg/sentry/fs/gofer/inode.go
+++ b/pkg/sentry/fs/gofer/inode.go
@@ -117,7 +117,7 @@ type inodeFileState struct {
 	// loading is acquired when the inodeFileState begins an asynchronous
 	// load. It releases when the load is complete. Callers that require all
 	// state to be available should call waitForLoad() to ensure that.
-	loading sync.Mutex `state:".(struct{})"`
+	loading sync.CrossGoroutineMutex `state:".(struct{})"`
 
 	// savedUAttr is only allocated during S/R. It points to the save-time
 	// unstable attributes and is used to validate restore-time ones.
diff --git a/pkg/sentry/fs/inode.go b/pkg/sentry/fs/inode.go
index 004910453..9b3d8166a 100644
--- a/pkg/sentry/fs/inode.go
+++ b/pkg/sentry/fs/inode.go
@@ -18,9 +18,9 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/log"
-	"gvisor.dev/gvisor/pkg/metric"
 	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/lock"
+	"gvisor.dev/gvisor/pkg/sentry/fsmetric"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
@@ -28,8 +28,6 @@ import (
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
-var opens = metric.MustCreateNewUint64Metric("/fs/opens", false /* sync */, "Number of file opens.")
-
 // Inode is a file system object that can be simultaneously referenced by different
 // components of the VFS (Dirent, fs.File, etc).
 //
@@ -247,7 +245,7 @@ func (i *Inode) GetFile(ctx context.Context, d *Dirent, flags FileFlags) (*File,
 	if i.overlay != nil {
 		return overlayGetFile(ctx, i.overlay, d, flags)
 	}
-	opens.Increment()
+	fsmetric.Opens.Increment()
 	return i.InodeOperations.GetFile(ctx, d, flags)
 }
 
diff --git a/pkg/sentry/fs/proc/sys.go b/pkg/sentry/fs/proc/sys.go
index f8aad2dbd..b998fb75d 100644
--- a/pkg/sentry/fs/proc/sys.go
+++ b/pkg/sentry/fs/proc/sys.go
@@ -84,6 +84,7 @@ func (p *proc) newKernelDir(ctx context.Context, msrc *fs.MountSource) *fs.Inode
 
 	children := map[string]*fs.Inode{
 		"hostname": newProcInode(ctx, &h, msrc, fs.SpecialFile, nil),
+		"sem":      newStaticProcInode(ctx, msrc, []byte(fmt.Sprintf("%d\t%d\t%d\t%d\n", linux.SEMMSL, linux.SEMMNS, linux.SEMOPM, linux.SEMMNI))),
 		"shmall":   newStaticProcInode(ctx, msrc, []byte(strconv.FormatUint(linux.SHMALL, 10))),
 		"shmmax":   newStaticProcInode(ctx, msrc, []byte(strconv.FormatUint(linux.SHMMAX, 10))),
 		"shmmni":   newStaticProcInode(ctx, msrc, []byte(strconv.FormatUint(linux.SHMMNI, 10))),
diff --git a/pkg/sentry/fs/tmpfs/BUILD b/pkg/sentry/fs/tmpfs/BUILD
index aa7199014..b521a86a2 100644
--- a/pkg/sentry/fs/tmpfs/BUILD
+++ b/pkg/sentry/fs/tmpfs/BUILD
@@ -15,12 +15,12 @@ go_library(
     deps = [
         "//pkg/abi/linux",
         "//pkg/context",
-        "//pkg/metric",
         "//pkg/safemem",
         "//pkg/sentry/device",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/fsutil",
         "//pkg/sentry/fs/ramfs",
+        "//pkg/sentry/fsmetric",
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/kernel/pipe",
diff --git a/pkg/sentry/fs/tmpfs/inode_file.go b/pkg/sentry/fs/tmpfs/inode_file.go
index d6c65301c..e04cd608d 100644
--- a/pkg/sentry/fs/tmpfs/inode_file.go
+++ b/pkg/sentry/fs/tmpfs/inode_file.go
@@ -18,14 +18,13 @@ import (
 	"fmt"
 	"io"
 	"math"
-	"time"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
-	"gvisor.dev/gvisor/pkg/metric"
 	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.dev/gvisor/pkg/sentry/fsmetric"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
@@ -35,13 +34,6 @@ import (
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
-var (
-	opensRO  = metric.MustCreateNewUint64Metric("/in_memory_file/opens_ro", false /* sync */, "Number of times an in-memory file was opened in read-only mode.")
-	opensW   = metric.MustCreateNewUint64Metric("/in_memory_file/opens_w", false /* sync */, "Number of times an in-memory file was opened in write mode.")
-	reads    = metric.MustCreateNewUint64Metric("/in_memory_file/reads", false /* sync */, "Number of in-memory file reads.")
-	readWait = metric.MustCreateNewUint64NanosecondsMetric("/in_memory_file/read_wait", false /* sync */, "Time waiting on in-memory file reads, in nanoseconds.")
-)
-
 // fileInodeOperations implements fs.InodeOperations for a regular tmpfs file.
 // These files are backed by pages allocated from a platform.Memory, and may be
 // directly mapped.
@@ -157,9 +149,9 @@ func (*fileInodeOperations) Rename(ctx context.Context, inode *fs.Inode, oldPare
 // GetFile implements fs.InodeOperations.GetFile.
 func (f *fileInodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
 	if flags.Write {
-		opensW.Increment()
+		fsmetric.TmpfsOpensW.Increment()
 	} else if flags.Read {
-		opensRO.Increment()
+		fsmetric.TmpfsOpensRO.Increment()
 	}
 	flags.Pread = true
 	flags.Pwrite = true
@@ -319,14 +311,12 @@ func (*fileInodeOperations) StatFS(context.Context) (fs.Info, error) {
 }
 
 func (f *fileInodeOperations) read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
-	var start time.Time
-	if fs.RecordWaitTime {
-		start = time.Now()
-	}
-	reads.Increment()
+	start := fsmetric.StartReadWait()
+	defer fsmetric.FinishReadWait(fsmetric.TmpfsReadWait, start)
+	fsmetric.TmpfsReads.Increment()
+
 	// Zero length reads for tmpfs are no-ops.
 	if dst.NumBytes() == 0 {
-		fs.IncrementWait(readWait, start)
 		return 0, nil
 	}
 
@@ -343,7 +333,6 @@ func (f *fileInodeOperations) read(ctx context.Context, file *fs.File, dst userm
 	size := f.attr.Size
 	f.dataMu.RUnlock()
 	if offset >= size {
-		fs.IncrementWait(readWait, start)
 		return 0, io.EOF
 	}
 
@@ -354,7 +343,6 @@ func (f *fileInodeOperations) read(ctx context.Context, file *fs.File, dst userm
 		f.attr.AccessTime = ktime.NowFromContext(ctx)
 		f.attrMu.Unlock()
 	}
-	fs.IncrementWait(readWait, start)
 	return n, err
 }
 
diff --git a/pkg/sentry/fsimpl/fuse/dev.go b/pkg/sentry/fsimpl/fuse/dev.go
index 89c3ef079..1bbe6fdb7 100644
--- a/pkg/sentry/fsimpl/fuse/dev.go
+++ b/pkg/sentry/fsimpl/fuse/dev.go
@@ -363,7 +363,7 @@ func (fd *DeviceFD) Readiness(mask waiter.EventMask) waiter.EventMask {
 func (fd *DeviceFD) readinessLocked(mask waiter.EventMask) waiter.EventMask {
 	var ready waiter.EventMask
 
-	if fd.fs.umounted {
+	if fd.fs == nil || fd.fs.umounted {
 		ready |= waiter.EventErr
 		return ready & mask
 	}
diff --git a/pkg/sentry/fsimpl/gofer/BUILD b/pkg/sentry/fsimpl/gofer/BUILD
index 4c3e9acf8..807b6ed1f 100644
--- a/pkg/sentry/fsimpl/gofer/BUILD
+++ b/pkg/sentry/fsimpl/gofer/BUILD
@@ -59,6 +59,7 @@ go_library(
         "//pkg/sentry/fs/fsutil",
         "//pkg/sentry/fs/lock",
         "//pkg/sentry/fsimpl/host",
+        "//pkg/sentry/fsmetric",
         "//pkg/sentry/hostfd",
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go
index 2294c490e..df27554d3 100644
--- a/pkg/sentry/fsimpl/gofer/filesystem.go
+++ b/pkg/sentry/fsimpl/gofer/filesystem.go
@@ -24,6 +24,7 @@ import (
 	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/p9"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/host"
+	"gvisor.dev/gvisor/pkg/sentry/fsmetric"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/pipe"
@@ -985,14 +986,11 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.Open
 	switch d.fileType() {
 	case linux.S_IFREG:
 		if !d.fs.opts.regularFilesUseSpecialFileFD {
-			if err := d.ensureSharedHandle(ctx, ats&vfs.MayRead != 0, ats&vfs.MayWrite != 0, trunc); err != nil {
+			if err := d.ensureSharedHandle(ctx, ats.MayRead(), ats.MayWrite(), trunc); err != nil {
 				return nil, err
 			}
-			fd := &regularFileFD{}
-			fd.LockFD.Init(&d.locks)
-			if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{
-				AllowDirectIO: true,
-			}); err != nil {
+			fd, err := newRegularFileFD(mnt, d, opts.Flags)
+			if err != nil {
 				return nil, err
 			}
 			vfd = &fd.vfsfd
@@ -1019,6 +1017,11 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.Open
 		if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil {
 			return nil, err
 		}
+		if atomic.LoadInt32(&d.readFD) >= 0 {
+			fsmetric.GoferOpensHost.Increment()
+		} else {
+			fsmetric.GoferOpens9P.Increment()
+		}
 		return &fd.vfsfd, nil
 	case linux.S_IFLNK:
 		// Can't open symlinks without O_PATH (which is unimplemented).
@@ -1110,7 +1113,7 @@ retry:
 			return nil, err
 		}
 	}
-	fd, err := newSpecialFileFD(h, mnt, d, &d.locks, opts.Flags)
+	fd, err := newSpecialFileFD(h, mnt, d, opts.Flags)
 	if err != nil {
 		h.close(ctx)
 		return nil, err
@@ -1205,11 +1208,8 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving
 	// Finally, construct a file description representing the created file.
 	var childVFSFD *vfs.FileDescription
 	if useRegularFileFD {
-		fd := &regularFileFD{}
-		fd.LockFD.Init(&child.locks)
-		if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &child.vfsd, &vfs.FileDescriptionOptions{
-			AllowDirectIO: true,
-		}); err != nil {
+		fd, err := newRegularFileFD(mnt, child, opts.Flags)
+		if err != nil {
 			return nil, err
 		}
 		childVFSFD = &fd.vfsfd
@@ -1221,7 +1221,7 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving
 		if fdobj != nil {
 			h.fd = int32(fdobj.Release())
 		}
-		fd, err := newSpecialFileFD(h, mnt, child, &d.locks, opts.Flags)
+		fd, err := newSpecialFileFD(h, mnt, child, opts.Flags)
 		if err != nil {
 			h.close(ctx)
 			return nil, err
diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go
index 75a836899..3cdb1e659 100644
--- a/pkg/sentry/fsimpl/gofer/gofer.go
+++ b/pkg/sentry/fsimpl/gofer/gofer.go
@@ -743,7 +743,9 @@ type dentry struct {
 	// for memory mappings. If mmapFD is -1, no such FD is available, and the
 	// internal page cache implementation is used for memory mappings instead.
 	//
-	// These fields are protected by handleMu.
+	// These fields are protected by handleMu. readFD, writeFD, and mmapFD are
+	// additionally written using atomic memory operations, allowing them to be
+	// read (albeit racily) with atomic.LoadInt32() without locking handleMu.
 	//
 	// readFile and writeFile may or may not represent the same p9.File. Once
 	// either p9.File transitions from closed (isNil() == true) to open
@@ -1351,16 +1353,11 @@ func (d *dentry) checkCachingLocked(ctx context.Context) {
 		return
 	}
 	if refs > 0 {
-		if d.cached {
-			// This isn't strictly necessary (fs.cachedDentries is permitted to
-			// contain dentries with non-zero refs, which are skipped by
-			// fs.evictCachedDentryLocked() upon reaching the end of the LRU),
-			// but since we are already holding fs.renameMu for writing we may
-			// as well.
-			d.fs.cachedDentries.Remove(d)
-			d.fs.cachedDentriesLen--
-			d.cached = false
-		}
+		// This isn't strictly necessary (fs.cachedDentries is permitted to
+		// contain dentries with non-zero refs, which are skipped by
+		// fs.evictCachedDentryLocked() upon reaching the end of the LRU), but
+		// since we are already holding fs.renameMu for writing we may as well.
+		d.removeFromCacheLocked()
 		return
 	}
 	// Deleted and invalidated dentries with zero references are no longer
@@ -1369,20 +1366,18 @@ func (d *dentry) checkCachingLocked(ctx context.Context) {
 		if d.isDeleted() {
 			d.watches.HandleDeletion(ctx)
 		}
-		if d.cached {
-			d.fs.cachedDentries.Remove(d)
-			d.fs.cachedDentriesLen--
-			d.cached = false
-		}
+		d.removeFromCacheLocked()
 		d.destroyLocked(ctx)
 		return
 	}
-	// If d still has inotify watches and it is not deleted or invalidated, we
-	// cannot cache it and allow it to be evicted. Otherwise, we will lose its
-	// watches, even if a new dentry is created for the same file in the future.
-	// Note that the size of d.watches cannot concurrently transition from zero
-	// to non-zero, because adding a watch requires holding a reference on d.
+	// If d still has inotify watches and it is not deleted or invalidated, it
+	// can't be evicted. Otherwise, we will lose its watches, even if a new
+	// dentry is created for the same file in the future. Note that the size of
+	// d.watches cannot concurrently transition from zero to non-zero, because
+	// adding a watch requires holding a reference on d.
 	if d.watches.Size() > 0 {
+		// As in the refs > 0 case, this is not strictly necessary.
+		d.removeFromCacheLocked()
 		return
 	}
 
@@ -1413,6 +1408,15 @@ func (d *dentry) checkCachingLocked(ctx context.Context) {
 	}
 }
 
+// Preconditions: d.fs.renameMu must be locked for writing.
+func (d *dentry) removeFromCacheLocked() {
+	if d.cached {
+		d.fs.cachedDentries.Remove(d)
+		d.fs.cachedDentriesLen--
+		d.cached = false
+	}
+}
+
 // Precondition: fs.renameMu must be locked for writing; it may be temporarily
 // unlocked.
 func (fs *filesystem) evictAllCachedDentriesLocked(ctx context.Context) {
@@ -1426,12 +1430,10 @@ func (fs *filesystem) evictAllCachedDentriesLocked(ctx context.Context) {
 // * fs.cachedDentriesLen != 0.
 func (fs *filesystem) evictCachedDentryLocked(ctx context.Context) {
 	victim := fs.cachedDentries.Back()
-	fs.cachedDentries.Remove(victim)
-	fs.cachedDentriesLen--
-	victim.cached = false
-	// victim.refs may have become non-zero from an earlier path resolution
-	// since it was inserted into fs.cachedDentries.
-	if atomic.LoadInt64(&victim.refs) == 0 {
+	victim.removeFromCacheLocked()
+	// victim.refs or victim.watches.Size() may have become non-zero from an
+	// earlier path resolution since it was inserted into fs.cachedDentries.
+	if atomic.LoadInt64(&victim.refs) == 0 && victim.watches.Size() == 0 {
 		if victim.parent != nil {
 			victim.parent.dirMu.Lock()
 			if !victim.vfsd.IsDead() {
@@ -1668,7 +1670,7 @@ func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool
 						}
 						fdsToClose = append(fdsToClose, d.readFD)
 						invalidateTranslations = true
-						d.readFD = h.fd
+						atomic.StoreInt32(&d.readFD, h.fd)
 					} else {
 						// Otherwise, we want to avoid invalidating existing
 						// memmap.Translations (which is expensive); instead, use
@@ -1689,15 +1691,15 @@ func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool
 						h.fd = d.readFD
 					}
 				} else {
-					d.readFD = h.fd
+					atomic.StoreInt32(&d.readFD, h.fd)
 				}
 				if d.writeFD != h.fd && d.writeFD >= 0 {
 					fdsToClose = append(fdsToClose, d.writeFD)
 				}
-				d.writeFD = h.fd
-				d.mmapFD = h.fd
+				atomic.StoreInt32(&d.writeFD, h.fd)
+				atomic.StoreInt32(&d.mmapFD, h.fd)
 			} else if openReadable && d.readFD < 0 {
-				d.readFD = h.fd
+				atomic.StoreInt32(&d.readFD, h.fd)
 				// If the file has not been opened for writing, the new FD may
 				// be used for read-only memory mappings. If the file was
 				// previously opened for reading (without an FD), then existing
@@ -1705,10 +1707,10 @@ func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool
 				// invalidate those mappings.
 				if d.writeFile.isNil() {
 					invalidateTranslations = !d.readFile.isNil()
-					d.mmapFD = h.fd
+					atomic.StoreInt32(&d.mmapFD, h.fd)
 				}
 			} else if openWritable && d.writeFD < 0 {
-				d.writeFD = h.fd
+				atomic.StoreInt32(&d.writeFD, h.fd)
 				if d.readFD >= 0 {
 					// We have an existing read-only FD, but the file has just
 					// been opened for writing, so we need to start supporting
@@ -1717,7 +1719,7 @@ func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool
 					// writable memory mappings. Switch to using the internal
 					// page cache.
 					invalidateTranslations = true
-					d.mmapFD = -1
+					atomic.StoreInt32(&d.mmapFD, -1)
 				}
 			} else {
 				// The new FD is not useful.
@@ -1729,7 +1731,7 @@ func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool
 			// memory mappings. However, we have no writable host FD. Switch to
 			// using the internal page cache.
 			invalidateTranslations = true
-			d.mmapFD = -1
+			atomic.StoreInt32(&d.mmapFD, -1)
 		}
 
 		// Switch to new fids.
diff --git a/pkg/sentry/fsimpl/gofer/regular_file.go b/pkg/sentry/fsimpl/gofer/regular_file.go
index 652142ecc..283b220bb 100644
--- a/pkg/sentry/fsimpl/gofer/regular_file.go
+++ b/pkg/sentry/fsimpl/gofer/regular_file.go
@@ -26,6 +26,7 @@ import (
 	"gvisor.dev/gvisor/pkg/p9"
 	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.dev/gvisor/pkg/sentry/fsmetric"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
 	"gvisor.dev/gvisor/pkg/sentry/usage"
@@ -48,6 +49,25 @@ type regularFileFD struct {
 	off int64
 }
 
+func newRegularFileFD(mnt *vfs.Mount, d *dentry, flags uint32) (*regularFileFD, error) {
+	fd := &regularFileFD{}
+	fd.LockFD.Init(&d.locks)
+	if err := fd.vfsfd.Init(fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{
+		AllowDirectIO: true,
+	}); err != nil {
+		return nil, err
+	}
+	if fd.vfsfd.IsWritable() && (atomic.LoadUint32(&d.mode)&0111 != 0) {
+		fsmetric.GoferOpensWX.Increment()
+	}
+	if atomic.LoadInt32(&d.mmapFD) >= 0 {
+		fsmetric.GoferOpensHost.Increment()
+	} else {
+		fsmetric.GoferOpens9P.Increment()
+	}
+	return fd, nil
+}
+
 // Release implements vfs.FileDescriptionImpl.Release.
 func (fd *regularFileFD) Release(context.Context) {
 }
@@ -89,6 +109,18 @@ func (fd *regularFileFD) Allocate(ctx context.Context, mode, offset, length uint
 
 // PRead implements vfs.FileDescriptionImpl.PRead.
 func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+	start := fsmetric.StartReadWait()
+	d := fd.dentry()
+	defer func() {
+		if atomic.LoadInt32(&d.readFD) >= 0 {
+			fsmetric.GoferReadsHost.Increment()
+			fsmetric.FinishReadWait(fsmetric.GoferReadWaitHost, start)
+		} else {
+			fsmetric.GoferReads9P.Increment()
+			fsmetric.FinishReadWait(fsmetric.GoferReadWait9P, start)
+		}
+	}()
+
 	if offset < 0 {
 		return 0, syserror.EINVAL
 	}
@@ -102,7 +134,6 @@ func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offs
 
 	// Check for reading at EOF before calling into MM (but not under
 	// InteropModeShared, which makes d.size unreliable).
-	d := fd.dentry()
 	if d.cachedMetadataAuthoritative() && uint64(offset) >= atomic.LoadUint64(&d.size) {
 		return 0, io.EOF
 	}
@@ -647,10 +678,7 @@ func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpt
 			// Whether or not we have a host FD, we're not allowed to use it.
 			return syserror.ENODEV
 		}
-		d.handleMu.RLock()
-		haveFD := d.mmapFD >= 0
-		d.handleMu.RUnlock()
-		if !haveFD {
+		if atomic.LoadInt32(&d.mmapFD) < 0 {
 			return syserror.ENODEV
 		}
 	default:
@@ -668,10 +696,7 @@ func (d *dentry) mayCachePages() bool {
 	if d.fs.opts.forcePageCache {
 		return true
 	}
-	d.handleMu.RLock()
-	haveFD := d.mmapFD >= 0
-	d.handleMu.RUnlock()
-	return haveFD
+	return atomic.LoadInt32(&d.mmapFD) >= 0
 }
 
 // AddMapping implements memmap.Mappable.AddMapping.
diff --git a/pkg/sentry/fsimpl/gofer/special_file.go b/pkg/sentry/fsimpl/gofer/special_file.go
index 625400c0b..089955a96 100644
--- a/pkg/sentry/fsimpl/gofer/special_file.go
+++ b/pkg/sentry/fsimpl/gofer/special_file.go
@@ -23,6 +23,7 @@ import (
 	"gvisor.dev/gvisor/pkg/fdnotifier"
 	"gvisor.dev/gvisor/pkg/p9"
 	"gvisor.dev/gvisor/pkg/safemem"
+	"gvisor.dev/gvisor/pkg/sentry/fsmetric"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
@@ -70,7 +71,7 @@ type specialFileFD struct {
 	buf     []byte
 }
 
-func newSpecialFileFD(h handle, mnt *vfs.Mount, d *dentry, locks *vfs.FileLocks, flags uint32) (*specialFileFD, error) {
+func newSpecialFileFD(h handle, mnt *vfs.Mount, d *dentry, flags uint32) (*specialFileFD, error) {
 	ftype := d.fileType()
 	seekable := ftype == linux.S_IFREG || ftype == linux.S_IFCHR || ftype == linux.S_IFBLK
 	haveQueue := (ftype == linux.S_IFIFO || ftype == linux.S_IFSOCK) && h.fd >= 0
@@ -80,7 +81,7 @@ func newSpecialFileFD(h handle, mnt *vfs.Mount, d *dentry, locks *vfs.FileLocks,
 		seekable:      seekable,
 		haveQueue:     haveQueue,
 	}
-	fd.LockFD.Init(locks)
+	fd.LockFD.Init(&d.locks)
 	if haveQueue {
 		if err := fdnotifier.AddFD(h.fd, &fd.queue); err != nil {
 			return nil, err
@@ -98,6 +99,14 @@ func newSpecialFileFD(h handle, mnt *vfs.Mount, d *dentry, locks *vfs.FileLocks,
 	d.fs.syncMu.Lock()
 	d.fs.specialFileFDs[fd] = struct{}{}
 	d.fs.syncMu.Unlock()
+	if fd.vfsfd.IsWritable() && (atomic.LoadUint32(&d.mode)&0111 != 0) {
+		fsmetric.GoferOpensWX.Increment()
+	}
+	if h.fd >= 0 {
+		fsmetric.GoferOpensHost.Increment()
+	} else {
+		fsmetric.GoferOpens9P.Increment()
+	}
 	return fd, nil
 }
 
@@ -161,6 +170,17 @@ func (fd *specialFileFD) Allocate(ctx context.Context, mode, offset, length uint
 
 // PRead implements vfs.FileDescriptionImpl.PRead.
 func (fd *specialFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+	start := fsmetric.StartReadWait()
+	defer func() {
+		if fd.handle.fd >= 0 {
+			fsmetric.GoferReadsHost.Increment()
+			fsmetric.FinishReadWait(fsmetric.GoferReadWaitHost, start)
+		} else {
+			fsmetric.GoferReads9P.Increment()
+			fsmetric.FinishReadWait(fsmetric.GoferReadWait9P, start)
+		}
+	}()
+
 	if fd.seekable && offset < 0 {
 		return 0, syserror.EINVAL
 	}
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go
index c14abcff4..565d723f0 100644
--- a/pkg/sentry/fsimpl/kernfs/kernfs.go
+++ b/pkg/sentry/fsimpl/kernfs/kernfs.go
@@ -286,7 +286,7 @@ func (d *Dentry) cacheLocked(ctx context.Context) {
 	refs := atomic.LoadInt64(&d.refs)
 	if refs == -1 {
 		// Dentry has already been destroyed.
-		panic(fmt.Sprintf("cacheLocked called on a dentry which has already been destroyed: %v", d))
+		return
 	}
 	if refs > 0 {
 		if d.cached {
diff --git a/pkg/sentry/fsimpl/overlay/copy_up.go b/pkg/sentry/fsimpl/overlay/copy_up.go
index 469f3a33d..27b00cf6f 100644
--- a/pkg/sentry/fsimpl/overlay/copy_up.go
+++ b/pkg/sentry/fsimpl/overlay/copy_up.go
@@ -16,7 +16,6 @@ package overlay
 
 import (
 	"fmt"
-	"io"
 	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
@@ -129,25 +128,9 @@ func (d *dentry) copyUpLocked(ctx context.Context) error {
 			return err
 		}
 		defer newFD.DecRef(ctx)
-		bufIOSeq := usermem.BytesIOSequence(make([]byte, 32*1024)) // arbitrary buffer size
-		for {
-			readN, readErr := oldFD.Read(ctx, bufIOSeq, vfs.ReadOptions{})
-			if readErr != nil && readErr != io.EOF {
-				cleanupUndoCopyUp()
-				return readErr
-			}
-			total := int64(0)
-			for total < readN {
-				writeN, writeErr := newFD.Write(ctx, bufIOSeq.DropFirst64(total), vfs.WriteOptions{})
-				total += writeN
-				if writeErr != nil {
-					cleanupUndoCopyUp()
-					return writeErr
-				}
-			}
-			if readErr == io.EOF {
-				break
-			}
+		if _, err := vfs.CopyRegularFileData(ctx, newFD, oldFD); err != nil {
+			cleanupUndoCopyUp()
+			return err
 		}
 		d.mapsMu.Lock()
 		defer d.mapsMu.Unlock()
diff --git a/pkg/sentry/fsimpl/overlay/regular_file.go b/pkg/sentry/fsimpl/overlay/regular_file.go
index 2b89a7a6d..25c785fd4 100644
--- a/pkg/sentry/fsimpl/overlay/regular_file.go
+++ b/pkg/sentry/fsimpl/overlay/regular_file.go
@@ -103,8 +103,8 @@ func (fd *regularFileFD) currentFDLocked(ctx context.Context) (*vfs.FileDescript
 			for e, mask := range fd.lowerWaiters {
 				fd.cachedFD.EventUnregister(e)
 				upperFD.EventRegister(e, mask)
-				if ready&mask != 0 {
-					e.Callback.Callback(e)
+				if m := ready & mask; m != 0 {
+					e.Callback.Callback(e, m)
 				}
 			}
 		}
diff --git a/pkg/sentry/fsimpl/proc/task_net.go b/pkg/sentry/fsimpl/proc/task_net.go
index 5cf8a071a..d4f6a5a9b 100644
--- a/pkg/sentry/fsimpl/proc/task_net.go
+++ b/pkg/sentry/fsimpl/proc/task_net.go
@@ -208,7 +208,7 @@ func (n *netUnixData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 	for _, se := range n.kernel.ListSockets() {
 		s := se.SockVFS2
 		if !s.TryIncRef() {
-			log.Debugf("Couldn't get reference on %v in socket table, racing with destruction?", s)
+			// Racing with socket destruction, this is ok.
 			continue
 		}
 		if family, _, _ := s.Impl().(socket.SocketVFS2).Type(); family != linux.AF_UNIX {
@@ -351,7 +351,7 @@ func commonGenerateTCP(ctx context.Context, buf *bytes.Buffer, k *kernel.Kernel,
 	for _, se := range k.ListSockets() {
 		s := se.SockVFS2
 		if !s.TryIncRef() {
-			log.Debugf("Couldn't get reference on %v in socket table, racing with destruction?", s)
+			// Racing with socket destruction, this is ok.
 			continue
 		}
 		sops, ok := s.Impl().(socket.SocketVFS2)
@@ -516,7 +516,7 @@ func (d *netUDPData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 	for _, se := range d.kernel.ListSockets() {
 		s := se.SockVFS2
 		if !s.TryIncRef() {
-			log.Debugf("Couldn't get reference on %v in socket table, racing with destruction?", s)
+			// Racing with socket destruction, this is ok.
 			continue
 		}
 		sops, ok := s.Impl().(socket.SocketVFS2)
diff --git a/pkg/sentry/fsimpl/proc/tasks_sys.go b/pkg/sentry/fsimpl/proc/tasks_sys.go
index 7c7afdcfa..25c407d98 100644
--- a/pkg/sentry/fsimpl/proc/tasks_sys.go
+++ b/pkg/sentry/fsimpl/proc/tasks_sys.go
@@ -44,6 +44,7 @@ func (fs *filesystem) newSysDir(ctx context.Context, root *auth.Credentials, k *
 	return fs.newStaticDir(ctx, root, map[string]kernfs.Inode{
 		"kernel": fs.newStaticDir(ctx, root, map[string]kernfs.Inode{
 			"hostname": fs.newInode(ctx, root, 0444, &hostnameData{}),
+			"sem":      fs.newInode(ctx, root, 0444, newStaticFile(fmt.Sprintf("%d\t%d\t%d\t%d\n", linux.SEMMSL, linux.SEMMNS, linux.SEMOPM, linux.SEMMNI))),
 			"shmall":   fs.newInode(ctx, root, 0444, shmData(linux.SHMALL)),
 			"shmmax":   fs.newInode(ctx, root, 0444, shmData(linux.SHMMAX)),
 			"shmmni":   fs.newInode(ctx, root, 0444, shmData(linux.SHMMNI)),
diff --git a/pkg/sentry/fsimpl/tmpfs/BUILD b/pkg/sentry/fsimpl/tmpfs/BUILD
index fe520b6fd..09957c2b7 100644
--- a/pkg/sentry/fsimpl/tmpfs/BUILD
+++ b/pkg/sentry/fsimpl/tmpfs/BUILD
@@ -67,6 +67,7 @@ go_library(
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/fsutil",
         "//pkg/sentry/fs/lock",
+        "//pkg/sentry/fsmetric",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/kernel/pipe",
         "//pkg/sentry/kernel/time",
diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go
index e39cd305b..9296db2fb 100644
--- a/pkg/sentry/fsimpl/tmpfs/filesystem.go
+++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go
@@ -21,6 +21,7 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/fsmetric"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
@@ -381,6 +382,8 @@ afterTrailingSymlink:
 		creds := rp.Credentials()
 		child := fs.newDentry(fs.newRegularFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode))
 		parentDir.insertChildLocked(child, name)
+		child.IncRef()
+		defer child.DecRef(ctx)
 		unlock()
 		fd, err := child.open(ctx, rp, &opts, true)
 		if err != nil {
@@ -437,6 +440,11 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.Open
 				return nil, err
 			}
 		}
+		if fd.vfsfd.IsWritable() {
+			fsmetric.TmpfsOpensW.Increment()
+		} else if fd.vfsfd.IsReadable() {
+			fsmetric.TmpfsOpensRO.Increment()
+		}
 		return &fd.vfsfd, nil
 	case *directory:
 		// Can't open directories writably.
diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file.go b/pkg/sentry/fsimpl/tmpfs/regular_file.go
index f8e0cffb0..6255a7c84 100644
--- a/pkg/sentry/fsimpl/tmpfs/regular_file.go
+++ b/pkg/sentry/fsimpl/tmpfs/regular_file.go
@@ -25,6 +25,7 @@ import (
 	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.dev/gvisor/pkg/sentry/fsmetric"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
@@ -359,6 +360,10 @@ func (fd *regularFileFD) Allocate(ctx context.Context, mode, offset, length uint
 
 // PRead implements vfs.FileDescriptionImpl.PRead.
 func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+	start := fsmetric.StartReadWait()
+	defer fsmetric.FinishReadWait(fsmetric.TmpfsReadWait, start)
+	fsmetric.TmpfsReads.Increment()
+
 	if offset < 0 {
 		return 0, syserror.EINVAL
 	}
diff --git a/pkg/sentry/fsimpl/verity/filesystem.go b/pkg/sentry/fsimpl/verity/filesystem.go
index add5dd48e..04e7110a3 100644
--- a/pkg/sentry/fsimpl/verity/filesystem.go
+++ b/pkg/sentry/fsimpl/verity/filesystem.go
@@ -107,8 +107,10 @@ func (fs *filesystem) renameMuUnlockAndCheckDrop(ctx context.Context, ds **[]*de
 // Dentries which may have a reference count of zero, and which therefore
 // should be dropped once traversal is complete, are appended to ds.
 //
-// Preconditions: fs.renameMu must be locked. d.dirMu must be locked.
-// !rp.Done().
+// Preconditions:
+// * fs.renameMu must be locked.
+// * d.dirMu must be locked.
+// * !rp.Done().
 func (fs *filesystem) stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, mayFollowSymlinks bool, ds **[]*dentry) (*dentry, error) {
 	if !d.isDir() {
 		return nil, syserror.ENOTDIR
@@ -158,15 +160,19 @@ afterSymlink:
 	return child, nil
 }
 
-// verifyChild verifies the hash of child against the already verified hash of
-// the parent to ensure the child is expected.  verifyChild triggers a sentry
-// panic if unexpected modifications to the file system are detected. In
+// verifyChildLocked verifies the hash of child against the already verified
+// hash of the parent to ensure the child is expected.  verifyChild triggers a
+// sentry panic if unexpected modifications to the file system are detected. In
 // noCrashOnVerificationFailure mode it returns a syserror instead.
-// Preconditions: fs.renameMu must be locked. d.dirMu must be locked.
+//
+// Preconditions:
+// * fs.renameMu must be locked.
+// * d.dirMu must be locked.
+//
 // TODO(b/166474175): Investigate all possible errors returned in this
 // function, and make sure we differentiate all errors that indicate unexpected
 // modifications to the file system from the ones that are not harmful.
-func (fs *filesystem) verifyChild(ctx context.Context, parent *dentry, child *dentry) (*dentry, error) {
+func (fs *filesystem) verifyChildLocked(ctx context.Context, parent *dentry, child *dentry) (*dentry, error) {
 	vfsObj := fs.vfsfs.VirtualFilesystem()
 
 	// Get the path to the child dentry. This is only used to provide path
@@ -248,7 +254,7 @@ func (fs *filesystem) verifyChild(ctx context.Context, parent *dentry, child *de
 		return nil, alertIntegrityViolation(fmt.Sprintf("Failed to convert xattr %s for %s to int: %v", merkleSizeXattr, childPath, err))
 	}
 
-	fdReader := vfs.FileReadWriteSeeker{
+	fdReader := FileReadWriteSeeker{
 		FD:  parentMerkleFD,
 		Ctx: ctx,
 	}
@@ -268,7 +274,8 @@ func (fs *filesystem) verifyChild(ctx context.Context, parent *dentry, child *de
 	// contain the hash of the children in the parent Merkle tree when
 	// Verify returns with success.
 	var buf bytes.Buffer
-	if _, err := merkletree.Verify(&merkletree.VerifyParams{
+	parent.hashMu.RLock()
+	_, err = merkletree.Verify(&merkletree.VerifyParams{
 		Out:      &buf,
 		File:     &fdReader,
 		Tree:     &fdReader,
@@ -284,21 +291,27 @@ func (fs *filesystem) verifyChild(ctx context.Context, parent *dentry, child *de
 		ReadSize:              int64(merkletree.DigestSize(fs.alg.toLinuxHashAlg())),
 		Expected:              parent.hash,
 		DataAndTreeInSameFile: true,
-	}); err != nil && err != io.EOF {
+	})
+	parent.hashMu.RUnlock()
+	if err != nil && err != io.EOF {
 		return nil, alertIntegrityViolation(fmt.Sprintf("Verification for %s failed: %v", childPath, err))
 	}
 
 	// Cache child hash when it's verified the first time.
+	child.hashMu.Lock()
 	if len(child.hash) == 0 {
 		child.hash = buf.Bytes()
 	}
+	child.hashMu.Unlock()
 	return child, nil
 }
 
-// verifyStatAndChildren verifies the stat and children names against the
+// verifyStatAndChildrenLocked verifies the stat and children names against the
 // verified hash. The mode/uid/gid and childrenNames of the file is cached
 // after verified.
-func (fs *filesystem) verifyStatAndChildren(ctx context.Context, d *dentry, stat linux.Statx) error {
+//
+// Preconditions: d.dirMu must be locked.
+func (fs *filesystem) verifyStatAndChildrenLocked(ctx context.Context, d *dentry, stat linux.Statx) error {
 	vfsObj := fs.vfsfs.VirtualFilesystem()
 
 	// Get the path to the child dentry. This is only used to provide path
@@ -384,12 +397,13 @@ func (fs *filesystem) verifyStatAndChildren(ctx context.Context, d *dentry, stat
 		}
 	}
 
-	fdReader := vfs.FileReadWriteSeeker{
+	fdReader := FileReadWriteSeeker{
 		FD:  fd,
 		Ctx: ctx,
 	}
 
 	var buf bytes.Buffer
+	d.hashMu.RLock()
 	params := &merkletree.VerifyParams{
 		Out:      &buf,
 		Tree:     &fdReader,
@@ -407,6 +421,7 @@ func (fs *filesystem) verifyStatAndChildren(ctx context.Context, d *dentry, stat
 		Expected:              d.hash,
 		DataAndTreeInSameFile: false,
 	}
+	d.hashMu.RUnlock()
 	if atomic.LoadUint32(&d.mode)&linux.S_IFMT == linux.S_IFDIR {
 		params.DataAndTreeInSameFile = true
 	}
@@ -421,7 +436,9 @@ func (fs *filesystem) verifyStatAndChildren(ctx context.Context, d *dentry, stat
 	return nil
 }
 
-// Preconditions: fs.renameMu must be locked. d.dirMu must be locked.
+// Preconditions:
+// * fs.renameMu must be locked.
+// * parent.dirMu must be locked.
 func (fs *filesystem) getChildLocked(ctx context.Context, parent *dentry, name string, ds **[]*dentry) (*dentry, error) {
 	if child, ok := parent.children[name]; ok {
 		// If verity is enabled on child, we should check again whether
@@ -470,7 +487,7 @@ func (fs *filesystem) getChildLocked(ctx context.Context, parent *dentry, name s
 		// be cached before enabled.
 		if fs.allowRuntimeEnable {
 			if parent.verityEnabled() {
-				if _, err := fs.verifyChild(ctx, parent, child); err != nil {
+				if _, err := fs.verifyChildLocked(ctx, parent, child); err != nil {
 					return nil, err
 				}
 			}
@@ -486,7 +503,7 @@ func (fs *filesystem) getChildLocked(ctx context.Context, parent *dentry, name s
 				if err != nil {
 					return nil, err
 				}
-				if err := fs.verifyStatAndChildren(ctx, child, stat); err != nil {
+				if err := fs.verifyStatAndChildrenLocked(ctx, child, stat); err != nil {
 					return nil, err
 				}
 			}
@@ -506,7 +523,9 @@ func (fs *filesystem) getChildLocked(ctx context.Context, parent *dentry, name s
 	return child, nil
 }
 
-// Preconditions: fs.renameMu must be locked. parent.dirMu must be locked.
+// Preconditions:
+// * fs.renameMu must be locked.
+// * parent.dirMu must be locked.
 func (fs *filesystem) lookupAndVerifyLocked(ctx context.Context, parent *dentry, name string) (*dentry, error) {
 	vfsObj := fs.vfsfs.VirtualFilesystem()
 
@@ -597,13 +616,13 @@ func (fs *filesystem) lookupAndVerifyLocked(ctx context.Context, parent *dentry,
 	// allowRuntimeEnable mode and the parent directory hasn't been enabled
 	// yet.
 	if parent.verityEnabled() {
-		if _, err := fs.verifyChild(ctx, parent, child); err != nil {
+		if _, err := fs.verifyChildLocked(ctx, parent, child); err != nil {
 			child.destroyLocked(ctx)
 			return nil, err
 		}
 	}
 	if child.verityEnabled() {
-		if err := fs.verifyStatAndChildren(ctx, child, stat); err != nil {
+		if err := fs.verifyStatAndChildrenLocked(ctx, child, stat); err != nil {
 			child.destroyLocked(ctx)
 			return nil, err
 		}
@@ -617,7 +636,9 @@ func (fs *filesystem) lookupAndVerifyLocked(ctx context.Context, parent *dentry,
 // rp.Start().Impl().(*dentry)). It does not check that the returned directory
 // is searchable by the provider of rp.
 //
-// Preconditions: fs.renameMu must be locked. !rp.Done().
+// Preconditions:
+// * fs.renameMu must be locked.
+// * !rp.Done().
 func (fs *filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, error) {
 	for !rp.Final() {
 		d.dirMu.Lock()
@@ -958,11 +979,13 @@ func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
 	if err != nil {
 		return linux.Statx{}, err
 	}
+	d.dirMu.Lock()
 	if d.verityEnabled() {
-		if err := fs.verifyStatAndChildren(ctx, d, stat); err != nil {
+		if err := fs.verifyStatAndChildrenLocked(ctx, d, stat); err != nil {
 			return linux.Statx{}, err
 		}
 	}
+	d.dirMu.Unlock()
 	return stat, nil
 }
 
diff --git a/pkg/sentry/fsimpl/verity/verity.go b/pkg/sentry/fsimpl/verity/verity.go
index 87dabe038..5788c661f 100644
--- a/pkg/sentry/fsimpl/verity/verity.go
+++ b/pkg/sentry/fsimpl/verity/verity.go
@@ -19,6 +19,18 @@
 // The verity file system is read-only, except for one case: when
 // allowRuntimeEnable is true, additional Merkle files can be generated using
 // the FS_IOC_ENABLE_VERITY ioctl.
+//
+// Lock order:
+//
+// filesystem.renameMu
+//   dentry.dirMu
+//     fileDescription.mu
+//       filesystem.verityMu
+//         dentry.hashMu
+//
+// Locking dentry.dirMu in multiple dentries requires that parent dentries are
+// locked before child dentries, and that filesystem.renameMu is locked to
+// stabilize this relationship.
 package verity
 
 import (
@@ -79,9 +91,6 @@ var (
 	// noCrashOnVerificationFailure indicates whether the sandbox should panic
 	// whenever verification fails. If true, an error is returned instead of
 	// panicking. This should only be set for tests.
-	//
-	// TODO(b/165661693): Decide whether to panic or return error based on this
-	// flag.
 	noCrashOnVerificationFailure bool
 
 	// verityMu synchronizes concurrent operations that enable verity and perform
@@ -372,12 +381,14 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
 			return nil, nil, alertIntegrityViolation(fmt.Sprintf("Failed to deserialize childrenNames: %v", err))
 		}
 
-		if err := fs.verifyStatAndChildren(ctx, d, stat); err != nil {
+		if err := fs.verifyStatAndChildrenLocked(ctx, d, stat); err != nil {
 			return nil, nil, err
 		}
 	}
 
+	d.hashMu.Lock()
 	copy(d.hash, iopts.RootHash)
+	d.hashMu.Unlock()
 	d.vfsd.Init(d)
 
 	fs.rootDentry = d
@@ -402,7 +413,8 @@ type dentry struct {
 	fs *filesystem
 
 	// mode, uid, gid and size are the file mode, owner, group, and size of
-	// the file in the underlying file system.
+	// the file in the underlying file system. They are set when a dentry
+	// is initialized, and never modified.
 	mode uint32
 	uid  uint32
 	gid  uint32
@@ -425,18 +437,22 @@ type dentry struct {
 
 	// childrenNames stores the name of all children of the dentry. This is
 	// used by verity to check whether a child is expected. This is only
-	// populated by enableVerity.
+	// populated by enableVerity. childrenNames is also protected by dirMu.
 	childrenNames map[string]struct{}
 
-	// lowerVD is the VirtualDentry in the underlying file system.
+	// lowerVD is the VirtualDentry in the underlying file system. It is
+	// never modified after initialized.
 	lowerVD vfs.VirtualDentry
 
 	// lowerMerkleVD is the VirtualDentry of the corresponding Merkle tree
-	// in the underlying file system.
+	// in the underlying file system. It is never modified after
+	// initialized.
 	lowerMerkleVD vfs.VirtualDentry
 
-	// hash is the calculated hash for the current file or directory.
-	hash []byte
+	// hash is the calculated hash for the current file or directory. hash
+	// is protected by hashMu.
+	hashMu sync.RWMutex `state:"nosave"`
+	hash   []byte
 }
 
 // newDentry creates a new dentry representing the given verity file. The
@@ -519,7 +535,9 @@ func (d *dentry) checkDropLocked(ctx context.Context) {
 
 // destroyLocked destroys the dentry.
 //
-// Preconditions: d.fs.renameMu must be locked for writing. d.refs == 0.
+// Preconditions:
+// * d.fs.renameMu must be locked for writing.
+// * d.refs == 0.
 func (d *dentry) destroyLocked(ctx context.Context) {
 	switch atomic.LoadInt64(&d.refs) {
 	case 0:
@@ -599,6 +617,8 @@ func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes)
 // mode, it returns true if the target has been enabled with
 // ioctl(FS_IOC_ENABLE_VERITY).
 func (d *dentry) verityEnabled() bool {
+	d.hashMu.RLock()
+	defer d.hashMu.RUnlock()
 	return !d.fs.allowRuntimeEnable || len(d.hash) != 0
 }
 
@@ -678,11 +698,13 @@ func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linu
 	if err != nil {
 		return linux.Statx{}, err
 	}
+	fd.d.dirMu.Lock()
 	if fd.d.verityEnabled() {
-		if err := fd.d.fs.verifyStatAndChildren(ctx, fd.d, stat); err != nil {
+		if err := fd.d.fs.verifyStatAndChildrenLocked(ctx, fd.d, stat); err != nil {
 			return linux.Statx{}, err
 		}
 	}
+	fd.d.dirMu.Unlock()
 	return stat, nil
 }
 
@@ -718,22 +740,24 @@ func (fd *fileDescription) Seek(ctx context.Context, offset int64, whence int32)
 	return offset, nil
 }
 
-// generateMerkle generates a Merkle tree file for fd. If fd points to a file
-// /foo/bar, a Merkle tree file /foo/.merkle.verity.bar is generated. The hash
-// of the generated Merkle tree and the data size is returned.  If fd points to
-// a regular file, the data is the content of the file. If fd points to a
-// directory, the data is all hahes of its children, written to the Merkle tree
-// file.
-func (fd *fileDescription) generateMerkle(ctx context.Context) ([]byte, uint64, error) {
-	fdReader := vfs.FileReadWriteSeeker{
+// generateMerkleLocked generates a Merkle tree file for fd. If fd points to a
+// file /foo/bar, a Merkle tree file /foo/.merkle.verity.bar is generated. The
+// hash of the generated Merkle tree and the data size is returned.  If fd
+// points to a regular file, the data is the content of the file. If fd points
+// to a directory, the data is all hahes of its children, written to the Merkle
+// tree file.
+//
+// Preconditions: fd.d.fs.verityMu must be locked.
+func (fd *fileDescription) generateMerkleLocked(ctx context.Context) ([]byte, uint64, error) {
+	fdReader := FileReadWriteSeeker{
 		FD:  fd.lowerFD,
 		Ctx: ctx,
 	}
-	merkleReader := vfs.FileReadWriteSeeker{
+	merkleReader := FileReadWriteSeeker{
 		FD:  fd.merkleReader,
 		Ctx: ctx,
 	}
-	merkleWriter := vfs.FileReadWriteSeeker{
+	merkleWriter := FileReadWriteSeeker{
 		FD:  fd.merkleWriter,
 		Ctx: ctx,
 	}
@@ -793,11 +817,14 @@ func (fd *fileDescription) generateMerkle(ctx context.Context) ([]byte, uint64,
 	return hash, uint64(params.Size), err
 }
 
-// recordChildren writes the names of fd's children into the corresponding
-// Merkle tree file, and saves the offset/size of the map into xattrs.
+// recordChildrenLocked writes the names of fd's children into the
+// corresponding Merkle tree file, and saves the offset/size of the map into
+// xattrs.
 //
-// Preconditions: fd.d.isDir() == true
-func (fd *fileDescription) recordChildren(ctx context.Context) error {
+// Preconditions:
+// * fd.d.fs.verityMu must be locked.
+// * fd.d.isDir() == true.
+func (fd *fileDescription) recordChildrenLocked(ctx context.Context) error {
 	// Record the children names in the Merkle tree file.
 	childrenNames, err := json.Marshal(fd.d.childrenNames)
 	if err != nil {
@@ -847,7 +874,7 @@ func (fd *fileDescription) enableVerity(ctx context.Context) (uintptr, error) {
 		return 0, alertIntegrityViolation("Unexpected verity fd: missing expected underlying fds")
 	}
 
-	hash, dataSize, err := fd.generateMerkle(ctx)
+	hash, dataSize, err := fd.generateMerkleLocked(ctx)
 	if err != nil {
 		return 0, err
 	}
@@ -888,11 +915,13 @@ func (fd *fileDescription) enableVerity(ctx context.Context) (uintptr, error) {
 	}
 
 	if fd.d.isDir() {
-		if err := fd.recordChildren(ctx); err != nil {
+		if err := fd.recordChildrenLocked(ctx); err != nil {
 			return 0, err
 		}
 	}
-	fd.d.hash = append(fd.d.hash, hash...)
+	fd.d.hashMu.Lock()
+	fd.d.hash = hash
+	fd.d.hashMu.Unlock()
 	return 0, nil
 }
 
@@ -904,6 +933,9 @@ func (fd *fileDescription) measureVerity(ctx context.Context, verityDigest userm
 	}
 	var metadata linux.DigestMetadata
 
+	fd.d.hashMu.RLock()
+	defer fd.d.hashMu.RUnlock()
+
 	// If allowRuntimeEnable is true, an empty fd.d.hash indicates that
 	// verity is not enabled for the file. If allowRuntimeEnable is false,
 	// this is an integrity violation because all files should have verity
@@ -940,11 +972,13 @@ func (fd *fileDescription) measureVerity(ctx context.Context, verityDigest userm
 func (fd *fileDescription) verityFlags(ctx context.Context, flags usermem.Addr) (uintptr, error) {
 	f := int32(0)
 
+	fd.d.hashMu.RLock()
 	// All enabled files should store a hash. This flag is not settable via
 	// FS_IOC_SETFLAGS.
 	if len(fd.d.hash) != 0 {
 		f |= linux.FS_VERITY_FL
 	}
+	fd.d.hashMu.RUnlock()
 
 	t := kernel.TaskFromContext(ctx)
 	if t == nil {
@@ -1013,16 +1047,17 @@ func (fd *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, of
 		return 0, alertIntegrityViolation(fmt.Sprintf("Failed to convert xattr %s to int: %v", merkleSizeXattr, err))
 	}
 
-	dataReader := vfs.FileReadWriteSeeker{
+	dataReader := FileReadWriteSeeker{
 		FD:  fd.lowerFD,
 		Ctx: ctx,
 	}
 
-	merkleReader := vfs.FileReadWriteSeeker{
+	merkleReader := FileReadWriteSeeker{
 		FD:  fd.merkleReader,
 		Ctx: ctx,
 	}
 
+	fd.d.hashMu.RLock()
 	n, err := merkletree.Verify(&merkletree.VerifyParams{
 		Out:      dst.Writer(ctx),
 		File:     &dataReader,
@@ -1040,6 +1075,7 @@ func (fd *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, of
 		Expected:              fd.d.hash,
 		DataAndTreeInSameFile: false,
 	})
+	fd.d.hashMu.RUnlock()
 	if err != nil {
 		return 0, alertIntegrityViolation(fmt.Sprintf("Verification failed: %v", err))
 	}
@@ -1065,3 +1101,45 @@ func (fd *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t
 func (fd *fileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
 	return fd.lowerFD.UnlockPOSIX(ctx, uid, start, length, whence)
 }
+
+// FileReadWriteSeeker is a helper struct to pass a vfs.FileDescription as
+// io.Reader/io.Writer/io.ReadSeeker/io.ReaderAt/io.WriterAt/etc.
+type FileReadWriteSeeker struct {
+	FD    *vfs.FileDescription
+	Ctx   context.Context
+	ROpts vfs.ReadOptions
+	WOpts vfs.WriteOptions
+}
+
+// ReadAt implements io.ReaderAt.ReadAt.
+func (f *FileReadWriteSeeker) ReadAt(p []byte, off int64) (int, error) {
+	dst := usermem.BytesIOSequence(p)
+	n, err := f.FD.PRead(f.Ctx, dst, off, f.ROpts)
+	return int(n), err
+}
+
+// Read implements io.ReadWriteSeeker.Read.
+func (f *FileReadWriteSeeker) Read(p []byte) (int, error) {
+	dst := usermem.BytesIOSequence(p)
+	n, err := f.FD.Read(f.Ctx, dst, f.ROpts)
+	return int(n), err
+}
+
+// Seek implements io.ReadWriteSeeker.Seek.
+func (f *FileReadWriteSeeker) Seek(offset int64, whence int) (int64, error) {
+	return f.FD.Seek(f.Ctx, offset, int32(whence))
+}
+
+// WriteAt implements io.WriterAt.WriteAt.
+func (f *FileReadWriteSeeker) WriteAt(p []byte, off int64) (int, error) {
+	dst := usermem.BytesIOSequence(p)
+	n, err := f.FD.PWrite(f.Ctx, dst, off, f.WOpts)
+	return int(n), err
+}
+
+// Write implements io.ReadWriteSeeker.Write.
+func (f *FileReadWriteSeeker) Write(p []byte) (int, error) {
+	buf := usermem.BytesIOSequence(p)
+	n, err := f.FD.Write(f.Ctx, buf, f.WOpts)
+	return int(n), err
+}
diff --git a/pkg/sentry/fsimpl/verity/verity_test.go b/pkg/sentry/fsimpl/verity/verity_test.go
index 7196e74eb..6ced0afc9 100644
--- a/pkg/sentry/fsimpl/verity/verity_test.go
+++ b/pkg/sentry/fsimpl/verity/verity_test.go
@@ -35,16 +35,39 @@ import (
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
-// rootMerkleFilename is the name of the root Merkle tree file.
-const rootMerkleFilename = "root.verity"
+const (
+	// rootMerkleFilename is the name of the root Merkle tree file.
+	rootMerkleFilename = "root.verity"
+	// maxDataSize is the maximum data size of a test file.
+	maxDataSize = 100000
+)
+
+var hashAlgs = []HashAlgorithm{SHA256, SHA512}
 
-// maxDataSize is the maximum data size written to the file for test.
-const maxDataSize = 100000
+func dentryFromVD(t *testing.T, vd vfs.VirtualDentry) *dentry {
+	t.Helper()
+	d, ok := vd.Dentry().Impl().(*dentry)
+	if !ok {
+		t.Fatalf("can't assert %T as a *dentry", vd)
+	}
+	return d
+}
+
+// dentryFromFD returns the dentry corresponding to fd.
+func dentryFromFD(t *testing.T, fd *vfs.FileDescription) *dentry {
+	t.Helper()
+	f, ok := fd.Impl().(*fileDescription)
+	if !ok {
+		t.Fatalf("can't assert %T as a *fileDescription", fd)
+	}
+	return f.d
+}
 
 // newVerityRoot creates a new verity mount, and returns the root. The
 // underlying file system is tmpfs. If the error is not nil, then cleanup
 // should be called when the root is no longer needed.
 func newVerityRoot(t *testing.T, hashAlg HashAlgorithm) (*vfs.VirtualFilesystem, vfs.VirtualDentry, *kernel.Task, error) {
+	t.Helper()
 	k, err := testutil.Boot()
 	if err != nil {
 		t.Fatalf("testutil.Boot: %v", err)
@@ -92,7 +115,6 @@ func newVerityRoot(t *testing.T, hashAlg HashAlgorithm) (*vfs.VirtualFilesystem,
 		t.Fatalf("testutil.CreateTask: %v", err)
 	}
 
-	t.Helper()
 	t.Cleanup(func() {
 		root.DecRef(ctx)
 		mntns.DecRef(ctx)
@@ -100,21 +122,97 @@ func newVerityRoot(t *testing.T, hashAlg HashAlgorithm) (*vfs.VirtualFilesystem,
 	return vfsObj, root, task, nil
 }
 
-// newFileFD creates a new file in the verity mount, and returns the FD. The FD
-// points to a file that has random data generated.
-func newFileFD(ctx context.Context, vfsObj *vfs.VirtualFilesystem, root vfs.VirtualDentry, filePath string, mode linux.FileMode) (*vfs.FileDescription, int, error) {
-	creds := auth.CredentialsFromContext(ctx)
-	lowerRoot := root.Dentry().Impl().(*dentry).lowerVD
+// openVerityAt opens a verity file.
+//
+// TODO(chongc): release reference from opening the file when done.
+func openVerityAt(ctx context.Context, vfsObj *vfs.VirtualFilesystem, vd vfs.VirtualDentry, path string, flags uint32, mode linux.FileMode) (*vfs.FileDescription, error) {
+	return vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
+		Root:  vd,
+		Start: vd,
+		Path:  fspath.Parse(path),
+	}, &vfs.OpenOptions{
+		Flags: flags,
+		Mode:  mode,
+	})
+}
 
-	// Create the file in the underlying file system.
-	lowerFD, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{
-		Root:  lowerRoot,
-		Start: lowerRoot,
-		Path:  fspath.Parse(filePath),
+// openLowerAt opens the file in the underlying file system.
+//
+// TODO(chongc): release reference from opening the file when done.
+func (d *dentry) openLowerAt(ctx context.Context, vfsObj *vfs.VirtualFilesystem, path string, flags uint32, mode linux.FileMode) (*vfs.FileDescription, error) {
+	return vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
+		Root:  d.lowerVD,
+		Start: d.lowerVD,
+		Path:  fspath.Parse(path),
 	}, &vfs.OpenOptions{
-		Flags: linux.O_RDWR | linux.O_CREAT | linux.O_EXCL,
-		Mode:  linux.ModeRegular | mode,
+		Flags: flags,
+		Mode:  mode,
 	})
+}
+
+// openLowerMerkleAt opens the Merkle file in the underlying file system.
+//
+// TODO(chongc): release reference from opening the file when done.
+func (d *dentry) openLowerMerkleAt(ctx context.Context, vfsObj *vfs.VirtualFilesystem, flags uint32, mode linux.FileMode) (*vfs.FileDescription, error) {
+	return vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
+		Root:  d.lowerMerkleVD,
+		Start: d.lowerMerkleVD,
+	}, &vfs.OpenOptions{
+		Flags: flags,
+		Mode:  mode,
+	})
+}
+
+// unlinkLowerAt deletes the file in the underlying file system.
+func (d *dentry) unlinkLowerAt(ctx context.Context, vfsObj *vfs.VirtualFilesystem, path string) error {
+	return vfsObj.UnlinkAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
+		Root:  d.lowerVD,
+		Start: d.lowerVD,
+		Path:  fspath.Parse(path),
+	})
+}
+
+// unlinkLowerMerkleAt deletes the Merkle file in the underlying file system.
+func (d *dentry) unlinkLowerMerkleAt(ctx context.Context, vfsObj *vfs.VirtualFilesystem, path string) error {
+	return vfsObj.UnlinkAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
+		Root:  d.lowerVD,
+		Start: d.lowerVD,
+		Path:  fspath.Parse(merklePrefix + path),
+	})
+}
+
+// renameLowerAt renames file name to newName in the underlying file system.
+func (d *dentry) renameLowerAt(ctx context.Context, vfsObj *vfs.VirtualFilesystem, name string, newName string) error {
+	return vfsObj.RenameAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
+		Root:  d.lowerVD,
+		Start: d.lowerVD,
+		Path:  fspath.Parse(name),
+	}, &vfs.PathOperation{
+		Root:  d.lowerVD,
+		Start: d.lowerVD,
+		Path:  fspath.Parse(newName),
+	}, &vfs.RenameOptions{})
+}
+
+// renameLowerMerkleAt renames Merkle file name to newName in the underlying
+// file system.
+func (d *dentry) renameLowerMerkleAt(ctx context.Context, vfsObj *vfs.VirtualFilesystem, name string, newName string) error {
+	return vfsObj.RenameAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
+		Root:  d.lowerVD,
+		Start: d.lowerVD,
+		Path:  fspath.Parse(merklePrefix + name),
+	}, &vfs.PathOperation{
+		Root:  d.lowerVD,
+		Start: d.lowerVD,
+		Path:  fspath.Parse(merklePrefix + newName),
+	}, &vfs.RenameOptions{})
+}
+
+// newFileFD creates a new file in the verity mount, and returns the FD. The FD
+// points to a file that has random data generated.
+func newFileFD(ctx context.Context, t *testing.T, vfsObj *vfs.VirtualFilesystem, root vfs.VirtualDentry, filePath string, mode linux.FileMode) (*vfs.FileDescription, int, error) {
+	// Create the file in the underlying file system.
+	lowerFD, err := dentryFromVD(t, root).openLowerAt(ctx, vfsObj, filePath, linux.O_RDWR|linux.O_CREAT|linux.O_EXCL, linux.ModeRegular|mode)
 	if err != nil {
 		return nil, 0, err
 	}
@@ -137,20 +235,12 @@ func newFileFD(ctx context.Context, vfsObj *vfs.VirtualFilesystem, root vfs.Virt
 	lowerFD.DecRef(ctx)
 
 	// Now open the verity file descriptor.
-	fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{
-		Root:  root,
-		Start: root,
-		Path:  fspath.Parse(filePath),
-	}, &vfs.OpenOptions{
-		Flags: linux.O_RDONLY,
-		Mode:  linux.ModeRegular | mode,
-	})
+	fd, err := openVerityAt(ctx, vfsObj, root, filePath, linux.O_RDONLY, mode)
 	return fd, dataSize, err
 }
 
-// corruptRandomBit randomly flips a bit in the file represented by fd.
-func corruptRandomBit(ctx context.Context, fd *vfs.FileDescription, size int) error {
-	// Flip a random bit in the underlying file.
+// flipRandomBit randomly flips a bit in the file represented by fd.
+func flipRandomBit(ctx context.Context, fd *vfs.FileDescription, size int) error {
 	randomPos := int64(rand.Intn(size))
 	byteToModify := make([]byte, 1)
 	if _, err := fd.PRead(ctx, usermem.BytesIOSequence(byteToModify), randomPos, vfs.ReadOptions{}); err != nil {
@@ -163,7 +253,14 @@ func corruptRandomBit(ctx context.Context, fd *vfs.FileDescription, size int) er
 	return nil
 }
 
-var hashAlgs = []HashAlgorithm{SHA256, SHA512}
+func enableVerity(ctx context.Context, t *testing.T, fd *vfs.FileDescription) {
+	t.Helper()
+	var args arch.SyscallArguments
+	args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY}
+	if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil {
+		t.Fatalf("enable verity: %v", err)
+	}
+}
 
 // TestOpen ensures that when a file is created, the corresponding Merkle tree
 // file and the root Merkle tree file exist.
@@ -175,30 +272,18 @@ func TestOpen(t *testing.T) {
 		}
 
 		filename := "verity-test-file"
-		if _, _, err := newFileFD(ctx, vfsObj, root, filename, 0644); err != nil {
+		fd, _, err := newFileFD(ctx, t, vfsObj, root, filename, 0644)
+		if err != nil {
 			t.Fatalf("newFileFD: %v", err)
 		}
 
 		// Ensure that the corresponding Merkle tree file is created.
-		lowerRoot := root.Dentry().Impl().(*dentry).lowerVD
-		if _, err = vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
-			Root:  lowerRoot,
-			Start: lowerRoot,
-			Path:  fspath.Parse(merklePrefix + filename),
-		}, &vfs.OpenOptions{
-			Flags: linux.O_RDONLY,
-		}); err != nil {
+		if _, err = dentryFromFD(t, fd).openLowerMerkleAt(ctx, vfsObj, linux.O_RDONLY, linux.ModeRegular); err != nil {
 			t.Errorf("OpenAt Merkle tree file %s: %v", merklePrefix+filename, err)
 		}
 
 		// Ensure the root merkle tree file is created.
-		if _, err = vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
-			Root:  lowerRoot,
-			Start: lowerRoot,
-			Path:  fspath.Parse(merklePrefix + rootMerkleFilename),
-		}, &vfs.OpenOptions{
-			Flags: linux.O_RDONLY,
-		}); err != nil {
+		if _, err = dentryFromVD(t, root).openLowerMerkleAt(ctx, vfsObj, linux.O_RDONLY, linux.ModeRegular); err != nil {
 			t.Errorf("OpenAt root Merkle tree file %s: %v", merklePrefix+rootMerkleFilename, err)
 		}
 	}
@@ -214,17 +299,13 @@ func TestPReadUnmodifiedFileSucceeds(t *testing.T) {
 		}
 
 		filename := "verity-test-file"
-		fd, size, err := newFileFD(ctx, vfsObj, root, filename, 0644)
+		fd, size, err := newFileFD(ctx, t, vfsObj, root, filename, 0644)
 		if err != nil {
 			t.Fatalf("newFileFD: %v", err)
 		}
 
 		// Enable verity on the file and confirm a normal read succeeds.
-		var args arch.SyscallArguments
-		args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY}
-		if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil {
-			t.Fatalf("Ioctl: %v", err)
-		}
+		enableVerity(ctx, t, fd)
 
 		buf := make([]byte, size)
 		n, err := fd.PRead(ctx, usermem.BytesIOSequence(buf), 0 /* offset */, vfs.ReadOptions{})
@@ -248,17 +329,13 @@ func TestReadUnmodifiedFileSucceeds(t *testing.T) {
 		}
 
 		filename := "verity-test-file"
-		fd, size, err := newFileFD(ctx, vfsObj, root, filename, 0644)
+		fd, size, err := newFileFD(ctx, t, vfsObj, root, filename, 0644)
 		if err != nil {
 			t.Fatalf("newFileFD: %v", err)
 		}
 
 		// Enable verity on the file and confirm a normal read succeeds.
-		var args arch.SyscallArguments
-		args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY}
-		if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil {
-			t.Fatalf("Ioctl: %v", err)
-		}
+		enableVerity(ctx, t, fd)
 
 		buf := make([]byte, size)
 		n, err := fd.Read(ctx, usermem.BytesIOSequence(buf), vfs.ReadOptions{})
@@ -282,27 +359,16 @@ func TestReopenUnmodifiedFileSucceeds(t *testing.T) {
 		}
 
 		filename := "verity-test-file"
-		fd, _, err := newFileFD(ctx, vfsObj, root, filename, 0644)
+		fd, _, err := newFileFD(ctx, t, vfsObj, root, filename, 0644)
 		if err != nil {
 			t.Fatalf("newFileFD: %v", err)
 		}
 
 		// Enable verity on the file and confirms a normal read succeeds.
-		var args arch.SyscallArguments
-		args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY}
-		if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil {
-			t.Fatalf("Ioctl: %v", err)
-		}
+		enableVerity(ctx, t, fd)
 
 		// Ensure reopening the verity enabled file succeeds.
-		if _, err = vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
-			Root:  root,
-			Start: root,
-			Path:  fspath.Parse(filename),
-		}, &vfs.OpenOptions{
-			Flags: linux.O_RDONLY,
-			Mode:  linux.ModeRegular,
-		}); err != nil {
+		if _, err = openVerityAt(ctx, vfsObj, root, filename, linux.O_RDONLY, linux.ModeRegular); err != nil {
 			t.Errorf("reopen enabled file failed: %v", err)
 		}
 	}
@@ -317,43 +383,24 @@ func TestOpenNonexistentFile(t *testing.T) {
 	}
 
 	filename := "verity-test-file"
-	fd, _, err := newFileFD(ctx, vfsObj, root, filename, 0644)
+	fd, _, err := newFileFD(ctx, t, vfsObj, root, filename, 0644)
 	if err != nil {
 		t.Fatalf("newFileFD: %v", err)
 	}
 
 	// Enable verity on the file and confirms a normal read succeeds.
-	var args arch.SyscallArguments
-	args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY}
-	if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil {
-		t.Fatalf("Ioctl: %v", err)
-	}
+	enableVerity(ctx, t, fd)
 
 	// Enable verity on the parent directory.
-	parentFD, err := vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
-		Root:  root,
-		Start: root,
-	}, &vfs.OpenOptions{
-		Flags: linux.O_RDONLY,
-	})
+	parentFD, err := openVerityAt(ctx, vfsObj, root, "", linux.O_RDONLY, linux.ModeRegular)
 	if err != nil {
 		t.Fatalf("OpenAt: %v", err)
 	}
-
-	if _, err := parentFD.Ioctl(ctx, nil /* uio */, args); err != nil {
-		t.Fatalf("Ioctl: %v", err)
-	}
+	enableVerity(ctx, t, parentFD)
 
 	// Ensure open an unexpected file in the parent directory fails with
 	// ENOENT rather than verification failure.
-	if _, err = vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
-		Root:  root,
-		Start: root,
-		Path:  fspath.Parse(filename + "abc"),
-	}, &vfs.OpenOptions{
-		Flags: linux.O_RDONLY,
-		Mode:  linux.ModeRegular,
-	}); err != syserror.ENOENT {
+	if _, err = openVerityAt(ctx, vfsObj, root, filename+"abc", linux.O_RDONLY, linux.ModeRegular); err != syserror.ENOENT {
 		t.Errorf("OpenAt unexpected error: %v", err)
 	}
 }
@@ -368,33 +415,22 @@ func TestPReadModifiedFileFails(t *testing.T) {
 		}
 
 		filename := "verity-test-file"
-		fd, size, err := newFileFD(ctx, vfsObj, root, filename, 0644)
+		fd, size, err := newFileFD(ctx, t, vfsObj, root, filename, 0644)
 		if err != nil {
 			t.Fatalf("newFileFD: %v", err)
 		}
 
 		// Enable verity on the file.
-		var args arch.SyscallArguments
-		args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY}
-		if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil {
-			t.Fatalf("Ioctl: %v", err)
-		}
+		enableVerity(ctx, t, fd)
 
 		// Open a new lowerFD that's read/writable.
-		lowerVD := fd.Impl().(*fileDescription).d.lowerVD
-
-		lowerFD, err := vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
-			Root:  lowerVD,
-			Start: lowerVD,
-		}, &vfs.OpenOptions{
-			Flags: linux.O_RDWR,
-		})
+		lowerFD, err := dentryFromFD(t, fd).openLowerAt(ctx, vfsObj, "", linux.O_RDWR, linux.ModeRegular)
 		if err != nil {
 			t.Fatalf("OpenAt: %v", err)
 		}
 
-		if err := corruptRandomBit(ctx, lowerFD, size); err != nil {
-			t.Fatalf("corruptRandomBit: %v", err)
+		if err := flipRandomBit(ctx, lowerFD, size); err != nil {
+			t.Fatalf("flipRandomBit: %v", err)
 		}
 
 		// Confirm that read from the modified file fails.
@@ -415,33 +451,22 @@ func TestReadModifiedFileFails(t *testing.T) {
 		}
 
 		filename := "verity-test-file"
-		fd, size, err := newFileFD(ctx, vfsObj, root, filename, 0644)
+		fd, size, err := newFileFD(ctx, t, vfsObj, root, filename, 0644)
 		if err != nil {
 			t.Fatalf("newFileFD: %v", err)
 		}
 
 		// Enable verity on the file.
-		var args arch.SyscallArguments
-		args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY}
-		if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil {
-			t.Fatalf("Ioctl: %v", err)
-		}
+		enableVerity(ctx, t, fd)
 
 		// Open a new lowerFD that's read/writable.
-		lowerVD := fd.Impl().(*fileDescription).d.lowerVD
-
-		lowerFD, err := vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
-			Root:  lowerVD,
-			Start: lowerVD,
-		}, &vfs.OpenOptions{
-			Flags: linux.O_RDWR,
-		})
+		lowerFD, err := dentryFromFD(t, fd).openLowerAt(ctx, vfsObj, "", linux.O_RDWR, linux.ModeRegular)
 		if err != nil {
 			t.Fatalf("OpenAt: %v", err)
 		}
 
-		if err := corruptRandomBit(ctx, lowerFD, size); err != nil {
-			t.Fatalf("corruptRandomBit: %v", err)
+		if err := flipRandomBit(ctx, lowerFD, size); err != nil {
+			t.Fatalf("flipRandomBit: %v", err)
 		}
 
 		// Confirm that read from the modified file fails.
@@ -462,27 +487,16 @@ func TestModifiedMerkleFails(t *testing.T) {
 		}
 
 		filename := "verity-test-file"
-		fd, size, err := newFileFD(ctx, vfsObj, root, filename, 0644)
+		fd, size, err := newFileFD(ctx, t, vfsObj, root, filename, 0644)
 		if err != nil {
 			t.Fatalf("newFileFD: %v", err)
 		}
 
 		// Enable verity on the file.
-		var args arch.SyscallArguments
-		args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY}
-		if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil {
-			t.Fatalf("Ioctl: %v", err)
-		}
+		enableVerity(ctx, t, fd)
 
 		// Open a new lowerMerkleFD that's read/writable.
-		lowerMerkleVD := fd.Impl().(*fileDescription).d.lowerMerkleVD
-
-		lowerMerkleFD, err := vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
-			Root:  lowerMerkleVD,
-			Start: lowerMerkleVD,
-		}, &vfs.OpenOptions{
-			Flags: linux.O_RDWR,
-		})
+		lowerMerkleFD, err := dentryFromFD(t, fd).openLowerMerkleAt(ctx, vfsObj, linux.O_RDWR, linux.ModeRegular)
 		if err != nil {
 			t.Fatalf("OpenAt: %v", err)
 		}
@@ -493,14 +507,13 @@ func TestModifiedMerkleFails(t *testing.T) {
 			t.Errorf("lowerMerkleFD.Stat: %v", err)
 		}
 
-		if err := corruptRandomBit(ctx, lowerMerkleFD, int(stat.Size)); err != nil {
-			t.Fatalf("corruptRandomBit: %v", err)
+		if err := flipRandomBit(ctx, lowerMerkleFD, int(stat.Size)); err != nil {
+			t.Fatalf("flipRandomBit: %v", err)
 		}
 
 		// Confirm that read from a file with modified Merkle tree fails.
 		buf := make([]byte, size)
 		if _, err := fd.PRead(ctx, usermem.BytesIOSequence(buf), 0 /* offset */, vfs.ReadOptions{}); err == nil {
-			fmt.Println(buf)
 			t.Fatalf("fd.PRead succeeded with modified Merkle file")
 		}
 	}
@@ -517,42 +530,23 @@ func TestModifiedParentMerkleFails(t *testing.T) {
 		}
 
 		filename := "verity-test-file"
-		fd, _, err := newFileFD(ctx, vfsObj, root, filename, 0644)
+		fd, _, err := newFileFD(ctx, t, vfsObj, root, filename, 0644)
 		if err != nil {
 			t.Fatalf("newFileFD: %v", err)
 		}
 
 		// Enable verity on the file.
-		var args arch.SyscallArguments
-		args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY}
-		if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil {
-			t.Fatalf("Ioctl: %v", err)
-		}
+		enableVerity(ctx, t, fd)
 
 		// Enable verity on the parent directory.
-		parentFD, err := vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
-			Root:  root,
-			Start: root,
-		}, &vfs.OpenOptions{
-			Flags: linux.O_RDONLY,
-		})
+		parentFD, err := openVerityAt(ctx, vfsObj, root, "", linux.O_RDONLY, linux.ModeRegular)
 		if err != nil {
 			t.Fatalf("OpenAt: %v", err)
 		}
-
-		if _, err := parentFD.Ioctl(ctx, nil /* uio */, args); err != nil {
-			t.Fatalf("Ioctl: %v", err)
-		}
+		enableVerity(ctx, t, parentFD)
 
 		// Open a new lowerMerkleFD that's read/writable.
-		parentLowerMerkleVD := fd.Impl().(*fileDescription).d.parent.lowerMerkleVD
-
-		parentLowerMerkleFD, err := vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
-			Root:  parentLowerMerkleVD,
-			Start: parentLowerMerkleVD,
-		}, &vfs.OpenOptions{
-			Flags: linux.O_RDWR,
-		})
+		parentLowerMerkleFD, err := dentryFromFD(t, fd).parent.openLowerMerkleAt(ctx, vfsObj, linux.O_RDWR, linux.ModeRegular)
 		if err != nil {
 			t.Fatalf("OpenAt: %v", err)
 		}
@@ -572,21 +566,14 @@ func TestModifiedParentMerkleFails(t *testing.T) {
 		if err != nil {
 			t.Fatalf("Failed convert size to int: %v", err)
 		}
-		if err := corruptRandomBit(ctx, parentLowerMerkleFD, parentMerkleSize); err != nil {
-			t.Fatalf("corruptRandomBit: %v", err)
+		if err := flipRandomBit(ctx, parentLowerMerkleFD, parentMerkleSize); err != nil {
+			t.Fatalf("flipRandomBit: %v", err)
 		}
 
 		parentLowerMerkleFD.DecRef(ctx)
 
 		// Ensure reopening the verity enabled file fails.
-		if _, err = vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
-			Root:  root,
-			Start: root,
-			Path:  fspath.Parse(filename),
-		}, &vfs.OpenOptions{
-			Flags: linux.O_RDONLY,
-			Mode:  linux.ModeRegular,
-		}); err == nil {
+		if _, err = openVerityAt(ctx, vfsObj, root, filename, linux.O_RDONLY, linux.ModeRegular); err == nil {
 			t.Errorf("OpenAt file with modified parent Merkle succeeded")
 		}
 	}
@@ -602,18 +589,13 @@ func TestUnmodifiedStatSucceeds(t *testing.T) {
 		}
 
 		filename := "verity-test-file"
-		fd, _, err := newFileFD(ctx, vfsObj, root, filename, 0644)
+		fd, _, err := newFileFD(ctx, t, vfsObj, root, filename, 0644)
 		if err != nil {
 			t.Fatalf("newFileFD: %v", err)
 		}
 
-		// Enable verity on the file and confirms stat succeeds.
-		var args arch.SyscallArguments
-		args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY}
-		if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil {
-			t.Fatalf("fd.Ioctl: %v", err)
-		}
-
+		// Enable verity on the file and confirm that stat succeeds.
+		enableVerity(ctx, t, fd)
 		if _, err := fd.Stat(ctx, vfs.StatOptions{}); err != nil {
 			t.Errorf("fd.Stat: %v", err)
 		}
@@ -630,17 +612,13 @@ func TestModifiedStatFails(t *testing.T) {
 		}
 
 		filename := "verity-test-file"
-		fd, _, err := newFileFD(ctx, vfsObj, root, filename, 0644)
+		fd, _, err := newFileFD(ctx, t, vfsObj, root, filename, 0644)
 		if err != nil {
 			t.Fatalf("newFileFD: %v", err)
 		}
 
 		// Enable verity on the file.
-		var args arch.SyscallArguments
-		args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY}
-		if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil {
-			t.Fatalf("fd.Ioctl: %v", err)
-		}
+		enableVerity(ctx, t, fd)
 
 		lowerFD := fd.Impl().(*fileDescription).lowerFD
 		// Change the stat of the underlying file, and check that stat fails.
@@ -663,73 +641,57 @@ func TestModifiedStatFails(t *testing.T) {
 // and/or the corresponding Merkle tree file fails with the verity error.
 func TestOpenDeletedFileFails(t *testing.T) {
 	testCases := []struct {
+		name string
 		// The original file is removed if changeFile is true.
 		changeFile bool
 		// The Merkle tree file is removed if changeMerkleFile is true.
 		changeMerkleFile bool
 	}{
 		{
+			name:             "FileOnly",
 			changeFile:       true,
 			changeMerkleFile: false,
 		},
 		{
+			name:             "MerkleOnly",
 			changeFile:       false,
 			changeMerkleFile: true,
 		},
 		{
+			name:             "FileAndMerkle",
 			changeFile:       true,
 			changeMerkleFile: true,
 		},
 	}
 	for _, tc := range testCases {
-		t.Run(fmt.Sprintf("changeFile:%t, changeMerkleFile:%t", tc.changeFile, tc.changeMerkleFile), func(t *testing.T) {
+		t.Run(tc.name, func(t *testing.T) {
 			vfsObj, root, ctx, err := newVerityRoot(t, SHA256)
 			if err != nil {
 				t.Fatalf("newVerityRoot: %v", err)
 			}
 
 			filename := "verity-test-file"
-			fd, _, err := newFileFD(ctx, vfsObj, root, filename, 0644)
+			fd, _, err := newFileFD(ctx, t, vfsObj, root, filename, 0644)
 			if err != nil {
 				t.Fatalf("newFileFD: %v", err)
 			}
 
 			// Enable verity on the file.
-			var args arch.SyscallArguments
-			args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY}
-			if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil {
-				t.Fatalf("Ioctl: %v", err)
-			}
+			enableVerity(ctx, t, fd)
 
-			rootLowerVD := root.Dentry().Impl().(*dentry).lowerVD
 			if tc.changeFile {
-				if err := vfsObj.UnlinkAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
-					Root:  rootLowerVD,
-					Start: rootLowerVD,
-					Path:  fspath.Parse(filename),
-				}); err != nil {
+				if err := dentryFromVD(t, root).unlinkLowerAt(ctx, vfsObj, filename); err != nil {
 					t.Fatalf("UnlinkAt: %v", err)
 				}
 			}
 			if tc.changeMerkleFile {
-				if err := vfsObj.UnlinkAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
-					Root:  rootLowerVD,
-					Start: rootLowerVD,
-					Path:  fspath.Parse(merklePrefix + filename),
-				}); err != nil {
+				if err := dentryFromVD(t, root).unlinkLowerMerkleAt(ctx, vfsObj, filename); err != nil {
 					t.Fatalf("UnlinkAt: %v", err)
 				}
 			}
 
 			// Ensure reopening the verity enabled file fails.
-			if _, err = vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
-				Root:  root,
-				Start: root,
-				Path:  fspath.Parse(filename),
-			}, &vfs.OpenOptions{
-				Flags: linux.O_RDONLY,
-				Mode:  linux.ModeRegular,
-			}); err != syserror.EIO {
+			if _, err = openVerityAt(ctx, vfsObj, root, filename, linux.O_RDONLY, linux.ModeRegular); err != syserror.EIO {
 				t.Errorf("got OpenAt error: %v, expected EIO", err)
 			}
 		})
@@ -740,82 +702,58 @@ func TestOpenDeletedFileFails(t *testing.T) {
 // and/or the corresponding Merkle tree file fails with the verity error.
 func TestOpenRenamedFileFails(t *testing.T) {
 	testCases := []struct {
+		name string
 		// The original file is renamed if changeFile is true.
 		changeFile bool
 		// The Merkle tree file is renamed if changeMerkleFile is true.
 		changeMerkleFile bool
 	}{
 		{
+			name:             "FileOnly",
 			changeFile:       true,
 			changeMerkleFile: false,
 		},
 		{
+			name:             "MerkleOnly",
 			changeFile:       false,
 			changeMerkleFile: true,
 		},
 		{
+			name:             "FileAndMerkle",
 			changeFile:       true,
 			changeMerkleFile: true,
 		},
 	}
 	for _, tc := range testCases {
-		t.Run(fmt.Sprintf("changeFile:%t, changeMerkleFile:%t", tc.changeFile, tc.changeMerkleFile), func(t *testing.T) {
+		t.Run(tc.name, func(t *testing.T) {
 			vfsObj, root, ctx, err := newVerityRoot(t, SHA256)
 			if err != nil {
 				t.Fatalf("newVerityRoot: %v", err)
 			}
 
 			filename := "verity-test-file"
-			fd, _, err := newFileFD(ctx, vfsObj, root, filename, 0644)
+			fd, _, err := newFileFD(ctx, t, vfsObj, root, filename, 0644)
 			if err != nil {
 				t.Fatalf("newFileFD: %v", err)
 			}
 
 			// Enable verity on the file.
-			var args arch.SyscallArguments
-			args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY}
-			if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil {
-				t.Fatalf("Ioctl: %v", err)
-			}
+			enableVerity(ctx, t, fd)
 
-			rootLowerVD := root.Dentry().Impl().(*dentry).lowerVD
 			newFilename := "renamed-test-file"
 			if tc.changeFile {
-				if err := vfsObj.RenameAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
-					Root:  rootLowerVD,
-					Start: rootLowerVD,
-					Path:  fspath.Parse(filename),
-				}, &vfs.PathOperation{
-					Root:  rootLowerVD,
-					Start: rootLowerVD,
-					Path:  fspath.Parse(newFilename),
-				}, &vfs.RenameOptions{}); err != nil {
+				if err := dentryFromVD(t, root).renameLowerAt(ctx, vfsObj, filename, newFilename); err != nil {
 					t.Fatalf("RenameAt: %v", err)
 				}
 			}
 			if tc.changeMerkleFile {
-				if err := vfsObj.RenameAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
-					Root:  rootLowerVD,
-					Start: rootLowerVD,
-					Path:  fspath.Parse(merklePrefix + filename),
-				}, &vfs.PathOperation{
-					Root:  rootLowerVD,
-					Start: rootLowerVD,
-					Path:  fspath.Parse(merklePrefix + newFilename),
-				}, &vfs.RenameOptions{}); err != nil {
+				if err := dentryFromVD(t, root).renameLowerMerkleAt(ctx, vfsObj, filename, newFilename); err != nil {
 					t.Fatalf("UnlinkAt: %v", err)
 				}
 			}
 
 			// Ensure reopening the verity enabled file fails.
-			if _, err = vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
-				Root:  root,
-				Start: root,
-				Path:  fspath.Parse(filename),
-			}, &vfs.OpenOptions{
-				Flags: linux.O_RDONLY,
-				Mode:  linux.ModeRegular,
-			}); err != syserror.EIO {
+			if _, err = openVerityAt(ctx, vfsObj, root, filename, linux.O_RDONLY, linux.ModeRegular); err != syserror.EIO {
 				t.Errorf("got OpenAt error: %v, expected EIO", err)
 			}
 		})
diff --git a/pkg/sentry/fsmetric/BUILD b/pkg/sentry/fsmetric/BUILD
new file mode 100644
index 000000000..4e86fbdd8
--- /dev/null
+++ b/pkg/sentry/fsmetric/BUILD
@@ -0,0 +1,10 @@
+load("//tools:defs.bzl", "go_library")
+
+licenses(["notice"])
+
+go_library(
+    name = "fsmetric",
+    srcs = ["fsmetric.go"],
+    visibility = ["//pkg/sentry:internal"],
+    deps = ["//pkg/metric"],
+)
diff --git a/pkg/sentry/fsmetric/fsmetric.go b/pkg/sentry/fsmetric/fsmetric.go
new file mode 100644
index 000000000..7e535b527
--- /dev/null
+++ b/pkg/sentry/fsmetric/fsmetric.go
@@ -0,0 +1,83 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package fsmetric defines filesystem metrics that are used by both VFS1 and
+// VFS2.
+//
+// TODO(gvisor.dev/issue/1624): Once VFS1 is deleted, inline these metrics into
+// VFS2.
+package fsmetric
+
+import (
+	"time"
+
+	"gvisor.dev/gvisor/pkg/metric"
+)
+
+// RecordWaitTime enables the ReadWait, GoferReadWait9P, GoferReadWaitHost, and
+// TmpfsReadWait metrics. Enabling this comes at a CPU cost due to performing
+// three clock reads per read call.
+//
+// Note that this is only performed in the direct read path, and may not be
+// consistently applied for other forms of reads, such as splice.
+var RecordWaitTime = false
+
+// Metrics that apply to all filesystems.
+var (
+	Opens    = metric.MustCreateNewUint64Metric("/fs/opens", false /* sync */, "Number of file opens.")
+	Reads    = metric.MustCreateNewUint64Metric("/fs/reads", false /* sync */, "Number of file reads.")
+	ReadWait = metric.MustCreateNewUint64NanosecondsMetric("/fs/read_wait", false /* sync */, "Time waiting on file reads, in nanoseconds.")
+)
+
+// Metrics that only apply to fs/gofer and fsimpl/gofer.
+var (
+	GoferOpensWX      = metric.MustCreateNewUint64Metric("/gofer/opened_write_execute_file", true /* sync */, "Number of times a executable file was opened writably from a gofer.")
+	GoferOpens9P      = metric.MustCreateNewUint64Metric("/gofer/opens_9p", false /* sync */, "Number of times a file was opened from a gofer and did not have a host file descriptor.")
+	GoferOpensHost    = metric.MustCreateNewUint64Metric("/gofer/opens_host", false /* sync */, "Number of times a file was opened from a gofer and did have a host file descriptor.")
+	GoferReads9P      = metric.MustCreateNewUint64Metric("/gofer/reads_9p", false /* sync */, "Number of 9P file reads from a gofer.")
+	GoferReadWait9P   = metric.MustCreateNewUint64NanosecondsMetric("/gofer/read_wait_9p", false /* sync */, "Time waiting on 9P file reads from a gofer, in nanoseconds.")
+	GoferReadsHost    = metric.MustCreateNewUint64Metric("/gofer/reads_host", false /* sync */, "Number of host file reads from a gofer.")
+	GoferReadWaitHost = metric.MustCreateNewUint64NanosecondsMetric("/gofer/read_wait_host", false /* sync */, "Time waiting on host file reads from a gofer, in nanoseconds.")
+)
+
+// Metrics that only apply to fs/tmpfs and fsimpl/tmpfs.
+var (
+	TmpfsOpensRO  = metric.MustCreateNewUint64Metric("/in_memory_file/opens_ro", false /* sync */, "Number of times an in-memory file was opened in read-only mode.")
+	TmpfsOpensW   = metric.MustCreateNewUint64Metric("/in_memory_file/opens_w", false /* sync */, "Number of times an in-memory file was opened in write mode.")
+	TmpfsReads    = metric.MustCreateNewUint64Metric("/in_memory_file/reads", false /* sync */, "Number of in-memory file reads.")
+	TmpfsReadWait = metric.MustCreateNewUint64NanosecondsMetric("/in_memory_file/read_wait", false /* sync */, "Time waiting on in-memory file reads, in nanoseconds.")
+)
+
+// StartReadWait indicates the beginning of a file read.
+func StartReadWait() time.Time {
+	if !RecordWaitTime {
+		return time.Time{}
+	}
+	return time.Now()
+}
+
+// FinishReadWait indicates the end of a file read whose time is accounted by
+// m. start must be the value returned by the corresponding call to
+// StartReadWait.
+//
+// FinishReadWait is marked nosplit for performance since it's often called
+// from defer statements, which prevents it from being inlined
+// (https://github.com/golang/go/issues/38471).
+//go:nosplit
+func FinishReadWait(m *metric.Uint64Metric, start time.Time) {
+	if !RecordWaitTime {
+		return
+	}
+	m.IncrementBy(uint64(time.Since(start).Nanoseconds()))
+}
diff --git a/pkg/sentry/kernel/epoll/epoll.go b/pkg/sentry/kernel/epoll/epoll.go
index 15519f0df..61aeca044 100644
--- a/pkg/sentry/kernel/epoll/epoll.go
+++ b/pkg/sentry/kernel/epoll/epoll.go
@@ -273,7 +273,7 @@ func (e *EventPoll) ReadEvents(max int) []linux.EpollEvent {
 //
 // Callback is called when one of the files we're polling becomes ready. It
 // moves said file to the readyList if it's currently in the waiting list.
-func (p *pollEntry) Callback(*waiter.Entry) {
+func (p *pollEntry) Callback(*waiter.Entry, waiter.EventMask) {
 	e := p.epoll
 
 	e.listsMu.Lock()
@@ -306,9 +306,8 @@ func (e *EventPoll) initEntryReadiness(entry *pollEntry) {
 	f.EventRegister(&entry.waiter, entry.mask)
 
 	// Check if the file happens to already be in a ready state.
-	ready := f.Readiness(entry.mask) & entry.mask
-	if ready != 0 {
-		entry.Callback(&entry.waiter)
+	if ready := f.Readiness(entry.mask) & entry.mask; ready != 0 {
+		entry.Callback(&entry.waiter, ready)
 	}
 }
 
diff --git a/pkg/sentry/kernel/fasync/BUILD b/pkg/sentry/kernel/fasync/BUILD
index 2b3955598..f855f038b 100644
--- a/pkg/sentry/kernel/fasync/BUILD
+++ b/pkg/sentry/kernel/fasync/BUILD
@@ -8,11 +8,13 @@ go_library(
     visibility = ["//:sandbox"],
     deps = [
         "//pkg/abi/linux",
+        "//pkg/sentry/arch",
         "//pkg/sentry/fs",
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/vfs",
         "//pkg/sync",
+        "//pkg/syserror",
         "//pkg/waiter",
     ],
 )
diff --git a/pkg/sentry/kernel/fasync/fasync.go b/pkg/sentry/kernel/fasync/fasync.go
index 153d2cd9b..b66d61c6f 100644
--- a/pkg/sentry/kernel/fasync/fasync.go
+++ b/pkg/sentry/kernel/fasync/fasync.go
@@ -17,22 +17,45 @@ package fasync
 
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
-// New creates a new fs.FileAsync.
-func New() fs.FileAsync {
-	return &FileAsync{}
+// Table to convert waiter event masks into si_band siginfo codes.
+// Taken from fs/fcntl.c:band_table.
+var bandTable = map[waiter.EventMask]int64{
+	// POLL_IN
+	waiter.EventIn: linux.EPOLLIN | linux.EPOLLRDNORM,
+	// POLL_OUT
+	waiter.EventOut: linux.EPOLLOUT | linux.EPOLLWRNORM | linux.EPOLLWRBAND,
+	// POLL_ERR
+	waiter.EventErr: linux.EPOLLERR,
+	// POLL_PRI
+	waiter.EventPri: linux.EPOLLPRI | linux.EPOLLRDBAND,
+	// POLL_HUP
+	waiter.EventHUp: linux.EPOLLHUP | linux.EPOLLERR,
 }
 
-// NewVFS2 creates a new vfs.FileAsync.
-func NewVFS2() vfs.FileAsync {
-	return &FileAsync{}
+// New returns a function that creates a new fs.FileAsync with the given file
+// descriptor.
+func New(fd int) func() fs.FileAsync {
+	return func() fs.FileAsync {
+		return &FileAsync{fd: fd}
+	}
+}
+
+// NewVFS2 returns a function that creates a new vfs.FileAsync with the given
+// file descriptor.
+func NewVFS2(fd int) func() vfs.FileAsync {
+	return func() vfs.FileAsync {
+		return &FileAsync{fd: fd}
+	}
 }
 
 // FileAsync sends signals when the registered file is ready for IO.
@@ -42,6 +65,12 @@ type FileAsync struct {
 	// e is immutable after first use (which is protected by mu below).
 	e waiter.Entry
 
+	// fd is the file descriptor to notify about.
+	// It is immutable, set at allocation time. This matches Linux semantics in
+	// fs/fcntl.c:fasync_helper.
+	// The fd value is passed to the signal recipient in siginfo.si_fd.
+	fd int
+
 	// regMu protects registeration and unregistration actions on e.
 	//
 	// regMu must be held while registration decisions are being made
@@ -56,6 +85,10 @@ type FileAsync struct {
 	mu         sync.Mutex `state:"nosave"`
 	requester  *auth.Credentials
 	registered bool
+	// signal is the signal to deliver upon I/O being available.
+	// The default value ("zero signal") means the default SIGIO signal will be
+	// delivered.
+	signal linux.Signal
 
 	// Only one of the following is allowed to be non-nil.
 	recipientPG *kernel.ProcessGroup
@@ -64,10 +97,10 @@ type FileAsync struct {
 }
 
 // Callback sends a signal.
-func (a *FileAsync) Callback(e *waiter.Entry) {
+func (a *FileAsync) Callback(e *waiter.Entry, mask waiter.EventMask) {
 	a.mu.Lock()
+	defer a.mu.Unlock()
 	if !a.registered {
-		a.mu.Unlock()
 		return
 	}
 	t := a.recipientT
@@ -80,19 +113,34 @@ func (a *FileAsync) Callback(e *waiter.Entry) {
 	}
 	if t == nil {
 		// No recipient has been registered.
-		a.mu.Unlock()
 		return
 	}
 	c := t.Credentials()
 	// Logic from sigio_perm in fs/fcntl.c.
-	if a.requester.EffectiveKUID == 0 ||
+	permCheck := (a.requester.EffectiveKUID == 0 ||
 		a.requester.EffectiveKUID == c.SavedKUID ||
 		a.requester.EffectiveKUID == c.RealKUID ||
 		a.requester.RealKUID == c.SavedKUID ||
-		a.requester.RealKUID == c.RealKUID {
-		t.SendSignal(kernel.SignalInfoPriv(linux.SIGIO))
+		a.requester.RealKUID == c.RealKUID)
+	if !permCheck {
+		return
 	}
-	a.mu.Unlock()
+	signalInfo := &arch.SignalInfo{
+		Signo: int32(linux.SIGIO),
+		Code:  arch.SignalInfoKernel,
+	}
+	if a.signal != 0 {
+		signalInfo.Signo = int32(a.signal)
+		signalInfo.SetFD(uint32(a.fd))
+		var band int64
+		for m, bandCode := range bandTable {
+			if m&mask != 0 {
+				band |= bandCode
+			}
+		}
+		signalInfo.SetBand(band)
+	}
+	t.SendSignal(signalInfo)
 }
 
 // Register sets the file which will be monitored for IO events.
@@ -186,3 +234,25 @@ func (a *FileAsync) ClearOwner() {
 	a.recipientTG = nil
 	a.recipientPG = nil
 }
+
+// Signal returns which signal will be sent to the signal recipient.
+// A value of zero means the signal to deliver wasn't customized, which means
+// the default signal (SIGIO) will be delivered.
+func (a *FileAsync) Signal() linux.Signal {
+	a.mu.Lock()
+	defer a.mu.Unlock()
+	return a.signal
+}
+
+// SetSignal overrides which signal to send when I/O is available.
+// The default behavior can be reset by specifying signal zero, which means
+// to send SIGIO.
+func (a *FileAsync) SetSignal(signal linux.Signal) error {
+	if signal != 0 && !signal.IsValid() {
+		return syserror.EINVAL
+	}
+	a.mu.Lock()
+	defer a.mu.Unlock()
+	a.signal = signal
+	return nil
+}
diff --git a/pkg/sentry/kernel/fd_table_unsafe.go b/pkg/sentry/kernel/fd_table_unsafe.go
index 470d8bf83..f17f9c59c 100644
--- a/pkg/sentry/kernel/fd_table_unsafe.go
+++ b/pkg/sentry/kernel/fd_table_unsafe.go
@@ -121,18 +121,21 @@ func (f *FDTable) setAll(ctx context.Context, fd int32, file *fs.File, fileVFS2
 		panic("VFS1 and VFS2 files set")
 	}
 
-	slice := *(*[]unsafe.Pointer)(atomic.LoadPointer(&f.slice))
+	slicePtr := (*[]unsafe.Pointer)(atomic.LoadPointer(&f.slice))
 
 	// Grow the table as required.
-	if last := int32(len(slice)); fd >= last {
+	if last := int32(len(*slicePtr)); fd >= last {
 		end := fd + 1
 		if end < 2*last {
 			end = 2 * last
 		}
-		slice = append(slice, make([]unsafe.Pointer, end-last)...)
-		atomic.StorePointer(&f.slice, unsafe.Pointer(&slice))
+		newSlice := append(*slicePtr, make([]unsafe.Pointer, end-last)...)
+		slicePtr = &newSlice
+		atomic.StorePointer(&f.slice, unsafe.Pointer(slicePtr))
 	}
 
+	slice := *slicePtr
+
 	var desc *descriptor
 	if file != nil || fileVFS2 != nil {
 		desc = &descriptor{
diff --git a/pkg/sentry/kernel/semaphore/semaphore.go b/pkg/sentry/kernel/semaphore/semaphore.go
index b99c0bffa..31198d772 100644
--- a/pkg/sentry/kernel/semaphore/semaphore.go
+++ b/pkg/sentry/kernel/semaphore/semaphore.go
@@ -29,17 +29,17 @@ import (
 )
 
 const (
-	valueMax = 32767 // SEMVMX
+	// Maximum semaphore value.
+	valueMax = linux.SEMVMX
 
-	// semaphoresMax is "maximum number of semaphores per semaphore ID" (SEMMSL).
-	semaphoresMax = 32000
+	// Maximum number of semaphore sets.
+	setsMax = linux.SEMMNI
 
-	// setMax is "system-wide limit on the number of semaphore sets" (SEMMNI).
-	setsMax = 32000
+	// Maximum number of semaphroes in a semaphore set.
+	semsMax = linux.SEMMSL
 
-	// semaphoresTotalMax is "system-wide limit on the number of semaphores"
-	// (SEMMNS = SEMMNI*SEMMSL).
-	semaphoresTotalMax = 1024000000
+	// Maximum number of semaphores in all semaphroe sets.
+	semsTotalMax = linux.SEMMNS
 )
 
 // Registry maintains a set of semaphores that can be found by key or ID.
@@ -122,7 +122,7 @@ func NewRegistry(userNS *auth.UserNamespace) *Registry {
 // be found. If exclusive is true, it fails if a set with the same key already
 // exists.
 func (r *Registry) FindOrCreate(ctx context.Context, key, nsems int32, mode linux.FileMode, private, create, exclusive bool) (*Set, error) {
-	if nsems < 0 || nsems > semaphoresMax {
+	if nsems < 0 || nsems > semsMax {
 		return nil, syserror.EINVAL
 	}
 
@@ -166,7 +166,7 @@ func (r *Registry) FindOrCreate(ctx context.Context, key, nsems int32, mode linu
 	if len(r.semaphores) >= setsMax {
 		return nil, syserror.EINVAL
 	}
-	if r.totalSems() > int(semaphoresTotalMax-nsems) {
+	if r.totalSems() > int(semsTotalMax-nsems) {
 		return nil, syserror.EINVAL
 	}
 
@@ -176,6 +176,22 @@ func (r *Registry) FindOrCreate(ctx context.Context, key, nsems int32, mode linu
 	return r.newSet(ctx, key, owner, owner, perms, nsems)
 }
 
+// IPCInfo returns information about system-wide semaphore limits and parameters.
+func (r *Registry) IPCInfo() *linux.SemInfo {
+	return &linux.SemInfo{
+		SemMap: linux.SEMMAP,
+		SemMni: linux.SEMMNI,
+		SemMns: linux.SEMMNS,
+		SemMnu: linux.SEMMNU,
+		SemMsl: linux.SEMMSL,
+		SemOpm: linux.SEMOPM,
+		SemUme: linux.SEMUME,
+		SemUsz: 0, // SemUsz not supported.
+		SemVmx: linux.SEMVMX,
+		SemAem: linux.SEMAEM,
+	}
+}
+
 // RemoveID removes set with give 'id' from the registry and marks the set as
 // dead. All waiters will be awakened and fail.
 func (r *Registry) RemoveID(id int32, creds *auth.Credentials) error {
diff --git a/pkg/sentry/kernel/syslog.go b/pkg/sentry/kernel/syslog.go
index a83ce219c..3fee7aa68 100644
--- a/pkg/sentry/kernel/syslog.go
+++ b/pkg/sentry/kernel/syslog.go
@@ -75,6 +75,12 @@ func (s *syslog) Log() []byte {
 		"Checking naughty and nice process list...", // Check it up to twice.
 		"Granting licence to kill(2)...",            // British spelling for British movie.
 		"Letting the watchdogs out...",
+		"Conjuring /dev/null black hole...",
+		"Adversarially training Redcode AI...",
+		"Singleplexing /dev/ptmx...",
+		"Recruiting cron-ies...",
+		"Verifying that no non-zero bytes made their way into /dev/zero...",
+		"Accelerating teletypewriter to 9600 baud...",
 	}
 
 	selectMessage := func() string {
diff --git a/pkg/sentry/kernel/task_block.go b/pkg/sentry/kernel/task_block.go
index 147a3d286..9419f2e95 100644
--- a/pkg/sentry/kernel/task_block.go
+++ b/pkg/sentry/kernel/task_block.go
@@ -177,19 +177,23 @@ func (t *Task) SleepStart() <-chan struct{} {
 // SleepFinish implements context.ChannelSleeper.SleepFinish.
 func (t *Task) SleepFinish(success bool) {
 	if !success {
-		// The interrupted notification is consumed only at the top-level
-		// (Run). Therefore we attempt to reset the pending notification.
-		// This will also elide our next entry back into the task, so we
-		// will process signals, state changes, etc.
+		// Our caller received from t.interruptChan; we need to re-send to it
+		// to ensure that t.interrupted() is still true.
 		t.interruptSelf()
 	}
 	t.accountTaskGoroutineLeave(TaskGoroutineBlockedInterruptible)
 	t.Activate()
 }
 
-// Interrupted implements amutex.Sleeper.Interrupted
+// Interrupted implements context.ChannelSleeper.Interrupted.
 func (t *Task) Interrupted() bool {
-	return len(t.interruptChan) != 0
+	if t.interrupted() {
+		return true
+	}
+	// Indicate that t's task goroutine is still responsive (i.e. reset the
+	// watchdog timer).
+	t.accountTaskGoroutineRunning()
+	return false
 }
 
 // UninterruptibleSleepStart implements context.Context.UninterruptibleSleepStart.
@@ -210,13 +214,17 @@ func (t *Task) UninterruptibleSleepFinish(activate bool) {
 }
 
 // interrupted returns true if interrupt or interruptSelf has been called at
-// least once since the last call to interrupted.
+// least once since the last call to unsetInterrupted.
 func (t *Task) interrupted() bool {
+	return len(t.interruptChan) != 0
+}
+
+// unsetInterrupted causes interrupted to return false until the next call to
+// interrupt or interruptSelf.
+func (t *Task) unsetInterrupted() {
 	select {
 	case <-t.interruptChan:
-		return true
 	default:
-		return false
 	}
 }
 
@@ -232,9 +240,7 @@ func (t *Task) interrupt() {
 func (t *Task) interruptSelf() {
 	select {
 	case t.interruptChan <- struct{}{}:
-		t.Debugf("Interrupt queued")
 	default:
-		t.Debugf("Dropping duplicate interrupt")
 	}
 	// platform.Context.Interrupt() is unnecessary since a task goroutine
 	// calling interruptSelf() cannot also be blocked in
diff --git a/pkg/sentry/kernel/task_sched.go b/pkg/sentry/kernel/task_sched.go
index 52c55d13d..9ba5f8d78 100644
--- a/pkg/sentry/kernel/task_sched.go
+++ b/pkg/sentry/kernel/task_sched.go
@@ -157,6 +157,18 @@ func (t *Task) accountTaskGoroutineLeave(state TaskGoroutineState) {
 	t.goschedSeq.EndWrite()
 }
 
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) accountTaskGoroutineRunning() {
+	now := t.k.CPUClockNow()
+	if t.gosched.State != TaskGoroutineRunningSys {
+		panic(fmt.Sprintf("Task goroutine in state %v (expected %v)", t.gosched.State, TaskGoroutineRunningSys))
+	}
+	t.goschedSeq.BeginWrite()
+	t.gosched.SysTicks += now - t.gosched.Timestamp
+	t.gosched.Timestamp = now
+	t.goschedSeq.EndWrite()
+}
+
 // TaskGoroutineSchedInfo returns a copy of t's task goroutine scheduling info.
 // Most clients should use t.CPUStats() instead.
 func (t *Task) TaskGoroutineSchedInfo() TaskGoroutineSchedInfo {
diff --git a/pkg/sentry/kernel/task_signals.go b/pkg/sentry/kernel/task_signals.go
index ebdb83061..42dd3e278 100644
--- a/pkg/sentry/kernel/task_signals.go
+++ b/pkg/sentry/kernel/task_signals.go
@@ -619,9 +619,6 @@ func (t *Task) setSignalMaskLocked(mask linux.SignalSet) {
 				return
 			}
 		})
-		// We have to re-issue the interrupt consumed by t.interrupted() since
-		// it might have been for a different reason.
-		t.interruptSelf()
 	}
 
 	// Conversely, if the new mask unblocks any signals that were blocked by
@@ -931,10 +928,10 @@ func (t *Task) signalStop(target *Task, code int32, status int32) {
 type runInterrupt struct{}
 
 func (*runInterrupt) execute(t *Task) taskRunState {
-	// Interrupts are de-duplicated (if t is interrupted twice before
-	// t.interrupted() is called, t.interrupted() will only return true once),
-	// so early exits from this function must re-enter the runInterrupt state
-	// to check for more interrupt-signaled conditions.
+	// Interrupts are de-duplicated (t.unsetInterrupted() will undo the effect
+	// of all previous calls to t.interrupted() regardless of how many such
+	// calls there have been), so early exits from this function must re-enter
+	// the runInterrupt state to check for more interrupt-signaled conditions.
 
 	t.tg.signalHandlers.mu.Lock()
 
@@ -1080,6 +1077,7 @@ func (*runInterrupt) execute(t *Task) taskRunState {
 		return t.deliverSignal(info, act)
 	}
 
+	t.unsetInterrupted()
 	t.tg.signalHandlers.mu.Unlock()
 	return (*runApp)(nil)
 }
diff --git a/pkg/sentry/pgalloc/pgalloc.go b/pkg/sentry/pgalloc/pgalloc.go
index 7c297fb9e..d99be7f46 100644
--- a/pkg/sentry/pgalloc/pgalloc.go
+++ b/pkg/sentry/pgalloc/pgalloc.go
@@ -423,11 +423,7 @@ func (f *MemoryFile) Allocate(length uint64, kind usage.MemoryKind) (memmap.File
 	}
 
 	if f.opts.ManualZeroing {
-		if err := f.forEachMappingSlice(fr, func(bs []byte) {
-			for i := range bs {
-				bs[i] = 0
-			}
-		}); err != nil {
+		if err := f.manuallyZero(fr); err != nil {
 			return memmap.FileRange{}, err
 		}
 	}
@@ -560,19 +556,39 @@ func (f *MemoryFile) Decommit(fr memmap.FileRange) error {
 		panic(fmt.Sprintf("invalid range: %v", fr))
 	}
 
+	if f.opts.ManualZeroing {
+		// FALLOC_FL_PUNCH_HOLE may not zero pages if ManualZeroing is in
+		// effect.
+		if err := f.manuallyZero(fr); err != nil {
+			return err
+		}
+	} else {
+		if err := f.decommitFile(fr); err != nil {
+			return err
+		}
+	}
+
+	f.markDecommitted(fr)
+	return nil
+}
+
+func (f *MemoryFile) manuallyZero(fr memmap.FileRange) error {
+	return f.forEachMappingSlice(fr, func(bs []byte) {
+		for i := range bs {
+			bs[i] = 0
+		}
+	})
+}
+
+func (f *MemoryFile) decommitFile(fr memmap.FileRange) error {
 	// "After a successful call, subsequent reads from this range will
 	// return zeroes. The FALLOC_FL_PUNCH_HOLE flag must be ORed with
 	// FALLOC_FL_KEEP_SIZE in mode ..." - fallocate(2)
-	err := syscall.Fallocate(
+	return syscall.Fallocate(
 		int(f.file.Fd()),
 		_FALLOC_FL_PUNCH_HOLE|_FALLOC_FL_KEEP_SIZE,
 		int64(fr.Start),
 		int64(fr.Length()))
-	if err != nil {
-		return err
-	}
-	f.markDecommitted(fr)
-	return nil
 }
 
 func (f *MemoryFile) markDecommitted(fr memmap.FileRange) {
@@ -1044,20 +1060,20 @@ func (f *MemoryFile) runReclaim() {
 			break
 		}
 
-		if err := f.Decommit(fr); err != nil {
-			log.Warningf("Reclaim failed to decommit %v: %v", fr, err)
-			// Zero the pages manually. This won't reduce memory usage, but at
-			// least ensures that the pages will be zero when reallocated.
-			f.forEachMappingSlice(fr, func(bs []byte) {
-				for i := range bs {
-					bs[i] = 0
+		// If ManualZeroing is in effect, pages will be zeroed on allocation
+		// and may not be freed by decommitFile, so calling decommitFile is
+		// unnecessary.
+		if !f.opts.ManualZeroing {
+			if err := f.decommitFile(fr); err != nil {
+				log.Warningf("Reclaim failed to decommit %v: %v", fr, err)
+				// Zero the pages manually. This won't reduce memory usage, but at
+				// least ensures that the pages will be zero when reallocated.
+				if err := f.manuallyZero(fr); err != nil {
+					panic(fmt.Sprintf("Reclaim failed to decommit or zero %v: %v", fr, err))
 				}
-			})
-			// Pretend the pages were decommitted even though they weren't,
-			// since the memory accounting implementation has no idea how to
-			// deal with this.
-			f.markDecommitted(fr)
+			}
 		}
+		f.markDecommitted(fr)
 		f.markReclaimed(fr)
 	}
 
diff --git a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go
index fd92c3873..3f5be276b 100644
--- a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go
+++ b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go
@@ -263,13 +263,6 @@ func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts, info *arch.SignalInfo)
 		return usermem.NoAccess, platform.ErrContextInterrupt
 	case ring0.El0SyncUndef:
 		return c.fault(int32(syscall.SIGILL), info)
-	case ring0.El1SyncUndef:
-		*info = arch.SignalInfo{
-			Signo: int32(syscall.SIGILL),
-			Code:  1, // ILL_ILLOPC (illegal opcode).
-		}
-		info.SetAddr(switchOpts.Registers.Pc) // Include address.
-		return usermem.AccessType{}, platform.ErrContextSignal
 	default:
 		panic(fmt.Sprintf("unexpected vector: 0x%x", vector))
 	}
diff --git a/pkg/sentry/platform/ptrace/ptrace.go b/pkg/sentry/platform/ptrace/ptrace.go
index f56aa3b79..571bfcc2e 100644
--- a/pkg/sentry/platform/ptrace/ptrace.go
+++ b/pkg/sentry/platform/ptrace/ptrace.go
@@ -18,8 +18,8 @@
 //
 // In a nutshell, it works as follows:
 //
-// The creation of a new address space creates a new child processes with a
-// single thread which is traced by a single goroutine.
+// The creation of a new address space creates a new child process with a single
+// thread which is traced by a single goroutine.
 //
 // A context is just a collection of temporary variables. Calling Switch on a
 // context does the following:
diff --git a/pkg/sentry/platform/ring0/aarch64.go b/pkg/sentry/platform/ring0/aarch64.go
index 327d48465..c51df2811 100644
--- a/pkg/sentry/platform/ring0/aarch64.go
+++ b/pkg/sentry/platform/ring0/aarch64.go
@@ -90,6 +90,7 @@ const (
 	El0SyncIa
 	El0SyncFpsimdAcc
 	El0SyncSveAcc
+	El0SyncFpsimdExc
 	El0SyncSys
 	El0SyncSpPc
 	El0SyncUndef
diff --git a/pkg/sentry/platform/ring0/entry_arm64.s b/pkg/sentry/platform/ring0/entry_arm64.s
index 266817d82..cf0bf3528 100644
--- a/pkg/sentry/platform/ring0/entry_arm64.s
+++ b/pkg/sentry/platform/ring0/entry_arm64.s
@@ -290,6 +290,18 @@
 	MOVD CPU_TTBR0_KVM(from), RSV_REG; \
 	MSR RSV_REG, TTBR0_EL1;
 
+TEXT ·EnableVFP(SB),NOSPLIT,$0
+	MOVD $FPEN_ENABLE, R0
+	WORD $0xd5181040 //MSR R0, CPACR_EL1
+	ISB $15
+	RET
+
+TEXT ·DisableVFP(SB),NOSPLIT,$0
+	MOVD $0, R0
+	WORD $0xd5181040 //MSR R0, CPACR_EL1
+	ISB $15
+	RET
+
 #define VFP_ENABLE \
 	MOVD $FPEN_ENABLE, R0; \
 	WORD $0xd5181040; \ //MSR R0, CPACR_EL1
@@ -336,12 +348,12 @@
 	MOVD R4, CPU_REGISTERS+PTRACE_SP(RSV_REG); \
 	LOAD_KERNEL_STACK(RSV_REG);  // Load the temporary stack.
 
-// EXCEPTION_WITH_ERROR is a common exception handler function.
-#define EXCEPTION_WITH_ERROR(user, vector) \
+// EXCEPTION_EL0 is a common el0 exception handler function.
+#define EXCEPTION_EL0(vector) \
 	WORD $0xd538d092; \	//MRS   TPIDR_EL1, R18
 	WORD $0xd538601a; \	//MRS   FAR_EL1, R26
 	MOVD R26, CPU_FAULT_ADDR(RSV_REG); \
-	MOVD $user, R3; \
+	MOVD $1, R3; \
 	MOVD R3, CPU_ERROR_TYPE(RSV_REG); \	// Set error type to user.
 	MOVD $vector, R3; \
 	MOVD R3, CPU_VECTOR_CODE(RSV_REG); \
@@ -349,6 +361,12 @@
 	MOVD R3, CPU_ERROR_CODE(RSV_REG); \
 	B ·kernelExitToEl1(SB);
 
+// EXCEPTION_EL1 is a common el1 exception handler function.
+#define EXCEPTION_EL1(vector) \
+	MOVD $vector, R3; \
+	MOVD R3, 8(RSP); \
+	B ·HaltEl1ExceptionAndResume(SB);
+
 // storeAppASID writes the application's asid value.
 TEXT ·storeAppASID(SB),NOSPLIT,$0-8
 	MOVD asid+0(FP), R1
@@ -396,6 +414,16 @@ TEXT ·HaltEl1SvcAndResume(SB),NOSPLIT,$0
 	CALL ·kernelSyscall(SB)     // Call the trampoline.
 	B ·kernelExitToEl1(SB)      // Resume.
 
+// HaltEl1ExceptionAndResume calls Hooks.KernelException and resume.
+TEXT ·HaltEl1ExceptionAndResume(SB),NOSPLIT,$0-8
+	WORD $0xd538d092            // MRS TPIDR_EL1, R18
+	MOVD CPU_SELF(RSV_REG), R3  // Load vCPU.
+	MOVD R3, 8(RSP)             // First argument (vCPU).
+	MOVD vector+0(FP), R3
+	MOVD R3, 16(RSP)            // Second argument (vector).
+	CALL ·kernelException(SB)   // Call the trampoline.
+	B ·kernelExitToEl1(SB)      // Resume.
+
 // Shutdown stops the guest.
 TEXT ·Shutdown(SB),NOSPLIT,$0
 	// PSCI EVENT.
@@ -558,39 +586,22 @@ TEXT ·El1_sync(SB),NOSPLIT,$0
 	B el1_invalid
 
 el1_da:
+	EXCEPTION_EL1(El1SyncDa)
 el1_ia:
-	WORD $0xd538d092     //MRS   TPIDR_EL1, R18
-	WORD $0xd538601a     //MRS   FAR_EL1, R26
-
-	MOVD R26, CPU_FAULT_ADDR(RSV_REG)
-
-	MOVD $0, CPU_ERROR_TYPE(RSV_REG)
-
-	MOVD $PageFault, R3
-	MOVD R3, CPU_VECTOR_CODE(RSV_REG)
-
-	B ·HaltAndResume(SB)
-
+	EXCEPTION_EL1(El1SyncIa)
 el1_sp_pc:
-	B ·Shutdown(SB)
-
+	EXCEPTION_EL1(El1SyncSpPc)
 el1_undef:
-	B ·Shutdown(SB)
-
+	EXCEPTION_EL1(El1SyncUndef)
 el1_svc:
-	MOVD $0, CPU_ERROR_CODE(RSV_REG)
-	MOVD $0, CPU_ERROR_TYPE(RSV_REG)
 	B ·HaltEl1SvcAndResume(SB)
-
 el1_dbg:
-	B ·Shutdown(SB)
-
+	EXCEPTION_EL1(El1SyncDbg)
 el1_fpsimd_acc:
 	VFP_ENABLE
 	B ·kernelExitToEl1(SB)  // Resume.
-
 el1_invalid:
-	B ·Shutdown(SB)
+	EXCEPTION_EL1(El1SyncInv)
 
 // El1_irq is the handler for El1_irq.
 TEXT ·El1_irq(SB),NOSPLIT,$0
@@ -646,28 +657,21 @@ el0_svc:
 
 el0_da:
 el0_ia:
-	EXCEPTION_WITH_ERROR(1, PageFault)
-
+	EXCEPTION_EL0(PageFault)
 el0_fpsimd_acc:
-	B ·Shutdown(SB)
-
+	EXCEPTION_EL0(El0SyncFpsimdAcc)
 el0_sve_acc:
-	B ·Shutdown(SB)
-
+	EXCEPTION_EL0(El0SyncSveAcc)
 el0_fpsimd_exc:
-	B ·Shutdown(SB)
-
+	EXCEPTION_EL0(El0SyncFpsimdExc)
 el0_sp_pc:
-	B ·Shutdown(SB)
-
+	EXCEPTION_EL0(El0SyncSpPc)
 el0_undef:
-	EXCEPTION_WITH_ERROR(1, El0SyncUndef)
-
+	EXCEPTION_EL0(El0SyncUndef)
 el0_dbg:
-	B ·Shutdown(SB)
-
+	EXCEPTION_EL0(El0SyncDbg)
 el0_invalid:
-	B ·Shutdown(SB)
+	EXCEPTION_EL0(El0SyncInv)
 
 TEXT ·El0_irq(SB),NOSPLIT,$0
 	B ·Shutdown(SB)
diff --git a/pkg/sentry/platform/ring0/kernel_arm64.go b/pkg/sentry/platform/ring0/kernel_arm64.go
index 6cbbf001f..90a7b8392 100644
--- a/pkg/sentry/platform/ring0/kernel_arm64.go
+++ b/pkg/sentry/platform/ring0/kernel_arm64.go
@@ -24,6 +24,10 @@ func HaltAndResume()
 //go:nosplit
 func HaltEl1SvcAndResume()
 
+// HaltEl1ExceptionAndResume calls Hooks.KernelException and resume.
+//go:nosplit
+func HaltEl1ExceptionAndResume()
+
 // init initializes architecture-specific state.
 func (k *Kernel) init(maxCPUs int) {
 }
@@ -61,11 +65,13 @@ func (c *CPU) SwitchToUser(switchOpts SwitchOpts) (vector Vector) {
 	regs.Pstate &= ^uint64(PsrFlagsClear)
 	regs.Pstate |= UserFlagsSet
 
+	EnableVFP()
 	LoadFloatingPoint(switchOpts.FloatingPointState)
 
 	kernelExitToEl0()
 
 	SaveFloatingPoint(switchOpts.FloatingPointState)
+	DisableVFP()
 
 	vector = c.vecCode
 
diff --git a/pkg/sentry/platform/ring0/lib_arm64.go b/pkg/sentry/platform/ring0/lib_arm64.go
index 842aa7e30..ef0d8974d 100644
--- a/pkg/sentry/platform/ring0/lib_arm64.go
+++ b/pkg/sentry/platform/ring0/lib_arm64.go
@@ -53,6 +53,12 @@ func LoadFloatingPoint(*byte)
 // SaveFloatingPoint saves floating point state.
 func SaveFloatingPoint(*byte)
 
+// EnableVFP enables fpsimd.
+func EnableVFP()
+
+// DisableVFP disables fpsimd.
+func DisableVFP()
+
 // Init sets function pointers based on architectural features.
 //
 // This must be called prior to using ring0.
diff --git a/pkg/sentry/platform/ring0/lib_arm64.s b/pkg/sentry/platform/ring0/lib_arm64.s
index 19c1fca8b..6f4923539 100644
--- a/pkg/sentry/platform/ring0/lib_arm64.s
+++ b/pkg/sentry/platform/ring0/lib_arm64.s
@@ -35,62 +35,47 @@ TEXT ·CPACREL1(SB),NOSPLIT,$0-8
 	RET
 
 TEXT ·GetFPCR(SB),NOSPLIT,$0-8
-	WORD $0xd53b4201    	// MRS NZCV, R1
+	MOVD FPCR, R1
 	MOVD R1, ret+0(FP)
 	RET
 
 TEXT ·GetFPSR(SB),NOSPLIT,$0-8
-	WORD $0xd53b4421   	// MRS FPSR, R1
+	MOVD FPSR, R1
 	MOVD R1, ret+0(FP)
 	RET
 
 TEXT ·SetFPCR(SB),NOSPLIT,$0-8
 	MOVD addr+0(FP), R1
-	WORD $0xd51b4201  	// MSR R1, NZCV
+	MOVD R1, FPCR
 	RET
 
 TEXT ·SetFPSR(SB),NOSPLIT,$0-8
 	MOVD addr+0(FP), R1
-	WORD $0xd51b4421   	// MSR R1, FPSR
+	MOVD R1, FPSR
 	RET
 
 TEXT ·SaveVRegs(SB),NOSPLIT,$0-8
 	MOVD addr+0(FP), R0
 
 	// Skip aarch64_ctx, fpsr, fpcr.
-	FMOVD F0, 16*1(R0)
-	FMOVD F1, 16*2(R0)
-	FMOVD F2, 16*3(R0)
-	FMOVD F3, 16*4(R0)
-	FMOVD F4, 16*5(R0)
-	FMOVD F5, 16*6(R0)
-	FMOVD F6, 16*7(R0)
-	FMOVD F7, 16*8(R0)
-	FMOVD F8, 16*9(R0)
-	FMOVD F9, 16*10(R0)
-	FMOVD F10, 16*11(R0)
-	FMOVD F11, 16*12(R0)
-	FMOVD F12, 16*13(R0)
-	FMOVD F13, 16*14(R0)
-	FMOVD F14, 16*15(R0)
-	FMOVD F15, 16*16(R0)
-	FMOVD F16, 16*17(R0)
-	FMOVD F17, 16*18(R0)
-	FMOVD F18, 16*19(R0)
-	FMOVD F19, 16*20(R0)
-	FMOVD F20, 16*21(R0)
-	FMOVD F21, 16*22(R0)
-	FMOVD F22, 16*23(R0)
-	FMOVD F23, 16*24(R0)
-	FMOVD F24, 16*25(R0)
-	FMOVD F25, 16*26(R0)
-	FMOVD F26, 16*27(R0)
-	FMOVD F27, 16*28(R0)
-	FMOVD F28, 16*29(R0)
-	FMOVD F29, 16*30(R0)
-	FMOVD F30, 16*31(R0)
-	FMOVD F31, 16*32(R0)
-	ISB $15
+	ADD $16, R0, R0
+
+	WORD $0xad000400       //  stp  q0, q1, [x0]
+	WORD $0xad010c02       //  stp  q2, q3, [x0, #32]
+	WORD $0xad021404       //  stp  q4, q5, [x0, #64]
+	WORD $0xad031c06       //  stp  q6, q7, [x0, #96]
+	WORD $0xad042408       //  stp  q8, q9, [x0, #128]
+	WORD $0xad052c0a       //  stp  q10, q11, [x0, #160]
+	WORD $0xad06340c       //  stp  q12, q13, [x0, #192]
+	WORD $0xad073c0e       //  stp  q14, q15, [x0, #224]
+	WORD $0xad084410       //  stp  q16, q17, [x0, #256]
+	WORD $0xad094c12       //  stp  q18, q19, [x0, #288]
+	WORD $0xad0a5414       //  stp  q20, q21, [x0, #320]
+	WORD $0xad0b5c16       //  stp  q22, q23, [x0, #352]
+	WORD $0xad0c6418       //  stp  q24, q25, [x0, #384]
+	WORD $0xad0d6c1a       //  stp  q26, q27, [x0, #416]
+	WORD $0xad0e741c       //  stp  q28, q29, [x0, #448]
+	WORD $0xad0f7c1e       //  stp  q30, q31, [x0, #480]
 
 	RET
 
@@ -98,39 +83,24 @@ TEXT ·LoadVRegs(SB),NOSPLIT,$0-8
 	MOVD addr+0(FP), R0
 
 	// Skip aarch64_ctx, fpsr, fpcr.
-	FMOVD 16*1(R0), F0
-	FMOVD 16*2(R0), F1
-	FMOVD 16*3(R0), F2
-	FMOVD 16*4(R0), F3
-	FMOVD 16*5(R0), F4
-	FMOVD 16*6(R0), F5
-	FMOVD 16*7(R0), F6
-	FMOVD 16*8(R0), F7
-	FMOVD 16*9(R0), F8
-	FMOVD 16*10(R0), F9
-	FMOVD 16*11(R0), F10
-	FMOVD 16*12(R0), F11
-	FMOVD 16*13(R0), F12
-	FMOVD 16*14(R0), F13
-	FMOVD 16*15(R0), F14
-	FMOVD 16*16(R0), F15
-	FMOVD 16*17(R0), F16
-	FMOVD 16*18(R0), F17
-	FMOVD 16*19(R0), F18
-	FMOVD 16*20(R0), F19
-	FMOVD 16*21(R0), F20
-	FMOVD 16*22(R0), F21
-	FMOVD 16*23(R0), F22
-	FMOVD 16*24(R0), F23
-	FMOVD 16*25(R0), F24
-	FMOVD 16*26(R0), F25
-	FMOVD 16*27(R0), F26
-	FMOVD 16*28(R0), F27
-	FMOVD 16*29(R0), F28
-	FMOVD 16*30(R0), F29
-	FMOVD 16*31(R0), F30
-	FMOVD 16*32(R0), F31
-	ISB $15
+	ADD $16, R0, R0
+
+	WORD $0xad400400    // ldp  q0, q1, [x0]
+	WORD $0xad410c02    // ldp  q2, q3, [x0, #32]
+	WORD $0xad421404    // ldp  q4, q5, [x0, #64]
+	WORD $0xad431c06    // ldp  q6, q7, [x0, #96]
+	WORD $0xad442408    // ldp  q8, q9, [x0, #128]
+	WORD $0xad452c0a    // ldp  q10, q11, [x0, #160]
+	WORD $0xad46340c    // ldp  q12, q13, [x0, #192]
+	WORD $0xad473c0e    // ldp  q14, q15, [x0, #224]
+	WORD $0xad484410    // ldp  q16, q17, [x0, #256]
+	WORD $0xad494c12    // ldp  q18, q19, [x0, #288]
+	WORD $0xad4a5414    // ldp  q20, q21, [x0, #320]
+	WORD $0xad4b5c16    // ldp  q22, q23, [x0, #352]
+	WORD $0xad4c6418    // ldp  q24, q25, [x0, #384]
+	WORD $0xad4d6c1a    // ldp  q26, q27, [x0, #416]
+	WORD $0xad4e741c    // ldp  q28, q29, [x0, #448]
+	WORD $0xad4f7c1e    // ldp  q30, q31, [x0, #480]
 
 	RET
 
diff --git a/pkg/sentry/platform/ring0/offsets_arm64.go b/pkg/sentry/platform/ring0/offsets_arm64.go
index 53bc3353c..b5652deb9 100644
--- a/pkg/sentry/platform/ring0/offsets_arm64.go
+++ b/pkg/sentry/platform/ring0/offsets_arm64.go
@@ -70,6 +70,7 @@ func Emit(w io.Writer) {
 	fmt.Fprintf(w, "#define El0SyncIa 0x%02x\n", El0SyncIa)
 	fmt.Fprintf(w, "#define El0SyncFpsimdAcc 0x%02x\n", El0SyncFpsimdAcc)
 	fmt.Fprintf(w, "#define El0SyncSveAcc 0x%02x\n", El0SyncSveAcc)
+	fmt.Fprintf(w, "#define El0SyncFpsimdExc 0x%02x\n", El0SyncFpsimdExc)
 	fmt.Fprintf(w, "#define El0SyncSys 0x%02x\n", El0SyncSys)
 	fmt.Fprintf(w, "#define El0SyncSpPc 0x%02x\n", El0SyncSpPc)
 	fmt.Fprintf(w, "#define El0SyncUndef 0x%02x\n", El0SyncUndef)
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables.go b/pkg/sentry/platform/ring0/pagetables/pagetables.go
index bc16a1622..7605d0cb2 100644
--- a/pkg/sentry/platform/ring0/pagetables/pagetables.go
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables.go
@@ -58,6 +58,15 @@ type PageTables struct {
 	readOnlyShared bool
 }
 
+// Init initializes a set of PageTables.
+//
+//go:nosplit
+func (p *PageTables) Init(allocator Allocator) {
+	p.Allocator = allocator
+	p.root = p.Allocator.NewPTEs()
+	p.rootPhysical = p.Allocator.PhysicalFor(p.root)
+}
+
 // NewWithUpper returns new PageTables.
 //
 // upperSharedPageTables are used for mapping the upper of addresses,
@@ -73,14 +82,17 @@ type PageTables struct {
 func NewWithUpper(a Allocator, upperSharedPageTables *PageTables, upperStart uintptr) *PageTables {
 	p := new(PageTables)
 	p.Init(a)
+
 	if upperSharedPageTables != nil {
 		if !upperSharedPageTables.readOnlyShared {
 			panic("Only read-only shared pagetables can be used as upper")
 		}
 		p.upperSharedPageTables = upperSharedPageTables
 		p.upperStart = upperStart
-		p.cloneUpperShared()
 	}
+
+	p.InitArch(a)
+
 	return p
 }
 
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_aarch64.go b/pkg/sentry/platform/ring0/pagetables/pagetables_aarch64.go
index a4e416af7..520161755 100644
--- a/pkg/sentry/platform/ring0/pagetables/pagetables_aarch64.go
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables_aarch64.go
@@ -24,6 +24,14 @@ import (
 
 // archPageTables is architecture-specific data.
 type archPageTables struct {
+	// root is the pagetable root for kernel space.
+	root *PTEs
+
+	// rootPhysical is the cached physical address of the root.
+	//
+	// This is saved only to prevent constant translation.
+	rootPhysical uintptr
+
 	asid uint16
 }
 
@@ -38,7 +46,7 @@ func (p *PageTables) TTBR0_EL1(noFlush bool, asid uint16) uint64 {
 //
 //go:nosplit
 func (p *PageTables) TTBR1_EL1(noFlush bool, asid uint16) uint64 {
-	return uint64(p.upperSharedPageTables.rootPhysical) | (uint64(asid)&ttbrASIDMask)<<ttbrASIDOffset
+	return uint64(p.archPageTables.rootPhysical) | (uint64(asid)&ttbrASIDMask)<<ttbrASIDOffset
 }
 
 // Bits in page table entries.
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go b/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go
index e7ab887e5..4bdde8448 100644
--- a/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go
@@ -41,13 +41,13 @@ const (
 	entriesPerPage = 512
 )
 
-// Init initializes a set of PageTables.
+// InitArch does some additional initialization related to the architecture.
 //
 //go:nosplit
-func (p *PageTables) Init(allocator Allocator) {
-	p.Allocator = allocator
-	p.root = p.Allocator.NewPTEs()
-	p.rootPhysical = p.Allocator.PhysicalFor(p.root)
+func (p *PageTables) InitArch(allocator Allocator) {
+	if p.upperSharedPageTables != nil {
+		p.cloneUpperShared()
+	}
 }
 
 func pgdIndex(upperStart uintptr) uintptr {
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_arm64.go b/pkg/sentry/platform/ring0/pagetables/pagetables_arm64.go
index 5392bf27a..ad0e30c88 100644
--- a/pkg/sentry/platform/ring0/pagetables/pagetables_arm64.go
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables_arm64.go
@@ -42,13 +42,16 @@ const (
 	entriesPerPage = 512
 )
 
-// Init initializes a set of PageTables.
+// InitArch does some additional initialization related to the architecture.
 //
 //go:nosplit
-func (p *PageTables) Init(allocator Allocator) {
-	p.Allocator = allocator
-	p.root = p.Allocator.NewPTEs()
-	p.rootPhysical = p.Allocator.PhysicalFor(p.root)
+func (p *PageTables) InitArch(allocator Allocator) {
+	if p.upperSharedPageTables != nil {
+		p.cloneUpperShared()
+	} else {
+		p.archPageTables.root = p.Allocator.NewPTEs()
+		p.archPageTables.rootPhysical = p.Allocator.PhysicalFor(p.archPageTables.root)
+	}
 }
 
 // cloneUpperShared clone the upper from the upper shared page tables.
@@ -59,7 +62,8 @@ func (p *PageTables) cloneUpperShared() {
 		panic("upperStart should be the same as upperBottom")
 	}
 
-	// nothing to do for arm.
+	p.archPageTables.root = p.upperSharedPageTables.archPageTables.root
+	p.archPageTables.rootPhysical = p.upperSharedPageTables.archPageTables.rootPhysical
 }
 
 // PTEs is a collection of entries.
diff --git a/pkg/sentry/platform/ring0/pagetables/walker_arm64.go b/pkg/sentry/platform/ring0/pagetables/walker_arm64.go
index 157c9a7cc..c261d393a 100644
--- a/pkg/sentry/platform/ring0/pagetables/walker_arm64.go
+++ b/pkg/sentry/platform/ring0/pagetables/walker_arm64.go
@@ -116,7 +116,7 @@ func next(start uintptr, size uintptr) uintptr {
 func (w *Walker) iterateRangeCanonical(start, end uintptr) {
 	pgdEntryIndex := w.pageTables.root
 	if start >= upperBottom {
-		pgdEntryIndex = w.pageTables.upperSharedPageTables.root
+		pgdEntryIndex = w.pageTables.archPageTables.root
 	}
 
 	for pgdIndex := (uint16((start & pgdMask) >> pgdShift)); start < end && pgdIndex < entriesPerPage; pgdIndex++ {
diff --git a/pkg/sentry/socket/BUILD b/pkg/sentry/socket/BUILD
index a3f775d15..cc1f6bfcc 100644
--- a/pkg/sentry/socket/BUILD
+++ b/pkg/sentry/socket/BUILD
@@ -20,6 +20,7 @@ go_library(
         "//pkg/sentry/vfs",
         "//pkg/syserr",
         "//pkg/tcpip",
+        "//pkg/tcpip/header",
         "//pkg/usermem",
     ],
 )
diff --git a/pkg/sentry/socket/netlink/socket.go b/pkg/sentry/socket/netlink/socket.go
index 3baad098b..057f4d294 100644
--- a/pkg/sentry/socket/netlink/socket.go
+++ b/pkg/sentry/socket/netlink/socket.go
@@ -120,9 +120,6 @@ type socketOpsCommon struct {
 	// fixed buffer but only consume this many bytes.
 	sendBufferSize uint32
 
-	// passcred indicates if this socket wants SCM credentials.
-	passcred bool
-
 	// filter indicates that this socket has a BPF filter "installed".
 	//
 	// TODO(gvisor.dev/issue/1119): We don't actually support filtering,
@@ -201,10 +198,7 @@ func (s *socketOpsCommon) EventUnregister(e *waiter.Entry) {
 
 // Passcred implements transport.Credentialer.Passcred.
 func (s *socketOpsCommon) Passcred() bool {
-	s.mu.Lock()
-	passcred := s.passcred
-	s.mu.Unlock()
-	return passcred
+	return s.ep.SocketOptions().GetPassCred()
 }
 
 // ConnectedPasscred implements transport.Credentialer.ConnectedPasscred.
@@ -419,9 +413,7 @@ func (s *socketOpsCommon) SetSockOpt(t *kernel.Task, level int, name int, opt []
 			}
 			passcred := usermem.ByteOrder.Uint32(opt)
 
-			s.mu.Lock()
-			s.passcred = passcred != 0
-			s.mu.Unlock()
+			s.ep.SocketOptions().SetPassCred(passcred != 0)
 			return nil
 
 		case linux.SO_ATTACH_FILTER:
diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index 7d0ae15ca..5e9ab97ad 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -84,69 +84,73 @@ var Metrics = tcpip.Stats{
 	MalformedRcvdPackets:       mustCreateMetric("/netstack/malformed_received_packets", "Number of packets received by netstack that were deemed malformed."),
 	DroppedPackets:             mustCreateMetric("/netstack/dropped_packets", "Number of packets dropped by netstack due to full queues."),
 	ICMP: tcpip.ICMPStats{
-		V4PacketsSent: tcpip.ICMPv4SentPacketStats{
-			ICMPv4PacketStats: tcpip.ICMPv4PacketStats{
-				Echo:           mustCreateMetric("/netstack/icmp/v4/packets_sent/echo", "Total number of ICMPv4 echo packets sent by netstack."),
-				EchoReply:      mustCreateMetric("/netstack/icmp/v4/packets_sent/echo_reply", "Total number of ICMPv4 echo reply packets sent by netstack."),
-				DstUnreachable: mustCreateMetric("/netstack/icmp/v4/packets_sent/dst_unreachable", "Total number of ICMPv4 destination unreachable packets sent by netstack."),
-				SrcQuench:      mustCreateMetric("/netstack/icmp/v4/packets_sent/src_quench", "Total number of ICMPv4 source quench packets sent by netstack."),
-				Redirect:       mustCreateMetric("/netstack/icmp/v4/packets_sent/redirect", "Total number of ICMPv4 redirect packets sent by netstack."),
-				TimeExceeded:   mustCreateMetric("/netstack/icmp/v4/packets_sent/time_exceeded", "Total number of ICMPv4 time exceeded packets sent by netstack."),
-				ParamProblem:   mustCreateMetric("/netstack/icmp/v4/packets_sent/param_problem", "Total number of ICMPv4 parameter problem packets sent by netstack."),
-				Timestamp:      mustCreateMetric("/netstack/icmp/v4/packets_sent/timestamp", "Total number of ICMPv4 timestamp packets sent by netstack."),
-				TimestampReply: mustCreateMetric("/netstack/icmp/v4/packets_sent/timestamp_reply", "Total number of ICMPv4 timestamp reply packets sent by netstack."),
-				InfoRequest:    mustCreateMetric("/netstack/icmp/v4/packets_sent/info_request", "Total number of ICMPv4 information request packets sent by netstack."),
-				InfoReply:      mustCreateMetric("/netstack/icmp/v4/packets_sent/info_reply", "Total number of ICMPv4 information reply packets sent by netstack."),
+		V4: tcpip.ICMPv4Stats{
+			PacketsSent: tcpip.ICMPv4SentPacketStats{
+				ICMPv4PacketStats: tcpip.ICMPv4PacketStats{
+					Echo:           mustCreateMetric("/netstack/icmp/v4/packets_sent/echo", "Total number of ICMPv4 echo packets sent by netstack."),
+					EchoReply:      mustCreateMetric("/netstack/icmp/v4/packets_sent/echo_reply", "Total number of ICMPv4 echo reply packets sent by netstack."),
+					DstUnreachable: mustCreateMetric("/netstack/icmp/v4/packets_sent/dst_unreachable", "Total number of ICMPv4 destination unreachable packets sent by netstack."),
+					SrcQuench:      mustCreateMetric("/netstack/icmp/v4/packets_sent/src_quench", "Total number of ICMPv4 source quench packets sent by netstack."),
+					Redirect:       mustCreateMetric("/netstack/icmp/v4/packets_sent/redirect", "Total number of ICMPv4 redirect packets sent by netstack."),
+					TimeExceeded:   mustCreateMetric("/netstack/icmp/v4/packets_sent/time_exceeded", "Total number of ICMPv4 time exceeded packets sent by netstack."),
+					ParamProblem:   mustCreateMetric("/netstack/icmp/v4/packets_sent/param_problem", "Total number of ICMPv4 parameter problem packets sent by netstack."),
+					Timestamp:      mustCreateMetric("/netstack/icmp/v4/packets_sent/timestamp", "Total number of ICMPv4 timestamp packets sent by netstack."),
+					TimestampReply: mustCreateMetric("/netstack/icmp/v4/packets_sent/timestamp_reply", "Total number of ICMPv4 timestamp reply packets sent by netstack."),
+					InfoRequest:    mustCreateMetric("/netstack/icmp/v4/packets_sent/info_request", "Total number of ICMPv4 information request packets sent by netstack."),
+					InfoReply:      mustCreateMetric("/netstack/icmp/v4/packets_sent/info_reply", "Total number of ICMPv4 information reply packets sent by netstack."),
+				},
+				Dropped: mustCreateMetric("/netstack/icmp/v4/packets_sent/dropped", "Total number of ICMPv4 packets dropped by netstack due to link layer errors."),
 			},
-			Dropped: mustCreateMetric("/netstack/icmp/v4/packets_sent/dropped", "Total number of ICMPv4 packets dropped by netstack due to link layer errors."),
-		},
-		V4PacketsReceived: tcpip.ICMPv4ReceivedPacketStats{
-			ICMPv4PacketStats: tcpip.ICMPv4PacketStats{
-				Echo:           mustCreateMetric("/netstack/icmp/v4/packets_received/echo", "Total number of ICMPv4 echo packets received by netstack."),
-				EchoReply:      mustCreateMetric("/netstack/icmp/v4/packets_received/echo_reply", "Total number of ICMPv4 echo reply packets received by netstack."),
-				DstUnreachable: mustCreateMetric("/netstack/icmp/v4/packets_received/dst_unreachable", "Total number of ICMPv4 destination unreachable packets received by netstack."),
-				SrcQuench:      mustCreateMetric("/netstack/icmp/v4/packets_received/src_quench", "Total number of ICMPv4 source quench packets received by netstack."),
-				Redirect:       mustCreateMetric("/netstack/icmp/v4/packets_received/redirect", "Total number of ICMPv4 redirect packets received by netstack."),
-				TimeExceeded:   mustCreateMetric("/netstack/icmp/v4/packets_received/time_exceeded", "Total number of ICMPv4 time exceeded packets received by netstack."),
-				ParamProblem:   mustCreateMetric("/netstack/icmp/v4/packets_received/param_problem", "Total number of ICMPv4 parameter problem packets received by netstack."),
-				Timestamp:      mustCreateMetric("/netstack/icmp/v4/packets_received/timestamp", "Total number of ICMPv4 timestamp packets received by netstack."),
-				TimestampReply: mustCreateMetric("/netstack/icmp/v4/packets_received/timestamp_reply", "Total number of ICMPv4 timestamp reply packets received by netstack."),
-				InfoRequest:    mustCreateMetric("/netstack/icmp/v4/packets_received/info_request", "Total number of ICMPv4 information request packets received by netstack."),
-				InfoReply:      mustCreateMetric("/netstack/icmp/v4/packets_received/info_reply", "Total number of ICMPv4 information reply packets received by netstack."),
+			PacketsReceived: tcpip.ICMPv4ReceivedPacketStats{
+				ICMPv4PacketStats: tcpip.ICMPv4PacketStats{
+					Echo:           mustCreateMetric("/netstack/icmp/v4/packets_received/echo", "Total number of ICMPv4 echo packets received by netstack."),
+					EchoReply:      mustCreateMetric("/netstack/icmp/v4/packets_received/echo_reply", "Total number of ICMPv4 echo reply packets received by netstack."),
+					DstUnreachable: mustCreateMetric("/netstack/icmp/v4/packets_received/dst_unreachable", "Total number of ICMPv4 destination unreachable packets received by netstack."),
+					SrcQuench:      mustCreateMetric("/netstack/icmp/v4/packets_received/src_quench", "Total number of ICMPv4 source quench packets received by netstack."),
+					Redirect:       mustCreateMetric("/netstack/icmp/v4/packets_received/redirect", "Total number of ICMPv4 redirect packets received by netstack."),
+					TimeExceeded:   mustCreateMetric("/netstack/icmp/v4/packets_received/time_exceeded", "Total number of ICMPv4 time exceeded packets received by netstack."),
+					ParamProblem:   mustCreateMetric("/netstack/icmp/v4/packets_received/param_problem", "Total number of ICMPv4 parameter problem packets received by netstack."),
+					Timestamp:      mustCreateMetric("/netstack/icmp/v4/packets_received/timestamp", "Total number of ICMPv4 timestamp packets received by netstack."),
+					TimestampReply: mustCreateMetric("/netstack/icmp/v4/packets_received/timestamp_reply", "Total number of ICMPv4 timestamp reply packets received by netstack."),
+					InfoRequest:    mustCreateMetric("/netstack/icmp/v4/packets_received/info_request", "Total number of ICMPv4 information request packets received by netstack."),
+					InfoReply:      mustCreateMetric("/netstack/icmp/v4/packets_received/info_reply", "Total number of ICMPv4 information reply packets received by netstack."),
+				},
+				Invalid: mustCreateMetric("/netstack/icmp/v4/packets_received/invalid", "Total number of ICMPv4 packets received that the transport layer could not parse."),
 			},
-			Invalid: mustCreateMetric("/netstack/icmp/v4/packets_received/invalid", "Total number of ICMPv4 packets received that the transport layer could not parse."),
 		},
-		V6PacketsSent: tcpip.ICMPv6SentPacketStats{
-			ICMPv6PacketStats: tcpip.ICMPv6PacketStats{
-				EchoRequest:     mustCreateMetric("/netstack/icmp/v6/packets_sent/echo_request", "Total number of ICMPv6 echo request packets sent by netstack."),
-				EchoReply:       mustCreateMetric("/netstack/icmp/v6/packets_sent/echo_reply", "Total number of ICMPv6 echo reply packets sent by netstack."),
-				DstUnreachable:  mustCreateMetric("/netstack/icmp/v6/packets_sent/dst_unreachable", "Total number of ICMPv6 destination unreachable packets sent by netstack."),
-				PacketTooBig:    mustCreateMetric("/netstack/icmp/v6/packets_sent/packet_too_big", "Total number of ICMPv6 packet too big packets sent by netstack."),
-				TimeExceeded:    mustCreateMetric("/netstack/icmp/v6/packets_sent/time_exceeded", "Total number of ICMPv6 time exceeded packets sent by netstack."),
-				ParamProblem:    mustCreateMetric("/netstack/icmp/v6/packets_sent/param_problem", "Total number of ICMPv6 parameter problem packets sent by netstack."),
-				RouterSolicit:   mustCreateMetric("/netstack/icmp/v6/packets_sent/router_solicit", "Total number of ICMPv6 router solicit packets sent by netstack."),
-				RouterAdvert:    mustCreateMetric("/netstack/icmp/v6/packets_sent/router_advert", "Total number of ICMPv6 router advert packets sent by netstack."),
-				NeighborSolicit: mustCreateMetric("/netstack/icmp/v6/packets_sent/neighbor_solicit", "Total number of ICMPv6 neighbor solicit packets sent by netstack."),
-				NeighborAdvert:  mustCreateMetric("/netstack/icmp/v6/packets_sent/neighbor_advert", "Total number of ICMPv6 neighbor advert packets sent by netstack."),
-				RedirectMsg:     mustCreateMetric("/netstack/icmp/v6/packets_sent/redirect_msg", "Total number of ICMPv6 redirect message packets sent by netstack."),
+		V6: tcpip.ICMPv6Stats{
+			PacketsSent: tcpip.ICMPv6SentPacketStats{
+				ICMPv6PacketStats: tcpip.ICMPv6PacketStats{
+					EchoRequest:     mustCreateMetric("/netstack/icmp/v6/packets_sent/echo_request", "Total number of ICMPv6 echo request packets sent by netstack."),
+					EchoReply:       mustCreateMetric("/netstack/icmp/v6/packets_sent/echo_reply", "Total number of ICMPv6 echo reply packets sent by netstack."),
+					DstUnreachable:  mustCreateMetric("/netstack/icmp/v6/packets_sent/dst_unreachable", "Total number of ICMPv6 destination unreachable packets sent by netstack."),
+					PacketTooBig:    mustCreateMetric("/netstack/icmp/v6/packets_sent/packet_too_big", "Total number of ICMPv6 packet too big packets sent by netstack."),
+					TimeExceeded:    mustCreateMetric("/netstack/icmp/v6/packets_sent/time_exceeded", "Total number of ICMPv6 time exceeded packets sent by netstack."),
+					ParamProblem:    mustCreateMetric("/netstack/icmp/v6/packets_sent/param_problem", "Total number of ICMPv6 parameter problem packets sent by netstack."),
+					RouterSolicit:   mustCreateMetric("/netstack/icmp/v6/packets_sent/router_solicit", "Total number of ICMPv6 router solicit packets sent by netstack."),
+					RouterAdvert:    mustCreateMetric("/netstack/icmp/v6/packets_sent/router_advert", "Total number of ICMPv6 router advert packets sent by netstack."),
+					NeighborSolicit: mustCreateMetric("/netstack/icmp/v6/packets_sent/neighbor_solicit", "Total number of ICMPv6 neighbor solicit packets sent by netstack."),
+					NeighborAdvert:  mustCreateMetric("/netstack/icmp/v6/packets_sent/neighbor_advert", "Total number of ICMPv6 neighbor advert packets sent by netstack."),
+					RedirectMsg:     mustCreateMetric("/netstack/icmp/v6/packets_sent/redirect_msg", "Total number of ICMPv6 redirect message packets sent by netstack."),
+				},
+				Dropped: mustCreateMetric("/netstack/icmp/v6/packets_sent/dropped", "Total number of ICMPv6 packets dropped by netstack due to link layer errors."),
 			},
-			Dropped: mustCreateMetric("/netstack/icmp/v6/packets_sent/dropped", "Total number of ICMPv6 packets dropped by netstack due to link layer errors."),
-		},
-		V6PacketsReceived: tcpip.ICMPv6ReceivedPacketStats{
-			ICMPv6PacketStats: tcpip.ICMPv6PacketStats{
-				EchoRequest:     mustCreateMetric("/netstack/icmp/v6/packets_received/echo_request", "Total number of ICMPv6 echo request packets received by netstack."),
-				EchoReply:       mustCreateMetric("/netstack/icmp/v6/packets_received/echo_reply", "Total number of ICMPv6 echo reply packets received by netstack."),
-				DstUnreachable:  mustCreateMetric("/netstack/icmp/v6/packets_received/dst_unreachable", "Total number of ICMPv6 destination unreachable packets received by netstack."),
-				PacketTooBig:    mustCreateMetric("/netstack/icmp/v6/packets_received/packet_too_big", "Total number of ICMPv6 packet too big packets received by netstack."),
-				TimeExceeded:    mustCreateMetric("/netstack/icmp/v6/packets_received/time_exceeded", "Total number of ICMPv6 time exceeded packets received by netstack."),
-				ParamProblem:    mustCreateMetric("/netstack/icmp/v6/packets_received/param_problem", "Total number of ICMPv6 parameter problem packets received by netstack."),
-				RouterSolicit:   mustCreateMetric("/netstack/icmp/v6/packets_received/router_solicit", "Total number of ICMPv6 router solicit packets received by netstack."),
-				RouterAdvert:    mustCreateMetric("/netstack/icmp/v6/packets_received/router_advert", "Total number of ICMPv6 router advert packets received by netstack."),
-				NeighborSolicit: mustCreateMetric("/netstack/icmp/v6/packets_received/neighbor_solicit", "Total number of ICMPv6 neighbor solicit packets received by netstack."),
-				NeighborAdvert:  mustCreateMetric("/netstack/icmp/v6/packets_received/neighbor_advert", "Total number of ICMPv6 neighbor advert packets received by netstack."),
-				RedirectMsg:     mustCreateMetric("/netstack/icmp/v6/packets_received/redirect_msg", "Total number of ICMPv6 redirect message packets received by netstack."),
+			PacketsReceived: tcpip.ICMPv6ReceivedPacketStats{
+				ICMPv6PacketStats: tcpip.ICMPv6PacketStats{
+					EchoRequest:     mustCreateMetric("/netstack/icmp/v6/packets_received/echo_request", "Total number of ICMPv6 echo request packets received by netstack."),
+					EchoReply:       mustCreateMetric("/netstack/icmp/v6/packets_received/echo_reply", "Total number of ICMPv6 echo reply packets received by netstack."),
+					DstUnreachable:  mustCreateMetric("/netstack/icmp/v6/packets_received/dst_unreachable", "Total number of ICMPv6 destination unreachable packets received by netstack."),
+					PacketTooBig:    mustCreateMetric("/netstack/icmp/v6/packets_received/packet_too_big", "Total number of ICMPv6 packet too big packets received by netstack."),
+					TimeExceeded:    mustCreateMetric("/netstack/icmp/v6/packets_received/time_exceeded", "Total number of ICMPv6 time exceeded packets received by netstack."),
+					ParamProblem:    mustCreateMetric("/netstack/icmp/v6/packets_received/param_problem", "Total number of ICMPv6 parameter problem packets received by netstack."),
+					RouterSolicit:   mustCreateMetric("/netstack/icmp/v6/packets_received/router_solicit", "Total number of ICMPv6 router solicit packets received by netstack."),
+					RouterAdvert:    mustCreateMetric("/netstack/icmp/v6/packets_received/router_advert", "Total number of ICMPv6 router advert packets received by netstack."),
+					NeighborSolicit: mustCreateMetric("/netstack/icmp/v6/packets_received/neighbor_solicit", "Total number of ICMPv6 neighbor solicit packets received by netstack."),
+					NeighborAdvert:  mustCreateMetric("/netstack/icmp/v6/packets_received/neighbor_advert", "Total number of ICMPv6 neighbor advert packets received by netstack."),
+					RedirectMsg:     mustCreateMetric("/netstack/icmp/v6/packets_received/redirect_msg", "Total number of ICMPv6 redirect message packets received by netstack."),
+				},
+				Invalid: mustCreateMetric("/netstack/icmp/v6/packets_received/invalid", "Total number of ICMPv6 packets received that the transport layer could not parse."),
 			},
-			Invalid: mustCreateMetric("/netstack/icmp/v6/packets_received/invalid", "Total number of ICMPv6 packets received that the transport layer could not parse."),
 		},
 	},
 	IP: tcpip.IPStats{
@@ -209,18 +213,6 @@ const sizeOfInt32 int = 4
 
 var errStackType = syserr.New("expected but did not receive a netstack.Stack", linux.EINVAL)
 
-// ntohs converts a 16-bit number from network byte order to host byte order. It
-// assumes that the host is little endian.
-func ntohs(v uint16) uint16 {
-	return v<<8 | v>>8
-}
-
-// htons converts a 16-bit number from host byte order to network byte order. It
-// assumes that the host is little endian.
-func htons(v uint16) uint16 {
-	return ntohs(v)
-}
-
 // commonEndpoint represents the intersection of a tcpip.Endpoint and a
 // transport.Endpoint.
 type commonEndpoint interface {
@@ -240,10 +232,6 @@ type commonEndpoint interface {
 	// transport.Endpoint.SetSockOpt.
 	SetSockOpt(tcpip.SettableSocketOption) *tcpip.Error
 
-	// SetSockOptBool implements tcpip.Endpoint.SetSockOptBool and
-	// transport.Endpoint.SetSockOptBool.
-	SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error
-
 	// SetSockOptInt implements tcpip.Endpoint.SetSockOptInt and
 	// transport.Endpoint.SetSockOptInt.
 	SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error
@@ -252,18 +240,20 @@ type commonEndpoint interface {
 	// transport.Endpoint.GetSockOpt.
 	GetSockOpt(tcpip.GettableSocketOption) *tcpip.Error
 
-	// GetSockOptBool implements tcpip.Endpoint.GetSockOptBool and
-	// transport.Endpoint.GetSockOpt.
-	GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error)
-
 	// GetSockOptInt implements tcpip.Endpoint.GetSockOptInt and
 	// transport.Endpoint.GetSockOpt.
 	GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error)
 
-	// LastError implements tcpip.Endpoint.LastError.
+	// State returns a socket's lifecycle state. The returned value is
+	// protocol-specific and is primarily used for diagnostics.
+	State() uint32
+
+	// LastError implements tcpip.Endpoint.LastError and
+	// transport.Endpoint.LastError.
 	LastError() *tcpip.Error
 
-	// SocketOptions implements tcpip.Endpoint.SocketOptions.
+	// SocketOptions implements tcpip.Endpoint.SocketOptions and
+	// transport.Endpoint.SocketOptions.
 	SocketOptions() *tcpip.SocketOptions
 }
 
@@ -332,9 +322,7 @@ type socketOpsCommon struct {
 // New creates a new endpoint socket.
 func New(t *kernel.Task, family int, skType linux.SockType, protocol int, queue *waiter.Queue, endpoint tcpip.Endpoint) (*fs.File, *syserr.Error) {
 	if skType == linux.SOCK_STREAM {
-		if err := endpoint.SetSockOptBool(tcpip.DelayOption, true); err != nil {
-			return nil, syserr.TranslateNetstackError(err)
-		}
+		endpoint.SocketOptions().SetDelayOption(true)
 	}
 
 	dirent := socket.NewDirent(t, netstackDevice)
@@ -363,88 +351,6 @@ func bytesToIPAddress(addr []byte) tcpip.Address {
 	return tcpip.Address(addr)
 }
 
-// AddressAndFamily reads an sockaddr struct from the given address and
-// converts it to the FullAddress format. It supports AF_UNIX, AF_INET,
-// AF_INET6, and AF_PACKET addresses.
-//
-// AddressAndFamily returns an address and its family.
-func AddressAndFamily(addr []byte) (tcpip.FullAddress, uint16, *syserr.Error) {
-	// Make sure we have at least 2 bytes for the address family.
-	if len(addr) < 2 {
-		return tcpip.FullAddress{}, 0, syserr.ErrInvalidArgument
-	}
-
-	// Get the rest of the fields based on the address family.
-	switch family := usermem.ByteOrder.Uint16(addr); family {
-	case linux.AF_UNIX:
-		path := addr[2:]
-		if len(path) > linux.UnixPathMax {
-			return tcpip.FullAddress{}, family, syserr.ErrInvalidArgument
-		}
-		// Drop the terminating NUL (if one exists) and everything after
-		// it for filesystem (non-abstract) addresses.
-		if len(path) > 0 && path[0] != 0 {
-			if n := bytes.IndexByte(path[1:], 0); n >= 0 {
-				path = path[:n+1]
-			}
-		}
-		return tcpip.FullAddress{
-			Addr: tcpip.Address(path),
-		}, family, nil
-
-	case linux.AF_INET:
-		var a linux.SockAddrInet
-		if len(addr) < sockAddrInetSize {
-			return tcpip.FullAddress{}, family, syserr.ErrInvalidArgument
-		}
-		binary.Unmarshal(addr[:sockAddrInetSize], usermem.ByteOrder, &a)
-
-		out := tcpip.FullAddress{
-			Addr: bytesToIPAddress(a.Addr[:]),
-			Port: ntohs(a.Port),
-		}
-		return out, family, nil
-
-	case linux.AF_INET6:
-		var a linux.SockAddrInet6
-		if len(addr) < sockAddrInet6Size {
-			return tcpip.FullAddress{}, family, syserr.ErrInvalidArgument
-		}
-		binary.Unmarshal(addr[:sockAddrInet6Size], usermem.ByteOrder, &a)
-
-		out := tcpip.FullAddress{
-			Addr: bytesToIPAddress(a.Addr[:]),
-			Port: ntohs(a.Port),
-		}
-		if isLinkLocal(out.Addr) {
-			out.NIC = tcpip.NICID(a.Scope_id)
-		}
-		return out, family, nil
-
-	case linux.AF_PACKET:
-		var a linux.SockAddrLink
-		if len(addr) < sockAddrLinkSize {
-			return tcpip.FullAddress{}, family, syserr.ErrInvalidArgument
-		}
-		binary.Unmarshal(addr[:sockAddrLinkSize], usermem.ByteOrder, &a)
-		if a.Family != linux.AF_PACKET || a.HardwareAddrLen != header.EthernetAddressSize {
-			return tcpip.FullAddress{}, family, syserr.ErrInvalidArgument
-		}
-
-		// TODO(gvisor.dev/issue/173): Return protocol too.
-		return tcpip.FullAddress{
-			NIC:  tcpip.NICID(a.InterfaceIndex),
-			Addr: tcpip.Address(a.HardwareAddr[:header.EthernetAddressSize]),
-		}, family, nil
-
-	case linux.AF_UNSPEC:
-		return tcpip.FullAddress{}, family, nil
-
-	default:
-		return tcpip.FullAddress{}, 0, syserr.ErrAddressFamilyNotSupported
-	}
-}
-
 func (s *socketOpsCommon) isPacketBased() bool {
 	return s.skType == linux.SOCK_DGRAM || s.skType == linux.SOCK_SEQPACKET || s.skType == linux.SOCK_RDM || s.skType == linux.SOCK_RAW
 }
@@ -721,11 +627,7 @@ func (s *socketOpsCommon) checkFamily(family uint16, exact bool) *syserr.Error {
 		return nil
 	}
 	if !exact && family == linux.AF_INET && s.family == linux.AF_INET6 {
-		v, err := s.Endpoint.GetSockOptBool(tcpip.V6OnlyOption)
-		if err != nil {
-			return syserr.TranslateNetstackError(err)
-		}
-		if !v {
+		if !s.Endpoint.SocketOptions().GetV6Only() {
 			return nil
 		}
 	}
@@ -749,7 +651,7 @@ func (s *socketOpsCommon) mapFamily(addr tcpip.FullAddress, family uint16) tcpip
 // Connect implements the linux syscall connect(2) for sockets backed by
 // tpcip.Endpoint.
 func (s *socketOpsCommon) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error {
-	addr, family, err := AddressAndFamily(sockaddr)
+	addr, family, err := socket.AddressAndFamily(sockaddr)
 	if err != nil {
 		return err
 	}
@@ -830,7 +732,7 @@ func (s *socketOpsCommon) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error {
 		}
 	} else {
 		var err *syserr.Error
-		addr, family, err = AddressAndFamily(sockaddr)
+		addr, family, err = socket.AddressAndFamily(sockaddr)
 		if err != nil {
 			return err
 		}
@@ -921,7 +823,7 @@ func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int,
 	var addr linux.SockAddr
 	var addrLen uint32
 	if peerAddr != nil {
-		addr, addrLen = ConvertAddress(s.family, *peerAddr)
+		addr, addrLen = socket.ConvertAddress(s.family, *peerAddr)
 	}
 
 	fd, e := t.NewFDFrom(0, ns, kernel.FDFlags{
@@ -1005,7 +907,7 @@ func GetSockOpt(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, family in
 		return getSockOptSocket(t, s, ep, family, skType, name, outLen)
 
 	case linux.SOL_TCP:
-		return getSockOptTCP(t, ep, name, outLen)
+		return getSockOptTCP(t, s, ep, name, outLen)
 
 	case linux.SOL_IPV6:
 		return getSockOptIPv6(t, s, ep, name, outPtr, outLen)
@@ -1068,13 +970,8 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam
 			return nil, syserr.ErrInvalidArgument
 		}
 
-		v, err := ep.GetSockOptBool(tcpip.PasscredOption)
-		if err != nil {
-			return nil, syserr.TranslateNetstackError(err)
-		}
-
-		vP := primitive.Int32(boolToInt32(v))
-		return &vP, nil
+		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetPassCred()))
+		return &v, nil
 
 	case linux.SO_SNDBUF:
 		if outLen < sizeOfInt32 {
@@ -1115,25 +1012,16 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam
 			return nil, syserr.ErrInvalidArgument
 		}
 
-		v, err := ep.GetSockOptBool(tcpip.ReuseAddressOption)
-		if err != nil {
-			return nil, syserr.TranslateNetstackError(err)
-		}
-		vP := primitive.Int32(boolToInt32(v))
-		return &vP, nil
+		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReuseAddress()))
+		return &v, nil
 
 	case linux.SO_REUSEPORT:
 		if outLen < sizeOfInt32 {
 			return nil, syserr.ErrInvalidArgument
 		}
 
-		v, err := ep.GetSockOptBool(tcpip.ReusePortOption)
-		if err != nil {
-			return nil, syserr.TranslateNetstackError(err)
-		}
-
-		vP := primitive.Int32(boolToInt32(v))
-		return &vP, nil
+		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReusePort()))
+		return &v, nil
 
 	case linux.SO_BINDTODEVICE:
 		var v tcpip.BindToDeviceOption
@@ -1174,13 +1062,8 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam
 			return nil, syserr.ErrInvalidArgument
 		}
 
-		v, err := ep.GetSockOptBool(tcpip.KeepaliveEnabledOption)
-		if err != nil {
-			return nil, syserr.TranslateNetstackError(err)
-		}
-
-		vP := primitive.Int32(boolToInt32(v))
-		return &vP, nil
+		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetKeepAlive()))
+		return &v, nil
 
 	case linux.SO_LINGER:
 		if outLen < linux.SizeOfLinger {
@@ -1235,21 +1118,18 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam
 			return nil, syserr.ErrInvalidArgument
 		}
 
-		v, err := ep.GetSockOptBool(tcpip.NoChecksumOption)
-		if err != nil {
-			return nil, syserr.TranslateNetstackError(err)
-		}
-		vP := primitive.Int32(boolToInt32(v))
-		return &vP, nil
+		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetNoChecksum()))
+		return &v, nil
 
 	case linux.SO_ACCEPTCONN:
 		if outLen < sizeOfInt32 {
 			return nil, syserr.ErrInvalidArgument
 		}
 
-		v, err := ep.GetSockOptBool(tcpip.AcceptConnOption)
-		if err != nil {
-			return nil, syserr.TranslateNetstackError(err)
+		// This option is only viable for TCP endpoints.
+		var v bool
+		if _, skType, skProto := s.Type(); isTCPSocket(skType, skProto) {
+			v = tcp.EndpointState(ep.State()) == tcp.StateListen
 		}
 		vP := primitive.Int32(boolToInt32(v))
 		return &vP, nil
@@ -1261,46 +1141,36 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam
 }
 
 // getSockOptTCP implements GetSockOpt when level is SOL_TCP.
-func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (marshal.Marshallable, *syserr.Error) {
+func getSockOptTCP(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name, outLen int) (marshal.Marshallable, *syserr.Error) {
+	if _, skType, skProto := s.Type(); !isTCPSocket(skType, skProto) {
+		log.Warningf("SOL_TCP options are only supported on TCP sockets: skType, skProto = %v, %d", skType, skProto)
+		return nil, syserr.ErrUnknownProtocolOption
+	}
+
 	switch name {
 	case linux.TCP_NODELAY:
 		if outLen < sizeOfInt32 {
 			return nil, syserr.ErrInvalidArgument
 		}
 
-		v, err := ep.GetSockOptBool(tcpip.DelayOption)
-		if err != nil {
-			return nil, syserr.TranslateNetstackError(err)
-		}
-
-		vP := primitive.Int32(boolToInt32(!v))
-		return &vP, nil
+		v := primitive.Int32(boolToInt32(!ep.SocketOptions().GetDelayOption()))
+		return &v, nil
 
 	case linux.TCP_CORK:
 		if outLen < sizeOfInt32 {
 			return nil, syserr.ErrInvalidArgument
 		}
 
-		v, err := ep.GetSockOptBool(tcpip.CorkOption)
-		if err != nil {
-			return nil, syserr.TranslateNetstackError(err)
-		}
-
-		vP := primitive.Int32(boolToInt32(v))
-		return &vP, nil
+		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetCorkOption()))
+		return &v, nil
 
 	case linux.TCP_QUICKACK:
 		if outLen < sizeOfInt32 {
 			return nil, syserr.ErrInvalidArgument
 		}
 
-		v, err := ep.GetSockOptBool(tcpip.QuickAckOption)
-		if err != nil {
-			return nil, syserr.TranslateNetstackError(err)
-		}
-
-		vP := primitive.Int32(boolToInt32(v))
-		return &vP, nil
+		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetQuickAck()))
+		return &v, nil
 
 	case linux.TCP_MAXSEG:
 		if outLen < sizeOfInt32 {
@@ -1474,19 +1344,24 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (marshal
 
 // getSockOptIPv6 implements GetSockOpt when level is SOL_IPV6.
 func getSockOptIPv6(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name int, outPtr usermem.Addr, outLen int) (marshal.Marshallable, *syserr.Error) {
+	if _, ok := ep.(tcpip.Endpoint); !ok {
+		log.Warningf("SOL_IPV6 options not supported on endpoints other than tcpip.Endpoint: option = %d", name)
+		return nil, syserr.ErrUnknownProtocolOption
+	}
+
+	family, skType, _ := s.Type()
+	if family != linux.AF_INET6 {
+		return nil, syserr.ErrUnknownProtocolOption
+	}
+
 	switch name {
 	case linux.IPV6_V6ONLY:
 		if outLen < sizeOfInt32 {
 			return nil, syserr.ErrInvalidArgument
 		}
 
-		v, err := ep.GetSockOptBool(tcpip.V6OnlyOption)
-		if err != nil {
-			return nil, syserr.TranslateNetstackError(err)
-		}
-
-		vP := primitive.Int32(boolToInt32(v))
-		return &vP, nil
+		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetV6Only()))
+		return &v, nil
 
 	case linux.IPV6_PATHMTU:
 		t.Kernel().EmitUnimplementedEvent(t)
@@ -1518,13 +1393,8 @@ func getSockOptIPv6(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name
 			return nil, syserr.ErrInvalidArgument
 		}
 
-		v, err := ep.GetSockOptBool(tcpip.ReceiveTClassOption)
-		if err != nil {
-			return nil, syserr.TranslateNetstackError(err)
-		}
-
-		vP := primitive.Int32(boolToInt32(v))
-		return &vP, nil
+		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReceiveTClass()))
+		return &v, nil
 
 	case linux.IP6T_ORIGINAL_DST:
 		if outLen < int(binary.Size(linux.SockAddrInet6{})) {
@@ -1536,7 +1406,7 @@ func getSockOptIPv6(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name
 			return nil, syserr.TranslateNetstackError(err)
 		}
 
-		a, _ := ConvertAddress(linux.AF_INET6, tcpip.FullAddress(v))
+		a, _ := socket.ConvertAddress(linux.AF_INET6, tcpip.FullAddress(v))
 		return a.(*linux.SockAddrInet6), nil
 
 	case linux.IP6T_SO_GET_INFO:
@@ -1545,7 +1415,7 @@ func getSockOptIPv6(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name
 		}
 
 		// Only valid for raw IPv6 sockets.
-		if family, skType, _ := s.Type(); family != linux.AF_INET6 || skType != linux.SOCK_RAW {
+		if skType != linux.SOCK_RAW {
 			return nil, syserr.ErrProtocolNotAvailable
 		}
 
@@ -1565,7 +1435,7 @@ func getSockOptIPv6(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name
 			return nil, syserr.ErrInvalidArgument
 		}
 		// Only valid for raw IPv6 sockets.
-		if family, skType, _ := s.Type(); family != linux.AF_INET6 || skType != linux.SOCK_RAW {
+		if skType != linux.SOCK_RAW {
 			return nil, syserr.ErrProtocolNotAvailable
 		}
 
@@ -1585,7 +1455,7 @@ func getSockOptIPv6(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name
 		}
 
 		// Only valid for raw IPv6 sockets.
-		if family, skType, _ := s.Type(); family != linux.AF_INET6 || skType != linux.SOCK_RAW {
+		if skType != linux.SOCK_RAW {
 			return nil, syserr.ErrProtocolNotAvailable
 		}
 
@@ -1607,6 +1477,11 @@ func getSockOptIPv6(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name
 
 // getSockOptIP implements GetSockOpt when level is SOL_IP.
 func getSockOptIP(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name int, outPtr usermem.Addr, outLen int, family int) (marshal.Marshallable, *syserr.Error) {
+	if _, ok := ep.(tcpip.Endpoint); !ok {
+		log.Warningf("SOL_IP options not supported on endpoints other than tcpip.Endpoint: option = %d", name)
+		return nil, syserr.ErrUnknownProtocolOption
+	}
+
 	switch name {
 	case linux.IP_TTL:
 		if outLen < sizeOfInt32 {
@@ -1649,7 +1524,7 @@ func getSockOptIP(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name in
 			return nil, syserr.TranslateNetstackError(err)
 		}
 
-		a, _ := ConvertAddress(linux.AF_INET, tcpip.FullAddress{Addr: v.InterfaceAddr})
+		a, _ := socket.ConvertAddress(linux.AF_INET, tcpip.FullAddress{Addr: v.InterfaceAddr})
 
 		return &a.(*linux.SockAddrInet).Addr, nil
 
@@ -1658,13 +1533,8 @@ func getSockOptIP(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name in
 			return nil, syserr.ErrInvalidArgument
 		}
 
-		v, err := ep.GetSockOptBool(tcpip.MulticastLoopOption)
-		if err != nil {
-			return nil, syserr.TranslateNetstackError(err)
-		}
-
-		vP := primitive.Int32(boolToInt32(v))
-		return &vP, nil
+		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetMulticastLoop()))
+		return &v, nil
 
 	case linux.IP_TOS:
 		// Length handling for parity with Linux.
@@ -1688,26 +1558,24 @@ func getSockOptIP(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name in
 			return nil, syserr.ErrInvalidArgument
 		}
 
-		v, err := ep.GetSockOptBool(tcpip.ReceiveTOSOption)
-		if err != nil {
-			return nil, syserr.TranslateNetstackError(err)
-		}
-
-		vP := primitive.Int32(boolToInt32(v))
-		return &vP, nil
+		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReceiveTOS()))
+		return &v, nil
 
 	case linux.IP_PKTINFO:
 		if outLen < sizeOfInt32 {
 			return nil, syserr.ErrInvalidArgument
 		}
 
-		v, err := ep.GetSockOptBool(tcpip.ReceiveIPPacketInfoOption)
-		if err != nil {
-			return nil, syserr.TranslateNetstackError(err)
+		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReceivePacketInfo()))
+		return &v, nil
+
+	case linux.IP_HDRINCL:
+		if outLen < sizeOfInt32 {
+			return nil, syserr.ErrInvalidArgument
 		}
 
-		vP := primitive.Int32(boolToInt32(v))
-		return &vP, nil
+		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetHeaderIncluded()))
+		return &v, nil
 
 	case linux.SO_ORIGINAL_DST:
 		if outLen < int(binary.Size(linux.SockAddrInet{})) {
@@ -1719,7 +1587,7 @@ func getSockOptIP(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name in
 			return nil, syserr.TranslateNetstackError(err)
 		}
 
-		a, _ := ConvertAddress(linux.AF_INET, tcpip.FullAddress(v))
+		a, _ := socket.ConvertAddress(linux.AF_INET, tcpip.FullAddress(v))
 		return a.(*linux.SockAddrInet), nil
 
 	case linux.IPT_SO_GET_INFO:
@@ -1826,7 +1694,7 @@ func SetSockOpt(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, level int
 		return setSockOptSocket(t, s, ep, name, optVal)
 
 	case linux.SOL_TCP:
-		return setSockOptTCP(t, ep, name, optVal)
+		return setSockOptTCP(t, s, ep, name, optVal)
 
 	case linux.SOL_IPV6:
 		return setSockOptIPv6(t, s, ep, name, optVal)
@@ -1876,7 +1744,8 @@ func setSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, nam
 		}
 
 		v := usermem.ByteOrder.Uint32(optVal)
-		return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.ReuseAddressOption, v != 0))
+		ep.SocketOptions().SetReuseAddress(v != 0)
+		return nil
 
 	case linux.SO_REUSEPORT:
 		if len(optVal) < sizeOfInt32 {
@@ -1884,7 +1753,8 @@ func setSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, nam
 		}
 
 		v := usermem.ByteOrder.Uint32(optVal)
-		return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.ReusePortOption, v != 0))
+		ep.SocketOptions().SetReusePort(v != 0)
+		return nil
 
 	case linux.SO_BINDTODEVICE:
 		n := bytes.IndexByte(optVal, 0)
@@ -1923,7 +1793,8 @@ func setSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, nam
 		}
 
 		v := usermem.ByteOrder.Uint32(optVal)
-		return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.PasscredOption, v != 0))
+		ep.SocketOptions().SetPassCred(v != 0)
+		return nil
 
 	case linux.SO_KEEPALIVE:
 		if len(optVal) < sizeOfInt32 {
@@ -1931,7 +1802,8 @@ func setSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, nam
 		}
 
 		v := usermem.ByteOrder.Uint32(optVal)
-		return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.KeepaliveEnabledOption, v != 0))
+		ep.SocketOptions().SetKeepAlive(v != 0)
+		return nil
 
 	case linux.SO_SNDTIMEO:
 		if len(optVal) < linux.SizeOfTimeval {
@@ -1979,7 +1851,8 @@ func setSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, nam
 		}
 
 		v := usermem.ByteOrder.Uint32(optVal)
-		return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.NoChecksumOption, v != 0))
+		ep.SocketOptions().SetNoChecksum(v != 0)
+		return nil
 
 	case linux.SO_LINGER:
 		if len(optVal) < linux.SizeOfLinger {
@@ -2011,7 +1884,12 @@ func setSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, nam
 }
 
 // setSockOptTCP implements SetSockOpt when level is SOL_TCP.
-func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *syserr.Error {
+func setSockOptTCP(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name int, optVal []byte) *syserr.Error {
+	if _, skType, skProto := s.Type(); !isTCPSocket(skType, skProto) {
+		log.Warningf("SOL_TCP options are only supported on TCP sockets: skType, skProto = %v, %d", skType, skProto)
+		return syserr.ErrUnknownProtocolOption
+	}
+
 	switch name {
 	case linux.TCP_NODELAY:
 		if len(optVal) < sizeOfInt32 {
@@ -2019,7 +1897,8 @@ func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *
 		}
 
 		v := usermem.ByteOrder.Uint32(optVal)
-		return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.DelayOption, v == 0))
+		ep.SocketOptions().SetDelayOption(v == 0)
+		return nil
 
 	case linux.TCP_CORK:
 		if len(optVal) < sizeOfInt32 {
@@ -2027,7 +1906,8 @@ func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *
 		}
 
 		v := usermem.ByteOrder.Uint32(optVal)
-		return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.CorkOption, v != 0))
+		ep.SocketOptions().SetCorkOption(v != 0)
+		return nil
 
 	case linux.TCP_QUICKACK:
 		if len(optVal) < sizeOfInt32 {
@@ -2035,7 +1915,8 @@ func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *
 		}
 
 		v := usermem.ByteOrder.Uint32(optVal)
-		return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.QuickAckOption, v != 0))
+		ep.SocketOptions().SetQuickAck(v != 0)
+		return nil
 
 	case linux.TCP_MAXSEG:
 		if len(optVal) < sizeOfInt32 {
@@ -2147,14 +2028,31 @@ func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *
 
 // setSockOptIPv6 implements SetSockOpt when level is SOL_IPV6.
 func setSockOptIPv6(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name int, optVal []byte) *syserr.Error {
+	if _, ok := ep.(tcpip.Endpoint); !ok {
+		log.Warningf("SOL_IPV6 options not supported on endpoints other than tcpip.Endpoint: option = %d", name)
+		return syserr.ErrUnknownProtocolOption
+	}
+
+	family, skType, skProto := s.Type()
+	if family != linux.AF_INET6 {
+		return syserr.ErrUnknownProtocolOption
+	}
+
 	switch name {
 	case linux.IPV6_V6ONLY:
 		if len(optVal) < sizeOfInt32 {
 			return syserr.ErrInvalidArgument
 		}
 
+		if isTCPSocket(skType, skProto) && tcp.EndpointState(ep.State()) != tcp.StateInitial {
+			return syserr.ErrInvalidEndpointState
+		} else if isUDPSocket(skType, skProto) && udp.EndpointState(ep.State()) != udp.StateInitial {
+			return syserr.ErrInvalidEndpointState
+		}
+
 		v := usermem.ByteOrder.Uint32(optVal)
-		return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.V6OnlyOption, v != 0))
+		ep.SocketOptions().SetV6Only(v != 0)
+		return nil
 
 	case linux.IPV6_ADD_MEMBERSHIP,
 		linux.IPV6_DROP_MEMBERSHIP,
@@ -2193,7 +2091,8 @@ func setSockOptIPv6(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name
 			return err
 		}
 
-		return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.ReceiveTClassOption, v != 0))
+		ep.SocketOptions().SetReceiveTClass(v != 0)
+		return nil
 
 	case linux.IP6T_SO_SET_REPLACE:
 		if len(optVal) < linux.SizeOfIP6TReplace {
@@ -2201,7 +2100,7 @@ func setSockOptIPv6(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name
 		}
 
 		// Only valid for raw IPv6 sockets.
-		if family, skType, _ := s.Type(); family != linux.AF_INET6 || skType != linux.SOCK_RAW {
+		if skType != linux.SOCK_RAW {
 			return syserr.ErrProtocolNotAvailable
 		}
 
@@ -2276,6 +2175,11 @@ func parseIntOrChar(buf []byte) (int32, *syserr.Error) {
 
 // setSockOptIP implements SetSockOpt when level is SOL_IP.
 func setSockOptIP(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name int, optVal []byte) *syserr.Error {
+	if _, ok := ep.(tcpip.Endpoint); !ok {
+		log.Warningf("SOL_IP options not supported on endpoints other than tcpip.Endpoint: option = %d", name)
+		return syserr.ErrUnknownProtocolOption
+	}
+
 	switch name {
 	case linux.IP_MULTICAST_TTL:
 		v, err := parseIntOrChar(optVal)
@@ -2328,7 +2232,7 @@ func setSockOptIP(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name in
 
 		return syserr.TranslateNetstackError(ep.SetSockOpt(&tcpip.MulticastInterfaceOption{
 			NIC:           tcpip.NICID(req.InterfaceIndex),
-			InterfaceAddr: bytesToIPAddress(req.InterfaceAddr[:]),
+			InterfaceAddr: socket.BytesToIPAddress(req.InterfaceAddr[:]),
 		}))
 
 	case linux.IP_MULTICAST_LOOP:
@@ -2337,7 +2241,8 @@ func setSockOptIP(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name in
 			return err
 		}
 
-		return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.MulticastLoopOption, v != 0))
+		ep.SocketOptions().SetMulticastLoop(v != 0)
+		return nil
 
 	case linux.MCAST_JOIN_GROUP:
 		// FIXME(b/124219304): Implement MCAST_JOIN_GROUP.
@@ -2373,7 +2278,8 @@ func setSockOptIP(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name in
 		if err != nil {
 			return err
 		}
-		return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.ReceiveTOSOption, v != 0))
+		ep.SocketOptions().SetReceiveTOS(v != 0)
+		return nil
 
 	case linux.IP_PKTINFO:
 		if len(optVal) == 0 {
@@ -2383,7 +2289,8 @@ func setSockOptIP(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name in
 		if err != nil {
 			return err
 		}
-		return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.ReceiveIPPacketInfoOption, v != 0))
+		ep.SocketOptions().SetReceivePacketInfo(v != 0)
+		return nil
 
 	case linux.IP_HDRINCL:
 		if len(optVal) == 0 {
@@ -2393,7 +2300,8 @@ func setSockOptIP(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name in
 		if err != nil {
 			return err
 		}
-		return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.IPHdrIncludedOption, v != 0))
+		ep.SocketOptions().SetHeaderIncluded(v != 0)
+		return nil
 
 	case linux.IPT_SO_SET_REPLACE:
 		if len(optVal) < linux.SizeOfIPTReplace {
@@ -2535,7 +2443,6 @@ func emitUnimplementedEventIP(t *kernel.Task, name int) {
 	switch name {
 	case linux.IP_TOS,
 		linux.IP_TTL,
-		linux.IP_HDRINCL,
 		linux.IP_OPTIONS,
 		linux.IP_ROUTER_ALERT,
 		linux.IP_RECVOPTS,
@@ -2582,72 +2489,6 @@ func emitUnimplementedEventIP(t *kernel.Task, name int) {
 	}
 }
 
-// isLinkLocal determines if the given IPv6 address is link-local. This is the
-// case when it has the fe80::/10 prefix. This check is used to determine when
-// the NICID is relevant for a given IPv6 address.
-func isLinkLocal(addr tcpip.Address) bool {
-	return len(addr) >= 2 && addr[0] == 0xfe && addr[1]&0xc0 == 0x80
-}
-
-// ConvertAddress converts the given address to a native format.
-func ConvertAddress(family int, addr tcpip.FullAddress) (linux.SockAddr, uint32) {
-	switch family {
-	case linux.AF_UNIX:
-		var out linux.SockAddrUnix
-		out.Family = linux.AF_UNIX
-		l := len([]byte(addr.Addr))
-		for i := 0; i < l; i++ {
-			out.Path[i] = int8(addr.Addr[i])
-		}
-
-		// Linux returns the used length of the address struct (including the
-		// null terminator) for filesystem paths. The Family field is 2 bytes.
-		// It is sometimes allowed to exclude the null terminator if the
-		// address length is the max. Abstract and empty paths always return
-		// the full exact length.
-		if l == 0 || out.Path[0] == 0 || l == len(out.Path) {
-			return &out, uint32(2 + l)
-		}
-		return &out, uint32(3 + l)
-
-	case linux.AF_INET:
-		var out linux.SockAddrInet
-		copy(out.Addr[:], addr.Addr)
-		out.Family = linux.AF_INET
-		out.Port = htons(addr.Port)
-		return &out, uint32(sockAddrInetSize)
-
-	case linux.AF_INET6:
-		var out linux.SockAddrInet6
-		if len(addr.Addr) == header.IPv4AddressSize {
-			// Copy address in v4-mapped format.
-			copy(out.Addr[12:], addr.Addr)
-			out.Addr[10] = 0xff
-			out.Addr[11] = 0xff
-		} else {
-			copy(out.Addr[:], addr.Addr)
-		}
-		out.Family = linux.AF_INET6
-		out.Port = htons(addr.Port)
-		if isLinkLocal(addr.Addr) {
-			out.Scope_id = uint32(addr.NIC)
-		}
-		return &out, uint32(sockAddrInet6Size)
-
-	case linux.AF_PACKET:
-		// TODO(gvisor.dev/issue/173): Return protocol too.
-		var out linux.SockAddrLink
-		out.Family = linux.AF_PACKET
-		out.InterfaceIndex = int32(addr.NIC)
-		out.HardwareAddrLen = header.EthernetAddressSize
-		copy(out.HardwareAddr[:], addr.Addr)
-		return &out, uint32(sockAddrLinkSize)
-
-	default:
-		return nil, 0
-	}
-}
-
 // GetSockName implements the linux syscall getsockname(2) for sockets backed by
 // tcpip.Endpoint.
 func (s *socketOpsCommon) GetSockName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) {
@@ -2656,7 +2497,7 @@ func (s *socketOpsCommon) GetSockName(t *kernel.Task) (linux.SockAddr, uint32, *
 		return nil, 0, syserr.TranslateNetstackError(err)
 	}
 
-	a, l := ConvertAddress(s.family, addr)
+	a, l := socket.ConvertAddress(s.family, addr)
 	return a, l, nil
 }
 
@@ -2668,7 +2509,7 @@ func (s *socketOpsCommon) GetPeerName(t *kernel.Task) (linux.SockAddr, uint32, *
 		return nil, 0, syserr.TranslateNetstackError(err)
 	}
 
-	a, l := ConvertAddress(s.family, addr)
+	a, l := socket.ConvertAddress(s.family, addr)
 	return a, l, nil
 }
 
@@ -2686,7 +2527,7 @@ func (s *socketOpsCommon) coalescingRead(ctx context.Context, dst usermem.IOSequ
 		// Always do at least one fetchReadView, even if the number of bytes to
 		// read is 0.
 		err = s.fetchReadView()
-		if err != nil {
+		if err != nil || len(s.readView) == 0 {
 			break
 		}
 		if dst.NumBytes() == 0 {
@@ -2709,15 +2550,20 @@ func (s *socketOpsCommon) coalescingRead(ctx context.Context, dst usermem.IOSequ
 		}
 		copied += n
 		s.readView.TrimFront(n)
-		if len(s.readView) == 0 {
-			atomic.StoreUint32(&s.readViewHasData, 0)
-		}
 
 		dst = dst.DropFirst(n)
 		if e != nil {
 			err = syserr.FromError(e)
 			break
 		}
+		// If we are done reading requested data then stop.
+		if dst.NumBytes() == 0 {
+			break
+		}
+	}
+
+	if len(s.readView) == 0 {
+		atomic.StoreUint32(&s.readViewHasData, 0)
 	}
 
 	// If we managed to copy something, we must deliver it.
@@ -2812,10 +2658,10 @@ func (s *socketOpsCommon) nonBlockingRead(ctx context.Context, dst usermem.IOSeq
 	var addr linux.SockAddr
 	var addrLen uint32
 	if isPacket && senderRequested {
-		addr, addrLen = ConvertAddress(s.family, s.sender)
+		addr, addrLen = socket.ConvertAddress(s.family, s.sender)
 		switch v := addr.(type) {
 		case *linux.SockAddrLink:
-			v.Protocol = htons(uint16(s.linkPacketInfo.Protocol))
+			v.Protocol = socket.Htons(uint16(s.linkPacketInfo.Protocol))
 			v.PacketType = toLinuxPacketType(s.linkPacketInfo.PktType)
 		}
 	}
@@ -2980,7 +2826,7 @@ func (s *socketOpsCommon) SendMsg(t *kernel.Task, src usermem.IOSequence, to []b
 
 	var addr *tcpip.FullAddress
 	if len(to) > 0 {
-		addrBuf, family, err := AddressAndFamily(to)
+		addrBuf, family, err := socket.AddressAndFamily(to)
 		if err != nil {
 			return 0, err
 		}
@@ -3399,6 +3245,18 @@ func nicStateFlagsToLinux(f stack.NICStateFlags) uint32 {
 	return rv
 }
 
+func isTCPSocket(skType linux.SockType, skProto int) bool {
+	return skType == linux.SOCK_STREAM && (skProto == 0 || skProto == syscall.IPPROTO_TCP)
+}
+
+func isUDPSocket(skType linux.SockType, skProto int) bool {
+	return skType == linux.SOCK_DGRAM && (skProto == 0 || skProto == syscall.IPPROTO_UDP)
+}
+
+func isICMPSocket(skType linux.SockType, skProto int) bool {
+	return skType == linux.SOCK_DGRAM && (skProto == syscall.IPPROTO_ICMP || skProto == syscall.IPPROTO_ICMPV6)
+}
+
 // State implements socket.Socket.State. State translates the internal state
 // returned by netstack to values defined by Linux.
 func (s *socketOpsCommon) State() uint32 {
@@ -3408,7 +3266,7 @@ func (s *socketOpsCommon) State() uint32 {
 	}
 
 	switch {
-	case s.skType == linux.SOCK_STREAM && s.protocol == 0 || s.protocol == syscall.IPPROTO_TCP:
+	case isTCPSocket(s.skType, s.protocol):
 		// TCP socket.
 		switch tcp.EndpointState(s.Endpoint.State()) {
 		case tcp.StateEstablished:
@@ -3437,7 +3295,7 @@ func (s *socketOpsCommon) State() uint32 {
 			// Internal or unknown state.
 			return 0
 		}
-	case s.skType == linux.SOCK_DGRAM && s.protocol == 0 || s.protocol == syscall.IPPROTO_UDP:
+	case isUDPSocket(s.skType, s.protocol):
 		// UDP socket.
 		switch udp.EndpointState(s.Endpoint.State()) {
 		case udp.StateInitial, udp.StateBound, udp.StateClosed:
@@ -3447,7 +3305,7 @@ func (s *socketOpsCommon) State() uint32 {
 		default:
 			return 0
 		}
-	case s.skType == linux.SOCK_DGRAM && s.protocol == syscall.IPPROTO_ICMP || s.protocol == syscall.IPPROTO_ICMPV6:
+	case isICMPSocket(s.skType, s.protocol):
 		// TODO(b/112063468): Export states for ICMP sockets.
 	case s.skType == linux.SOCK_RAW:
 		// TODO(b/112063468): Export states for raw sockets.
diff --git a/pkg/sentry/socket/netstack/netstack_vfs2.go b/pkg/sentry/socket/netstack/netstack_vfs2.go
index b0d9e4d9e..b756bfca0 100644
--- a/pkg/sentry/socket/netstack/netstack_vfs2.go
+++ b/pkg/sentry/socket/netstack/netstack_vfs2.go
@@ -51,9 +51,7 @@ var _ = socket.SocketVFS2(&SocketVFS2{})
 // NewVFS2 creates a new endpoint socket.
 func NewVFS2(t *kernel.Task, family int, skType linux.SockType, protocol int, queue *waiter.Queue, endpoint tcpip.Endpoint) (*vfs.FileDescription, *syserr.Error) {
 	if skType == linux.SOCK_STREAM {
-		if err := endpoint.SetSockOptBool(tcpip.DelayOption, true); err != nil {
-			return nil, syserr.TranslateNetstackError(err)
-		}
+		endpoint.SocketOptions().SetDelayOption(true)
 	}
 
 	mnt := t.Kernel().SocketMount()
@@ -191,7 +189,7 @@ func (s *SocketVFS2) Accept(t *kernel.Task, peerRequested bool, flags int, block
 	var addrLen uint32
 	if peerAddr != nil {
 		// Get address of the peer and write it to peer slice.
-		addr, addrLen = ConvertAddress(s.family, *peerAddr)
+		addr, addrLen = socket.ConvertAddress(s.family, *peerAddr)
 	}
 
 	fd, e := t.NewFDFromVFS2(0, ns, kernel.FDFlags{
diff --git a/pkg/sentry/socket/netstack/provider.go b/pkg/sentry/socket/netstack/provider.go
index ead3b2b79..c847ff1c7 100644
--- a/pkg/sentry/socket/netstack/provider.go
+++ b/pkg/sentry/socket/netstack/provider.go
@@ -158,7 +158,7 @@ func packetSocket(t *kernel.Task, epStack *Stack, stype linux.SockType, protocol
 
 	// protocol is passed in network byte order, but netstack wants it in
 	// host order.
-	netProto := tcpip.NetworkProtocolNumber(ntohs(uint16(protocol)))
+	netProto := tcpip.NetworkProtocolNumber(socket.Ntohs(uint16(protocol)))
 
 	wq := &waiter.Queue{}
 	ep, err := epStack.Stack.NewPacketEndpoint(cooked, netProto, wq)
diff --git a/pkg/sentry/socket/netstack/provider_vfs2.go b/pkg/sentry/socket/netstack/provider_vfs2.go
index 2a01143f6..0af805246 100644
--- a/pkg/sentry/socket/netstack/provider_vfs2.go
+++ b/pkg/sentry/socket/netstack/provider_vfs2.go
@@ -102,7 +102,7 @@ func packetSocketVFS2(t *kernel.Task, epStack *Stack, stype linux.SockType, prot
 
 	// protocol is passed in network byte order, but netstack wants it in
 	// host order.
-	netProto := tcpip.NetworkProtocolNumber(ntohs(uint16(protocol)))
+	netProto := tcpip.NetworkProtocolNumber(socket.Ntohs(uint16(protocol)))
 
 	wq := &waiter.Queue{}
 	ep, err := epStack.Stack.NewPacketEndpoint(cooked, netProto, wq)
diff --git a/pkg/sentry/socket/netstack/stack.go b/pkg/sentry/socket/netstack/stack.go
index fa9ac9059..cc0fadeb5 100644
--- a/pkg/sentry/socket/netstack/stack.go
+++ b/pkg/sentry/socket/netstack/stack.go
@@ -324,12 +324,12 @@ func (s *Stack) Statistics(stat interface{}, arg string) error {
 			0,                               // Support Ip/FragCreates.
 		}
 	case *inet.StatSNMPICMP:
-		in := Metrics.ICMP.V4PacketsReceived.ICMPv4PacketStats
-		out := Metrics.ICMP.V4PacketsSent.ICMPv4PacketStats
+		in := Metrics.ICMP.V4.PacketsReceived.ICMPv4PacketStats
+		out := Metrics.ICMP.V4.PacketsSent.ICMPv4PacketStats
 		// TODO(gvisor.dev/issue/969) Support stubbed stats.
 		*stats = inet.StatSNMPICMP{
 			0, // Icmp/InMsgs.
-			Metrics.ICMP.V4PacketsSent.Dropped.Value(), // InErrors.
+			Metrics.ICMP.V4.PacketsSent.Dropped.Value(), // InErrors.
 			0,                         // Icmp/InCsumErrors.
 			in.DstUnreachable.Value(), // InDestUnreachs.
 			in.TimeExceeded.Value(),   // InTimeExcds.
@@ -343,18 +343,18 @@ func (s *Stack) Statistics(stat interface{}, arg string) error {
 			in.InfoRequest.Value(),    // InAddrMasks.
 			in.InfoReply.Value(),      // InAddrMaskReps.
 			0,                         // Icmp/OutMsgs.
-			Metrics.ICMP.V4PacketsReceived.Invalid.Value(), // OutErrors.
-			out.DstUnreachable.Value(),                     // OutDestUnreachs.
-			out.TimeExceeded.Value(),                       // OutTimeExcds.
-			out.ParamProblem.Value(),                       // OutParmProbs.
-			out.SrcQuench.Value(),                          // OutSrcQuenchs.
-			out.Redirect.Value(),                           // OutRedirects.
-			out.Echo.Value(),                               // OutEchos.
-			out.EchoReply.Value(),                          // OutEchoReps.
-			out.Timestamp.Value(),                          // OutTimestamps.
-			out.TimestampReply.Value(),                     // OutTimestampReps.
-			out.InfoRequest.Value(),                        // OutAddrMasks.
-			out.InfoReply.Value(),                          // OutAddrMaskReps.
+			Metrics.ICMP.V4.PacketsReceived.Invalid.Value(), // OutErrors.
+			out.DstUnreachable.Value(),                      // OutDestUnreachs.
+			out.TimeExceeded.Value(),                        // OutTimeExcds.
+			out.ParamProblem.Value(),                        // OutParmProbs.
+			out.SrcQuench.Value(),                           // OutSrcQuenchs.
+			out.Redirect.Value(),                            // OutRedirects.
+			out.Echo.Value(),                                // OutEchos.
+			out.EchoReply.Value(),                           // OutEchoReps.
+			out.Timestamp.Value(),                           // OutTimestamps.
+			out.TimestampReply.Value(),                      // OutTimestampReps.
+			out.InfoRequest.Value(),                         // OutAddrMasks.
+			out.InfoReply.Value(),                           // OutAddrMaskReps.
 		}
 	case *inet.StatSNMPTCP:
 		tcp := Metrics.TCP
diff --git a/pkg/sentry/socket/socket.go b/pkg/sentry/socket/socket.go
index fd31479e5..9049e8a21 100644
--- a/pkg/sentry/socket/socket.go
+++ b/pkg/sentry/socket/socket.go
@@ -18,6 +18,7 @@
 package socket
 
 import (
+	"bytes"
 	"fmt"
 	"sync/atomic"
 	"syscall"
@@ -35,6 +36,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserr"
 	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
@@ -460,3 +462,176 @@ func UnmarshalSockAddr(family int, data []byte) linux.SockAddr {
 		panic(fmt.Sprintf("Unsupported socket family %v", family))
 	}
 }
+
+var sockAddrLinkSize = (&linux.SockAddrLink{}).SizeBytes()
+var sockAddrInetSize = (&linux.SockAddrInet{}).SizeBytes()
+var sockAddrInet6Size = (&linux.SockAddrInet6{}).SizeBytes()
+
+// Ntohs converts a 16-bit number from network byte order to host byte order. It
+// assumes that the host is little endian.
+func Ntohs(v uint16) uint16 {
+	return v<<8 | v>>8
+}
+
+// Htons converts a 16-bit number from host byte order to network byte order. It
+// assumes that the host is little endian.
+func Htons(v uint16) uint16 {
+	return Ntohs(v)
+}
+
+// isLinkLocal determines if the given IPv6 address is link-local. This is the
+// case when it has the fe80::/10 prefix. This check is used to determine when
+// the NICID is relevant for a given IPv6 address.
+func isLinkLocal(addr tcpip.Address) bool {
+	return len(addr) >= 2 && addr[0] == 0xfe && addr[1]&0xc0 == 0x80
+}
+
+// ConvertAddress converts the given address to a native format.
+func ConvertAddress(family int, addr tcpip.FullAddress) (linux.SockAddr, uint32) {
+	switch family {
+	case linux.AF_UNIX:
+		var out linux.SockAddrUnix
+		out.Family = linux.AF_UNIX
+		l := len([]byte(addr.Addr))
+		for i := 0; i < l; i++ {
+			out.Path[i] = int8(addr.Addr[i])
+		}
+
+		// Linux returns the used length of the address struct (including the
+		// null terminator) for filesystem paths. The Family field is 2 bytes.
+		// It is sometimes allowed to exclude the null terminator if the
+		// address length is the max. Abstract and empty paths always return
+		// the full exact length.
+		if l == 0 || out.Path[0] == 0 || l == len(out.Path) {
+			return &out, uint32(2 + l)
+		}
+		return &out, uint32(3 + l)
+
+	case linux.AF_INET:
+		var out linux.SockAddrInet
+		copy(out.Addr[:], addr.Addr)
+		out.Family = linux.AF_INET
+		out.Port = Htons(addr.Port)
+		return &out, uint32(sockAddrInetSize)
+
+	case linux.AF_INET6:
+		var out linux.SockAddrInet6
+		if len(addr.Addr) == header.IPv4AddressSize {
+			// Copy address in v4-mapped format.
+			copy(out.Addr[12:], addr.Addr)
+			out.Addr[10] = 0xff
+			out.Addr[11] = 0xff
+		} else {
+			copy(out.Addr[:], addr.Addr)
+		}
+		out.Family = linux.AF_INET6
+		out.Port = Htons(addr.Port)
+		if isLinkLocal(addr.Addr) {
+			out.Scope_id = uint32(addr.NIC)
+		}
+		return &out, uint32(sockAddrInet6Size)
+
+	case linux.AF_PACKET:
+		// TODO(gvisor.dev/issue/173): Return protocol too.
+		var out linux.SockAddrLink
+		out.Family = linux.AF_PACKET
+		out.InterfaceIndex = int32(addr.NIC)
+		out.HardwareAddrLen = header.EthernetAddressSize
+		copy(out.HardwareAddr[:], addr.Addr)
+		return &out, uint32(sockAddrLinkSize)
+
+	default:
+		return nil, 0
+	}
+}
+
+// BytesToIPAddress converts an IPv4 or IPv6 address from the user to the
+// netstack representation taking any addresses into account.
+func BytesToIPAddress(addr []byte) tcpip.Address {
+	if bytes.Equal(addr, make([]byte, 4)) || bytes.Equal(addr, make([]byte, 16)) {
+		return ""
+	}
+	return tcpip.Address(addr)
+}
+
+// AddressAndFamily reads an sockaddr struct from the given address and
+// converts it to the FullAddress format. It supports AF_UNIX, AF_INET,
+// AF_INET6, and AF_PACKET addresses.
+//
+// AddressAndFamily returns an address and its family.
+func AddressAndFamily(addr []byte) (tcpip.FullAddress, uint16, *syserr.Error) {
+	// Make sure we have at least 2 bytes for the address family.
+	if len(addr) < 2 {
+		return tcpip.FullAddress{}, 0, syserr.ErrInvalidArgument
+	}
+
+	// Get the rest of the fields based on the address family.
+	switch family := usermem.ByteOrder.Uint16(addr); family {
+	case linux.AF_UNIX:
+		path := addr[2:]
+		if len(path) > linux.UnixPathMax {
+			return tcpip.FullAddress{}, family, syserr.ErrInvalidArgument
+		}
+		// Drop the terminating NUL (if one exists) and everything after
+		// it for filesystem (non-abstract) addresses.
+		if len(path) > 0 && path[0] != 0 {
+			if n := bytes.IndexByte(path[1:], 0); n >= 0 {
+				path = path[:n+1]
+			}
+		}
+		return tcpip.FullAddress{
+			Addr: tcpip.Address(path),
+		}, family, nil
+
+	case linux.AF_INET:
+		var a linux.SockAddrInet
+		if len(addr) < sockAddrInetSize {
+			return tcpip.FullAddress{}, family, syserr.ErrInvalidArgument
+		}
+		binary.Unmarshal(addr[:sockAddrInetSize], usermem.ByteOrder, &a)
+
+		out := tcpip.FullAddress{
+			Addr: BytesToIPAddress(a.Addr[:]),
+			Port: Ntohs(a.Port),
+		}
+		return out, family, nil
+
+	case linux.AF_INET6:
+		var a linux.SockAddrInet6
+		if len(addr) < sockAddrInet6Size {
+			return tcpip.FullAddress{}, family, syserr.ErrInvalidArgument
+		}
+		binary.Unmarshal(addr[:sockAddrInet6Size], usermem.ByteOrder, &a)
+
+		out := tcpip.FullAddress{
+			Addr: BytesToIPAddress(a.Addr[:]),
+			Port: Ntohs(a.Port),
+		}
+		if isLinkLocal(out.Addr) {
+			out.NIC = tcpip.NICID(a.Scope_id)
+		}
+		return out, family, nil
+
+	case linux.AF_PACKET:
+		var a linux.SockAddrLink
+		if len(addr) < sockAddrLinkSize {
+			return tcpip.FullAddress{}, family, syserr.ErrInvalidArgument
+		}
+		binary.Unmarshal(addr[:sockAddrLinkSize], usermem.ByteOrder, &a)
+		if a.Family != linux.AF_PACKET || a.HardwareAddrLen != header.EthernetAddressSize {
+			return tcpip.FullAddress{}, family, syserr.ErrInvalidArgument
+		}
+
+		// TODO(gvisor.dev/issue/173): Return protocol too.
+		return tcpip.FullAddress{
+			NIC:  tcpip.NICID(a.InterfaceIndex),
+			Addr: tcpip.Address(a.HardwareAddr[:header.EthernetAddressSize]),
+		}, family, nil
+
+	case linux.AF_UNSPEC:
+		return tcpip.FullAddress{}, family, nil
+
+	default:
+		return tcpip.FullAddress{}, 0, syserr.ErrAddressFamilyNotSupported
+	}
+}
diff --git a/pkg/sentry/socket/unix/transport/connectioned.go b/pkg/sentry/socket/unix/transport/connectioned.go
index 6d9e502bd..9f7aca305 100644
--- a/pkg/sentry/socket/unix/transport/connectioned.go
+++ b/pkg/sentry/socket/unix/transport/connectioned.go
@@ -118,28 +118,24 @@ var (
 
 // NewConnectioned creates a new unbound connectionedEndpoint.
 func NewConnectioned(ctx context.Context, stype linux.SockType, uid UniqueIDProvider) Endpoint {
-	return &connectionedEndpoint{
+	return newConnectioned(ctx, stype, uid)
+}
+
+func newConnectioned(ctx context.Context, stype linux.SockType, uid UniqueIDProvider) *connectionedEndpoint {
+	ep := &connectionedEndpoint{
 		baseEndpoint: baseEndpoint{Queue: &waiter.Queue{}},
 		id:           uid.UniqueID(),
 		idGenerator:  uid,
 		stype:        stype,
 	}
+	ep.ops.InitHandler(ep)
+	return ep
 }
 
 // NewPair allocates a new pair of connected unix-domain connectionedEndpoints.
 func NewPair(ctx context.Context, stype linux.SockType, uid UniqueIDProvider) (Endpoint, Endpoint) {
-	a := &connectionedEndpoint{
-		baseEndpoint: baseEndpoint{Queue: &waiter.Queue{}},
-		id:           uid.UniqueID(),
-		idGenerator:  uid,
-		stype:        stype,
-	}
-	b := &connectionedEndpoint{
-		baseEndpoint: baseEndpoint{Queue: &waiter.Queue{}},
-		id:           uid.UniqueID(),
-		idGenerator:  uid,
-		stype:        stype,
-	}
+	a := newConnectioned(ctx, stype, uid)
+	b := newConnectioned(ctx, stype, uid)
 
 	q1 := &queue{ReaderQueue: a.Queue, WriterQueue: b.Queue, limit: initialLimit}
 	q1.InitRefs()
@@ -171,12 +167,14 @@ func NewPair(ctx context.Context, stype linux.SockType, uid UniqueIDProvider) (E
 // NewExternal creates a new externally backed Endpoint. It behaves like a
 // socketpair.
 func NewExternal(ctx context.Context, stype linux.SockType, uid UniqueIDProvider, queue *waiter.Queue, receiver Receiver, connected ConnectedEndpoint) Endpoint {
-	return &connectionedEndpoint{
+	ep := &connectionedEndpoint{
 		baseEndpoint: baseEndpoint{Queue: queue, receiver: receiver, connected: connected},
 		id:           uid.UniqueID(),
 		idGenerator:  uid,
 		stype:        stype,
 	}
+	ep.ops.InitHandler(ep)
+	return ep
 }
 
 // ID implements ConnectingEndpoint.ID.
@@ -298,6 +296,7 @@ func (e *connectionedEndpoint) BidirectionalConnect(ctx context.Context, ce Conn
 		idGenerator: e.idGenerator,
 		stype:       e.stype,
 	}
+	ne.ops.InitHandler(ne)
 
 	readQueue := &queue{ReaderQueue: ce.WaiterQueue(), WriterQueue: ne.Queue, limit: initialLimit}
 	readQueue.InitRefs()
diff --git a/pkg/sentry/socket/unix/transport/connectionless.go b/pkg/sentry/socket/unix/transport/connectionless.go
index 1406971bc..0813ad87d 100644
--- a/pkg/sentry/socket/unix/transport/connectionless.go
+++ b/pkg/sentry/socket/unix/transport/connectionless.go
@@ -44,6 +44,7 @@ func NewConnectionless(ctx context.Context) Endpoint {
 	q := queue{ReaderQueue: ep.Queue, WriterQueue: &waiter.Queue{}, limit: initialLimit}
 	q.InitRefs()
 	ep.receiver = &queueReceiver{readQueue: &q}
+	ep.ops.InitHandler(ep)
 	return ep
 }
 
diff --git a/pkg/sentry/socket/unix/transport/unix.go b/pkg/sentry/socket/unix/transport/unix.go
index 18a50e9f8..0247e93fa 100644
--- a/pkg/sentry/socket/unix/transport/unix.go
+++ b/pkg/sentry/socket/unix/transport/unix.go
@@ -16,8 +16,6 @@
 package transport
 
 import (
-	"sync/atomic"
-
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/log"
@@ -180,10 +178,6 @@ type Endpoint interface {
 	// SetSockOpt sets a socket option.
 	SetSockOpt(opt tcpip.SettableSocketOption) *tcpip.Error
 
-	// SetSockOptBool sets a socket option for simple cases when a value has
-	// the int type.
-	SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error
-
 	// SetSockOptInt sets a socket option for simple cases when a value has
 	// the int type.
 	SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error
@@ -191,10 +185,6 @@ type Endpoint interface {
 	// GetSockOpt gets a socket option.
 	GetSockOpt(opt tcpip.GettableSocketOption) *tcpip.Error
 
-	// GetSockOptBool gets a socket option for simple cases when a return
-	// value has the int type.
-	GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error)
-
 	// GetSockOptInt gets a socket option for simple cases when a return
 	// value has the int type.
 	GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error)
@@ -203,10 +193,11 @@ type Endpoint interface {
 	// procfs.
 	State() uint32
 
-	// LastError implements tcpip.Endpoint.LastError.
+	// LastError clears and returns the last error reported by the endpoint.
 	LastError() *tcpip.Error
 
-	// SocketOptions implements tcpip.Endpoint.SocketOptions.
+	// SocketOptions returns the structure which contains all the socket
+	// level options.
 	SocketOptions() *tcpip.SocketOptions
 }
 
@@ -739,10 +730,7 @@ func (e *connectedEndpoint) CloseUnread() {
 // +stateify savable
 type baseEndpoint struct {
 	*waiter.Queue
-
-	// passcred specifies whether SCM_CREDENTIALS socket control messages are
-	// enabled on this endpoint. Must be accessed atomically.
-	passcred int32
+	tcpip.DefaultSocketOptionsHandler
 
 	// Mutex protects the below fields.
 	sync.Mutex `state:"nosave"`
@@ -761,6 +749,7 @@ type baseEndpoint struct {
 	// linger is used for SO_LINGER socket option.
 	linger tcpip.LingerOption
 
+	// ops is used to get socket level options.
 	ops tcpip.SocketOptions
 }
 
@@ -786,7 +775,7 @@ func (e *baseEndpoint) EventUnregister(we *waiter.Entry) {
 
 // Passcred implements Credentialer.Passcred.
 func (e *baseEndpoint) Passcred() bool {
-	return atomic.LoadInt32(&e.passcred) != 0
+	return e.SocketOptions().GetPassCred()
 }
 
 // ConnectedPasscred implements Credentialer.ConnectedPasscred.
@@ -796,14 +785,6 @@ func (e *baseEndpoint) ConnectedPasscred() bool {
 	return e.connected != nil && e.connected.Passcred()
 }
 
-func (e *baseEndpoint) setPasscred(pc bool) {
-	if pc {
-		atomic.StoreInt32(&e.passcred, 1)
-	} else {
-		atomic.StoreInt32(&e.passcred, 0)
-	}
-}
-
 // Connected implements ConnectingEndpoint.Connected.
 func (e *baseEndpoint) Connected() bool {
 	return e.receiver != nil && e.connected != nil
@@ -868,17 +849,6 @@ func (e *baseEndpoint) SetSockOpt(opt tcpip.SettableSocketOption) *tcpip.Error {
 	return nil
 }
 
-func (e *baseEndpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error {
-	switch opt {
-	case tcpip.PasscredOption:
-		e.setPasscred(v)
-	case tcpip.ReuseAddressOption:
-	default:
-		log.Warningf("Unsupported socket option: %d", opt)
-	}
-	return nil
-}
-
 func (e *baseEndpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
 	switch opt {
 	case tcpip.SendBufferSizeOption:
@@ -889,20 +859,6 @@ func (e *baseEndpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
 	return nil
 }
 
-func (e *baseEndpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) {
-	switch opt {
-	case tcpip.KeepaliveEnabledOption, tcpip.AcceptConnOption:
-		return false, nil
-
-	case tcpip.PasscredOption:
-		return e.Passcred(), nil
-
-	default:
-		log.Warningf("Unsupported socket option: %d", opt)
-		return false, tcpip.ErrUnknownProtocolOption
-	}
-}
-
 func (e *baseEndpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
 	switch opt {
 	case tcpip.ReceiveQueueSizeOption:
diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go
index 3e520d2ee..c59297c80 100644
--- a/pkg/sentry/socket/unix/unix.go
+++ b/pkg/sentry/socket/unix/unix.go
@@ -115,9 +115,6 @@ type socketOpsCommon struct {
 	// bound, they cannot be modified.
 	abstractName      string
 	abstractNamespace *kernel.AbstractSocketNamespace
-
-	// ops is used to get socket level options.
-	ops tcpip.SocketOptions
 }
 
 func (s *socketOpsCommon) isPacket() bool {
@@ -139,7 +136,7 @@ func (s *socketOpsCommon) Endpoint() transport.Endpoint {
 
 // extractPath extracts and validates the address.
 func extractPath(sockaddr []byte) (string, *syserr.Error) {
-	addr, family, err := netstack.AddressAndFamily(sockaddr)
+	addr, family, err := socket.AddressAndFamily(sockaddr)
 	if err != nil {
 		if err == syserr.ErrAddressFamilyNotSupported {
 			err = syserr.ErrInvalidArgument
@@ -172,7 +169,7 @@ func (s *socketOpsCommon) GetPeerName(t *kernel.Task) (linux.SockAddr, uint32, *
 		return nil, 0, syserr.TranslateNetstackError(err)
 	}
 
-	a, l := netstack.ConvertAddress(linux.AF_UNIX, addr)
+	a, l := socket.ConvertAddress(linux.AF_UNIX, addr)
 	return a, l, nil
 }
 
@@ -184,7 +181,7 @@ func (s *socketOpsCommon) GetSockName(t *kernel.Task) (linux.SockAddr, uint32, *
 		return nil, 0, syserr.TranslateNetstackError(err)
 	}
 
-	a, l := netstack.ConvertAddress(linux.AF_UNIX, addr)
+	a, l := socket.ConvertAddress(linux.AF_UNIX, addr)
 	return a, l, nil
 }
 
@@ -258,7 +255,7 @@ func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int,
 	var addr linux.SockAddr
 	var addrLen uint32
 	if peerAddr != nil {
-		addr, addrLen = netstack.ConvertAddress(linux.AF_UNIX, *peerAddr)
+		addr, addrLen = socket.ConvertAddress(linux.AF_UNIX, *peerAddr)
 	}
 
 	fd, e := t.NewFDFrom(0, ns, kernel.FDFlags{
@@ -650,7 +647,7 @@ func (s *socketOpsCommon) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 		var from linux.SockAddr
 		var fromLen uint32
 		if r.From != nil && len([]byte(r.From.Addr)) != 0 {
-			from, fromLen = netstack.ConvertAddress(linux.AF_UNIX, *r.From)
+			from, fromLen = socket.ConvertAddress(linux.AF_UNIX, *r.From)
 		}
 
 		if r.ControlTrunc {
@@ -685,7 +682,7 @@ func (s *socketOpsCommon) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 			var from linux.SockAddr
 			var fromLen uint32
 			if r.From != nil {
-				from, fromLen = netstack.ConvertAddress(linux.AF_UNIX, *r.From)
+				from, fromLen = socket.ConvertAddress(linux.AF_UNIX, *r.From)
 			}
 
 			if r.ControlTrunc {
diff --git a/pkg/sentry/socket/unix/unix_vfs2.go b/pkg/sentry/socket/unix/unix_vfs2.go
index eaf0b0d26..27f705bb2 100644
--- a/pkg/sentry/socket/unix/unix_vfs2.go
+++ b/pkg/sentry/socket/unix/unix_vfs2.go
@@ -172,7 +172,7 @@ func (s *SocketVFS2) Accept(t *kernel.Task, peerRequested bool, flags int, block
 	var addr linux.SockAddr
 	var addrLen uint32
 	if peerAddr != nil {
-		addr, addrLen = netstack.ConvertAddress(linux.AF_UNIX, *peerAddr)
+		addr, addrLen = socket.ConvertAddress(linux.AF_UNIX, *peerAddr)
 	}
 
 	fd, e := t.NewFDFromVFS2(0, ns, kernel.FDFlags{
diff --git a/pkg/sentry/strace/BUILD b/pkg/sentry/strace/BUILD
index a920180d3..d36a64ffc 100644
--- a/pkg/sentry/strace/BUILD
+++ b/pkg/sentry/strace/BUILD
@@ -32,8 +32,8 @@ go_library(
         "//pkg/seccomp",
         "//pkg/sentry/arch",
         "//pkg/sentry/kernel",
+        "//pkg/sentry/socket",
         "//pkg/sentry/socket/netlink",
-        "//pkg/sentry/socket/netstack",
         "//pkg/sentry/syscalls/linux",
         "//pkg/usermem",
     ],
diff --git a/pkg/sentry/strace/socket.go b/pkg/sentry/strace/socket.go
index cc5f70cd4..d943a7cb1 100644
--- a/pkg/sentry/strace/socket.go
+++ b/pkg/sentry/strace/socket.go
@@ -23,8 +23,8 @@ import (
 	"gvisor.dev/gvisor/pkg/binary"
 	"gvisor.dev/gvisor/pkg/marshal/primitive"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/socket"
 	"gvisor.dev/gvisor/pkg/sentry/socket/netlink"
-	"gvisor.dev/gvisor/pkg/sentry/socket/netstack"
 	slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
@@ -341,7 +341,7 @@ func sockAddr(t *kernel.Task, addr usermem.Addr, length uint32) string {
 
 	switch family {
 	case linux.AF_INET, linux.AF_INET6, linux.AF_UNIX:
-		fa, _, err := netstack.AddressAndFamily(b)
+		fa, _, err := socket.AddressAndFamily(b)
 		if err != nil {
 			return fmt.Sprintf("%#x {Family: %s, error extracting address: %v}", addr, familyStr, err)
 		}
diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go
index bb1f715e2..cff442846 100644
--- a/pkg/sentry/syscalls/linux/linux64.go
+++ b/pkg/sentry/syscalls/linux/linux64.go
@@ -118,7 +118,7 @@ var AMD64 = &kernel.SyscallTable{
 		63:  syscalls.Supported("uname", Uname),
 		64:  syscalls.Supported("semget", Semget),
 		65:  syscalls.PartiallySupported("semop", Semop, "Option SEM_UNDO not supported.", nil),
-		66:  syscalls.PartiallySupported("semctl", Semctl, "Options IPC_INFO, SEM_INFO, SEM_STAT, SEM_STAT_ANY not supported.", nil),
+		66:  syscalls.PartiallySupported("semctl", Semctl, "Options SEM_INFO, SEM_STAT, SEM_STAT_ANY not supported.", nil),
 		67:  syscalls.Supported("shmdt", Shmdt),
 		68:  syscalls.ErrorWithEvent("msgget", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921)
 		69:  syscalls.ErrorWithEvent("msgsnd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921)
@@ -619,7 +619,7 @@ var ARM64 = &kernel.SyscallTable{
 		188: syscalls.ErrorWithEvent("msgrcv", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}),          // TODO(b/29354921)
 		189: syscalls.ErrorWithEvent("msgsnd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}),          // TODO(b/29354921)
 		190: syscalls.Supported("semget", Semget),
-		191: syscalls.PartiallySupported("semctl", Semctl, "Options IPC_INFO, SEM_INFO, SEM_STAT, SEM_STAT_ANY not supported.", nil),
+		191: syscalls.PartiallySupported("semctl", Semctl, "Options SEM_INFO, SEM_STAT, SEM_STAT_ANY not supported.", nil),
 		192: syscalls.ErrorWithEvent("semtimedop", syserror.ENOSYS, "", []string{"gvisor.dev/issue/137"}),
 		193: syscalls.PartiallySupported("semop", Semop, "Option SEM_UNDO not supported.", nil),
 		194: syscalls.PartiallySupported("shmget", Shmget, "Option SHM_HUGETLB is not supported.", nil),
diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go
index 519066a47..8db587401 100644
--- a/pkg/sentry/syscalls/linux/sys_file.go
+++ b/pkg/sentry/syscalls/linux/sys_file.go
@@ -646,7 +646,7 @@ func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 		if _, err := primitive.CopyInt32In(t, args[2].Pointer(), &set); err != nil {
 			return 0, nil, err
 		}
-		fSetOwn(t, file, set)
+		fSetOwn(t, int(fd), file, set)
 		return 0, nil, nil
 
 	case linux.FIOGETOWN, linux.SIOCGPGRP:
@@ -901,8 +901,8 @@ func fGetOwn(t *kernel.Task, file *fs.File) int32 {
 //
 // If who is positive, it represents a PID. If negative, it represents a PGID.
 // If the PID or PGID is invalid, the owner is silently unset.
-func fSetOwn(t *kernel.Task, file *fs.File, who int32) error {
-	a := file.Async(fasync.New).(*fasync.FileAsync)
+func fSetOwn(t *kernel.Task, fd int, file *fs.File, who int32) error {
+	a := file.Async(fasync.New(fd)).(*fasync.FileAsync)
 	if who < 0 {
 		// Check for overflow before flipping the sign.
 		if who-1 > who {
@@ -1049,7 +1049,7 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	case linux.F_GETOWN:
 		return uintptr(fGetOwn(t, file)), nil, nil
 	case linux.F_SETOWN:
-		return 0, nil, fSetOwn(t, file, args[2].Int())
+		return 0, nil, fSetOwn(t, int(fd), file, args[2].Int())
 	case linux.F_GETOWN_EX:
 		addr := args[2].Pointer()
 		owner := fGetOwnEx(t, file)
@@ -1062,7 +1062,7 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 		if err != nil {
 			return 0, nil, err
 		}
-		a := file.Async(fasync.New).(*fasync.FileAsync)
+		a := file.Async(fasync.New(int(fd))).(*fasync.FileAsync)
 		switch owner.Type {
 		case linux.F_OWNER_TID:
 			task := t.PIDNamespace().TaskWithID(kernel.ThreadID(owner.PID))
@@ -1111,6 +1111,12 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 		}
 		n, err := sz.SetFifoSize(int64(args[2].Int()))
 		return uintptr(n), nil, err
+	case linux.F_GETSIG:
+		a := file.Async(fasync.New(int(fd))).(*fasync.FileAsync)
+		return uintptr(a.Signal()), nil, nil
+	case linux.F_SETSIG:
+		a := file.Async(fasync.New(int(fd))).(*fasync.FileAsync)
+		return 0, nil, a.SetSignal(linux.Signal(args[2].Int()))
 	default:
 		// Everything else is not yet supported.
 		return 0, nil, syserror.EINVAL
diff --git a/pkg/sentry/syscalls/linux/sys_sem.go b/pkg/sentry/syscalls/linux/sys_sem.go
index e383a0a87..a1601676f 100644
--- a/pkg/sentry/syscalls/linux/sys_sem.go
+++ b/pkg/sentry/syscalls/linux/sys_sem.go
@@ -146,8 +146,15 @@ func Semctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 		v, err := getNCnt(t, id, num)
 		return uintptr(v), nil, err
 
-	case linux.IPC_INFO,
-		linux.SEM_INFO,
+	case linux.IPC_INFO:
+		buf := args[3].Pointer()
+		r := t.IPCNamespace().SemaphoreRegistry()
+		info := r.IPCInfo()
+		_, err := info.CopyOut(t, buf)
+		// TODO(gvisor.dev/issue/137): Return the index of the highest used entry.
+		return 0, nil, err
+
+	case linux.SEM_INFO,
 		linux.SEM_STAT,
 		linux.SEM_STAT_ANY:
 
diff --git a/pkg/sentry/syscalls/linux/vfs2/fd.go b/pkg/sentry/syscalls/linux/vfs2/fd.go
index 36e89700e..7dd9ef857 100644
--- a/pkg/sentry/syscalls/linux/vfs2/fd.go
+++ b/pkg/sentry/syscalls/linux/vfs2/fd.go
@@ -165,7 +165,7 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 			ownerType = linux.F_OWNER_PGRP
 			who = -who
 		}
-		return 0, nil, setAsyncOwner(t, file, ownerType, who)
+		return 0, nil, setAsyncOwner(t, int(fd), file, ownerType, who)
 	case linux.F_GETOWN_EX:
 		owner, hasOwner := getAsyncOwner(t, file)
 		if !hasOwner {
@@ -179,7 +179,7 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 		if err != nil {
 			return 0, nil, err
 		}
-		return 0, nil, setAsyncOwner(t, file, owner.Type, owner.PID)
+		return 0, nil, setAsyncOwner(t, int(fd), file, owner.Type, owner.PID)
 	case linux.F_SETPIPE_SZ:
 		pipefile, ok := file.Impl().(*pipe.VFSPipeFD)
 		if !ok {
@@ -207,6 +207,16 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 		return 0, nil, err
 	case linux.F_SETLK, linux.F_SETLKW:
 		return 0, nil, posixLock(t, args, file, cmd)
+	case linux.F_GETSIG:
+		a := file.AsyncHandler()
+		if a == nil {
+			// Default behavior aka SIGIO.
+			return 0, nil, nil
+		}
+		return uintptr(a.(*fasync.FileAsync).Signal()), nil, nil
+	case linux.F_SETSIG:
+		a := file.SetAsyncHandler(fasync.NewVFS2(int(fd))).(*fasync.FileAsync)
+		return 0, nil, a.SetSignal(linux.Signal(args[2].Int()))
 	default:
 		// Everything else is not yet supported.
 		return 0, nil, syserror.EINVAL
@@ -241,7 +251,7 @@ func getAsyncOwner(t *kernel.Task, fd *vfs.FileDescription) (ownerEx linux.FOwne
 	}
 }
 
-func setAsyncOwner(t *kernel.Task, fd *vfs.FileDescription, ownerType, pid int32) error {
+func setAsyncOwner(t *kernel.Task, fd int, file *vfs.FileDescription, ownerType, pid int32) error {
 	switch ownerType {
 	case linux.F_OWNER_TID, linux.F_OWNER_PID, linux.F_OWNER_PGRP:
 		// Acceptable type.
@@ -249,7 +259,7 @@ func setAsyncOwner(t *kernel.Task, fd *vfs.FileDescription, ownerType, pid int32
 		return syserror.EINVAL
 	}
 
-	a := fd.SetAsyncHandler(fasync.NewVFS2).(*fasync.FileAsync)
+	a := file.SetAsyncHandler(fasync.NewVFS2(fd)).(*fasync.FileAsync)
 	if pid == 0 {
 		a.ClearOwner()
 		return nil
diff --git a/pkg/sentry/syscalls/linux/vfs2/ioctl.go b/pkg/sentry/syscalls/linux/vfs2/ioctl.go
index 2806c3f6f..20c264fef 100644
--- a/pkg/sentry/syscalls/linux/vfs2/ioctl.go
+++ b/pkg/sentry/syscalls/linux/vfs2/ioctl.go
@@ -100,7 +100,7 @@ func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 			ownerType = linux.F_OWNER_PGRP
 			who = -who
 		}
-		return 0, nil, setAsyncOwner(t, file, ownerType, who)
+		return 0, nil, setAsyncOwner(t, int(fd), file, ownerType, who)
 	}
 
 	ret, err := file.Ioctl(t, t.MemoryManager(), args)
diff --git a/pkg/sentry/syscalls/linux/vfs2/splice.go b/pkg/sentry/syscalls/linux/vfs2/splice.go
index 9ce4f280a..8bb763a47 100644
--- a/pkg/sentry/syscalls/linux/vfs2/splice.go
+++ b/pkg/sentry/syscalls/linux/vfs2/splice.go
@@ -343,8 +343,8 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 
 	// Copy data.
 	var (
-		n   int64
-		err error
+		total int64
+		err   error
 	)
 	dw := dualWaiter{
 		inFile:  inFile,
@@ -357,13 +357,20 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 	// can block.
 	nonBlock := outFile.StatusFlags()&linux.O_NONBLOCK != 0
 	if outIsPipe {
-		for n < count {
-			var spliceN int64
-			spliceN, err = outPipeFD.SpliceFromNonPipe(t, inFile, offset, count)
+		for {
+			var n int64
+			n, err = outPipeFD.SpliceFromNonPipe(t, inFile, offset, count-total)
 			if offset != -1 {
-				offset += spliceN
+				offset += n
+			}
+			total += n
+			if total == count {
+				break
+			}
+			if err == nil && t.Interrupted() {
+				err = syserror.ErrInterrupted
+				break
 			}
-			n += spliceN
 			if err == syserror.ErrWouldBlock && !nonBlock {
 				err = dw.waitForBoth(t)
 			}
@@ -374,7 +381,7 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 	} else {
 		// Read inFile to buffer, then write the contents to outFile.
 		buf := make([]byte, count)
-		for n < count {
+		for {
 			var readN int64
 			if offset != -1 {
 				readN, err = inFile.PRead(t, usermem.BytesIOSequence(buf), offset, vfs.ReadOptions{})
@@ -382,7 +389,6 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 			} else {
 				readN, err = inFile.Read(t, usermem.BytesIOSequence(buf), vfs.ReadOptions{})
 			}
-			n += readN
 
 			// Write all of the bytes that we read. This may need
 			// multiple write calls to complete.
@@ -398,7 +404,7 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 					// We didn't complete the write. Only report the bytes that were actually
 					// written, and rewind offsets as needed.
 					notWritten := int64(len(wbuf))
-					n -= notWritten
+					readN -= notWritten
 					if offset == -1 {
 						// We modified the offset of the input file itself during the read
 						// operation. Rewind it.
@@ -415,6 +421,16 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 					break
 				}
 			}
+
+			total += readN
+			buf = buf[readN:]
+			if total == count {
+				break
+			}
+			if err == nil && t.Interrupted() {
+				err = syserror.ErrInterrupted
+				break
+			}
 			if err == syserror.ErrWouldBlock && !nonBlock {
 				err = dw.waitForBoth(t)
 			}
@@ -432,7 +448,7 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 		}
 	}
 
-	if n != 0 {
+	if total != 0 {
 		inFile.Dentry().InotifyWithParent(t, linux.IN_ACCESS, 0, vfs.PathEvent)
 		outFile.Dentry().InotifyWithParent(t, linux.IN_MODIFY, 0, vfs.PathEvent)
 
@@ -445,7 +461,7 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 
 	// We can only pass a single file to handleIOError, so pick inFile arbitrarily.
 	// This is used only for debugging purposes.
-	return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "sendfile", inFile)
+	return uintptr(total), nil, slinux.HandleIOErrorVFS2(t, total != 0, err, syserror.ERESTARTSYS, "sendfile", inFile)
 }
 
 // dualWaiter is used to wait on one or both vfs.FileDescriptions. It is not
diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD
index 440c9307c..a3868bf16 100644
--- a/pkg/sentry/vfs/BUILD
+++ b/pkg/sentry/vfs/BUILD
@@ -105,6 +105,7 @@ go_library(
         "//pkg/sentry/arch",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/lock",
+        "//pkg/sentry/fsmetric",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/kernel/time",
         "//pkg/sentry/limits",
diff --git a/pkg/sentry/vfs/epoll.go b/pkg/sentry/vfs/epoll.go
index a98aac52b..072655fe8 100644
--- a/pkg/sentry/vfs/epoll.go
+++ b/pkg/sentry/vfs/epoll.go
@@ -204,8 +204,8 @@ func (ep *EpollInstance) AddInterest(file *FileDescription, num int32, event lin
 	file.EventRegister(&epi.waiter, wmask)
 
 	// Check if the file is already ready.
-	if file.Readiness(wmask)&wmask != 0 {
-		epi.Callback(nil)
+	if m := file.Readiness(wmask) & wmask; m != 0 {
+		epi.Callback(nil, m)
 	}
 
 	// Add epi to file.epolls so that it is removed when the last
@@ -274,8 +274,8 @@ func (ep *EpollInstance) ModifyInterest(file *FileDescription, num int32, event
 	file.EventRegister(&epi.waiter, wmask)
 
 	// Check if the file is already ready with the new mask.
-	if file.Readiness(wmask)&wmask != 0 {
-		epi.Callback(nil)
+	if m := file.Readiness(wmask) & wmask; m != 0 {
+		epi.Callback(nil, m)
 	}
 
 	return nil
@@ -311,7 +311,7 @@ func (ep *EpollInstance) DeleteInterest(file *FileDescription, num int32) error
 }
 
 // Callback implements waiter.EntryCallback.Callback.
-func (epi *epollInterest) Callback(*waiter.Entry) {
+func (epi *epollInterest) Callback(*waiter.Entry, waiter.EventMask) {
 	newReady := false
 	epi.epoll.mu.Lock()
 	if !epi.ready {
diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go
index 936f9fc71..5321ac80a 100644
--- a/pkg/sentry/vfs/file_description.go
+++ b/pkg/sentry/vfs/file_description.go
@@ -15,12 +15,14 @@
 package vfs
 
 import (
+	"io"
 	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/fs/lock"
+	"gvisor.dev/gvisor/pkg/sentry/fsmetric"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sync"
@@ -42,7 +44,7 @@ import (
 type FileDescription struct {
 	FileDescriptionRefs
 
-	// flagsMu protects statusFlags and asyncHandler below.
+	// flagsMu protects `statusFlags`, `saved`, and `asyncHandler` below.
 	flagsMu sync.Mutex `state:"nosave"`
 
 	// statusFlags contains status flags, "initialized by open(2) and possibly
@@ -51,6 +53,11 @@ type FileDescription struct {
 	// access to asyncHandler.
 	statusFlags uint32
 
+	// saved is true after beforeSave is called. This is used to prevent
+	// double-unregistration of asyncHandler. This does not work properly for
+	// save-resume, which is not currently supported in gVisor (see b/26588733).
+	saved bool `state:"nosave"`
+
 	// asyncHandler handles O_ASYNC signal generation. It is set with the
 	// F_SETOWN or F_SETOWN_EX fcntls. For asyncHandler to be used, O_ASYNC must
 	// also be set by fcntl(2).
@@ -183,7 +190,7 @@ func (fd *FileDescription) DecRef(ctx context.Context) {
 		}
 		fd.vd.DecRef(ctx)
 		fd.flagsMu.Lock()
-		if fd.statusFlags&linux.O_ASYNC != 0 && fd.asyncHandler != nil {
+		if !fd.saved && fd.statusFlags&linux.O_ASYNC != 0 && fd.asyncHandler != nil {
 			fd.asyncHandler.Unregister(fd)
 		}
 		fd.asyncHandler = nil
@@ -583,7 +590,11 @@ func (fd *FileDescription) PRead(ctx context.Context, dst usermem.IOSequence, of
 	if !fd.readable {
 		return 0, syserror.EBADF
 	}
-	return fd.impl.PRead(ctx, dst, offset, opts)
+	start := fsmetric.StartReadWait()
+	n, err := fd.impl.PRead(ctx, dst, offset, opts)
+	fsmetric.Reads.Increment()
+	fsmetric.FinishReadWait(fsmetric.ReadWait, start)
+	return n, err
 }
 
 // Read is similar to PRead, but does not specify an offset.
@@ -591,7 +602,11 @@ func (fd *FileDescription) Read(ctx context.Context, dst usermem.IOSequence, opt
 	if !fd.readable {
 		return 0, syserror.EBADF
 	}
-	return fd.impl.Read(ctx, dst, opts)
+	start := fsmetric.StartReadWait()
+	n, err := fd.impl.Read(ctx, dst, opts)
+	fsmetric.Reads.Increment()
+	fsmetric.FinishReadWait(fsmetric.ReadWait, start)
+	return n, err
 }
 
 // PWrite writes src to the file represented by fd, starting at the given
@@ -825,44 +840,27 @@ func (fd *FileDescription) SetAsyncHandler(newHandler func() FileAsync) FileAsyn
 	return fd.asyncHandler
 }
 
-// FileReadWriteSeeker is a helper struct to pass a FileDescription as
-// io.Reader/io.Writer/io.ReadSeeker/io.ReaderAt/io.WriterAt/etc.
-type FileReadWriteSeeker struct {
-	FD    *FileDescription
-	Ctx   context.Context
-	ROpts ReadOptions
-	WOpts WriteOptions
-}
-
-// ReadAt implements io.ReaderAt.ReadAt.
-func (f *FileReadWriteSeeker) ReadAt(p []byte, off int64) (int, error) {
-	dst := usermem.BytesIOSequence(p)
-	n, err := f.FD.PRead(f.Ctx, dst, off, f.ROpts)
-	return int(n), err
-}
-
-// Read implements io.ReadWriteSeeker.Read.
-func (f *FileReadWriteSeeker) Read(p []byte) (int, error) {
-	dst := usermem.BytesIOSequence(p)
-	n, err := f.FD.Read(f.Ctx, dst, f.ROpts)
-	return int(n), err
-}
-
-// Seek implements io.ReadWriteSeeker.Seek.
-func (f *FileReadWriteSeeker) Seek(offset int64, whence int) (int64, error) {
-	return f.FD.Seek(f.Ctx, offset, int32(whence))
-}
-
-// WriteAt implements io.WriterAt.WriteAt.
-func (f *FileReadWriteSeeker) WriteAt(p []byte, off int64) (int, error) {
-	dst := usermem.BytesIOSequence(p)
-	n, err := f.FD.PWrite(f.Ctx, dst, off, f.WOpts)
-	return int(n), err
-}
-
-// Write implements io.ReadWriteSeeker.Write.
-func (f *FileReadWriteSeeker) Write(p []byte) (int, error) {
-	buf := usermem.BytesIOSequence(p)
-	n, err := f.FD.Write(f.Ctx, buf, f.WOpts)
-	return int(n), err
+// CopyRegularFileData copies data from srcFD to dstFD until reading from srcFD
+// returns EOF or an error. It returns the number of bytes copied.
+func CopyRegularFileData(ctx context.Context, dstFD, srcFD *FileDescription) (int64, error) {
+	done := int64(0)
+	buf := usermem.BytesIOSequence(make([]byte, 32*1024)) // arbitrary buffer size
+	for {
+		readN, readErr := srcFD.Read(ctx, buf, ReadOptions{})
+		if readErr != nil && readErr != io.EOF {
+			return done, readErr
+		}
+		src := buf.TakeFirst64(readN)
+		for src.NumBytes() != 0 {
+			writeN, writeErr := dstFD.Write(ctx, src, WriteOptions{})
+			done += writeN
+			src = src.DropFirst64(writeN)
+			if writeErr != nil {
+				return done, writeErr
+			}
+		}
+		if readErr == io.EOF {
+			return done, nil
+		}
+	}
 }
diff --git a/pkg/sentry/vfs/save_restore.go b/pkg/sentry/vfs/save_restore.go
index 7723ed643..8f070ed53 100644
--- a/pkg/sentry/vfs/save_restore.go
+++ b/pkg/sentry/vfs/save_restore.go
@@ -18,8 +18,10 @@ import (
 	"fmt"
 	"sync/atomic"
 
+	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/refsvfs2"
+	"gvisor.dev/gvisor/pkg/waiter"
 )
 
 // FilesystemImplSaveRestoreExtension is an optional extension to
@@ -120,5 +122,20 @@ func (mnt *Mount) afterLoad() {
 func (epi *epollInterest) afterLoad() {
 	// Mark all epollInterests as ready after restore so that the next call to
 	// EpollInstance.ReadEvents() rechecks their readiness.
-	epi.Callback(nil)
+	epi.Callback(nil, waiter.EventMaskFromLinux(epi.mask))
+}
+
+// beforeSave is called by stateify.
+func (fd *FileDescription) beforeSave() {
+	fd.saved = true
+	if fd.statusFlags&linux.O_ASYNC != 0 && fd.asyncHandler != nil {
+		fd.asyncHandler.Unregister(fd)
+	}
+}
+
+// afterLoad is called by stateify.
+func (fd *FileDescription) afterLoad() {
+	if fd.statusFlags&linux.O_ASYNC != 0 && fd.asyncHandler != nil {
+		fd.asyncHandler.Register(fd)
+	}
 }
diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go
index 48d6252f7..6fd1bb0b2 100644
--- a/pkg/sentry/vfs/vfs.go
+++ b/pkg/sentry/vfs/vfs.go
@@ -41,6 +41,7 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/fsmetric"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.dev/gvisor/pkg/sync"
@@ -381,6 +382,8 @@ func (vfs *VirtualFilesystem) MknodAt(ctx context.Context, creds *auth.Credentia
 // OpenAt returns a FileDescription providing access to the file at the given
 // path. A reference is taken on the returned FileDescription.
 func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *OpenOptions) (*FileDescription, error) {
+	fsmetric.Opens.Increment()
+
 	// Remove:
 	//
 	// - O_CLOEXEC, which affects file descriptors and therefore must be
diff --git a/pkg/sentry/watchdog/watchdog.go b/pkg/sentry/watchdog/watchdog.go
index 1d1062aeb..8e3146d8d 100644
--- a/pkg/sentry/watchdog/watchdog.go
+++ b/pkg/sentry/watchdog/watchdog.go
@@ -338,6 +338,7 @@ func (w *Watchdog) report(offenders map[*kernel.Task]*offender, newTaskFound boo
 		tid := w.k.TaskSet().Root.IDOfTask(t)
 		buf.WriteString(fmt.Sprintf("\tTask tid: %v (goroutine %d), entered RunSys state %v ago.\n", tid, t.GoroutineID(), now.Sub(o.lastUpdateTime)))
 	}
+	buf.WriteString("Search for 'goroutine <id>' in the stack dump to find the offending goroutine(s)")
 
 	// Force stack dump only if a new task is detected.
 	w.doAction(w.TaskTimeoutAction, newTaskFound, &buf)
diff --git a/pkg/shim/v2/service.go b/pkg/shim/v2/service.go
index 2e39d2c4a..cba403cae 100644
--- a/pkg/shim/v2/service.go
+++ b/pkg/shim/v2/service.go
@@ -67,9 +67,15 @@ var (
 
 var _ = (taskAPI.TaskService)(&service{})
 
-// configFile is the default config file name. For containerd 1.2,
-// we assume that a config.toml should exist in the runtime root.
-const configFile = "config.toml"
+const (
+	// configFile is the default config file name. For containerd 1.2,
+	// we assume that a config.toml should exist in the runtime root.
+	configFile = "config.toml"
+
+	// shimAddressPath is the relative path to a file that contains the address
+	// to the shim UDS. See service.shimAddress.
+	shimAddressPath = "address"
+)
 
 // New returns a new shim service that can be used via GRPC.
 func New(ctx context.Context, id string, publisher shim.Publisher, cancel func()) (shim.Shim, error) {
@@ -101,6 +107,11 @@ func New(ctx context.Context, id string, publisher shim.Publisher, cancel func()
 		return nil, fmt.Errorf("failed to initialized platform behavior: %w", err)
 	}
 	go s.forward(ctx, publisher)
+
+	if address, err := shim.ReadAddress(shimAddressPath); err == nil {
+		s.shimAddress = address
+	}
+
 	return s, nil
 }
 
@@ -152,6 +163,9 @@ type service struct {
 	// cancel is a function that needs to be called before the shim stops. The
 	// function is provided by the caller to New().
 	cancel func()
+
+	// shimAddress is the location of the UDS used to communicate to containerd.
+	shimAddress string
 }
 
 func (s *service) newCommand(ctx context.Context, containerdBinary, containerdAddress string) (*exec.Cmd, error) {
@@ -191,38 +205,58 @@ func (s *service) StartShim(ctx context.Context, id, containerdBinary, container
 	if err != nil {
 		return "", err
 	}
-	address, err := shim.SocketAddress(ctx, id)
+	address, err := shim.SocketAddress(ctx, containerdAddress, id)
 	if err != nil {
 		return "", err
 	}
 	socket, err := shim.NewSocket(address)
 	if err != nil {
-		return "", err
+		// The only time where this would happen is if there is a bug and the socket
+		// was not cleaned up in the cleanup method of the shim or we are using the
+		// grouping functionality where the new process should be run with the same
+		// shim as an existing container.
+		if !shim.SocketEaddrinuse(err) {
+			return "", fmt.Errorf("create new shim socket: %w", err)
+		}
+		if shim.CanConnect(address) {
+			if err := shim.WriteAddress(shimAddressPath, address); err != nil {
+				return "", fmt.Errorf("write existing socket for shim: %w", err)
+			}
+			return address, nil
+		}
+		if err := shim.RemoveSocket(address); err != nil {
+			return "", fmt.Errorf("remove pre-existing socket: %w", err)
+		}
+		if socket, err = shim.NewSocket(address); err != nil {
+			return "", fmt.Errorf("try create new shim socket 2x: %w", err)
+		}
 	}
-	defer socket.Close()
+	cu := cleanup.Make(func() {
+		socket.Close()
+		_ = shim.RemoveSocket(address)
+	})
+	defer cu.Clean()
+
 	f, err := socket.File()
 	if err != nil {
 		return "", err
 	}
-	defer f.Close()
 
 	cmd.ExtraFiles = append(cmd.ExtraFiles, f)
 
 	log.L.Debugf("Executing: %q %s", cmd.Path, cmd.Args)
 	if err := cmd.Start(); err != nil {
+		f.Close()
 		return "", err
 	}
-	cu := cleanup.Make(func() {
-		cmd.Process.Kill()
-	})
-	defer cu.Clean()
+	cu.Add(func() { cmd.Process.Kill() })
 
 	// make sure to wait after start
 	go cmd.Wait()
 	if err := shim.WritePidFile("shim.pid", cmd.Process.Pid); err != nil {
 		return "", err
 	}
-	if err := shim.WriteAddress("address", address); err != nil {
+	if err := shim.WriteAddress(shimAddressPath, address); err != nil {
 		return "", err
 	}
 	if err := shim.SetScore(cmd.Process.Pid); err != nil {
@@ -675,8 +709,11 @@ func (s *service) Connect(ctx context.Context, r *taskAPI.ConnectRequest) (*task
 func (s *service) Shutdown(ctx context.Context, r *taskAPI.ShutdownRequest) (*types.Empty, error) {
 	log.L.Debugf("Shutdown, id: %s", r.ID)
 	s.cancel()
+	if s.shimAddress != "" {
+		_ = shim.RemoveSocket(s.shimAddress)
+	}
 	os.Exit(0)
-	return empty, nil
+	panic("Should not get here")
 }
 
 func (s *service) Stats(ctx context.Context, r *taskAPI.StatsRequest) (*taskAPI.StatsResponse, error) {
@@ -843,9 +880,7 @@ func (s *service) getContainerPids(ctx context.Context, id string) ([]uint32, er
 
 func (s *service) forward(ctx context.Context, publisher shim.Publisher) {
 	for e := range s.events {
-		ctx, cancel := context.WithTimeout(ctx, 5*time.Second)
 		err := publisher.Publish(ctx, getTopic(e), e)
-		cancel()
 		if err != nil {
 			// Should not happen.
 			panic(fmt.Errorf("post event: %w", err))
diff --git a/pkg/sleep/BUILD b/pkg/sleep/BUILD
index ae0fe1522..48bcdd62b 100644
--- a/pkg/sleep/BUILD
+++ b/pkg/sleep/BUILD
@@ -5,10 +5,6 @@ package(licenses = ["notice"])
 go_library(
     name = "sleep",
     srcs = [
-        "commit_amd64.s",
-        "commit_arm64.s",
-        "commit_asm.go",
-        "commit_noasm.go",
         "sleep_unsafe.go",
     ],
     visibility = ["//:sandbox"],
diff --git a/pkg/sleep/commit_amd64.s b/pkg/sleep/commit_amd64.s
deleted file mode 100644
index bc4ac2c3c..000000000
--- a/pkg/sleep/commit_amd64.s
+++ /dev/null
@@ -1,35 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "textflag.h"
-
-#define preparingG 1
-
-// See commit_noasm.go for a description of commitSleep.
-//
-// func commitSleep(g uintptr, waitingG *uintptr) bool
-TEXT ·commitSleep(SB),NOSPLIT,$0-24
-	MOVQ waitingG+8(FP), CX
-	MOVQ g+0(FP), DX
-
-	// Store the G in waitingG if it's still preparingG. If it's anything
-	// else it means a waker has aborted the sleep.
-	MOVQ $preparingG, AX
-	LOCK
-	CMPXCHGQ DX, 0(CX)
-
-	SETEQ AX
-	MOVB AX, ret+16(FP)
-
-	RET
diff --git a/pkg/sleep/commit_arm64.s b/pkg/sleep/commit_arm64.s
deleted file mode 100644
index d0ef15b20..000000000
--- a/pkg/sleep/commit_arm64.s
+++ /dev/null
@@ -1,38 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "textflag.h"
-
-#define preparingG 1
-
-// See commit_noasm.go for a description of commitSleep.
-//
-// func commitSleep(g uintptr, waitingG *uintptr) bool
-TEXT ·commitSleep(SB),NOSPLIT,$0-24
-	MOVD waitingG+8(FP), R0
-	MOVD $preparingG, R1
-	MOVD G+0(FP), R2
-
-	// Store the G in waitingG if it's still preparingG. If it's anything
-	// else it means a waker has aborted the sleep.
-again:
-        LDAXR   (R0), R3
-        CMP     R1, R3
-        BNE     ok
-        STLXR   R2, (R0), R3
-        CBNZ    R3, again
-ok:
-        CSET    EQ, R0
-        MOVB    R0, ret+16(FP)
-        RET
diff --git a/pkg/sleep/commit_asm.go b/pkg/sleep/commit_asm.go
deleted file mode 100644
index 75728a97d..000000000
--- a/pkg/sleep/commit_asm.go
+++ /dev/null
@@ -1,20 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// +build amd64 arm64
-
-package sleep
-
-// See commit_noasm.go for a description of commitSleep.
-func commitSleep(g uintptr, waitingG *uintptr) bool
diff --git a/pkg/sleep/commit_noasm.go b/pkg/sleep/commit_noasm.go
deleted file mode 100644
index f59061f37..000000000
--- a/pkg/sleep/commit_noasm.go
+++ /dev/null
@@ -1,33 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// +build !race
-// +build !amd64,!arm64
-
-package sleep
-
-import "sync/atomic"
-
-// commitSleep signals to wakers that the given g is now sleeping. Wakers can
-// then fetch it and wake it.
-//
-// The commit may fail if wakers have been asserted after our last check, in
-// which case they will have set s.waitingG to zero.
-//
-// It is written in assembly because it is called from g0, so it doesn't have
-// a race context.
-func commitSleep(g uintptr, waitingG *uintptr) bool {
-	// Try to store the G so that wakers know who to wake.
-	return atomic.CompareAndSwapUintptr(waitingG, preparingG, g)
-}
diff --git a/pkg/sleep/sleep_unsafe.go b/pkg/sleep/sleep_unsafe.go
index 19bce2afb..c44206b1e 100644
--- a/pkg/sleep/sleep_unsafe.go
+++ b/pkg/sleep/sleep_unsafe.go
@@ -12,11 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// +build go1.11
-// +build !go1.17
-
-// Check go:linkname function signatures when updating Go version.
-
 // Package sleep allows goroutines to efficiently sleep on multiple sources of
 // notifications (wakers). It offers O(1) complexity, which is different from
 // multi-channel selects which have O(n) complexity (where n is the number of
@@ -91,12 +86,6 @@ var (
 	assertedSleeper Sleeper
 )
 
-//go:linkname gopark runtime.gopark
-func gopark(unlockf func(uintptr, *uintptr) bool, wg *uintptr, reason uint8, traceEv byte, traceskip int)
-
-//go:linkname goready runtime.goready
-func goready(g uintptr, traceskip int)
-
 // Sleeper allows a goroutine to sleep and receive wake up notifications from
 // Wakers in an efficient way.
 //
@@ -189,7 +178,7 @@ func (s *Sleeper) nextWaker(block bool) *Waker {
 			// See:runtime2.go in the go runtime package for
 			// the values to pass as the waitReason here.
 			const waitReasonSelect = 9
-			gopark(commitSleep, &s.waitingG, waitReasonSelect, traceEvGoBlockSelect, 0)
+			sync.Gopark(commitSleep, unsafe.Pointer(&s.waitingG), sync.WaitReasonSelect, sync.TraceEvGoBlockSelect, 0)
 		}
 
 		// Pull the shared list out and reverse it in the local
@@ -212,6 +201,18 @@ func (s *Sleeper) nextWaker(block bool) *Waker {
 	return w
 }
 
+// commitSleep signals to wakers that the given g is now sleeping. Wakers can
+// then fetch it and wake it.
+//
+// The commit may fail if wakers have been asserted after our last check, in
+// which case they will have set s.waitingG to zero.
+//
+//go:norace
+//go:nosplit
+func commitSleep(g uintptr, waitingG unsafe.Pointer) bool {
+	return sync.RaceUncheckedAtomicCompareAndSwapUintptr((*uintptr)(waitingG), preparingG, g)
+}
+
 // Fetch fetches the next wake-up notification. If a notification is immediately
 // available, it is returned right away. Otherwise, the behavior depends on the
 // value of 'block': if true, the current goroutine blocks until a notification
@@ -311,7 +312,7 @@ func (s *Sleeper) enqueueAssertedWaker(w *Waker) {
 	case 0, preparingG:
 	default:
 		// We managed to get a G. Wake it up.
-		goready(g, 0)
+		sync.Goready(g, 0)
 	}
 }
 
diff --git a/pkg/sync/BUILD b/pkg/sync/BUILD
index 68535c3b1..be5bc99fc 100644
--- a/pkg/sync/BUILD
+++ b/pkg/sync/BUILD
@@ -31,18 +31,26 @@ go_library(
     name = "sync",
     srcs = [
         "aliases.go",
-        "memmove_unsafe.go",
+        "checklocks_off_unsafe.go",
+        "checklocks_on_unsafe.go",
+        "goyield_go113_unsafe.go",
+        "goyield_unsafe.go",
         "mutex_unsafe.go",
         "nocopy.go",
         "norace_unsafe.go",
+        "race_amd64.s",
+        "race_arm64.s",
         "race_unsafe.go",
+        "runtime_unsafe.go",
         "rwmutex_unsafe.go",
         "seqcount.go",
-        "spin_unsafe.go",
         "sync.go",
     ],
     marshal = False,
     stateify = False,
+    deps = [
+        "//pkg/goid",
+    ],
 )
 
 go_test(
diff --git a/pkg/sync/checklocks_off_unsafe.go b/pkg/sync/checklocks_off_unsafe.go
new file mode 100644
index 000000000..62c81b149
--- /dev/null
+++ b/pkg/sync/checklocks_off_unsafe.go
@@ -0,0 +1,18 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !checklocks
+
+package sync
+
+import (
+	"unsafe"
+)
+
+func noteLock(l unsafe.Pointer) {
+}
+
+func noteUnlock(l unsafe.Pointer) {
+}
diff --git a/pkg/sync/checklocks_on_unsafe.go b/pkg/sync/checklocks_on_unsafe.go
new file mode 100644
index 000000000..24f933ed1
--- /dev/null
+++ b/pkg/sync/checklocks_on_unsafe.go
@@ -0,0 +1,108 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build checklocks
+
+package sync
+
+import (
+	"fmt"
+	"strings"
+	"sync"
+	"unsafe"
+
+	"gvisor.dev/gvisor/pkg/goid"
+)
+
+// gLocks contains metadata about the locks held by a goroutine.
+type gLocks struct {
+	locksHeld []unsafe.Pointer
+}
+
+// map[goid int]*gLocks
+//
+// Each key may only be written by the G with the goid it refers to.
+//
+// Note that entries are not evicted when a G exit, causing unbounded growth
+// with new G creation / destruction. If this proves problematic, entries could
+// be evicted when no locks are held at the expense of more allocations when
+// taking top-level locks.
+var locksHeld sync.Map
+
+func getGLocks() *gLocks {
+	id := goid.Get()
+
+	var locks *gLocks
+	if l, ok := locksHeld.Load(id); ok {
+		locks = l.(*gLocks)
+	} else {
+		locks = &gLocks{
+			// Initialize space for a few locks.
+			locksHeld: make([]unsafe.Pointer, 0, 8),
+		}
+		locksHeld.Store(id, locks)
+	}
+
+	return locks
+}
+
+func noteLock(l unsafe.Pointer) {
+	locks := getGLocks()
+
+	for _, lock := range locks.locksHeld {
+		if lock == l {
+			panic(fmt.Sprintf("Deadlock on goroutine %d! Double lock of %p: %+v", goid.Get(), l, locks))
+		}
+	}
+
+	// Commit only after checking for panic conditions so that this lock
+	// isn't on the list if the above panic is recovered.
+	locks.locksHeld = append(locks.locksHeld, l)
+}
+
+func noteUnlock(l unsafe.Pointer) {
+	locks := getGLocks()
+
+	if len(locks.locksHeld) == 0 {
+		panic(fmt.Sprintf("Unlock of %p on goroutine %d without any locks held! All locks:\n%s", l, goid.Get(), dumpLocks()))
+	}
+
+	// Search backwards since callers are most likely to unlock in LIFO order.
+	length := len(locks.locksHeld)
+	for i := length - 1; i >= 0; i-- {
+		if l == locks.locksHeld[i] {
+			copy(locks.locksHeld[i:length-1], locks.locksHeld[i+1:length])
+			// Clear last entry to ensure addr can be GC'd.
+			locks.locksHeld[length-1] = nil
+			locks.locksHeld = locks.locksHeld[:length-1]
+			return
+		}
+	}
+
+	panic(fmt.Sprintf("Unlock of %p on goroutine %d without matching lock! All locks:\n%s", l, goid.Get(), dumpLocks()))
+}
+
+func dumpLocks() string {
+	var s strings.Builder
+	locksHeld.Range(func(key, value interface{}) bool {
+		goid := key.(int64)
+		locks := value.(*gLocks)
+
+		// N.B. accessing gLocks of another G is fundamentally racy.
+
+		fmt.Fprintf(&s, "goroutine %d:\n", goid)
+		if len(locks.locksHeld) == 0 {
+			fmt.Fprintf(&s, "\t<none>\n")
+		}
+		for _, lock := range locks.locksHeld {
+			fmt.Fprintf(&s, "\t%p\n", lock)
+		}
+		fmt.Fprintf(&s, "\n")
+
+		return true
+	})
+
+	return s.String()
+}
diff --git a/pkg/sync/goyield_go113_unsafe.go b/pkg/sync/goyield_go113_unsafe.go
new file mode 100644
index 000000000..8aee0d455
--- /dev/null
+++ b/pkg/sync/goyield_go113_unsafe.go
@@ -0,0 +1,18 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build go1.13
+// +build !go1.14
+
+package sync
+
+import (
+	"runtime"
+)
+
+func goyield() {
+	// goyield is not available until Go 1.14.
+	runtime.Gosched()
+}
diff --git a/pkg/sync/spin_unsafe.go b/pkg/sync/goyield_unsafe.go
index cafb2d065..672ee274d 100644
--- a/pkg/sync/spin_unsafe.go
+++ b/pkg/sync/goyield_unsafe.go
@@ -3,7 +3,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build go1.13
+// +build go1.14
 // +build !go1.17
 
 // Check go:linkname function signatures when updating Go version.
@@ -14,11 +14,5 @@ import (
 	_ "unsafe" // for go:linkname
 )
 
-//go:linkname canSpin sync.runtime_canSpin
-func canSpin(i int) bool
-
-//go:linkname doSpin sync.runtime_doSpin
-func doSpin()
-
 //go:linkname goyield runtime.goyield
 func goyield()
diff --git a/pkg/sync/memmove_unsafe.go b/pkg/sync/memmove_unsafe.go
deleted file mode 100644
index f5e630009..000000000
--- a/pkg/sync/memmove_unsafe.go
+++ /dev/null
@@ -1,28 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build go1.12
-// +build !go1.17
-
-// Check go:linkname function signatures when updating Go version.
-
-package sync
-
-import (
-	"unsafe"
-)
-
-//go:linkname memmove runtime.memmove
-//go:noescape
-func memmove(to, from unsafe.Pointer, n uintptr)
-
-// Memmove is exported for SeqAtomicLoad/SeqAtomicTryLoad<T>, which can't
-// define it because go_generics can't update the go:linkname annotation.
-// Furthermore, go:linkname silently doesn't work if the local name is exported
-// (this is of course undocumented), which is why this indirection is
-// necessary.
-func Memmove(to, from unsafe.Pointer, n uintptr) {
-	memmove(to, from, n)
-}
diff --git a/pkg/sync/mutex_test.go b/pkg/sync/mutex_test.go
index 0838248b4..4fb51a8ab 100644
--- a/pkg/sync/mutex_test.go
+++ b/pkg/sync/mutex_test.go
@@ -32,11 +32,11 @@ func TestStructSize(t *testing.T) {
 func TestFieldValues(t *testing.T) {
 	var m Mutex
 	m.Lock()
-	if got := *m.state(); got != mutexLocked {
+	if got := *m.m.state(); got != mutexLocked {
 		t.Errorf("got locked sync.Mutex.state = %d, want = %d", got, mutexLocked)
 	}
 	m.Unlock()
-	if got := *m.state(); got != mutexUnlocked {
+	if got := *m.m.state(); got != mutexUnlocked {
 		t.Errorf("got unlocked sync.Mutex.state = %d, want = %d", got, mutexUnlocked)
 	}
 }
diff --git a/pkg/sync/mutex_unsafe.go b/pkg/sync/mutex_unsafe.go
index f4c2e9642..21084b857 100644
--- a/pkg/sync/mutex_unsafe.go
+++ b/pkg/sync/mutex_unsafe.go
@@ -17,8 +17,9 @@ import (
 	"unsafe"
 )
 
-// Mutex is a try lock.
-type Mutex struct {
+// CrossGoroutineMutex is equivalent to Mutex, but it need not be unlocked by a
+// the same goroutine that locked the mutex.
+type CrossGoroutineMutex struct {
 	sync.Mutex
 }
 
@@ -27,7 +28,7 @@ type syncMutex struct {
 	sema  uint32
 }
 
-func (m *Mutex) state() *int32 {
+func (m *CrossGoroutineMutex) state() *int32 {
 	return &(*syncMutex)(unsafe.Pointer(&m.Mutex)).state
 }
 
@@ -36,9 +37,9 @@ const (
 	mutexLocked   = 1
 )
 
-// TryLock tries to aquire the mutex. It returns true if it succeeds and false
+// TryLock tries to acquire the mutex. It returns true if it succeeds and false
 // otherwise. TryLock does not block.
-func (m *Mutex) TryLock() bool {
+func (m *CrossGoroutineMutex) TryLock() bool {
 	if atomic.CompareAndSwapInt32(m.state(), mutexUnlocked, mutexLocked) {
 		if RaceEnabled {
 			RaceAcquire(unsafe.Pointer(&m.Mutex))
@@ -47,3 +48,43 @@ func (m *Mutex) TryLock() bool {
 	}
 	return false
 }
+
+// Mutex is a mutual exclusion lock. The zero value for a Mutex is an unlocked
+// mutex.
+//
+// A Mutex must not be copied after first use.
+//
+// A Mutex must be unlocked by the same goroutine that locked it. This
+// invariant is enforced with the 'checklocks' build tag.
+type Mutex struct {
+	m CrossGoroutineMutex
+}
+
+// Lock locks m. If the lock is already in use, the calling goroutine blocks
+// until the mutex is available.
+func (m *Mutex) Lock() {
+	noteLock(unsafe.Pointer(m))
+	m.m.Lock()
+}
+
+// Unlock unlocks m.
+//
+// Preconditions:
+// * m is locked.
+// * m was locked by this goroutine.
+func (m *Mutex) Unlock() {
+	noteUnlock(unsafe.Pointer(m))
+	m.m.Unlock()
+}
+
+// TryLock tries to acquire the mutex. It returns true if it succeeds and false
+// otherwise. TryLock does not block.
+func (m *Mutex) TryLock() bool {
+	// Note lock first to enforce proper locking even if unsuccessful.
+	noteLock(unsafe.Pointer(m))
+	locked := m.m.TryLock()
+	if !locked {
+		noteUnlock(unsafe.Pointer(m))
+	}
+	return locked
+}
diff --git a/pkg/sync/norace_unsafe.go b/pkg/sync/norace_unsafe.go
index 006055dd6..70b5f3a5e 100644
--- a/pkg/sync/norace_unsafe.go
+++ b/pkg/sync/norace_unsafe.go
@@ -8,6 +8,7 @@
 package sync
 
 import (
+	"sync/atomic"
 	"unsafe"
 )
 
@@ -33,3 +34,13 @@ func RaceRelease(addr unsafe.Pointer) {
 // RaceReleaseMerge has the same semantics as runtime.RaceReleaseMerge.
 func RaceReleaseMerge(addr unsafe.Pointer) {
 }
+
+// RaceUncheckedAtomicCompareAndSwapUintptr is equivalent to
+// sync/atomic.CompareAndSwapUintptr, but is not checked by the race detector.
+// This is necessary when implementing gopark callbacks, since no race context
+// is available during their execution.
+func RaceUncheckedAtomicCompareAndSwapUintptr(ptr *uintptr, old, new uintptr) bool {
+	// Use atomic.CompareAndSwapUintptr outside of race builds for
+	// inlinability.
+	return atomic.CompareAndSwapUintptr(ptr, old, new)
+}
diff --git a/pkg/syncevent/waiter_amd64.s b/pkg/sync/race_amd64.s
index 5e216b045..57bc0ec79 100644
--- a/pkg/syncevent/waiter_amd64.s
+++ b/pkg/sync/race_amd64.s
@@ -12,21 +12,22 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+// +build race
+// +build amd64
+
 #include "textflag.h"
 
-// See waiter_noasm_unsafe.go for a description of waiterUnlock.
-//
-// func waiterUnlock(ptr unsafe.Pointer, wg *unsafe.Pointer) bool
-TEXT ·waiterUnlock(SB),NOSPLIT,$0-24
+// func RaceUncheckedAtomicCompareAndSwapUintptr(ptr *uintptr, old, new uintptr) bool
+TEXT ·RaceUncheckedAtomicCompareAndSwapUintptr(SB),NOSPLIT,$0-25
 	MOVQ ptr+0(FP), DI
-	MOVQ wg+8(FP), SI
+	MOVQ old+8(FP), AX
+	MOVQ new+16(FP), SI
 
-	MOVQ $·preparingG(SB), AX
 	LOCK
-	CMPXCHGQ DI, 0(SI)
+	CMPXCHGQ SI, 0(DI)
 
 	SETEQ AX
-	MOVB AX, ret+16(FP)
+	MOVB AX, ret+24(FP)
 
 	RET
 
diff --git a/pkg/syncevent/waiter_arm64.s b/pkg/sync/race_arm64.s
index f4c06f194..88f091fda 100644
--- a/pkg/syncevent/waiter_arm64.s
+++ b/pkg/sync/race_arm64.s
@@ -12,15 +12,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+// +build race
+// +build arm64
+
 #include "textflag.h"
 
-// See waiter_noasm_unsafe.go for a description of waiterUnlock.
-//
-// func waiterUnlock(ptr unsafe.Pointer, wg *unsafe.Pointer) bool
-TEXT ·waiterUnlock(SB),NOSPLIT,$0-24
-	MOVD wg+8(FP), R0
-	MOVD $·preparingG(SB), R1
-	MOVD ptr+0(FP), R2
+// func RaceUncheckedAtomicCompareAndSwapUintptr(ptr *uintptr, old, new uintptr) bool
+TEXT ·RaceUncheckedAtomicCompareAndSwapUintptr(SB),NOSPLIT,$0-25
+	MOVD ptr+0(FP), R0
+	MOVD old+8(FP), R1
+	MOVD new+16(FP), R1
 again:
 	LDAXR (R0), R3
 	CMP R1, R3
@@ -29,6 +30,6 @@ again:
 	CBNZ R3, again
 ok:
 	CSET EQ, R0
-	MOVB R0, ret+16(FP)
+	MOVB R0, ret+24(FP)
 	RET
 
diff --git a/pkg/sync/race_unsafe.go b/pkg/sync/race_unsafe.go
index 31d8fa9a6..59985c270 100644
--- a/pkg/sync/race_unsafe.go
+++ b/pkg/sync/race_unsafe.go
@@ -39,3 +39,9 @@ func RaceRelease(addr unsafe.Pointer) {
 func RaceReleaseMerge(addr unsafe.Pointer) {
 	runtime.RaceReleaseMerge(addr)
 }
+
+// RaceUncheckedAtomicCompareAndSwapUintptr is equivalent to
+// sync/atomic.CompareAndSwapUintptr, but is not checked by the race detector.
+// This is necessary when implementing gopark callbacks, since no race context
+// is available during their execution.
+func RaceUncheckedAtomicCompareAndSwapUintptr(ptr *uintptr, old, new uintptr) bool
diff --git a/pkg/sync/runtime_unsafe.go b/pkg/sync/runtime_unsafe.go
new file mode 100644
index 000000000..7ad6a4434
--- /dev/null
+++ b/pkg/sync/runtime_unsafe.go
@@ -0,0 +1,76 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build go1.13
+// +build !go1.17
+
+// Check function signatures and constants when updating Go version.
+
+package sync
+
+import (
+	"unsafe"
+)
+
+// Note that go:linkname silently doesn't work if the local name is exported,
+// necessitating an indirection for exported functions.
+
+// Memmove is runtime.memmove, exported for SeqAtomicLoad/SeqAtomicTryLoad<T>.
+//
+//go:nosplit
+func Memmove(to, from unsafe.Pointer, n uintptr) {
+	memmove(to, from, n)
+}
+
+//go:linkname memmove runtime.memmove
+//go:noescape
+func memmove(to, from unsafe.Pointer, n uintptr)
+
+// Gopark is runtime.gopark. Gopark calls unlockf(pointer to runtime.g, lock);
+// if unlockf returns true, Gopark blocks until Goready(pointer to runtime.g)
+// is called. unlockf and its callees must be nosplit and norace, since stack
+// splitting and race context are not available where it is called.
+//
+//go:nosplit
+func Gopark(unlockf func(uintptr, unsafe.Pointer) bool, lock unsafe.Pointer, reason uint8, traceEv byte, traceskip int) {
+	gopark(unlockf, lock, reason, traceEv, traceskip)
+}
+
+//go:linkname gopark runtime.gopark
+func gopark(unlockf func(uintptr, unsafe.Pointer) bool, lock unsafe.Pointer, reason uint8, traceEv byte, traceskip int)
+
+// Goready is runtime.goready.
+//
+//go:nosplit
+func Goready(gp uintptr, traceskip int) {
+	goready(gp, traceskip)
+}
+
+//go:linkname goready runtime.goready
+func goready(gp uintptr, traceskip int)
+
+// Values for the reason argument to gopark, from Go's src/runtime/runtime2.go.
+const (
+	WaitReasonSelect uint8 = 9
+)
+
+// Values for the traceEv argument to gopark, from Go's src/runtime/trace.go.
+const (
+	TraceEvGoBlockSelect byte = 24
+)
+
+// These functions are only used within the sync package.
+
+//go:linkname semacquire sync.runtime_Semacquire
+func semacquire(s *uint32)
+
+//go:linkname semrelease sync.runtime_Semrelease
+func semrelease(s *uint32, handoff bool, skipframes int)
+
+//go:linkname canSpin sync.runtime_canSpin
+func canSpin(i int) bool
+
+//go:linkname doSpin sync.runtime_doSpin
+func doSpin()
diff --git a/pkg/sync/rwmutex_unsafe.go b/pkg/sync/rwmutex_unsafe.go
index b3b4dee78..4cf3fcd6e 100644
--- a/pkg/sync/rwmutex_unsafe.go
+++ b/pkg/sync/rwmutex_unsafe.go
@@ -3,11 +3,6 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build go1.13
-// +build !go1.17
-
-// Check go:linkname function signatures when updating Go version.
-
 // This is mostly copied from the standard library's sync/rwmutex.go.
 //
 // Happens-before relationships indicated to the race detector:
@@ -23,16 +18,15 @@ import (
 	"unsafe"
 )
 
-//go:linkname runtimeSemacquire sync.runtime_Semacquire
-func runtimeSemacquire(s *uint32)
-
-//go:linkname runtimeSemrelease sync.runtime_Semrelease
-func runtimeSemrelease(s *uint32, handoff bool, skipframes int)
-
-// RWMutex is identical to sync.RWMutex, but adds the DowngradeLock,
-// TryLock and TryRLock methods.
-type RWMutex struct {
-	w           Mutex  // held if there are pending writers
+// CrossGoroutineRWMutex is equivalent to RWMutex, but it need not be unlocked
+// by a the same goroutine that locked the mutex.
+type CrossGoroutineRWMutex struct {
+	// w is held if there are pending writers
+	//
+	// We use CrossGoroutineMutex rather than Mutex because the lock
+	// annotation instrumentation in Mutex will trigger false positives in
+	// the race detector when called inside of RaceDisable.
+	w           CrossGoroutineMutex
 	writerSem   uint32 // semaphore for writers to wait for completing readers
 	readerSem   uint32 // semaphore for readers to wait for completing writers
 	readerCount int32  // number of pending readers
@@ -43,7 +37,7 @@ const rwmutexMaxReaders = 1 << 30
 
 // TryRLock locks rw for reading. It returns true if it succeeds and false
 // otherwise. It does not block.
-func (rw *RWMutex) TryRLock() bool {
+func (rw *CrossGoroutineRWMutex) TryRLock() bool {
 	if RaceEnabled {
 		RaceDisable()
 	}
@@ -67,13 +61,17 @@ func (rw *RWMutex) TryRLock() bool {
 }
 
 // RLock locks rw for reading.
-func (rw *RWMutex) RLock() {
+//
+// It should not be used for recursive read locking; a blocked Lock call
+// excludes new readers from acquiring the lock. See the documentation on the
+// RWMutex type.
+func (rw *CrossGoroutineRWMutex) RLock() {
 	if RaceEnabled {
 		RaceDisable()
 	}
 	if atomic.AddInt32(&rw.readerCount, 1) < 0 {
 		// A writer is pending, wait for it.
-		runtimeSemacquire(&rw.readerSem)
+		semacquire(&rw.readerSem)
 	}
 	if RaceEnabled {
 		RaceEnable()
@@ -82,7 +80,10 @@ func (rw *RWMutex) RLock() {
 }
 
 // RUnlock undoes a single RLock call.
-func (rw *RWMutex) RUnlock() {
+//
+// Preconditions:
+// * rw is locked for reading.
+func (rw *CrossGoroutineRWMutex) RUnlock() {
 	if RaceEnabled {
 		RaceReleaseMerge(unsafe.Pointer(&rw.writerSem))
 		RaceDisable()
@@ -94,7 +95,7 @@ func (rw *RWMutex) RUnlock() {
 		// A writer is pending.
 		if atomic.AddInt32(&rw.readerWait, -1) == 0 {
 			// The last reader unblocks the writer.
-			runtimeSemrelease(&rw.writerSem, false, 0)
+			semrelease(&rw.writerSem, false, 0)
 		}
 	}
 	if RaceEnabled {
@@ -104,7 +105,7 @@ func (rw *RWMutex) RUnlock() {
 
 // TryLock locks rw for writing. It returns true if it succeeds and false
 // otherwise. It does not block.
-func (rw *RWMutex) TryLock() bool {
+func (rw *CrossGoroutineRWMutex) TryLock() bool {
 	if RaceEnabled {
 		RaceDisable()
 	}
@@ -130,8 +131,9 @@ func (rw *RWMutex) TryLock() bool {
 	return true
 }
 
-// Lock locks rw for writing.
-func (rw *RWMutex) Lock() {
+// Lock locks rw for writing. If the lock is already locked for reading or
+// writing, Lock blocks until the lock is available.
+func (rw *CrossGoroutineRWMutex) Lock() {
 	if RaceEnabled {
 		RaceDisable()
 	}
@@ -141,7 +143,7 @@ func (rw *RWMutex) Lock() {
 	r := atomic.AddInt32(&rw.readerCount, -rwmutexMaxReaders) + rwmutexMaxReaders
 	// Wait for active readers.
 	if r != 0 && atomic.AddInt32(&rw.readerWait, r) != 0 {
-		runtimeSemacquire(&rw.writerSem)
+		semacquire(&rw.writerSem)
 	}
 	if RaceEnabled {
 		RaceEnable()
@@ -150,7 +152,10 @@ func (rw *RWMutex) Lock() {
 }
 
 // Unlock unlocks rw for writing.
-func (rw *RWMutex) Unlock() {
+//
+// Preconditions:
+// * rw is locked for writing.
+func (rw *CrossGoroutineRWMutex) Unlock() {
 	if RaceEnabled {
 		RaceRelease(unsafe.Pointer(&rw.writerSem))
 		RaceRelease(unsafe.Pointer(&rw.readerSem))
@@ -163,7 +168,7 @@ func (rw *RWMutex) Unlock() {
 	}
 	// Unblock blocked readers, if any.
 	for i := 0; i < int(r); i++ {
-		runtimeSemrelease(&rw.readerSem, false, 0)
+		semrelease(&rw.readerSem, false, 0)
 	}
 	// Allow other writers to proceed.
 	rw.w.Unlock()
@@ -173,7 +178,10 @@ func (rw *RWMutex) Unlock() {
 }
 
 // DowngradeLock atomically unlocks rw for writing and locks it for reading.
-func (rw *RWMutex) DowngradeLock() {
+//
+// Preconditions:
+// * rw is locked for writing.
+func (rw *CrossGoroutineRWMutex) DowngradeLock() {
 	if RaceEnabled {
 		RaceRelease(unsafe.Pointer(&rw.readerSem))
 		RaceDisable()
@@ -186,7 +194,7 @@ func (rw *RWMutex) DowngradeLock() {
 	// Unblock blocked readers, if any. Note that this loop starts as 1 since r
 	// includes this goroutine.
 	for i := 1; i < int(r); i++ {
-		runtimeSemrelease(&rw.readerSem, false, 0)
+		semrelease(&rw.readerSem, false, 0)
 	}
 	// Allow other writers to proceed to rw.w.Lock(). Note that they will still
 	// block on rw.writerSem since at least this reader exists, such that
@@ -196,3 +204,91 @@ func (rw *RWMutex) DowngradeLock() {
 		RaceEnable()
 	}
 }
+
+// A RWMutex is a reader/writer mutual exclusion lock. The lock can be held by
+// an arbitrary number of readers or a single writer. The zero value for a
+// RWMutex is an unlocked mutex.
+//
+// A RWMutex must not be copied after first use.
+//
+// If a goroutine holds a RWMutex for reading and another goroutine might call
+// Lock, no goroutine should expect to be able to acquire a read lock until the
+// initial read lock is released. In particular, this prohibits recursive read
+// locking. This is to ensure that the lock eventually becomes available; a
+// blocked Lock call excludes new readers from acquiring the lock.
+//
+// A Mutex must be unlocked by the same goroutine that locked it. This
+// invariant is enforced with the 'checklocks' build tag.
+type RWMutex struct {
+	m CrossGoroutineRWMutex
+}
+
+// TryRLock locks rw for reading. It returns true if it succeeds and false
+// otherwise. It does not block.
+func (rw *RWMutex) TryRLock() bool {
+	// Note lock first to enforce proper locking even if unsuccessful.
+	noteLock(unsafe.Pointer(rw))
+	locked := rw.m.TryRLock()
+	if !locked {
+		noteUnlock(unsafe.Pointer(rw))
+	}
+	return locked
+}
+
+// RLock locks rw for reading.
+//
+// It should not be used for recursive read locking; a blocked Lock call
+// excludes new readers from acquiring the lock. See the documentation on the
+// RWMutex type.
+func (rw *RWMutex) RLock() {
+	noteLock(unsafe.Pointer(rw))
+	rw.m.RLock()
+}
+
+// RUnlock undoes a single RLock call.
+//
+// Preconditions:
+// * rw is locked for reading.
+// * rw was locked by this goroutine.
+func (rw *RWMutex) RUnlock() {
+	rw.m.RUnlock()
+	noteUnlock(unsafe.Pointer(rw))
+}
+
+// TryLock locks rw for writing. It returns true if it succeeds and false
+// otherwise. It does not block.
+func (rw *RWMutex) TryLock() bool {
+	// Note lock first to enforce proper locking even if unsuccessful.
+	noteLock(unsafe.Pointer(rw))
+	locked := rw.m.TryLock()
+	if !locked {
+		noteUnlock(unsafe.Pointer(rw))
+	}
+	return locked
+}
+
+// Lock locks rw for writing. If the lock is already locked for reading or
+// writing, Lock blocks until the lock is available.
+func (rw *RWMutex) Lock() {
+	noteLock(unsafe.Pointer(rw))
+	rw.m.Lock()
+}
+
+// Unlock unlocks rw for writing.
+//
+// Preconditions:
+// * rw is locked for writing.
+// * rw was locked by this goroutine.
+func (rw *RWMutex) Unlock() {
+	rw.m.Unlock()
+	noteUnlock(unsafe.Pointer(rw))
+}
+
+// DowngradeLock atomically unlocks rw for writing and locks it for reading.
+//
+// Preconditions:
+// * rw is locked for writing.
+func (rw *RWMutex) DowngradeLock() {
+	// No note change for DowngradeLock.
+	rw.m.DowngradeLock()
+}
diff --git a/pkg/sync/seqatomic_unsafe.go b/pkg/sync/seqatomic_unsafe.go
index 2184cb5ab..780f3b8f8 100644
--- a/pkg/sync/seqatomic_unsafe.go
+++ b/pkg/sync/seqatomic_unsafe.go
@@ -8,20 +8,12 @@
 package template
 
 import (
-	"fmt"
-	"reflect"
-	"strings"
 	"unsafe"
 
 	"gvisor.dev/gvisor/pkg/sync"
 )
 
 // Value is a required type parameter.
-//
-// Value must not contain any pointers, including interface objects, function
-// objects, slices, maps, channels, unsafe.Pointer, and arrays or structs
-// containing any of the above. An init() function will panic if this property
-// does not hold.
 type Value struct{}
 
 // SeqAtomicLoad returns a copy of *ptr, ensuring that the read does not race
@@ -55,12 +47,3 @@ func SeqAtomicTryLoad(seq *sync.SeqCount, epoch sync.SeqCountEpoch, ptr *Value)
 	ok = seq.ReadOk(epoch)
 	return
 }
-
-func init() {
-	var val Value
-	typ := reflect.TypeOf(val)
-	name := typ.Name()
-	if ptrs := sync.PointersInType(typ, name); len(ptrs) != 0 {
-		panic(fmt.Sprintf("SeqAtomicLoad<%s> is invalid since values %s of type %s contain pointers:\n%s", typ, name, typ, strings.Join(ptrs, "\n")))
-	}
-}
diff --git a/pkg/sync/seqcount.go b/pkg/sync/seqcount.go
index 2c5d3df99..1f025f33c 100644
--- a/pkg/sync/seqcount.go
+++ b/pkg/sync/seqcount.go
@@ -6,8 +6,6 @@
 package sync
 
 import (
-	"fmt"
-	"reflect"
 	"sync/atomic"
 )
 
@@ -27,9 +25,6 @@ import (
 // - SeqCount may be more flexible: correct use of SeqCount.ReadOk allows other
 // operations to be made atomic with reads of SeqCount-protected data.
 //
-// - SeqCount may be less flexible: as of this writing, SeqCount-protected data
-// cannot include pointers.
-//
 // - SeqCount is more cumbersome to use; atomic reads of SeqCount-protected
 // data require instantiating function templates using go_generics (see
 // seqatomic.go).
@@ -128,32 +123,3 @@ func (s *SeqCount) EndWrite() {
 		panic("SeqCount.EndWrite outside writer critical section")
 	}
 }
-
-// PointersInType returns a list of pointers reachable from values named
-// valName of the given type.
-//
-// PointersInType is not exhaustive, but it is guaranteed that if typ contains
-// at least one pointer, then PointersInTypeOf returns a non-empty list.
-func PointersInType(typ reflect.Type, valName string) []string {
-	switch kind := typ.Kind(); kind {
-	case reflect.Bool, reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64, reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64, reflect.Uintptr, reflect.Float32, reflect.Float64, reflect.Complex64, reflect.Complex128:
-		return nil
-
-	case reflect.Chan, reflect.Func, reflect.Interface, reflect.Map, reflect.Ptr, reflect.Slice, reflect.String, reflect.UnsafePointer:
-		return []string{valName}
-
-	case reflect.Array:
-		return PointersInType(typ.Elem(), valName+"[]")
-
-	case reflect.Struct:
-		var ptrs []string
-		for i, n := 0, typ.NumField(); i < n; i++ {
-			field := typ.Field(i)
-			ptrs = append(ptrs, PointersInType(field.Type, fmt.Sprintf("%s.%s", valName, field.Name))...)
-		}
-		return ptrs
-
-	default:
-		return []string{fmt.Sprintf("%s (of type %s with unknown kind %s)", valName, typ, kind)}
-	}
-}
diff --git a/pkg/sync/seqcount_test.go b/pkg/sync/seqcount_test.go
index 6eb7b4b59..3f5592e3e 100644
--- a/pkg/sync/seqcount_test.go
+++ b/pkg/sync/seqcount_test.go
@@ -6,7 +6,6 @@
 package sync
 
 import (
-	"reflect"
 	"testing"
 	"time"
 )
@@ -99,55 +98,3 @@ func BenchmarkSeqCountReadUncontended(b *testing.B) {
 		}
 	})
 }
-
-func TestPointersInType(t *testing.T) {
-	for _, test := range []struct {
-		name string // used for both test and value name
-		val  interface{}
-		ptrs []string
-	}{
-		{
-			name: "EmptyStruct",
-			val:  struct{}{},
-		},
-		{
-			name: "Int",
-			val:  int(0),
-		},
-		{
-			name: "MixedStruct",
-			val: struct {
-				b             bool
-				I             int
-				ExportedPtr   *struct{}
-				unexportedPtr *struct{}
-				arr           [2]int
-				ptrArr        [2]*int
-				nestedStruct  struct {
-					nestedNonptr int
-					nestedPtr    *int
-				}
-				structArr [1]struct {
-					nonptr int
-					ptr    *int
-				}
-			}{},
-			ptrs: []string{
-				"MixedStruct.ExportedPtr",
-				"MixedStruct.unexportedPtr",
-				"MixedStruct.ptrArr[]",
-				"MixedStruct.nestedStruct.nestedPtr",
-				"MixedStruct.structArr[].ptr",
-			},
-		},
-	} {
-		t.Run(test.name, func(t *testing.T) {
-			typ := reflect.TypeOf(test.val)
-			ptrs := PointersInType(typ, test.name)
-			t.Logf("Found pointers: %v", ptrs)
-			if (len(ptrs) != 0 || len(test.ptrs) != 0) && !reflect.DeepEqual(ptrs, test.ptrs) {
-				t.Errorf("Got %v, wanted %v", ptrs, test.ptrs)
-			}
-		})
-	}
-}
diff --git a/pkg/syncevent/BUILD b/pkg/syncevent/BUILD
index 0500a22cf..42c553308 100644
--- a/pkg/syncevent/BUILD
+++ b/pkg/syncevent/BUILD
@@ -9,10 +9,6 @@ go_library(
         "receiver.go",
         "source.go",
         "syncevent.go",
-        "waiter_amd64.s",
-        "waiter_arm64.s",
-        "waiter_asm_unsafe.go",
-        "waiter_noasm_unsafe.go",
         "waiter_unsafe.go",
     ],
     visibility = ["//:sandbox"],
diff --git a/pkg/syncevent/waiter_noasm_unsafe.go b/pkg/syncevent/waiter_noasm_unsafe.go
deleted file mode 100644
index 0f74a689c..000000000
--- a/pkg/syncevent/waiter_noasm_unsafe.go
+++ /dev/null
@@ -1,39 +0,0 @@
-// Copyright 2020 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// waiterUnlock is called from g0, so when the race detector is enabled,
-// waiterUnlock must be implemented in assembly since no race context is
-// available.
-//
-// +build !race
-// +build !amd64,!arm64
-
-package syncevent
-
-import (
-	"sync/atomic"
-	"unsafe"
-)
-
-// waiterUnlock is the "unlock function" passed to runtime.gopark by
-// Waiter.Wait*. wg is &Waiter.g, and g is a pointer to the calling runtime.g.
-// waiterUnlock returns true if Waiter.Wait should sleep and false if sleeping
-// should be aborted.
-//
-//go:nosplit
-func waiterUnlock(ptr unsafe.Pointer, wg *unsafe.Pointer) bool {
-	// The only way this CAS can fail is if a call to Waiter.NotifyPending()
-	// has replaced *wg with nil, in which case we should not sleep.
-	return atomic.CompareAndSwapPointer(wg, (unsafe.Pointer)(&preparingG), ptr)
-}
diff --git a/pkg/syncevent/waiter_unsafe.go b/pkg/syncevent/waiter_unsafe.go
index 518f18479..b6ed2852d 100644
--- a/pkg/syncevent/waiter_unsafe.go
+++ b/pkg/syncevent/waiter_unsafe.go
@@ -12,11 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// +build go1.11
-// +build !go1.17
-
-// Check go:linkname function signatures when updating Go version.
-
 package syncevent
 
 import (
@@ -26,17 +21,6 @@ import (
 	"gvisor.dev/gvisor/pkg/sync"
 )
 
-//go:linkname gopark runtime.gopark
-func gopark(unlockf func(unsafe.Pointer, *unsafe.Pointer) bool, wg *unsafe.Pointer, reason uint8, traceEv byte, traceskip int)
-
-//go:linkname goready runtime.goready
-func goready(g unsafe.Pointer, traceskip int)
-
-const (
-	waitReasonSelect     = 9  // Go: src/runtime/runtime2.go
-	traceEvGoBlockSelect = 24 // Go: src/runtime/trace.go
-)
-
 // Waiter allows a goroutine to block on pending events received by a Receiver.
 //
 // Waiter.Init() must be called before first use.
@@ -45,20 +29,19 @@ type Waiter struct {
 
 	// g is one of:
 	//
-	// - nil: No goroutine is blocking in Wait.
+	// - 0: No goroutine is blocking in Wait.
 	//
-	// - &preparingG: A goroutine is in Wait preparing to sleep, but hasn't yet
+	// - preparingG: A goroutine is in Wait preparing to sleep, but hasn't yet
 	// completed waiterUnlock(). Thus the wait can only be interrupted by
-	// replacing the value of g with nil (the G may not be in state Gwaiting
-	// yet, so we can't call goready.)
+	// replacing the value of g with 0 (the G may not be in state Gwaiting yet,
+	// so we can't call goready.)
 	//
 	// - Otherwise: g is a pointer to the runtime.g in state Gwaiting for the
 	// goroutine blocked in Wait, which can only be woken by calling goready.
-	g unsafe.Pointer `state:"zerovalue"`
+	g uintptr `state:"zerovalue"`
 }
 
-// Sentinel object for Waiter.g.
-var preparingG struct{}
+const preparingG = 1
 
 // Init must be called before first use of w.
 func (w *Waiter) Init() {
@@ -99,21 +82,29 @@ func (w *Waiter) WaitFor(es Set) Set {
 		}
 
 		// Indicate that we're preparing to go to sleep.
-		atomic.StorePointer(&w.g, (unsafe.Pointer)(&preparingG))
+		atomic.StoreUintptr(&w.g, preparingG)
 
 		// If an event is pending, abort the sleep.
 		if p := w.r.Pending(); p&es != NoEvents {
-			atomic.StorePointer(&w.g, nil)
+			atomic.StoreUintptr(&w.g, 0)
 			return p
 		}
 
 		// If w.g is still preparingG (i.e. w.NotifyPending() has not been
-		// called or has not reached atomic.SwapPointer()), go to sleep until
+		// called or has not reached atomic.SwapUintptr()), go to sleep until
 		// w.NotifyPending() => goready().
-		gopark(waiterUnlock, &w.g, waitReasonSelect, traceEvGoBlockSelect, 0)
+		sync.Gopark(waiterCommit, unsafe.Pointer(&w.g), sync.WaitReasonSelect, sync.TraceEvGoBlockSelect, 0)
 	}
 }
 
+//go:norace
+//go:nosplit
+func waiterCommit(g uintptr, wg unsafe.Pointer) bool {
+	// The only way this CAS can fail is if a call to Waiter.NotifyPending()
+	// has replaced *wg with nil, in which case we should not sleep.
+	return sync.RaceUncheckedAtomicCompareAndSwapUintptr((*uintptr)(wg), preparingG, g)
+}
+
 // Ack marks the given events as not pending.
 func (w *Waiter) Ack(es Set) {
 	w.r.Ack(es)
@@ -135,20 +126,20 @@ func (w *Waiter) WaitAndAckAll() Set {
 
 	for {
 		// Indicate that we're preparing to go to sleep.
-		atomic.StorePointer(&w.g, (unsafe.Pointer)(&preparingG))
+		atomic.StoreUintptr(&w.g, preparingG)
 
 		// If an event is pending, abort the sleep.
 		if w.r.Pending() != NoEvents {
 			if p := w.r.PendingAndAckAll(); p != NoEvents {
-				atomic.StorePointer(&w.g, nil)
+				atomic.StoreUintptr(&w.g, 0)
 				return p
 			}
 		}
 
 		// If w.g is still preparingG (i.e. w.NotifyPending() has not been
-		// called or has not reached atomic.SwapPointer()), go to sleep until
+		// called or has not reached atomic.SwapUintptr()), go to sleep until
 		// w.NotifyPending() => goready().
-		gopark(waiterUnlock, &w.g, waitReasonSelect, traceEvGoBlockSelect, 0)
+		sync.Gopark(waiterCommit, unsafe.Pointer(&w.g), sync.WaitReasonSelect, sync.TraceEvGoBlockSelect, 0)
 
 		// Check for pending events. We call PendingAndAckAll() directly now since
 		// we only expect to be woken after events become pending.
@@ -171,14 +162,14 @@ func (w *Waiter) NotifyPending() {
 	// goroutine. NotifyPending is called after w.r.Pending() is updated, so
 	// concurrent and future calls to w.Wait() will observe pending events and
 	// abort sleeping.
-	if atomic.LoadPointer(&w.g) == nil {
+	if atomic.LoadUintptr(&w.g) == 0 {
 		return
 	}
 	// Wake a sleeping G, or prevent a G that is preparing to sleep from doing
 	// so. Swap is needed here to ensure that only one call to NotifyPending
 	// calls goready.
-	if g := atomic.SwapPointer(&w.g, nil); g != nil && g != (unsafe.Pointer)(&preparingG) {
-		goready(g, 0)
+	if g := atomic.SwapUintptr(&w.g, 0); g > preparingG {
+		sync.Goready(g, 0)
 	}
 }
 
diff --git a/pkg/tcpip/checker/checker.go b/pkg/tcpip/checker/checker.go
index 81f762e10..d3ae56ac6 100644
--- a/pkg/tcpip/checker/checker.go
+++ b/pkg/tcpip/checker/checker.go
@@ -20,6 +20,7 @@ import (
 	"encoding/binary"
 	"reflect"
 	"testing"
+	"time"
 
 	"github.com/google/go-cmp/cmp"
 	"gvisor.dev/gvisor/pkg/tcpip"
@@ -216,6 +217,42 @@ func IPv4Options(want header.IPv4Options) NetworkChecker {
 	}
 }
 
+// IPv4RouterAlert returns a checker that checks that the RouterAlert option is
+// set in an IPv4 packet.
+func IPv4RouterAlert() NetworkChecker {
+	return func(t *testing.T, h []header.Network) {
+		t.Helper()
+		ip, ok := h[0].(header.IPv4)
+		if !ok {
+			t.Fatalf("unexpected network header passed to checker, got = %T, want = header.IPv4", h[0])
+		}
+		iterator := ip.Options().MakeIterator()
+		for {
+			opt, done, err := iterator.Next()
+			if err != nil {
+				t.Fatalf("error acquiring next IPv4 option %s", err)
+			}
+			if done {
+				break
+			}
+			if opt.Type() != header.IPv4OptionRouterAlertType {
+				continue
+			}
+			want := [header.IPv4OptionRouterAlertLength]byte{
+				byte(header.IPv4OptionRouterAlertType),
+				header.IPv4OptionRouterAlertLength,
+				header.IPv4OptionRouterAlertValue,
+				header.IPv4OptionRouterAlertValue,
+			}
+			if diff := cmp.Diff(want[:], opt.Contents()); diff != "" {
+				t.Errorf("router alert option mismatch (-want +got):\n%s", diff)
+			}
+			return
+		}
+		t.Errorf("failed to find router alert option in %v", ip.Options())
+	}
+}
+
 // FragmentOffset creates a checker that checks the FragmentOffset field.
 func FragmentOffset(offset uint16) NetworkChecker {
 	return func(t *testing.T, h []header.Network) {
@@ -1012,6 +1049,74 @@ func ICMPv6Payload(want []byte) TransportChecker {
 	}
 }
 
+// MLD creates a checker that checks that the packet contains a valid MLD
+// message for type of mldType, with potentially additional checks specified by
+// checkers.
+//
+// Checkers may assume that a valid ICMPv6 is passed to it containing a valid
+// MLD message as far as the size of the message (minSize) is concerned. The
+// values within the message are up to checkers to validate.
+func MLD(msgType header.ICMPv6Type, minSize int, checkers ...TransportChecker) NetworkChecker {
+	return func(t *testing.T, h []header.Network) {
+		t.Helper()
+
+		// Check normal ICMPv6 first.
+		ICMPv6(
+			ICMPv6Type(msgType),
+			ICMPv6Code(0))(t, h)
+
+		last := h[len(h)-1]
+
+		icmp := header.ICMPv6(last.Payload())
+		if got := len(icmp.MessageBody()); got < minSize {
+			t.Fatalf("ICMPv6 MLD (type = %d) payload size of %d is less than the minimum size of %d", msgType, got, minSize)
+		}
+
+		for _, f := range checkers {
+			f(t, icmp)
+		}
+		if t.Failed() {
+			t.FailNow()
+		}
+	}
+}
+
+// MLDMaxRespDelay creates a checker that checks the Maximum Response Delay
+// field of a MLD message.
+//
+// The returned TransportChecker assumes that a valid ICMPv6 is passed to it
+// containing a valid MLD message as far as the size is concerned.
+func MLDMaxRespDelay(want time.Duration) TransportChecker {
+	return func(t *testing.T, h header.Transport) {
+		t.Helper()
+
+		icmp := h.(header.ICMPv6)
+		ns := header.MLD(icmp.MessageBody())
+
+		if got := ns.MaximumResponseDelay(); got != want {
+			t.Errorf("got %T.MaximumResponseDelay() = %s, want = %s", ns, got, want)
+		}
+	}
+}
+
+// MLDMulticastAddress creates a checker that checks the Multicast Address
+// field of a MLD message.
+//
+// The returned TransportChecker assumes that a valid ICMPv6 is passed to it
+// containing a valid MLD message as far as the size is concerned.
+func MLDMulticastAddress(want tcpip.Address) TransportChecker {
+	return func(t *testing.T, h header.Transport) {
+		t.Helper()
+
+		icmp := h.(header.ICMPv6)
+		ns := header.MLD(icmp.MessageBody())
+
+		if got := ns.MulticastAddress(); got != want {
+			t.Errorf("got %T.MulticastAddress() = %s, want = %s", ns, got, want)
+		}
+	}
+}
+
 // NDP creates a checker that checks that the packet contains a valid NDP
 // message for type of ty, with potentially additional checks specified by
 // checkers.
@@ -1031,7 +1136,7 @@ func NDP(msgType header.ICMPv6Type, minSize int, checkers ...TransportChecker) N
 		last := h[len(h)-1]
 
 		icmp := header.ICMPv6(last.Payload())
-		if got := len(icmp.NDPPayload()); got < minSize {
+		if got := len(icmp.MessageBody()); got < minSize {
 			t.Fatalf("ICMPv6 NDP (type = %d) payload size of %d is less than the minimum size of %d", msgType, got, minSize)
 		}
 
@@ -1065,7 +1170,7 @@ func NDPNSTargetAddress(want tcpip.Address) TransportChecker {
 		t.Helper()
 
 		icmp := h.(header.ICMPv6)
-		ns := header.NDPNeighborSolicit(icmp.NDPPayload())
+		ns := header.NDPNeighborSolicit(icmp.MessageBody())
 
 		if got := ns.TargetAddress(); got != want {
 			t.Errorf("got %T.TargetAddress() = %s, want = %s", ns, got, want)
@@ -1094,7 +1199,7 @@ func NDPNATargetAddress(want tcpip.Address) TransportChecker {
 		t.Helper()
 
 		icmp := h.(header.ICMPv6)
-		na := header.NDPNeighborAdvert(icmp.NDPPayload())
+		na := header.NDPNeighborAdvert(icmp.MessageBody())
 
 		if got := na.TargetAddress(); got != want {
 			t.Errorf("got %T.TargetAddress() = %s, want = %s", na, got, want)
@@ -1112,7 +1217,7 @@ func NDPNASolicitedFlag(want bool) TransportChecker {
 		t.Helper()
 
 		icmp := h.(header.ICMPv6)
-		na := header.NDPNeighborAdvert(icmp.NDPPayload())
+		na := header.NDPNeighborAdvert(icmp.MessageBody())
 
 		if got := na.SolicitedFlag(); got != want {
 			t.Errorf("got %T.SolicitedFlag = %t, want = %t", na, got, want)
@@ -1183,7 +1288,7 @@ func NDPNAOptions(opts []header.NDPOption) TransportChecker {
 		t.Helper()
 
 		icmp := h.(header.ICMPv6)
-		na := header.NDPNeighborAdvert(icmp.NDPPayload())
+		na := header.NDPNeighborAdvert(icmp.MessageBody())
 		ndpOptions(t, na.Options(), opts)
 	}
 }
@@ -1198,7 +1303,7 @@ func NDPNSOptions(opts []header.NDPOption) TransportChecker {
 		t.Helper()
 
 		icmp := h.(header.ICMPv6)
-		ns := header.NDPNeighborSolicit(icmp.NDPPayload())
+		ns := header.NDPNeighborSolicit(icmp.MessageBody())
 		ndpOptions(t, ns.Options(), opts)
 	}
 }
@@ -1223,7 +1328,75 @@ func NDPRSOptions(opts []header.NDPOption) TransportChecker {
 		t.Helper()
 
 		icmp := h.(header.ICMPv6)
-		rs := header.NDPRouterSolicit(icmp.NDPPayload())
+		rs := header.NDPRouterSolicit(icmp.MessageBody())
 		ndpOptions(t, rs.Options(), opts)
 	}
 }
+
+// IGMP checks the validity and properties of the given IGMP packet. It is
+// expected to be used in conjunction with other IGMP transport checkers for
+// specific properties.
+func IGMP(checkers ...TransportChecker) NetworkChecker {
+	return func(t *testing.T, h []header.Network) {
+		t.Helper()
+
+		last := h[len(h)-1]
+
+		if p := last.TransportProtocol(); p != header.IGMPProtocolNumber {
+			t.Fatalf("Bad protocol, got %d, want %d", p, header.IGMPProtocolNumber)
+		}
+
+		igmp := header.IGMP(last.Payload())
+		for _, f := range checkers {
+			f(t, igmp)
+		}
+		if t.Failed() {
+			t.FailNow()
+		}
+	}
+}
+
+// IGMPType creates a checker that checks the IGMP Type field.
+func IGMPType(want header.IGMPType) TransportChecker {
+	return func(t *testing.T, h header.Transport) {
+		t.Helper()
+
+		igmp, ok := h.(header.IGMP)
+		if !ok {
+			t.Fatalf("got transport header = %T, want = header.IGMP", h)
+		}
+		if got := igmp.Type(); got != want {
+			t.Errorf("got igmp.Type() = %d, want = %d", got, want)
+		}
+	}
+}
+
+// IGMPMaxRespTime creates a checker that checks the IGMP Max Resp Time field.
+func IGMPMaxRespTime(want time.Duration) TransportChecker {
+	return func(t *testing.T, h header.Transport) {
+		t.Helper()
+
+		igmp, ok := h.(header.IGMP)
+		if !ok {
+			t.Fatalf("got transport header = %T, want = header.IGMP", h)
+		}
+		if got := igmp.MaxRespTime(); got != want {
+			t.Errorf("got igmp.MaxRespTime() = %s, want = %s", got, want)
+		}
+	}
+}
+
+// IGMPGroupAddress creates a checker that checks the IGMP Group Address field.
+func IGMPGroupAddress(want tcpip.Address) TransportChecker {
+	return func(t *testing.T, h header.Transport) {
+		t.Helper()
+
+		igmp, ok := h.(header.IGMP)
+		if !ok {
+			t.Fatalf("got transport header = %T, want = header.IGMP", h)
+		}
+		if got := igmp.GroupAddress(); got != want {
+			t.Errorf("got igmp.GroupAddress() = %s, want = %s", got, want)
+		}
+	}
+}
diff --git a/pkg/tcpip/header/BUILD b/pkg/tcpip/header/BUILD
index d87797617..0bdc12d53 100644
--- a/pkg/tcpip/header/BUILD
+++ b/pkg/tcpip/header/BUILD
@@ -11,11 +11,13 @@ go_library(
         "gue.go",
         "icmpv4.go",
         "icmpv6.go",
+        "igmp.go",
         "interfaces.go",
         "ipv4.go",
         "ipv6.go",
         "ipv6_extension_headers.go",
         "ipv6_fragment.go",
+        "mld.go",
         "ndp_neighbor_advert.go",
         "ndp_neighbor_solicit.go",
         "ndp_options.go",
@@ -39,6 +41,8 @@ go_test(
     size = "small",
     srcs = [
         "checksum_test.go",
+        "igmp_test.go",
+        "ipv4_test.go",
         "ipv6_test.go",
         "ipversion_test.go",
         "tcp_test.go",
@@ -58,6 +62,7 @@ go_test(
     srcs = [
         "eth_test.go",
         "ipv6_extension_headers_test.go",
+        "mld_test.go",
         "ndp_test.go",
     ],
     library = ":header",
diff --git a/pkg/tcpip/header/icmpv6.go b/pkg/tcpip/header/icmpv6.go
index 4303fc5d5..2eef64b4d 100644
--- a/pkg/tcpip/header/icmpv6.go
+++ b/pkg/tcpip/header/icmpv6.go
@@ -115,6 +115,12 @@ const (
 	ICMPv6NeighborSolicit ICMPv6Type = 135
 	ICMPv6NeighborAdvert  ICMPv6Type = 136
 	ICMPv6RedirectMsg     ICMPv6Type = 137
+
+	// Multicast Listener Discovery (MLD) messages, see RFC 2710.
+
+	ICMPv6MulticastListenerQuery  ICMPv6Type = 130
+	ICMPv6MulticastListenerReport ICMPv6Type = 131
+	ICMPv6MulticastListenerDone   ICMPv6Type = 132
 )
 
 // IsErrorType returns true if the receiver is an ICMP error type.
@@ -245,10 +251,9 @@ func (b ICMPv6) SetSequence(sequence uint16) {
 	binary.BigEndian.PutUint16(b[icmpv6SequenceOffset:], sequence)
 }
 
-// NDPPayload returns the NDP payload buffer. That is, it returns the ICMPv6
-// packet's message body as defined by RFC 4443 section 2.1; the portion of the
-// ICMPv6 buffer after the first ICMPv6HeaderSize bytes.
-func (b ICMPv6) NDPPayload() []byte {
+// MessageBody returns the message body as defined by RFC 4443 section 2.1; the
+// portion of the ICMPv6 buffer after the first ICMPv6HeaderSize bytes.
+func (b ICMPv6) MessageBody() []byte {
 	return b[ICMPv6HeaderSize:]
 }
 
diff --git a/pkg/tcpip/header/igmp.go b/pkg/tcpip/header/igmp.go
new file mode 100644
index 000000000..5c5be1b9d
--- /dev/null
+++ b/pkg/tcpip/header/igmp.go
@@ -0,0 +1,181 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package header
+
+import (
+	"encoding/binary"
+	"fmt"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+)
+
+// IGMP represents an IGMP header stored in a byte array.
+type IGMP []byte
+
+// IGMP implements `Transport`.
+var _ Transport = (*IGMP)(nil)
+
+const (
+	// IGMPMinimumSize is the minimum size of a valid IGMP packet in bytes,
+	// as per RFC 2236, Section 2, Page 2.
+	IGMPMinimumSize = 8
+
+	// IGMPQueryMinimumSize is the minimum size of a valid Membership Query
+	// Message in bytes, as per RFC 2236, Section 2, Page 2.
+	IGMPQueryMinimumSize = 8
+
+	// IGMPReportMinimumSize is the minimum size of a valid Report Message in
+	// bytes, as per RFC 2236, Section 2, Page 2.
+	IGMPReportMinimumSize = 8
+
+	// IGMPLeaveMessageMinimumSize is the minimum size of a valid Leave Message
+	// in bytes, as per RFC 2236, Section 2, Page 2.
+	IGMPLeaveMessageMinimumSize = 8
+
+	// IGMPTTL is the TTL for all IGMP messages, as per RFC 2236, Section 3, Page
+	// 3.
+	IGMPTTL = 1
+
+	// igmpTypeOffset defines the offset of the type field in an IGMP message.
+	igmpTypeOffset = 0
+
+	// igmpMaxRespTimeOffset defines the offset of the MaxRespTime field in an
+	// IGMP message.
+	igmpMaxRespTimeOffset = 1
+
+	// igmpChecksumOffset defines the offset of the checksum field in an IGMP
+	// message.
+	igmpChecksumOffset = 2
+
+	// igmpGroupAddressOffset defines the offset of the Group Address field in an
+	// IGMP message.
+	igmpGroupAddressOffset = 4
+
+	// IGMPProtocolNumber is IGMP's transport protocol number.
+	IGMPProtocolNumber tcpip.TransportProtocolNumber = 2
+)
+
+// IGMPType is the IGMP type field as per RFC 2236.
+type IGMPType byte
+
+// Values for the IGMP Type described in RFC 2236 Section 2.1, Page 2.
+// Descriptions below come from there.
+const (
+	// IGMPMembershipQuery indicates that the message type is Membership Query.
+	// "There are two sub-types of Membership Query messages:
+	// - General Query, used to learn which groups have members on an
+	//   attached network.
+	// - Group-Specific Query, used to learn if a particular group
+	//   has any members on an attached network.
+	// These two messages are differentiated by the Group Address, as
+	// described in section 1.4 ."
+	IGMPMembershipQuery IGMPType = 0x11
+	// IGMPv1MembershipReport indicates that the message is a Membership Report
+	// generated by a host using the IGMPv1 protocol: "an additional type of
+	// message, for backwards-compatibility with IGMPv1"
+	IGMPv1MembershipReport IGMPType = 0x12
+	// IGMPv2MembershipReport indicates that the Message type is a Membership
+	// Report generated by a host using the IGMPv2 protocol.
+	IGMPv2MembershipReport IGMPType = 0x16
+	// IGMPLeaveGroup indicates that the message type is a Leave Group
+	// notification message.
+	IGMPLeaveGroup IGMPType = 0x17
+)
+
+// Type is the IGMP type field.
+func (b IGMP) Type() IGMPType { return IGMPType(b[igmpTypeOffset]) }
+
+// SetType sets the IGMP type field.
+func (b IGMP) SetType(t IGMPType) { b[igmpTypeOffset] = byte(t) }
+
+// MaxRespTime gets the MaxRespTimeField. This is meaningful only in Membership
+// Query messages, in other cases it is set to 0 by the sender and ignored by
+// the receiver.
+func (b IGMP) MaxRespTime() time.Duration {
+	// As per RFC 2236 section 2.2,
+	//
+	//  The Max Response Time field is meaningful only in Membership Query
+	//  messages, and specifies the maximum allowed time before sending a
+	//  responding report in units of 1/10 second.  In all other messages, it
+	//  is set to zero by the sender and ignored by receivers.
+	return DecisecondToDuration(b[igmpMaxRespTimeOffset])
+}
+
+// SetMaxRespTime sets the MaxRespTimeField.
+func (b IGMP) SetMaxRespTime(m byte) { b[igmpMaxRespTimeOffset] = m }
+
+// Checksum is the IGMP checksum field.
+func (b IGMP) Checksum() uint16 {
+	return binary.BigEndian.Uint16(b[igmpChecksumOffset:])
+}
+
+// SetChecksum sets the IGMP checksum field.
+func (b IGMP) SetChecksum(checksum uint16) {
+	binary.BigEndian.PutUint16(b[igmpChecksumOffset:], checksum)
+}
+
+// GroupAddress gets the Group Address field.
+func (b IGMP) GroupAddress() tcpip.Address {
+	return tcpip.Address(b[igmpGroupAddressOffset:][:IPv4AddressSize])
+}
+
+// SetGroupAddress sets the Group Address field.
+func (b IGMP) SetGroupAddress(address tcpip.Address) {
+	if n := copy(b[igmpGroupAddressOffset:], address); n != IPv4AddressSize {
+		panic(fmt.Sprintf("copied %d bytes, expected %d", n, IPv4AddressSize))
+	}
+}
+
+// SourcePort implements Transport.SourcePort.
+func (IGMP) SourcePort() uint16 {
+	return 0
+}
+
+// DestinationPort implements Transport.DestinationPort.
+func (IGMP) DestinationPort() uint16 {
+	return 0
+}
+
+// SetSourcePort implements Transport.SetSourcePort.
+func (IGMP) SetSourcePort(uint16) {
+}
+
+// SetDestinationPort implements Transport.SetDestinationPort.
+func (IGMP) SetDestinationPort(uint16) {
+}
+
+// Payload implements Transport.Payload.
+func (IGMP) Payload() []byte {
+	return nil
+}
+
+// IGMPCalculateChecksum calculates the IGMP checksum over the provided IGMP
+// header.
+func IGMPCalculateChecksum(h IGMP) uint16 {
+	// The header contains a checksum itself, set it aside to avoid checksumming
+	// the checksum and replace it afterwards.
+	existingXsum := h.Checksum()
+	h.SetChecksum(0)
+	xsum := ^Checksum(h, 0)
+	h.SetChecksum(existingXsum)
+	return xsum
+}
+
+// DecisecondToDuration converts a value representing deci-seconds to a
+// time.Duration.
+func DecisecondToDuration(ds uint8) time.Duration {
+	return time.Duration(ds) * time.Second / 10
+}
diff --git a/pkg/tcpip/header/igmp_test.go b/pkg/tcpip/header/igmp_test.go
new file mode 100644
index 000000000..b6126d29a
--- /dev/null
+++ b/pkg/tcpip/header/igmp_test.go
@@ -0,0 +1,110 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package header_test
+
+import (
+	"testing"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+)
+
+// TestIGMPHeader tests the functions within header.igmp
+func TestIGMPHeader(t *testing.T) {
+	const maxRespTimeTenthSec = 0xF0
+	b := []byte{
+		0x11,                // IGMP Type, Membership Query
+		maxRespTimeTenthSec, // Maximum Response Time
+		0xC0, 0xC0,          // Checksum
+		0x01, 0x02, 0x03, 0x04, // Group Address
+	}
+
+	igmpHeader := header.IGMP(b)
+
+	if got, want := igmpHeader.Type(), header.IGMPMembershipQuery; got != want {
+		t.Errorf("got igmpHeader.Type() = %x, want = %x", got, want)
+	}
+
+	if got, want := igmpHeader.MaxRespTime(), header.DecisecondToDuration(maxRespTimeTenthSec); got != want {
+		t.Errorf("got igmpHeader.MaxRespTime() = %s, want = %s", got, want)
+	}
+
+	if got, want := igmpHeader.Checksum(), uint16(0xC0C0); got != want {
+		t.Errorf("got igmpHeader.Checksum() = %x, want = %x", got, want)
+	}
+
+	if got, want := igmpHeader.GroupAddress(), tcpip.Address("\x01\x02\x03\x04"); got != want {
+		t.Errorf("got igmpHeader.GroupAddress() = %s, want = %s", got, want)
+	}
+
+	igmpType := header.IGMPv2MembershipReport
+	igmpHeader.SetType(igmpType)
+	if got := igmpHeader.Type(); got != igmpType {
+		t.Errorf("got igmpHeader.Type() = %x, want = %x", got, igmpType)
+	}
+	if got := header.IGMPType(b[0]); got != igmpType {
+		t.Errorf("got IGMPtype in backing buffer = %x, want %x", got, igmpType)
+	}
+
+	respTime := byte(0x02)
+	igmpHeader.SetMaxRespTime(respTime)
+	if got, want := igmpHeader.MaxRespTime(), header.DecisecondToDuration(respTime); got != want {
+		t.Errorf("got igmpHeader.MaxRespTime() = %s, want = %s", got, want)
+	}
+
+	checksum := uint16(0x0102)
+	igmpHeader.SetChecksum(checksum)
+	if got := igmpHeader.Checksum(); got != checksum {
+		t.Errorf("got igmpHeader.Checksum() = %x, want = %x", got, checksum)
+	}
+
+	groupAddress := tcpip.Address("\x04\x03\x02\x01")
+	igmpHeader.SetGroupAddress(groupAddress)
+	if got := igmpHeader.GroupAddress(); got != groupAddress {
+		t.Errorf("got igmpHeader.GroupAddress() = %s, want = %s", got, groupAddress)
+	}
+}
+
+// TestIGMPChecksum ensures that the checksum calculator produces the expected
+// checksum.
+func TestIGMPChecksum(t *testing.T) {
+	b := []byte{
+		0x11,       // IGMP Type, Membership Query
+		0xF0,       // Maximum Response Time
+		0xC0, 0xC0, // Checksum
+		0x01, 0x02, 0x03, 0x04, // Group Address
+	}
+
+	igmpHeader := header.IGMP(b)
+
+	// Calculate the initial checksum after setting the checksum temporarily to 0
+	// to avoid checksumming the checksum.
+	initialChecksum := igmpHeader.Checksum()
+	igmpHeader.SetChecksum(0)
+	checksum := ^header.Checksum(b, 0)
+	igmpHeader.SetChecksum(initialChecksum)
+
+	if got := header.IGMPCalculateChecksum(igmpHeader); got != checksum {
+		t.Errorf("got IGMPCalculateChecksum = %x, want %x", got, checksum)
+	}
+}
+
+func TestDecisecondToDuration(t *testing.T) {
+	const valueInDeciseconds = 5
+	if got, want := header.DecisecondToDuration(valueInDeciseconds), valueInDeciseconds*time.Second/10; got != want {
+		t.Fatalf("got header.DecisecondToDuration(%d) = %s, want = %s", valueInDeciseconds, got, want)
+	}
+}
diff --git a/pkg/tcpip/header/ipv4.go b/pkg/tcpip/header/ipv4.go
index 91fe7b6a5..e6103f4bc 100644
--- a/pkg/tcpip/header/ipv4.go
+++ b/pkg/tcpip/header/ipv4.go
@@ -100,7 +100,7 @@ type IPv4Fields struct {
 	//
 	// That leaves ten 32 bit (4 byte) fields for options. An attempt to encode
 	// more will fail.
-	Options IPv4Options
+	Options IPv4OptionsSerializer
 }
 
 // IPv4 is an IPv4 header.
@@ -157,6 +157,9 @@ const (
 	// IPv4Any is the non-routable IPv4 "any" meta address.
 	IPv4Any tcpip.Address = "\x00\x00\x00\x00"
 
+	// IPv4AllRoutersGroup is a multicast address for all routers.
+	IPv4AllRoutersGroup tcpip.Address = "\xe0\x00\x00\x02"
+
 	// IPv4MinimumProcessableDatagramSize is the minimum size of an IP
 	// packet that every IPv4 capable host must be able to
 	// process/reassemble.
@@ -282,18 +285,17 @@ func (b IPv4) DestinationAddress() tcpip.Address {
 	return tcpip.Address(b[dstAddr : dstAddr+IPv4AddressSize])
 }
 
-// IPv4Options is a buffer that holds all the raw IP options.
-type IPv4Options []byte
-
-// SizeWithPadding implements stack.NetOptions.
-// It reports the size to allocate for the Options. RFC 791 page 23 (end of
-// section 3.1) says of the padding at the end of the options:
+// padIPv4OptionsLength returns the total length for IPv4 options of length l
+// after applying padding according to RFC 791:
 //    The internet header padding is used to ensure that the internet
 //    header ends on a 32 bit boundary.
-func (o IPv4Options) SizeWithPadding() int {
-	return (len(o) + IPv4IHLStride - 1) & ^(IPv4IHLStride - 1)
+func padIPv4OptionsLength(length uint8) uint8 {
+	return (length + IPv4IHLStride - 1) & ^uint8(IPv4IHLStride-1)
 }
 
+// IPv4Options is a buffer that holds all the raw IP options.
+type IPv4Options []byte
+
 // Options returns a buffer holding the options.
 func (b IPv4) Options() IPv4Options {
 	hdrLen := b.HeaderLength()
@@ -372,26 +374,16 @@ func (b IPv4) CalculateChecksum() uint16 {
 func (b IPv4) Encode(i *IPv4Fields) {
 	// The size of the options defines the size of the whole header and thus the
 	// IHL field. Options are rare and this is a heavily used function so it is
-	// worth a bit of optimisation here to keep the copy out of the fast path.
-	hdrLen := IPv4MinimumSize
+	// worth a bit of optimisation here to keep the serializer out of the fast
+	// path.
+	hdrLen := uint8(IPv4MinimumSize)
 	if len(i.Options) != 0 {
-		// SizeWithPadding is always >= len(i.Options).
-		aLen := i.Options.SizeWithPadding()
-		hdrLen += aLen
-		if hdrLen > len(b) {
-			panic(fmt.Sprintf("encode received %d bytes, wanted >= %d", len(b), hdrLen))
-		}
-		opts := b[options:]
-		// This avoids bounds checks on the next line(s) which would happen even
-		// if there's no work to do.
-		if n := copy(opts, i.Options); n != aLen {
-			padding := opts[n:][:aLen-n]
-			for i := range padding {
-				padding[i] = 0
-			}
-		}
+		hdrLen += i.Options.Serialize(b[options:])
+	}
+	if hdrLen > IPv4MaximumHeaderSize {
+		panic(fmt.Sprintf("%d is larger than maximum IPv4 header size of %d", hdrLen, IPv4MaximumHeaderSize))
 	}
-	b.SetHeaderLength(uint8(hdrLen))
+	b.SetHeaderLength(hdrLen)
 	b[tos] = i.TOS
 	b.SetTotalLength(i.TotalLength)
 	binary.BigEndian.PutUint16(b[id:], i.ID)
@@ -471,6 +463,10 @@ const (
 	// options and may appear multiple times.
 	IPv4OptionNOPType IPv4OptionType = 1
 
+	// IPv4OptionRouterAlertType is the option type for the Router Alert option,
+	// defined in RFC 2113 Section 2.1.
+	IPv4OptionRouterAlertType IPv4OptionType = 20 | 0x80
+
 	// IPv4OptionRecordRouteType is used by each router on the path of the packet
 	// to record its path. It is carried over to an Echo Reply.
 	IPv4OptionRecordRouteType IPv4OptionType = 7
@@ -871,3 +867,162 @@ func (rr *IPv4OptionRecordRoute) Size() uint8 { return uint8(len(*rr)) }
 
 // Contents implements IPv4Option.
 func (rr *IPv4OptionRecordRoute) Contents() []byte { return []byte(*rr) }
+
+// Router Alert option specific related constants.
+//
+// from RFC 2113 section 2.1:
+//
+//     +--------+--------+--------+--------+
+//     |10010100|00000100|  2 octet value  |
+//     +--------+--------+--------+--------+
+//
+//     Type:
+//     Copied flag:  1 (all fragments must carry the option)
+//     Option class: 0 (control)
+//     Option number: 20 (decimal)
+//
+//     Length: 4
+//
+//     Value:  A two octet code with the following values:
+//     0 - Router shall examine packet
+//     1-65535 - Reserved
+const (
+	// IPv4OptionRouterAlertLength is the length of a Router Alert option.
+	IPv4OptionRouterAlertLength = 4
+
+	// IPv4OptionRouterAlertValue is the only permissible value of the 16 bit
+	// payload of the router alert option.
+	IPv4OptionRouterAlertValue = 0
+
+	// iPv4OptionRouterAlertValueOffset is the offset for the value of a
+	// RouterAlert option.
+	iPv4OptionRouterAlertValueOffset = 2
+)
+
+// IPv4SerializableOption is an interface to represent serializable IPv4 option
+// types.
+type IPv4SerializableOption interface {
+	// optionType returns the type identifier of the option.
+	optionType() IPv4OptionType
+}
+
+// IPv4SerializableOptionPayload is an interface providing serialization of the
+// payload of an IPv4 option.
+type IPv4SerializableOptionPayload interface {
+	// length returns the size of the payload.
+	length() uint8
+
+	// serializeInto serializes the payload into the provided byte buffer.
+	//
+	// Note, the caller MUST provide a byte buffer with size of at least
+	// Length. Implementers of this function may assume that the byte buffer
+	// is of sufficient size. serializeInto MUST panic if the provided byte
+	// buffer is not of sufficient size.
+	//
+	// serializeInto will return the number of bytes that was used to
+	// serialize the receiver. Implementers must only use the number of
+	// bytes required to serialize the receiver. Callers MAY provide a
+	// larger buffer than required to serialize into.
+	serializeInto(buffer []byte) uint8
+}
+
+// IPv4OptionsSerializer is a serializer for IPv4 options.
+type IPv4OptionsSerializer []IPv4SerializableOption
+
+// Length returns the total number of bytes required to serialize the options.
+func (s IPv4OptionsSerializer) Length() uint8 {
+	var total uint8
+	for _, opt := range s {
+		total++
+		if withPayload, ok := opt.(IPv4SerializableOptionPayload); ok {
+			// Add 1 to reported length to account for the length byte.
+			total += 1 + withPayload.length()
+		}
+	}
+	return padIPv4OptionsLength(total)
+}
+
+// Serialize serializes the provided list of IPV4 options into b.
+//
+// Note, b must be of sufficient size to hold all the options in s. See
+// IPv4OptionsSerializer.Length for details on the getting the total size
+// of a serialized IPv4OptionsSerializer.
+//
+// Serialize panics if b is not of sufficient size to hold all the options in s.
+func (s IPv4OptionsSerializer) Serialize(b []byte) uint8 {
+	var total uint8
+	for _, opt := range s {
+		ty := opt.optionType()
+		if withPayload, ok := opt.(IPv4SerializableOptionPayload); ok {
+			// Serialize first to reduce bounds checks.
+			l := 2 + withPayload.serializeInto(b[2:])
+			b[0] = byte(ty)
+			b[1] = l
+			b = b[l:]
+			total += l
+			continue
+		}
+		// Options without payload consist only of the type field.
+		//
+		// NB: Repeating code from the branch above is intentional to minimize
+		// bounds checks.
+		b[0] = byte(ty)
+		b = b[1:]
+		total++
+	}
+
+	// According to RFC 791:
+	//
+	//  The internet header padding is used to ensure that the internet
+	//  header ends on a 32 bit boundary. The padding is zero.
+	padded := padIPv4OptionsLength(total)
+	b = b[:padded-total]
+	for i := range b {
+		b[i] = 0
+	}
+	return padded
+}
+
+var _ IPv4SerializableOptionPayload = (*IPv4SerializableRouterAlertOption)(nil)
+var _ IPv4SerializableOption = (*IPv4SerializableRouterAlertOption)(nil)
+
+// IPv4SerializableRouterAlertOption provides serialization of the Router Alert
+// IPv4 option according to RFC 2113.
+type IPv4SerializableRouterAlertOption struct{}
+
+// Type implements IPv4SerializableOption.
+func (*IPv4SerializableRouterAlertOption) optionType() IPv4OptionType {
+	return IPv4OptionRouterAlertType
+}
+
+// Length implements IPv4SerializableOption.
+func (*IPv4SerializableRouterAlertOption) length() uint8 {
+	return IPv4OptionRouterAlertLength - iPv4OptionRouterAlertValueOffset
+}
+
+// SerializeInto implements IPv4SerializableOption.
+func (o *IPv4SerializableRouterAlertOption) serializeInto(buffer []byte) uint8 {
+	binary.BigEndian.PutUint16(buffer, IPv4OptionRouterAlertValue)
+	return o.length()
+}
+
+var _ IPv4SerializableOption = (*IPv4SerializableNOPOption)(nil)
+
+// IPv4SerializableNOPOption provides serialization for the IPv4 no-op option.
+type IPv4SerializableNOPOption struct{}
+
+// Type implements IPv4SerializableOption.
+func (*IPv4SerializableNOPOption) optionType() IPv4OptionType {
+	return IPv4OptionNOPType
+}
+
+var _ IPv4SerializableOption = (*IPv4SerializableListEndOption)(nil)
+
+// IPv4SerializableListEndOption provides serialization for the IPv4 List End
+// option.
+type IPv4SerializableListEndOption struct{}
+
+// Type implements IPv4SerializableOption.
+func (*IPv4SerializableListEndOption) optionType() IPv4OptionType {
+	return IPv4OptionListEndType
+}
diff --git a/pkg/tcpip/header/ipv4_test.go b/pkg/tcpip/header/ipv4_test.go
new file mode 100644
index 000000000..6475cd694
--- /dev/null
+++ b/pkg/tcpip/header/ipv4_test.go
@@ -0,0 +1,179 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package header_test
+
+import (
+	"testing"
+
+	"github.com/google/go-cmp/cmp"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+)
+
+func TestIPv4OptionsSerializer(t *testing.T) {
+	optCases := []struct {
+		name   string
+		option []header.IPv4SerializableOption
+		expect []byte
+	}{
+		{
+			name: "NOP",
+			option: []header.IPv4SerializableOption{
+				&header.IPv4SerializableNOPOption{},
+			},
+			expect: []byte{1, 0, 0, 0},
+		},
+		{
+			name: "ListEnd",
+			option: []header.IPv4SerializableOption{
+				&header.IPv4SerializableListEndOption{},
+			},
+			expect: []byte{0, 0, 0, 0},
+		},
+		{
+			name: "RouterAlert",
+			option: []header.IPv4SerializableOption{
+				&header.IPv4SerializableRouterAlertOption{},
+			},
+			expect: []byte{148, 4, 0, 0},
+		}, {
+			name: "NOP and RouterAlert",
+			option: []header.IPv4SerializableOption{
+				&header.IPv4SerializableNOPOption{},
+				&header.IPv4SerializableRouterAlertOption{},
+			},
+			expect: []byte{1, 148, 4, 0, 0, 0, 0, 0},
+		},
+	}
+
+	for _, opt := range optCases {
+		t.Run(opt.name, func(t *testing.T) {
+			s := header.IPv4OptionsSerializer(opt.option)
+			l := s.Length()
+			if got := len(opt.expect); got != int(l) {
+				t.Fatalf("s.Length() = %d, want = %d", got, l)
+			}
+			b := make([]byte, l)
+			for i := range b {
+				// Fill the buffer with full bytes to ensure padding is being set
+				// correctly.
+				b[i] = 0xFF
+			}
+			if serializedLength := s.Serialize(b); serializedLength != l {
+				t.Fatalf("s.Serialize(_) = %d, want %d", serializedLength, l)
+			}
+			if diff := cmp.Diff(opt.expect, b); diff != "" {
+				t.Errorf("mismatched serialized option (-want +got):\n%s", diff)
+			}
+		})
+	}
+}
+
+// TestIPv4Encode checks that ipv4.Encode correctly fills out the requested
+// fields when options are supplied.
+func TestIPv4EncodeOptions(t *testing.T) {
+	tests := []struct {
+		name           string
+		numberOfNops   int
+		encodedOptions header.IPv4Options // reply should look like this
+		wantIHL        int
+	}{
+		{
+			name:    "valid no options",
+			wantIHL: header.IPv4MinimumSize,
+		},
+		{
+			name:           "one byte options",
+			numberOfNops:   1,
+			encodedOptions: header.IPv4Options{1, 0, 0, 0},
+			wantIHL:        header.IPv4MinimumSize + 4,
+		},
+		{
+			name:           "two byte options",
+			numberOfNops:   2,
+			encodedOptions: header.IPv4Options{1, 1, 0, 0},
+			wantIHL:        header.IPv4MinimumSize + 4,
+		},
+		{
+			name:           "three byte options",
+			numberOfNops:   3,
+			encodedOptions: header.IPv4Options{1, 1, 1, 0},
+			wantIHL:        header.IPv4MinimumSize + 4,
+		},
+		{
+			name:           "four byte options",
+			numberOfNops:   4,
+			encodedOptions: header.IPv4Options{1, 1, 1, 1},
+			wantIHL:        header.IPv4MinimumSize + 4,
+		},
+		{
+			name:           "five byte options",
+			numberOfNops:   5,
+			encodedOptions: header.IPv4Options{1, 1, 1, 1, 1, 0, 0, 0},
+			wantIHL:        header.IPv4MinimumSize + 8,
+		},
+		{
+			name:         "thirty nine byte options",
+			numberOfNops: 39,
+			encodedOptions: header.IPv4Options{
+				1, 1, 1, 1, 1, 1, 1, 1,
+				1, 1, 1, 1, 1, 1, 1, 1,
+				1, 1, 1, 1, 1, 1, 1, 1,
+				1, 1, 1, 1, 1, 1, 1, 1,
+				1, 1, 1, 1, 1, 1, 1, 0,
+			},
+			wantIHL: header.IPv4MinimumSize + 40,
+		},
+	}
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			serializeOpts := header.IPv4OptionsSerializer(make([]header.IPv4SerializableOption, test.numberOfNops))
+			for i := range serializeOpts {
+				serializeOpts[i] = &header.IPv4SerializableNOPOption{}
+			}
+			paddedOptionLength := serializeOpts.Length()
+			ipHeaderLength := int(header.IPv4MinimumSize + paddedOptionLength)
+			if ipHeaderLength > header.IPv4MaximumHeaderSize {
+				t.Fatalf("IP header length too large: got = %d, want <= %d ", ipHeaderLength, header.IPv4MaximumHeaderSize)
+			}
+			totalLen := uint16(ipHeaderLength)
+			hdr := buffer.NewPrependable(int(totalLen))
+			ip := header.IPv4(hdr.Prepend(ipHeaderLength))
+			// To check the padding works, poison the last byte of the options space.
+			if paddedOptionLength != serializeOpts.Length() {
+				ip.SetHeaderLength(uint8(ipHeaderLength))
+				ip.Options()[paddedOptionLength-1] = 0xff
+				ip.SetHeaderLength(0)
+			}
+			ip.Encode(&header.IPv4Fields{
+				Options: serializeOpts,
+			})
+			options := ip.Options()
+			wantOptions := test.encodedOptions
+			if got, want := int(ip.HeaderLength()), test.wantIHL; got != want {
+				t.Errorf("got IHL of %d, want %d", got, want)
+			}
+
+			// cmp.Diff does not consider nil slices equal to empty slices, but we do.
+			if len(wantOptions) == 0 && len(options) == 0 {
+				return
+			}
+
+			if diff := cmp.Diff(wantOptions, options); diff != "" {
+				t.Errorf("options mismatch (-want +got):\n%s", diff)
+			}
+		})
+	}
+}
diff --git a/pkg/tcpip/header/ipv6_extension_headers.go b/pkg/tcpip/header/ipv6_extension_headers.go
index 583c2c5d3..571eae233 100644
--- a/pkg/tcpip/header/ipv6_extension_headers.go
+++ b/pkg/tcpip/header/ipv6_extension_headers.go
@@ -47,6 +47,11 @@ const (
 	// IPv6NoNextHeaderIdentifier is the header identifier used to signify the end
 	// of an IPv6 payload, as per RFC 8200 section 4.7.
 	IPv6NoNextHeaderIdentifier IPv6ExtensionHeaderIdentifier = 59
+
+	// IPv6UnknownExtHdrIdentifier is reserved by IANA.
+	// https://www.iana.org/assignments/ipv6-parameters/ipv6-parameters.xhtml#extension-header
+	// "254	Use for experimentation and testing	[RFC3692][RFC4727]"
+	IPv6UnknownExtHdrIdentifier IPv6ExtensionHeaderIdentifier = 254
 )
 
 const (
@@ -452,9 +457,11 @@ func (i *IPv6PayloadIterator) AsRawHeader(consume bool) IPv6RawPayloadHeader {
 		// Since we consume the iterator, we return the payload as is.
 		buf = i.payload
 
-		// Mark i as done.
+		// Mark i as done, but keep track of where we were for error reporting.
 		*i = IPv6PayloadIterator{
 			nextHdrIdentifier: IPv6NoNextHeaderIdentifier,
+			headerOffset:      i.headerOffset,
+			nextOffset:        i.nextOffset,
 		}
 	} else {
 		buf = i.payload.Clone(nil)
diff --git a/pkg/tcpip/header/mld.go b/pkg/tcpip/header/mld.go
new file mode 100644
index 000000000..ffe03c76a
--- /dev/null
+++ b/pkg/tcpip/header/mld.go
@@ -0,0 +1,103 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package header
+
+import (
+	"encoding/binary"
+	"fmt"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+)
+
+const (
+	// MLDMinimumSize is the minimum size for an MLD message.
+	MLDMinimumSize = 20
+
+	// MLDHopLimit is the Hop Limit for all IPv6 packets with an MLD message, as
+	// per RFC 2710 section 3.
+	MLDHopLimit = 1
+
+	// mldMaximumResponseDelayOffset is the offset to the Maximum Response Delay
+	// field within MLD.
+	mldMaximumResponseDelayOffset = 0
+
+	// mldMulticastAddressOffset is the offset to the Multicast Address field
+	// within MLD.
+	mldMulticastAddressOffset = 4
+)
+
+// MLD is a Multicast Listener Discovery message in an ICMPv6 packet.
+//
+// MLD will only contain the body of an ICMPv6 packet.
+//
+// As per RFC 2710 section 3, MLD messages have the following format (MLD only
+// holds the bytes after the first four bytes in the diagram below):
+//
+//    0                   1                   2                   3
+//    0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+//   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+//   |     Type      |     Code      |          Checksum             |
+//   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+//   |     Maximum Response Delay    |          Reserved             |
+//   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+//   |                                                               |
+//   +                                                               +
+//   |                                                               |
+//   +                       Multicast Address                       +
+//   |                                                               |
+//   +                                                               +
+//   |                                                               |
+//   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+type MLD []byte
+
+// MaximumResponseDelay returns the Maximum Response Delay.
+func (m MLD) MaximumResponseDelay() time.Duration {
+	// As per RFC 2710 section 3.4:
+	//
+	//   The Maximum Response Delay field is meaningful only in Query
+	//   messages, and specifies the maximum allowed delay before sending a
+	//   responding Report, in units of milliseconds. In all other messages,
+	//   it is set to zero by the sender and ignored by receivers.
+	return time.Duration(binary.BigEndian.Uint16(m[mldMaximumResponseDelayOffset:])) * time.Millisecond
+}
+
+// SetMaximumResponseDelay sets the Maximum Response Delay field.
+//
+// maxRespDelayMS is the value in milliseconds.
+func (m MLD) SetMaximumResponseDelay(maxRespDelayMS uint16) {
+	binary.BigEndian.PutUint16(m[mldMaximumResponseDelayOffset:], maxRespDelayMS)
+}
+
+// MulticastAddress returns the Multicast Address.
+func (m MLD) MulticastAddress() tcpip.Address {
+	// As per RFC 2710 section 3.5:
+	//
+	//   In a Query message, the Multicast Address field is set to zero when
+	//   sending a General Query, and set to a specific IPv6 multicast address
+	//   when sending a Multicast-Address-Specific Query.
+	//
+	//   In a Report or Done message, the Multicast Address field holds a
+	//   specific IPv6 multicast address to which the message sender is
+	//   listening or is ceasing to listen, respectively.
+	return tcpip.Address(m[mldMulticastAddressOffset:][:IPv6AddressSize])
+}
+
+// SetMulticastAddress sets the Multicast Address field.
+func (m MLD) SetMulticastAddress(multicastAddress tcpip.Address) {
+	if n := copy(m[mldMulticastAddressOffset:], multicastAddress); n != IPv6AddressSize {
+		panic(fmt.Sprintf("copied %d bytes, expected to copy %d bytes", n, IPv6AddressSize))
+	}
+}
diff --git a/pkg/tcpip/header/mld_test.go b/pkg/tcpip/header/mld_test.go
new file mode 100644
index 000000000..0cecf10d4
--- /dev/null
+++ b/pkg/tcpip/header/mld_test.go
@@ -0,0 +1,61 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package header
+
+import (
+	"encoding/binary"
+	"testing"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+)
+
+func TestMLD(t *testing.T) {
+	b := []byte{
+		// Maximum Response Delay
+		0, 0,
+
+		// Reserved
+		0, 0,
+
+		// MulticastAddress
+		1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6,
+	}
+
+	const maxRespDelay = 513
+	binary.BigEndian.PutUint16(b, maxRespDelay)
+
+	mld := MLD(b)
+
+	if got, want := mld.MaximumResponseDelay(), maxRespDelay*time.Millisecond; got != want {
+		t.Errorf("got mld.MaximumResponseDelay() = %s, want = %s", got, want)
+	}
+
+	const newMaxRespDelay = 1234
+	mld.SetMaximumResponseDelay(newMaxRespDelay)
+	if got, want := mld.MaximumResponseDelay(), newMaxRespDelay*time.Millisecond; got != want {
+		t.Errorf("got mld.MaximumResponseDelay() = %s, want = %s", got, want)
+	}
+
+	if got, want := mld.MulticastAddress(), tcpip.Address([]byte{1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6}); got != want {
+		t.Errorf("got mld.MulticastAddress() = %s, want = %s", got, want)
+	}
+
+	multicastAddress := tcpip.Address([]byte{15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0})
+	mld.SetMulticastAddress(multicastAddress)
+	if got := mld.MulticastAddress(); got != multicastAddress {
+		t.Errorf("got mld.MulticastAddress() = %s, want = %s", got, multicastAddress)
+	}
+}
diff --git a/pkg/tcpip/header/ndp_options.go b/pkg/tcpip/header/ndp_options.go
index 5d3975c56..554242f0c 100644
--- a/pkg/tcpip/header/ndp_options.go
+++ b/pkg/tcpip/header/ndp_options.go
@@ -298,7 +298,7 @@ func (b NDPOptions) Iter(check bool) (NDPOptionIterator, error) {
 	return it, nil
 }
 
-// Serialize serializes the provided list of NDP options into o.
+// Serialize serializes the provided list of NDP options into b.
 //
 // Note, b must be of sufficient size to hold all the options in s. See
 // NDPOptionsSerializer.Length for details on the getting the total size
diff --git a/pkg/tcpip/link/channel/BUILD b/pkg/tcpip/link/channel/BUILD
index 39ca774ef..973f06cbc 100644
--- a/pkg/tcpip/link/channel/BUILD
+++ b/pkg/tcpip/link/channel/BUILD
@@ -9,7 +9,6 @@ go_library(
     deps = [
         "//pkg/sync",
         "//pkg/tcpip",
-        "//pkg/tcpip/buffer",
         "//pkg/tcpip/header",
         "//pkg/tcpip/stack",
     ],
diff --git a/pkg/tcpip/link/channel/channel.go b/pkg/tcpip/link/channel/channel.go
index c95aef63c..0efbfb22b 100644
--- a/pkg/tcpip/link/channel/channel.go
+++ b/pkg/tcpip/link/channel/channel.go
@@ -22,7 +22,6 @@ import (
 
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/tcpip"
-	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 )
@@ -32,7 +31,7 @@ type PacketInfo struct {
 	Pkt   *stack.PacketBuffer
 	Proto tcpip.NetworkProtocolNumber
 	GSO   *stack.GSO
-	Route stack.Route
+	Route *stack.Route
 }
 
 // Notification is the interface for receiving notification from the packet
@@ -271,21 +270,6 @@ func (e *Endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.Packe
 	return n, nil
 }
 
-// WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket.
-func (e *Endpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error {
-	p := PacketInfo{
-		Pkt: stack.NewPacketBuffer(stack.PacketBufferOptions{
-			Data: vv,
-		}),
-		Proto: 0,
-		GSO:   nil,
-	}
-
-	e.q.Write(p)
-
-	return nil
-}
-
 // Wait implements stack.LinkEndpoint.Wait.
 func (*Endpoint) Wait() {}
 
diff --git a/pkg/tcpip/link/ethernet/ethernet.go b/pkg/tcpip/link/ethernet/ethernet.go
index 3eef7cd56..beefcd008 100644
--- a/pkg/tcpip/link/ethernet/ethernet.go
+++ b/pkg/tcpip/link/ethernet/ethernet.go
@@ -62,7 +62,7 @@ func (e *Endpoint) Capabilities() stack.LinkEndpointCapabilities {
 
 // WritePacket implements stack.LinkEndpoint.
 func (e *Endpoint) WritePacket(r *stack.Route, gso *stack.GSO, proto tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) *tcpip.Error {
-	e.AddHeader(e.Endpoint.LinkAddress(), r.RemoteLinkAddress, proto, pkt)
+	e.AddHeader(e.Endpoint.LinkAddress(), r.RemoteLinkAddress(), proto, pkt)
 	return e.Endpoint.WritePacket(r, gso, proto, pkt)
 }
 
@@ -71,7 +71,7 @@ func (e *Endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.Packe
 	linkAddr := e.Endpoint.LinkAddress()
 
 	for pkt := pkts.Front(); pkt != nil; pkt = pkt.Next() {
-		e.AddHeader(linkAddr, r.RemoteLinkAddress, proto, pkt)
+		e.AddHeader(linkAddr, r.RemoteLinkAddress(), proto, pkt)
 	}
 
 	return e.Endpoint.WritePackets(r, gso, pkts, proto)
diff --git a/pkg/tcpip/link/fdbased/endpoint.go b/pkg/tcpip/link/fdbased/endpoint.go
index 975309fc8..9f2084eae 100644
--- a/pkg/tcpip/link/fdbased/endpoint.go
+++ b/pkg/tcpip/link/fdbased/endpoint.go
@@ -410,7 +410,7 @@ func (e *endpoint) AddHeader(local, remote tcpip.LinkAddress, protocol tcpip.Net
 // currently writable, the packet is dropped.
 func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) *tcpip.Error {
 	if e.hdrSize > 0 {
-		e.AddHeader(r.LocalLinkAddress, r.RemoteLinkAddress, protocol, pkt)
+		e.AddHeader(r.LocalLinkAddress, r.RemoteLinkAddress(), protocol, pkt)
 	}
 
 	var builder iovec.Builder
@@ -453,7 +453,7 @@ func (e *endpoint) sendBatch(batchFD int, batch []*stack.PacketBuffer) (int, *tc
 	mmsgHdrs := make([]rawfile.MMsgHdr, 0, len(batch))
 	for _, pkt := range batch {
 		if e.hdrSize > 0 {
-			e.AddHeader(pkt.EgressRoute.LocalLinkAddress, pkt.EgressRoute.RemoteLinkAddress, pkt.NetworkProtocolNumber, pkt)
+			e.AddHeader(pkt.EgressRoute.LocalLinkAddress, pkt.EgressRoute.RemoteLinkAddress(), pkt.NetworkProtocolNumber, pkt)
 		}
 
 		var vnetHdrBuf []byte
@@ -558,11 +558,6 @@ func viewsEqual(vs1, vs2 []buffer.View) bool {
 	return len(vs1) == len(vs2) && (len(vs1) == 0 || &vs1[0] == &vs2[0])
 }
 
-// WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket.
-func (e *endpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error {
-	return rawfile.NonBlockingWrite(e.fds[0], vv.ToView())
-}
-
 // InjectOutobund implements stack.InjectableEndpoint.InjectOutbound.
 func (e *endpoint) InjectOutbound(dest tcpip.Address, packet []byte) *tcpip.Error {
 	return rawfile.NonBlockingWrite(e.fds[0], packet)
diff --git a/pkg/tcpip/link/fdbased/endpoint_test.go b/pkg/tcpip/link/fdbased/endpoint_test.go
index 709f829c8..ce4da7230 100644
--- a/pkg/tcpip/link/fdbased/endpoint_test.go
+++ b/pkg/tcpip/link/fdbased/endpoint_test.go
@@ -183,9 +183,8 @@ func testWritePacket(t *testing.T, plen int, eth bool, gsoMaxSize uint32, hash u
 	c := newContext(t, &Options{Address: laddr, MTU: mtu, EthernetHeader: eth, GSOMaxSize: gsoMaxSize})
 	defer c.cleanup()
 
-	r := &stack.Route{
-		RemoteLinkAddress: raddr,
-	}
+	var r stack.Route
+	r.ResolveWith(raddr)
 
 	// Build payload.
 	payload := buffer.NewView(plen)
@@ -220,7 +219,7 @@ func testWritePacket(t *testing.T, plen int, eth bool, gsoMaxSize uint32, hash u
 			L3HdrLen:   header.IPv4MaximumHeaderSize,
 		}
 	}
-	if err := c.ep.WritePacket(r, gso, proto, pkt); err != nil {
+	if err := c.ep.WritePacket(&r, gso, proto, pkt); err != nil {
 		t.Fatalf("WritePacket failed: %v", err)
 	}
 
@@ -325,9 +324,9 @@ func TestPreserveSrcAddress(t *testing.T) {
 
 	// Set LocalLinkAddress in route to the value of the bridged address.
 	r := &stack.Route{
-		RemoteLinkAddress: raddr,
-		LocalLinkAddress:  baddr,
+		LocalLinkAddress: baddr,
 	}
+	r.ResolveWith(raddr)
 
 	pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
 		// WritePacket panics given a prependable with anything less than
diff --git a/pkg/tcpip/link/loopback/loopback.go b/pkg/tcpip/link/loopback/loopback.go
index 38aa694e4..edca57e4e 100644
--- a/pkg/tcpip/link/loopback/loopback.go
+++ b/pkg/tcpip/link/loopback/loopback.go
@@ -96,23 +96,6 @@ func (e *endpoint) WritePackets(*stack.Route, *stack.GSO, stack.PacketBufferList
 	panic("not implemented")
 }
 
-// WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket.
-func (e *endpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error {
-	pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
-		Data: vv,
-	})
-	// There should be an ethernet header at the beginning of vv.
-	hdr, ok := pkt.LinkHeader().Consume(header.EthernetMinimumSize)
-	if !ok {
-		// Reject the packet if it's shorter than an ethernet header.
-		return tcpip.ErrBadAddress
-	}
-	linkHeader := header.Ethernet(hdr)
-	e.dispatcher.DeliverNetworkPacket("" /* remote */, "" /* local */, linkHeader.Type(), pkt)
-
-	return nil
-}
-
 // ARPHardwareType implements stack.LinkEndpoint.ARPHardwareType.
 func (*endpoint) ARPHardwareType() header.ARPHardwareType {
 	return header.ARPHardwareLoopback
diff --git a/pkg/tcpip/link/muxed/BUILD b/pkg/tcpip/link/muxed/BUILD
index e7493e5c5..cbda59775 100644
--- a/pkg/tcpip/link/muxed/BUILD
+++ b/pkg/tcpip/link/muxed/BUILD
@@ -8,7 +8,6 @@ go_library(
     visibility = ["//visibility:public"],
     deps = [
         "//pkg/tcpip",
-        "//pkg/tcpip/buffer",
         "//pkg/tcpip/header",
         "//pkg/tcpip/stack",
     ],
diff --git a/pkg/tcpip/link/muxed/injectable.go b/pkg/tcpip/link/muxed/injectable.go
index 56a611825..22e79ce3a 100644
--- a/pkg/tcpip/link/muxed/injectable.go
+++ b/pkg/tcpip/link/muxed/injectable.go
@@ -17,7 +17,6 @@ package muxed
 
 import (
 	"gvisor.dev/gvisor/pkg/tcpip"
-	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 )
@@ -106,13 +105,6 @@ func (m *InjectableEndpoint) WritePacket(r *stack.Route, gso *stack.GSO, protoco
 	return tcpip.ErrNoRoute
 }
 
-// WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket.
-func (m *InjectableEndpoint) WriteRawPacket(buffer.VectorisedView) *tcpip.Error {
-	// WriteRawPacket doesn't get a route or network address, so there's
-	// nowhere to write this.
-	return tcpip.ErrNoRoute
-}
-
 // InjectOutbound writes outbound packets to the appropriate
 // LinkInjectableEndpoint based on the dest address.
 func (m *InjectableEndpoint) InjectOutbound(dest tcpip.Address, packet []byte) *tcpip.Error {
diff --git a/pkg/tcpip/link/nested/BUILD b/pkg/tcpip/link/nested/BUILD
index 2cdb23475..00b42b924 100644
--- a/pkg/tcpip/link/nested/BUILD
+++ b/pkg/tcpip/link/nested/BUILD
@@ -11,7 +11,6 @@ go_library(
     deps = [
         "//pkg/sync",
         "//pkg/tcpip",
-        "//pkg/tcpip/buffer",
         "//pkg/tcpip/header",
         "//pkg/tcpip/stack",
     ],
diff --git a/pkg/tcpip/link/nested/nested.go b/pkg/tcpip/link/nested/nested.go
index d40de54df..0ee54c3d5 100644
--- a/pkg/tcpip/link/nested/nested.go
+++ b/pkg/tcpip/link/nested/nested.go
@@ -19,7 +19,6 @@ package nested
 import (
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/tcpip"
-	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 )
@@ -123,11 +122,6 @@ func (e *Endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.Packe
 	return e.child.WritePackets(r, gso, pkts, protocol)
 }
 
-// WriteRawPacket implements stack.LinkEndpoint.
-func (e *Endpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error {
-	return e.child.WriteRawPacket(vv)
-}
-
 // Wait implements stack.LinkEndpoint.
 func (e *Endpoint) Wait() {
 	e.child.Wait()
diff --git a/pkg/tcpip/link/packetsocket/endpoint.go b/pkg/tcpip/link/packetsocket/endpoint.go
index 3922c2a04..9a1b0c0c2 100644
--- a/pkg/tcpip/link/packetsocket/endpoint.go
+++ b/pkg/tcpip/link/packetsocket/endpoint.go
@@ -36,14 +36,14 @@ func New(lower stack.LinkEndpoint) stack.LinkEndpoint {
 
 // WritePacket implements stack.LinkEndpoint.WritePacket.
 func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) *tcpip.Error {
-	e.Endpoint.DeliverOutboundPacket(r.RemoteLinkAddress, r.LocalLinkAddress, protocol, pkt)
+	e.Endpoint.DeliverOutboundPacket(r.RemoteLinkAddress(), r.LocalLinkAddress, protocol, pkt)
 	return e.Endpoint.WritePacket(r, gso, protocol, pkt)
 }
 
 // WritePackets implements stack.LinkEndpoint.WritePackets.
 func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.PacketBufferList, proto tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
 	for pkt := pkts.Front(); pkt != nil; pkt = pkt.Next() {
-		e.Endpoint.DeliverOutboundPacket(pkt.EgressRoute.RemoteLinkAddress, pkt.EgressRoute.LocalLinkAddress, pkt.NetworkProtocolNumber, pkt)
+		e.Endpoint.DeliverOutboundPacket(pkt.EgressRoute.RemoteLinkAddress(), pkt.EgressRoute.LocalLinkAddress, pkt.NetworkProtocolNumber, pkt)
 	}
 
 	return e.Endpoint.WritePackets(r, gso, pkts, proto)
diff --git a/pkg/tcpip/link/pipe/pipe.go b/pkg/tcpip/link/pipe/pipe.go
index 523b0d24b..25c364391 100644
--- a/pkg/tcpip/link/pipe/pipe.go
+++ b/pkg/tcpip/link/pipe/pipe.go
@@ -55,7 +55,7 @@ func (e *Endpoint) WritePacket(r *stack.Route, _ *stack.GSO, proto tcpip.Network
 	// remote address from the perspective of the other end of the pipe
 	// (e.linked). Similarly, the remote address from the perspective of this
 	// endpoint is the local address on the other end.
-	e.linked.dispatcher.DeliverNetworkPacket(r.LocalLinkAddress /* remote */, r.RemoteLinkAddress /* local */, proto, stack.NewPacketBuffer(stack.PacketBufferOptions{
+	e.linked.dispatcher.DeliverNetworkPacket(r.LocalLinkAddress /* remote */, r.RemoteLinkAddress() /* local */, proto, stack.NewPacketBuffer(stack.PacketBufferOptions{
 		Data: buffer.NewVectorisedView(pkt.Size(), pkt.Views()),
 	}))
 
@@ -67,11 +67,6 @@ func (*Endpoint) WritePackets(*stack.Route, *stack.GSO, stack.PacketBufferList,
 	panic("not implemented")
 }
 
-// WriteRawPacket implements stack.LinkEndpoint.
-func (*Endpoint) WriteRawPacket(buffer.VectorisedView) *tcpip.Error {
-	panic("not implemented")
-}
-
 // Attach implements stack.LinkEndpoint.
 func (e *Endpoint) Attach(dispatcher stack.NetworkDispatcher) {
 	e.dispatcher = dispatcher
diff --git a/pkg/tcpip/link/qdisc/fifo/BUILD b/pkg/tcpip/link/qdisc/fifo/BUILD
index 1d0079bd6..5bea598eb 100644
--- a/pkg/tcpip/link/qdisc/fifo/BUILD
+++ b/pkg/tcpip/link/qdisc/fifo/BUILD
@@ -13,7 +13,6 @@ go_library(
         "//pkg/sleep",
         "//pkg/sync",
         "//pkg/tcpip",
-        "//pkg/tcpip/buffer",
         "//pkg/tcpip/header",
         "//pkg/tcpip/stack",
     ],
diff --git a/pkg/tcpip/link/qdisc/fifo/endpoint.go b/pkg/tcpip/link/qdisc/fifo/endpoint.go
index fc1e34fc7..27667f5f0 100644
--- a/pkg/tcpip/link/qdisc/fifo/endpoint.go
+++ b/pkg/tcpip/link/qdisc/fifo/endpoint.go
@@ -21,7 +21,6 @@ import (
 	"gvisor.dev/gvisor/pkg/sleep"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/tcpip"
-	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 )
@@ -156,7 +155,7 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.Ne
 	// WritePacket caller's do not set the following fields in PacketBuffer
 	// so we populate them here.
 	newRoute := r.Clone()
-	pkt.EgressRoute = &newRoute
+	pkt.EgressRoute = newRoute
 	pkt.GSOOptions = gso
 	pkt.NetworkProtocolNumber = protocol
 	d := e.dispatchers[int(pkt.Hash)%len(e.dispatchers)]
@@ -183,7 +182,7 @@ func (e *endpoint) WritePackets(_ *stack.Route, _ *stack.GSO, pkts stack.PacketB
 		// the route here to ensure it doesn't get released while the
 		// packet is still in our queue.
 		newRoute := pkt.EgressRoute.Clone()
-		pkt.EgressRoute = &newRoute
+		pkt.EgressRoute = newRoute
 		if !d.q.enqueue(pkt) {
 			if enqueued > 0 {
 				d.newPacketWaker.Assert()
@@ -197,13 +196,6 @@ func (e *endpoint) WritePackets(_ *stack.Route, _ *stack.GSO, pkts stack.PacketB
 	return enqueued, nil
 }
 
-// WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket.
-func (e *endpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error {
-	// TODO(gvisor.dev/issue/3267): Queue these packets as well once
-	// WriteRawPacket takes PacketBuffer instead of VectorisedView.
-	return e.lower.WriteRawPacket(vv)
-}
-
 // Wait implements stack.LinkEndpoint.Wait.
 func (e *endpoint) Wait() {
 	e.lower.Wait()
diff --git a/pkg/tcpip/link/sharedmem/sharedmem.go b/pkg/tcpip/link/sharedmem/sharedmem.go
index 7fb8a6c49..5660418fa 100644
--- a/pkg/tcpip/link/sharedmem/sharedmem.go
+++ b/pkg/tcpip/link/sharedmem/sharedmem.go
@@ -204,7 +204,7 @@ func (e *endpoint) AddHeader(local, remote tcpip.LinkAddress, protocol tcpip.Net
 // WritePacket writes outbound packets to the file descriptor. If it is not
 // currently writable, the packet is dropped.
 func (e *endpoint) WritePacket(r *stack.Route, _ *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) *tcpip.Error {
-	e.AddHeader(r.LocalLinkAddress, r.RemoteLinkAddress, protocol, pkt)
+	e.AddHeader(r.LocalLinkAddress, r.RemoteLinkAddress(), protocol, pkt)
 
 	views := pkt.Views()
 	// Transmit the packet.
@@ -224,21 +224,6 @@ func (e *endpoint) WritePackets(r *stack.Route, _ *stack.GSO, pkts stack.PacketB
 	panic("not implemented")
 }
 
-// WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket.
-func (e *endpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error {
-	views := vv.Views()
-	// Transmit the packet.
-	e.mu.Lock()
-	ok := e.tx.transmit(views...)
-	e.mu.Unlock()
-
-	if !ok {
-		return tcpip.ErrWouldBlock
-	}
-
-	return nil
-}
-
 // dispatchLoop reads packets from the rx queue in a loop and dispatches them
 // to the network stack.
 func (e *endpoint) dispatchLoop(d stack.NetworkDispatcher) {
diff --git a/pkg/tcpip/link/sharedmem/sharedmem_test.go b/pkg/tcpip/link/sharedmem/sharedmem_test.go
index 22d5c97f1..7131392cc 100644
--- a/pkg/tcpip/link/sharedmem/sharedmem_test.go
+++ b/pkg/tcpip/link/sharedmem/sharedmem_test.go
@@ -260,9 +260,8 @@ func TestSimpleSend(t *testing.T) {
 	defer c.cleanup()
 
 	// Prepare route.
-	r := stack.Route{
-		RemoteLinkAddress: remoteLinkAddr,
-	}
+	var r stack.Route
+	r.ResolveWith(remoteLinkAddr)
 
 	for iters := 1000; iters > 0; iters-- {
 		func() {
@@ -342,9 +341,9 @@ func TestPreserveSrcAddressInSend(t *testing.T) {
 	newLocalLinkAddress := tcpip.LinkAddress(strings.Repeat("0xFE", 6))
 	// Set both remote and local link address in route.
 	r := stack.Route{
-		RemoteLinkAddress: remoteLinkAddr,
-		LocalLinkAddress:  newLocalLinkAddress,
+		LocalLinkAddress: newLocalLinkAddress,
 	}
+	r.ResolveWith(remoteLinkAddr)
 
 	pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
 		// WritePacket panics given a prependable with anything less than
@@ -395,9 +394,8 @@ func TestFillTxQueue(t *testing.T) {
 	defer c.cleanup()
 
 	// Prepare to send a packet.
-	r := stack.Route{
-		RemoteLinkAddress: remoteLinkAddr,
-	}
+	var r stack.Route
+	r.ResolveWith(remoteLinkAddr)
 
 	buf := buffer.NewView(100)
 
@@ -444,9 +442,8 @@ func TestFillTxQueueAfterBadCompletion(t *testing.T) {
 	c.txq.rx.Flush()
 
 	// Prepare to send a packet.
-	r := stack.Route{
-		RemoteLinkAddress: remoteLinkAddr,
-	}
+	var r stack.Route
+	r.ResolveWith(remoteLinkAddr)
 
 	buf := buffer.NewView(100)
 
@@ -509,9 +506,8 @@ func TestFillTxMemory(t *testing.T) {
 	defer c.cleanup()
 
 	// Prepare to send a packet.
-	r := stack.Route{
-		RemoteLinkAddress: remoteLinkAddr,
-	}
+	var r stack.Route
+	r.ResolveWith(remoteLinkAddr)
 
 	buf := buffer.NewView(100)
 
@@ -557,9 +553,8 @@ func TestFillTxMemoryWithMultiBuffer(t *testing.T) {
 	defer c.cleanup()
 
 	// Prepare to send a packet.
-	r := stack.Route{
-		RemoteLinkAddress: remoteLinkAddr,
-	}
+	var r stack.Route
+	r.ResolveWith(remoteLinkAddr)
 
 	buf := buffer.NewView(100)
 
diff --git a/pkg/tcpip/link/sniffer/sniffer.go b/pkg/tcpip/link/sniffer/sniffer.go
index b3e8c4b92..8d9a91020 100644
--- a/pkg/tcpip/link/sniffer/sniffer.go
+++ b/pkg/tcpip/link/sniffer/sniffer.go
@@ -53,16 +53,35 @@ type endpoint struct {
 	nested.Endpoint
 	writer     io.Writer
 	maxPCAPLen uint32
+	logPrefix  string
 }
 
 var _ stack.GSOEndpoint = (*endpoint)(nil)
 var _ stack.LinkEndpoint = (*endpoint)(nil)
 var _ stack.NetworkDispatcher = (*endpoint)(nil)
 
+type direction int
+
+const (
+	directionSend = iota
+	directionRecv
+)
+
 // New creates a new sniffer link-layer endpoint. It wraps around another
 // endpoint and logs packets and they traverse the endpoint.
 func New(lower stack.LinkEndpoint) stack.LinkEndpoint {
-	sniffer := &endpoint{}
+	return NewWithPrefix(lower, "")
+}
+
+// NewWithPrefix creates a new sniffer link-layer endpoint. It wraps around
+// another endpoint and logs packets prefixed with logPrefix as they traverse
+// the endpoint.
+//
+// logPrefix is prepended to the log line without any separators.
+// E.g. logPrefix = "NIC:en0/" will produce log lines like
+// "NIC:en0/send udp [...]".
+func NewWithPrefix(lower stack.LinkEndpoint, logPrefix string) stack.LinkEndpoint {
+	sniffer := &endpoint{logPrefix: logPrefix}
 	sniffer.Endpoint.Init(lower, sniffer)
 	return sniffer
 }
@@ -120,7 +139,7 @@ func NewWithWriter(lower stack.LinkEndpoint, writer io.Writer, snapLen uint32) (
 // called by the link-layer endpoint being wrapped when a packet arrives, and
 // logs the packet before forwarding to the actual dispatcher.
 func (e *endpoint) DeliverNetworkPacket(remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
-	e.dumpPacket("recv", nil, protocol, pkt)
+	e.dumpPacket(directionRecv, nil, protocol, pkt)
 	e.Endpoint.DeliverNetworkPacket(remote, local, protocol, pkt)
 }
 
@@ -129,10 +148,10 @@ func (e *endpoint) DeliverOutboundPacket(remote, local tcpip.LinkAddress, protoc
 	e.Endpoint.DeliverOutboundPacket(remote, local, protocol, pkt)
 }
 
-func (e *endpoint) dumpPacket(prefix string, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
+func (e *endpoint) dumpPacket(dir direction, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
 	writer := e.writer
 	if writer == nil && atomic.LoadUint32(&LogPackets) == 1 {
-		logPacket(prefix, protocol, pkt, gso)
+		logPacket(e.logPrefix, dir, protocol, pkt, gso)
 	}
 	if writer != nil && atomic.LoadUint32(&LogPacketsToPCAP) == 1 {
 		totalLength := pkt.Size()
@@ -169,7 +188,7 @@ func (e *endpoint) dumpPacket(prefix string, gso *stack.GSO, protocol tcpip.Netw
 // higher-level protocols to write packets; it just logs the packet and
 // forwards the request to the lower endpoint.
 func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) *tcpip.Error {
-	e.dumpPacket("send", gso, protocol, pkt)
+	e.dumpPacket(directionSend, gso, protocol, pkt)
 	return e.Endpoint.WritePacket(r, gso, protocol, pkt)
 }
 
@@ -178,20 +197,12 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.Ne
 // forwards the request to the lower endpoint.
 func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.PacketBufferList, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
 	for pkt := pkts.Front(); pkt != nil; pkt = pkt.Next() {
-		e.dumpPacket("send", gso, protocol, pkt)
+		e.dumpPacket(directionSend, gso, protocol, pkt)
 	}
 	return e.Endpoint.WritePackets(r, gso, pkts, protocol)
 }
 
-// WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket.
-func (e *endpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error {
-	e.dumpPacket("send", nil, 0, stack.NewPacketBuffer(stack.PacketBufferOptions{
-		Data: vv,
-	}))
-	return e.Endpoint.WriteRawPacket(vv)
-}
-
-func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer, gso *stack.GSO) {
+func logPacket(prefix string, dir direction, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer, gso *stack.GSO) {
 	// Figure out the network layer info.
 	var transProto uint8
 	src := tcpip.Address("unknown")
@@ -201,6 +212,16 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, pkt *stack.P
 	var fragmentOffset uint16
 	var moreFragments bool
 
+	var directionPrefix string
+	switch dir {
+	case directionSend:
+		directionPrefix = "send"
+	case directionRecv:
+		directionPrefix = "recv"
+	default:
+		panic(fmt.Sprintf("unrecognized direction: %d", dir))
+	}
+
 	// Clone the packet buffer to not modify the original.
 	//
 	// We don't clone the original packet buffer so that the new packet buffer
@@ -248,15 +269,16 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, pkt *stack.P
 
 		arp := header.ARP(pkt.NetworkHeader().View())
 		log.Infof(
-			"%s arp %s (%s) -> %s (%s) valid:%t",
+			"%s%s arp %s (%s) -> %s (%s) valid:%t",
 			prefix,
+			directionPrefix,
 			tcpip.Address(arp.ProtocolAddressSender()), tcpip.LinkAddress(arp.HardwareAddressSender()),
 			tcpip.Address(arp.ProtocolAddressTarget()), tcpip.LinkAddress(arp.HardwareAddressTarget()),
 			arp.IsValid(),
 		)
 		return
 	default:
-		log.Infof("%s unknown network protocol", prefix)
+		log.Infof("%s%s unknown network protocol", prefix, directionPrefix)
 		return
 	}
 
@@ -300,7 +322,7 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, pkt *stack.P
 				icmpType = "info reply"
 			}
 		}
-		log.Infof("%s %s %s -> %s %s len:%d id:%04x code:%d", prefix, transName, src, dst, icmpType, size, id, icmp.Code())
+		log.Infof("%s%s %s %s -> %s %s len:%d id:%04x code:%d", prefix, directionPrefix, transName, src, dst, icmpType, size, id, icmp.Code())
 		return
 
 	case header.ICMPv6ProtocolNumber:
@@ -335,7 +357,7 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, pkt *stack.P
 		case header.ICMPv6RedirectMsg:
 			icmpType = "redirect message"
 		}
-		log.Infof("%s %s %s -> %s %s len:%d id:%04x code:%d", prefix, transName, src, dst, icmpType, size, id, icmp.Code())
+		log.Infof("%s%s %s %s -> %s %s len:%d id:%04x code:%d", prefix, directionPrefix, transName, src, dst, icmpType, size, id, icmp.Code())
 		return
 
 	case header.UDPProtocolNumber:
@@ -391,7 +413,7 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, pkt *stack.P
 		}
 
 	default:
-		log.Infof("%s %s -> %s unknown transport protocol: %d", prefix, src, dst, transProto)
+		log.Infof("%s%s %s -> %s unknown transport protocol: %d", prefix, directionPrefix, src, dst, transProto)
 		return
 	}
 
@@ -399,5 +421,5 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, pkt *stack.P
 		details += fmt.Sprintf(" gso: %+v", gso)
 	}
 
-	log.Infof("%s %s %s:%d -> %s:%d len:%d id:%04x %s", prefix, transName, src, srcPort, dst, dstPort, size, id, details)
+	log.Infof("%s%s %s %s:%d -> %s:%d len:%d id:%04x %s", prefix, directionPrefix, transName, src, srcPort, dst, dstPort, size, id, details)
 }
diff --git a/pkg/tcpip/link/tun/device.go b/pkg/tcpip/link/tun/device.go
index 9a76bdba7..a364c5801 100644
--- a/pkg/tcpip/link/tun/device.go
+++ b/pkg/tcpip/link/tun/device.go
@@ -264,7 +264,7 @@ func (d *Device) encodePkt(info *channel.PacketInfo) (buffer.View, bool) {
 	// If the packet does not already have link layer header, and the route
 	// does not exist, we can't compute it. This is possibly a raw packet, tun
 	// device doesn't support this at the moment.
-	if info.Pkt.LinkHeader().View().IsEmpty() && info.Route.RemoteLinkAddress == "" {
+	if info.Pkt.LinkHeader().View().IsEmpty() && info.Route.RemoteLinkAddress() == "" {
 		return nil, false
 	}
 
@@ -272,7 +272,7 @@ func (d *Device) encodePkt(info *channel.PacketInfo) (buffer.View, bool) {
 	if d.hasFlags(linux.IFF_TAP) {
 		// Add ethernet header if not provided.
 		if info.Pkt.LinkHeader().View().IsEmpty() {
-			d.endpoint.AddHeader(info.Route.LocalLinkAddress, info.Route.RemoteLinkAddress, info.Proto, info.Pkt)
+			d.endpoint.AddHeader(info.Route.LocalLinkAddress, info.Route.RemoteLinkAddress(), info.Proto, info.Pkt)
 		}
 		vv.AppendView(info.Pkt.LinkHeader().View())
 	}
diff --git a/pkg/tcpip/link/waitable/BUILD b/pkg/tcpip/link/waitable/BUILD
index ee84c3d96..9b4602c1b 100644
--- a/pkg/tcpip/link/waitable/BUILD
+++ b/pkg/tcpip/link/waitable/BUILD
@@ -11,7 +11,6 @@ go_library(
     deps = [
         "//pkg/gate",
         "//pkg/tcpip",
-        "//pkg/tcpip/buffer",
         "//pkg/tcpip/header",
         "//pkg/tcpip/stack",
     ],
@@ -25,7 +24,6 @@ go_test(
     library = ":waitable",
     deps = [
         "//pkg/tcpip",
-        "//pkg/tcpip/buffer",
         "//pkg/tcpip/header",
         "//pkg/tcpip/stack",
     ],
diff --git a/pkg/tcpip/link/waitable/waitable.go b/pkg/tcpip/link/waitable/waitable.go
index b152a0f26..cf0077f43 100644
--- a/pkg/tcpip/link/waitable/waitable.go
+++ b/pkg/tcpip/link/waitable/waitable.go
@@ -24,7 +24,6 @@ package waitable
 import (
 	"gvisor.dev/gvisor/pkg/gate"
 	"gvisor.dev/gvisor/pkg/tcpip"
-	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 )
@@ -132,17 +131,6 @@ func (e *Endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.Packe
 	return n, err
 }
 
-// WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket.
-func (e *Endpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error {
-	if !e.writeGate.Enter() {
-		return nil
-	}
-
-	err := e.lower.WriteRawPacket(vv)
-	e.writeGate.Leave()
-	return err
-}
-
 // WaitWrite prevents new calls to WritePacket from reaching the lower endpoint,
 // and waits for inflight ones to finish before returning.
 func (e *Endpoint) WaitWrite() {
diff --git a/pkg/tcpip/link/waitable/waitable_test.go b/pkg/tcpip/link/waitable/waitable_test.go
index 94827fc56..cf7fb5126 100644
--- a/pkg/tcpip/link/waitable/waitable_test.go
+++ b/pkg/tcpip/link/waitable/waitable_test.go
@@ -18,7 +18,6 @@ import (
 	"testing"
 
 	"gvisor.dev/gvisor/pkg/tcpip"
-	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 )
@@ -81,11 +80,6 @@ func (e *countedEndpoint) WritePackets(r *stack.Route, _ *stack.GSO, pkts stack.
 	return pkts.Len(), nil
 }
 
-func (e *countedEndpoint) WriteRawPacket(buffer.VectorisedView) *tcpip.Error {
-	e.writeCount++
-	return nil
-}
-
 // ARPHardwareType implements stack.LinkEndpoint.ARPHardwareType.
 func (*countedEndpoint) ARPHardwareType() header.ARPHardwareType {
 	panic("unimplemented")
diff --git a/pkg/tcpip/network/BUILD b/pkg/tcpip/network/BUILD
index b38aff0b8..9ebf31b78 100644
--- a/pkg/tcpip/network/BUILD
+++ b/pkg/tcpip/network/BUILD
@@ -7,12 +7,14 @@ go_test(
     size = "small",
     srcs = [
         "ip_test.go",
+        "multicast_group_test.go",
     ],
     deps = [
         "//pkg/sync",
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/checker",
+        "//pkg/tcpip/faketime",
         "//pkg/tcpip/header",
         "//pkg/tcpip/header/parse",
         "//pkg/tcpip/link/channel",
diff --git a/pkg/tcpip/network/arp/arp_test.go b/pkg/tcpip/network/arp/arp_test.go
index f462524c9..0fb373612 100644
--- a/pkg/tcpip/network/arp/arp_test.go
+++ b/pkg/tcpip/network/arp/arp_test.go
@@ -319,9 +319,9 @@ func TestDirectRequestWithNeighborCache(t *testing.T) {
 			copy(h.HardwareAddressSender(), test.senderLinkAddr)
 			copy(h.ProtocolAddressSender(), test.senderAddr)
 			copy(h.ProtocolAddressTarget(), test.targetAddr)
-			c.linkEP.InjectInbound(arp.ProtocolNumber, &stack.PacketBuffer{
+			c.linkEP.InjectInbound(arp.ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
 				Data: v.ToVectorisedView(),
-			})
+			}))
 
 			if !test.isValid {
 				// No packets should be sent after receiving an invalid ARP request.
@@ -442,9 +442,9 @@ func (*testInterface) Promiscuous() bool {
 
 func (t *testInterface) WritePacketToRemote(remoteLinkAddr tcpip.LinkAddress, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) *tcpip.Error {
 	r := stack.Route{
-		NetProto:          protocol,
-		RemoteLinkAddress: remoteLinkAddr,
+		NetProto: protocol,
 	}
+	r.ResolveWith(remoteLinkAddr)
 	return t.LinkEndpoint.WritePacket(&r, gso, protocol, pkt)
 }
 
@@ -557,8 +557,8 @@ func TestLinkAddressRequest(t *testing.T) {
 				t.Fatal("expected to send a link address request")
 			}
 
-			if pkt.Route.RemoteLinkAddress != test.expectedRemoteLinkAddr {
-				t.Errorf("got pkt.Route.RemoteLinkAddress = %s, want = %s", pkt.Route.RemoteLinkAddress, test.expectedRemoteLinkAddr)
+			if got := pkt.Route.RemoteLinkAddress(); got != test.expectedRemoteLinkAddr {
+				t.Errorf("got pkt.Route.RemoteLinkAddress() = %s, want = %s", got, test.expectedRemoteLinkAddr)
 			}
 
 			rep := header.ARP(stack.PayloadSince(pkt.Pkt.NetworkHeader()))
diff --git a/pkg/tcpip/network/fragmentation/fragmentation.go b/pkg/tcpip/network/fragmentation/fragmentation.go
index c75ca7d71..d31296a41 100644
--- a/pkg/tcpip/network/fragmentation/fragmentation.go
+++ b/pkg/tcpip/network/fragmentation/fragmentation.go
@@ -46,9 +46,13 @@ const (
 )
 
 var (
-	// ErrInvalidArgs indicates to the caller that that an invalid argument was
+	// ErrInvalidArgs indicates to the caller that an invalid argument was
 	// provided.
 	ErrInvalidArgs = errors.New("invalid args")
+
+	// ErrFragmentOverlap indicates that, during reassembly, a fragment overlaps
+	// with another one.
+	ErrFragmentOverlap = errors.New("overlapping fragments")
 )
 
 // FragmentID is the identifier for a fragment.
diff --git a/pkg/tcpip/network/fragmentation/reassembler.go b/pkg/tcpip/network/fragmentation/reassembler.go
index 19f4920b3..04072d966 100644
--- a/pkg/tcpip/network/fragmentation/reassembler.go
+++ b/pkg/tcpip/network/fragmentation/reassembler.go
@@ -26,9 +26,9 @@ import (
 )
 
 type hole struct {
-	first   uint16
-	last    uint16
-	deleted bool
+	first  uint16
+	last   uint16
+	filled bool
 }
 
 type reassembler struct {
@@ -38,7 +38,7 @@ type reassembler struct {
 	proto        uint8
 	mu           sync.Mutex
 	holes        []hole
-	deleted      int
+	filled       int
 	heap         fragHeap
 	done         bool
 	creationTime int64
@@ -53,44 +53,86 @@ func newReassembler(id FragmentID, clock tcpip.Clock) *reassembler {
 		creationTime: clock.NowMonotonic(),
 	}
 	r.holes = append(r.holes, hole{
-		first:   0,
-		last:    math.MaxUint16,
-		deleted: false})
+		first:  0,
+		last:   math.MaxUint16,
+		filled: false,
+	})
 	return r
 }
 
-// updateHoles updates the list of holes for an incoming fragment and
-// returns true iff the fragment filled at least part of an existing hole.
-func (r *reassembler) updateHoles(first, last uint16, more bool) bool {
-	used := false
+// updateHoles updates the list of holes for an incoming fragment. It returns
+// true if the fragment fits, it is not a duplicate and it does not overlap with
+// another fragment.
+//
+// For IPv6, overlaps with an existing fragment are explicitly forbidden by
+// RFC 8200 section 4.5:
+//   If any of the fragments being reassembled overlap with any other fragments
+//   being reassembled for the same packet, reassembly of that packet must be
+//   abandoned and all the fragments that have been received for that packet
+//   must be discarded, and no ICMP error messages should be sent.
+//
+// It is not explicitly forbidden for IPv4, but to keep parity with Linux we
+// disallow it as well:
+// https://github.com/torvalds/linux/blob/38525c6/net/ipv4/inet_fragment.c#L349
+func (r *reassembler) updateHoles(first, last uint16, more bool) (bool, error) {
 	for i := range r.holes {
-		if r.holes[i].deleted || first > r.holes[i].last || last < r.holes[i].first {
+		currentHole := &r.holes[i]
+
+		if currentHole.filled || last < currentHole.first || currentHole.last < first {
 			continue
 		}
-		used = true
-		r.deleted++
-		r.holes[i].deleted = true
-		if first > r.holes[i].first {
-			r.holes = append(r.holes, hole{r.holes[i].first, first - 1, false})
+
+		if first < currentHole.first || currentHole.last < last {
+			// Incoming fragment only partially fits in the free hole.
+			return false, ErrFragmentOverlap
+		}
+
+		r.filled++
+		if first > currentHole.first {
+			r.holes = append(r.holes, hole{
+				first:  currentHole.first,
+				last:   first - 1,
+				filled: false,
+			})
+		}
+		if last < currentHole.last && more {
+			r.holes = append(r.holes, hole{
+				first:  last + 1,
+				last:   currentHole.last,
+				filled: false,
+			})
 		}
-		if last < r.holes[i].last && more {
-			r.holes = append(r.holes, hole{last + 1, r.holes[i].last, false})
+		// Update the current hole to precisely match the incoming fragment.
+		r.holes[i] = hole{
+			first:  first,
+			last:   last,
+			filled: true,
 		}
+		return true, nil
 	}
-	return used
+
+	// Incoming fragment is a duplicate/subset, or its offset comes after the end
+	// of the reassembled payload.
+	return false, nil
 }
 
 func (r *reassembler) process(first, last uint16, more bool, proto uint8, pkt *stack.PacketBuffer) (buffer.VectorisedView, uint8, bool, int, error) {
 	r.mu.Lock()
 	defer r.mu.Unlock()
-	consumed := 0
 	if r.done {
 		// A concurrent goroutine might have already reassembled
 		// the packet and emptied the heap while this goroutine
 		// was waiting on the mutex. We don't have to do anything in this case.
-		return buffer.VectorisedView{}, 0, false, consumed, nil
+		return buffer.VectorisedView{}, 0, false, 0, nil
 	}
-	if r.updateHoles(first, last, more) {
+
+	used, err := r.updateHoles(first, last, more)
+	if err != nil {
+		return buffer.VectorisedView{}, 0, false, 0, fmt.Errorf("fragment reassembly failed: %w", err)
+	}
+
+	var consumed int
+	if used {
 		// For IPv6, it is possible to have different Protocol values between
 		// fragments of a packet (because, unlike IPv4, the Protocol is not used to
 		// identify a fragment). In this case, only the Protocol of the first
@@ -109,13 +151,14 @@ func (r *reassembler) process(first, last uint16, more bool, proto uint8, pkt *s
 		consumed = vv.Size()
 		r.size += consumed
 	}
-	// Check if all the holes have been deleted and we are ready to reassamble.
-	if r.deleted < len(r.holes) {
+
+	// Check if all the holes have been filled and we are ready to reassemble.
+	if r.filled < len(r.holes) {
 		return buffer.VectorisedView{}, 0, false, consumed, nil
 	}
 	res, err := r.heap.reassemble()
 	if err != nil {
-		return buffer.VectorisedView{}, 0, false, consumed, fmt.Errorf("fragment reassembly failed: %w", err)
+		return buffer.VectorisedView{}, 0, false, 0, fmt.Errorf("fragment reassembly failed: %w", err)
 	}
 	return res, r.proto, true, consumed, nil
 }
diff --git a/pkg/tcpip/network/fragmentation/reassembler_test.go b/pkg/tcpip/network/fragmentation/reassembler_test.go
index a0a04a027..cee3063b1 100644
--- a/pkg/tcpip/network/fragmentation/reassembler_test.go
+++ b/pkg/tcpip/network/fragmentation/reassembler_test.go
@@ -16,92 +16,124 @@ package fragmentation
 
 import (
 	"math"
-	"reflect"
 	"testing"
 
+	"github.com/google/go-cmp/cmp"
 	"gvisor.dev/gvisor/pkg/tcpip/faketime"
 )
 
-type updateHolesInput struct {
-	first uint16
-	last  uint16
-	more  bool
+type updateHolesParams struct {
+	first     uint16
+	last      uint16
+	more      bool
+	wantUsed  bool
+	wantError error
 }
 
-var holesTestCases = []struct {
-	comment string
-	in      []updateHolesInput
-	want    []hole
-}{
-	{
-		comment: "No fragments. Expected holes: {[0 -> inf]}.",
-		in:      []updateHolesInput{},
-		want:    []hole{{first: 0, last: math.MaxUint16, deleted: false}},
-	},
-	{
-		comment: "One fragment at beginning. Expected holes: {[2, inf]}.",
-		in:      []updateHolesInput{{first: 0, last: 1, more: true}},
-		want: []hole{
-			{first: 0, last: math.MaxUint16, deleted: true},
-			{first: 2, last: math.MaxUint16, deleted: false},
+func TestUpdateHoles(t *testing.T) {
+	var tests = []struct {
+		name   string
+		params []updateHolesParams
+		want   []hole
+	}{
+		{
+			name:   "No fragments",
+			params: nil,
+			want:   []hole{{first: 0, last: math.MaxUint16, filled: false}},
 		},
-	},
-	{
-		comment: "One fragment in the middle. Expected holes: {[0, 0], [3, inf]}.",
-		in:      []updateHolesInput{{first: 1, last: 2, more: true}},
-		want: []hole{
-			{first: 0, last: math.MaxUint16, deleted: true},
-			{first: 0, last: 0, deleted: false},
-			{first: 3, last: math.MaxUint16, deleted: false},
+		{
+			name:   "One fragment at beginning",
+			params: []updateHolesParams{{first: 0, last: 1, more: true, wantUsed: true, wantError: nil}},
+			want: []hole{
+				{first: 0, last: 1, filled: true},
+				{first: 2, last: math.MaxUint16, filled: false},
+			},
 		},
-	},
-	{
-		comment: "One fragment at the end. Expected holes: {[0, 0]}.",
-		in:      []updateHolesInput{{first: 1, last: 2, more: false}},
-		want: []hole{
-			{first: 0, last: math.MaxUint16, deleted: true},
-			{first: 0, last: 0, deleted: false},
+		{
+			name:   "One fragment in the middle",
+			params: []updateHolesParams{{first: 1, last: 2, more: true, wantUsed: true, wantError: nil}},
+			want: []hole{
+				{first: 1, last: 2, filled: true},
+				{first: 0, last: 0, filled: false},
+				{first: 3, last: math.MaxUint16, filled: false},
+			},
 		},
-	},
-	{
-		comment: "One fragment completing a packet. Expected holes: {}.",
-		in:      []updateHolesInput{{first: 0, last: 1, more: false}},
-		want: []hole{
-			{first: 0, last: math.MaxUint16, deleted: true},
+		{
+			name:   "One fragment at the end",
+			params: []updateHolesParams{{first: 1, last: 2, more: false, wantUsed: true, wantError: nil}},
+			want: []hole{
+				{first: 1, last: 2, filled: true},
+				{first: 0, last: 0, filled: false},
+			},
 		},
-	},
-	{
-		comment: "Two non-overlapping fragments completing a packet. Expected holes: {}.",
-		in: []updateHolesInput{
-			{first: 0, last: 1, more: true},
-			{first: 2, last: 3, more: false},
+		{
+			name:   "One fragment completing a packet",
+			params: []updateHolesParams{{first: 0, last: 1, more: false, wantUsed: true, wantError: nil}},
+			want: []hole{
+				{first: 0, last: 1, filled: true},
+			},
 		},
-		want: []hole{
-			{first: 0, last: math.MaxUint16, deleted: true},
-			{first: 2, last: math.MaxUint16, deleted: true},
+		{
+			name: "Two fragments completing a packet",
+			params: []updateHolesParams{
+				{first: 0, last: 1, more: true, wantUsed: true, wantError: nil},
+				{first: 2, last: 3, more: false, wantUsed: true, wantError: nil},
+			},
+			want: []hole{
+				{first: 0, last: 1, filled: true},
+				{first: 2, last: 3, filled: true},
+			},
 		},
-	},
-	{
-		comment: "Two overlapping fragments completing a packet. Expected holes: {}.",
-		in: []updateHolesInput{
-			{first: 0, last: 2, more: true},
-			{first: 2, last: 3, more: false},
+		{
+			name: "Two fragments completing a packet with a duplicate",
+			params: []updateHolesParams{
+				{first: 0, last: 1, more: true, wantUsed: true, wantError: nil},
+				{first: 0, last: 1, more: true, wantUsed: false, wantError: nil},
+				{first: 2, last: 3, more: false, wantUsed: true, wantError: nil},
+			},
+			want: []hole{
+				{first: 0, last: 1, filled: true},
+				{first: 2, last: 3, filled: true},
+			},
 		},
-		want: []hole{
-			{first: 0, last: math.MaxUint16, deleted: true},
-			{first: 3, last: math.MaxUint16, deleted: true},
+		{
+			name: "Two overlapping fragments",
+			params: []updateHolesParams{
+				{first: 0, last: 10, more: true, wantUsed: true, wantError: nil},
+				{first: 5, last: 15, more: false, wantUsed: false, wantError: ErrFragmentOverlap},
+				{first: 11, last: 15, more: false, wantUsed: true, wantError: nil},
+			},
+			want: []hole{
+				{first: 0, last: 10, filled: true},
+				{first: 11, last: 15, filled: true},
+			},
 		},
-	},
-}
+		{
+			name: "Out of bounds fragment",
+			params: []updateHolesParams{
+				{first: 0, last: 10, more: true, wantUsed: true, wantError: nil},
+				{first: 11, last: 15, more: false, wantUsed: true, wantError: nil},
+				{first: 16, last: 20, more: false, wantUsed: false, wantError: nil},
+			},
+			want: []hole{
+				{first: 0, last: 10, filled: true},
+				{first: 11, last: 15, filled: true},
+			},
+		},
+	}
 
-func TestUpdateHoles(t *testing.T) {
-	for _, c := range holesTestCases {
-		r := newReassembler(FragmentID{}, &faketime.NullClock{})
-		for _, i := range c.in {
-			r.updateHoles(i.first, i.last, i.more)
-		}
-		if !reflect.DeepEqual(r.holes, c.want) {
-			t.Errorf("Test \"%s\" produced unexepetced holes. Got %v. Want %v", c.comment, r.holes, c.want)
-		}
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			r := newReassembler(FragmentID{}, &faketime.NullClock{})
+			for _, param := range test.params {
+				used, err := r.updateHoles(param.first, param.last, param.more)
+				if used != param.wantUsed || err != param.wantError {
+					t.Errorf("got r.updateHoles(%d, %d, %t) = (%t, %v), want = (%t, %v)", param.first, param.last, param.more, used, err, param.wantUsed, param.wantError)
+				}
+			}
+			if diff := cmp.Diff(test.want, r.holes, cmp.AllowUnexported(hole{})); diff != "" {
+				t.Errorf("r.holes mismatch (-want +got):\n%s", diff)
+			}
+		})
 	}
 }
diff --git a/pkg/tcpip/network/ip/BUILD b/pkg/tcpip/network/ip/BUILD
new file mode 100644
index 000000000..6ca200b48
--- /dev/null
+++ b/pkg/tcpip/network/ip/BUILD
@@ -0,0 +1,25 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "ip",
+    srcs = ["generic_multicast_protocol.go"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//pkg/sync",
+        "//pkg/tcpip",
+    ],
+)
+
+go_test(
+    name = "ip_test",
+    size = "small",
+    srcs = ["generic_multicast_protocol_test.go"],
+    deps = [
+        ":ip",
+        "//pkg/tcpip",
+        "//pkg/tcpip/faketime",
+        "@com_github_google_go_cmp//cmp:go_default_library",
+    ],
+)
diff --git a/pkg/tcpip/network/ip/generic_multicast_protocol.go b/pkg/tcpip/network/ip/generic_multicast_protocol.go
new file mode 100644
index 000000000..f14e2a88a
--- /dev/null
+++ b/pkg/tcpip/network/ip/generic_multicast_protocol.go
@@ -0,0 +1,546 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package ip holds IPv4/IPv6 common utilities.
+package ip
+
+import (
+	"fmt"
+	"math/rand"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/tcpip"
+)
+
+// hostState is the state a host may be in for a multicast group.
+type hostState int
+
+// The states below are generic across IGMPv2 (RFC 2236 section 6) and MLDv1
+// (RFC 2710 section 5). Even though the states are generic across both IGMPv2
+// and MLDv1, IGMPv2 terminology will be used.
+const (
+	// nonMember is the "'Non-Member' state, when the host does not belong to the
+	// group on the interface. This is the initial state for all memberships on
+	// all network interfaces; it requires no storage in the host."
+	//
+	// 'Non-Listener' is the MLDv1 term used to describe this state.
+	//
+	// This state is used to keep track of groups that have been joined locally,
+	// but without advertising the membership to the network.
+	nonMember hostState = iota
+
+	// delayingMember is the "'Delaying Member' state, when the host belongs to
+	// the group on the interface and has a report delay timer running for that
+	// membership."
+	//
+	// 'Delaying Listener' is the MLDv1 term used to describe this state.
+	delayingMember
+
+	// idleMember is the "Idle Member" state, when the host belongs to the group
+	// on the interface and does not have a report delay timer running for that
+	// membership.
+	//
+	// 'Idle Listener' is the MLDv1 term used to describe this state.
+	idleMember
+)
+
+// multicastGroupState holds the Generic Multicast Protocol state for a
+// multicast group.
+type multicastGroupState struct {
+	// joins is the number of times the group has been joined.
+	joins uint64
+
+	// state holds the host's state for the group.
+	state hostState
+
+	// lastToSendReport is true if we sent the last report for the group. It is
+	// used to track whether there are other hosts on the subnet that are also
+	// members of the group.
+	//
+	// Defined in RFC 2236 section 6 page 9 for IGMPv2 and RFC 2710 section 5 page
+	// 8 for MLDv1.
+	lastToSendReport bool
+
+	// delayedReportJob is used to delay sending responses to membership report
+	// messages in order to reduce duplicate reports from multiple hosts on the
+	// interface.
+	//
+	// Must not be nil.
+	delayedReportJob *tcpip.Job
+}
+
+// GenericMulticastProtocolOptions holds options for the generic multicast
+// protocol.
+type GenericMulticastProtocolOptions struct {
+	// Enabled indicates whether the generic multicast protocol will be
+	// performed.
+	//
+	// When enabled, the protocol may transmit report and leave messages when
+	// joining and leaving multicast groups respectively, and handle incoming
+	// packets.
+	//
+	// When disabled, the protocol will still keep track of locally joined groups,
+	// it just won't transmit and handle packets, or update groups' state.
+	Enabled bool
+
+	// Rand is the source of random numbers.
+	Rand *rand.Rand
+
+	// Clock is the clock used to create timers.
+	Clock tcpip.Clock
+
+	// Protocol is the implementation of the variant of multicast group protocol
+	// in use.
+	Protocol MulticastGroupProtocol
+
+	// MaxUnsolicitedReportDelay is the maximum amount of time to wait between
+	// transmitting unsolicited reports.
+	//
+	// Unsolicited reports are transmitted when a group is newly joined.
+	MaxUnsolicitedReportDelay time.Duration
+
+	// AllNodesAddress is a multicast address that all nodes on a network should
+	// be a member of.
+	//
+	// This address will not have the generic multicast protocol performed on it;
+	// it will be left in the non member/listener state, and packets will never
+	// be sent for it.
+	AllNodesAddress tcpip.Address
+}
+
+// MulticastGroupProtocol is a multicast group protocol whose core state machine
+// can be represented by GenericMulticastProtocolState.
+type MulticastGroupProtocol interface {
+	// SendReport sends a multicast report for the specified group address.
+	SendReport(groupAddress tcpip.Address) *tcpip.Error
+
+	// SendLeave sends a multicast leave for the specified group address.
+	SendLeave(groupAddress tcpip.Address) *tcpip.Error
+}
+
+// GenericMulticastProtocolState is the per interface generic multicast protocol
+// state.
+//
+// There is actually no protocol named "Generic Multicast Protocol". Instead,
+// the term used to refer to a generic multicast protocol that applies to both
+// IPv4 and IPv6. Specifically, Generic Multicast Protocol is the core state
+// machine of IGMPv2 as defined by RFC 2236 and MLDv1 as defined by RFC 2710.
+//
+// GenericMulticastProtocolState.Init MUST be called before calling any of
+// the methods on GenericMulticastProtocolState.
+type GenericMulticastProtocolState struct {
+	opts GenericMulticastProtocolOptions
+
+	mu struct {
+		sync.RWMutex
+
+		// memberships holds group addresses and their associated state.
+		memberships map[tcpip.Address]multicastGroupState
+	}
+}
+
+// Init initializes the Generic Multicast Protocol state.
+//
+// maxUnsolicitedReportDelay is the maximum time between sending unsolicited
+// reports after joining a group.
+func (g *GenericMulticastProtocolState) Init(opts GenericMulticastProtocolOptions) {
+	g.mu.Lock()
+	defer g.mu.Unlock()
+	g.opts = opts
+	g.mu.memberships = make(map[tcpip.Address]multicastGroupState)
+}
+
+// MakeAllNonMember transitions all groups to the non-member state.
+//
+// The groups will still be considered joined locally.
+func (g *GenericMulticastProtocolState) MakeAllNonMember() {
+	if !g.opts.Enabled {
+		return
+	}
+
+	g.mu.Lock()
+	defer g.mu.Unlock()
+
+	for groupAddress, info := range g.mu.memberships {
+		g.transitionToNonMemberLocked(groupAddress, &info)
+		g.mu.memberships[groupAddress] = info
+	}
+}
+
+// InitializeGroups initializes each group, as if they were newly joined but
+// without affecting the groups' join count.
+//
+// Must only be called after calling MakeAllNonMember as a group should not be
+// initialized while it is not in the non-member state.
+func (g *GenericMulticastProtocolState) InitializeGroups() {
+	if !g.opts.Enabled {
+		return
+	}
+
+	g.mu.Lock()
+	defer g.mu.Unlock()
+
+	for groupAddress, info := range g.mu.memberships {
+		g.initializeNewMemberLocked(groupAddress, &info)
+		g.mu.memberships[groupAddress] = info
+	}
+}
+
+// JoinGroup handles joining a new group.
+//
+// If dontInitialize is true, the group will be not be initialized and will be
+// left in the non-member state - no packets will be sent for it until it is
+// initialized via InitializeGroups.
+func (g *GenericMulticastProtocolState) JoinGroup(groupAddress tcpip.Address, dontInitialize bool) {
+	g.mu.Lock()
+	defer g.mu.Unlock()
+
+	if info, ok := g.mu.memberships[groupAddress]; ok {
+		// The group has already been joined.
+		info.joins++
+		g.mu.memberships[groupAddress] = info
+		return
+	}
+
+	info := multicastGroupState{
+		// Since we just joined the group, its count is 1.
+		joins: 1,
+		// The state will be updated below, if required.
+		state:            nonMember,
+		lastToSendReport: false,
+		delayedReportJob: tcpip.NewJob(g.opts.Clock, &g.mu, func() {
+			info, ok := g.mu.memberships[groupAddress]
+			if !ok {
+				panic(fmt.Sprintf("expected to find group state for group = %s", groupAddress))
+			}
+
+			info.lastToSendReport = g.opts.Protocol.SendReport(groupAddress) == nil
+			info.state = idleMember
+			g.mu.memberships[groupAddress] = info
+		}),
+	}
+
+	if !dontInitialize && g.opts.Enabled {
+		g.initializeNewMemberLocked(groupAddress, &info)
+	}
+
+	g.mu.memberships[groupAddress] = info
+}
+
+// IsLocallyJoined returns true if the group is locally joined.
+func (g *GenericMulticastProtocolState) IsLocallyJoined(groupAddress tcpip.Address) bool {
+	g.mu.RLock()
+	defer g.mu.RUnlock()
+	_, ok := g.mu.memberships[groupAddress]
+	return ok
+}
+
+// LeaveGroup handles leaving the group.
+//
+// Returns false if the group is not currently joined.
+func (g *GenericMulticastProtocolState) LeaveGroup(groupAddress tcpip.Address) bool {
+	g.mu.Lock()
+	defer g.mu.Unlock()
+
+	info, ok := g.mu.memberships[groupAddress]
+	if !ok {
+		return false
+	}
+
+	if info.joins == 0 {
+		panic(fmt.Sprintf("tried to leave group %s with a join count of 0", groupAddress))
+	}
+	info.joins--
+	if info.joins != 0 {
+		// If we still have outstanding joins, then do nothing further.
+		g.mu.memberships[groupAddress] = info
+		return true
+	}
+
+	g.transitionToNonMemberLocked(groupAddress, &info)
+	delete(g.mu.memberships, groupAddress)
+	return true
+}
+
+// HandleQuery handles a query message with the specified maximum response time.
+//
+// If the group address is unspecified, then reports will be scheduled for all
+// joined groups.
+//
+// Report(s) will be scheduled to be sent after a random duration between 0 and
+// the maximum response time.
+func (g *GenericMulticastProtocolState) HandleQuery(groupAddress tcpip.Address, maxResponseTime time.Duration) {
+	if !g.opts.Enabled {
+		return
+	}
+
+	g.mu.Lock()
+	defer g.mu.Unlock()
+
+	// As per RFC 2236 section 2.4 (for IGMPv2),
+	//
+	//   In a Membership Query message, the group address field is set to zero
+	//   when sending a General Query, and set to the group address being
+	//   queried when sending a Group-Specific Query.
+	//
+	// As per RFC 2710 section 3.6 (for MLDv1),
+	//
+	//   In a Query message, the Multicast Address field is set to zero when
+	//   sending a General Query, and set to a specific IPv6 multicast address
+	//   when sending a Multicast-Address-Specific Query.
+	if groupAddress.Unspecified() {
+		// This is a general query as the group address is unspecified.
+		for groupAddress, info := range g.mu.memberships {
+			g.setDelayTimerForAddressRLocked(groupAddress, &info, maxResponseTime)
+			g.mu.memberships[groupAddress] = info
+		}
+	} else if info, ok := g.mu.memberships[groupAddress]; ok {
+		g.setDelayTimerForAddressRLocked(groupAddress, &info, maxResponseTime)
+		g.mu.memberships[groupAddress] = info
+	}
+}
+
+// HandleReport handles a report message.
+//
+// If the report is for a joined group, any active delayed report will be
+// cancelled and the host state for the group transitions to idle.
+func (g *GenericMulticastProtocolState) HandleReport(groupAddress tcpip.Address) {
+	if !g.opts.Enabled {
+		return
+	}
+
+	g.mu.Lock()
+	defer g.mu.Unlock()
+
+	// As per RFC 2236 section 3 pages 3-4 (for IGMPv2),
+	//
+	//   If the host receives another host's Report (version 1 or 2) while it has
+	//   a timer running, it stops its timer for the specified group and does not
+	//   send a Report
+	//
+	// As per RFC 2710 section 4 page 6 (for MLDv1),
+	//
+	//   If a node receives another node's Report from an interface for a
+	//   multicast address while it has a timer running for that same address
+	//   on that interface, it stops its timer and does not send a Report for
+	//   that address, thus suppressing duplicate reports on the link.
+	if info, ok := g.mu.memberships[groupAddress]; ok && info.state == delayingMember {
+		info.delayedReportJob.Cancel()
+		info.lastToSendReport = false
+		info.state = idleMember
+		g.mu.memberships[groupAddress] = info
+	}
+}
+
+// initializeNewMemberLocked initializes a new group membership.
+//
+// Precondition: g.mu must be locked.
+func (g *GenericMulticastProtocolState) initializeNewMemberLocked(groupAddress tcpip.Address, info *multicastGroupState) {
+	if info.state != nonMember {
+		panic(fmt.Sprintf("state for group %s is not non-member; state = %d", groupAddress, info.state))
+	}
+
+	info.state = idleMember
+
+	if groupAddress == g.opts.AllNodesAddress {
+		// As per RFC 2236 section 6 page 10 (for IGMPv2),
+		//
+		//   The all-systems group (address 224.0.0.1) is handled as a special
+		//   case. The host starts in Idle Member state for that group on every
+		//   interface, never transitions to another state, and never sends a
+		//   report for that group.
+		//
+		// As per RFC 2710 section 5 page 10 (for MLDv1),
+		//
+		//   The link-scope all-nodes address (FF02::1) is handled as a special
+		//   case. The node starts in Idle Listener state for that address on
+		//   every interface, never transitions to another state, and never sends
+		//   a Report or Done for that address.
+		return
+	}
+
+	// As per RFC 2236 section 3 page 5 (for IGMPv2),
+	//
+	//   When a host joins a multicast group, it should immediately transmit an
+	//   unsolicited Version 2 Membership Report for that group" ... "it is
+	//   recommended that it be repeated".
+	//
+	// As per RFC 2710 section 4 page 6 (for MLDv1),
+	//
+	//   When a node starts listening to a multicast address on an interface,
+	//   it should immediately transmit an unsolicited Report for that address
+	//   on that interface, in case it is the first listener on the link. To
+	//   cover the possibility of the initial Report being lost or damaged, it
+	//   is recommended that it be repeated once or twice after short delays
+	//   [Unsolicited Report Interval].
+	//
+	// TODO(gvisor.dev/issue/4901): Support a configurable number of initial
+	// unsolicited reports.
+	info.lastToSendReport = g.opts.Protocol.SendReport(groupAddress) == nil
+	g.setDelayTimerForAddressRLocked(groupAddress, info, g.opts.MaxUnsolicitedReportDelay)
+}
+
+// maybeSendLeave attempts to send a leave message.
+func (g *GenericMulticastProtocolState) maybeSendLeave(groupAddress tcpip.Address, lastToSendReport bool) {
+	if !g.opts.Enabled || !lastToSendReport {
+		return
+	}
+
+	if groupAddress == g.opts.AllNodesAddress {
+		// As per RFC 2236 section 6 page 10 (for IGMPv2),
+		//
+		//   The all-systems group (address 224.0.0.1) is handled as a special
+		//   case. The host starts in Idle Member state for that group on every
+		//   interface, never transitions to another state, and never sends a
+		//   report for that group.
+		//
+		// As per RFC 2710 section 5 page 10 (for MLDv1),
+		//
+		//   The link-scope all-nodes address (FF02::1) is handled as a special
+		//   case. The node starts in Idle Listener state for that address on
+		//   every interface, never transitions to another state, and never sends
+		//   a Report or Done for that address.
+		return
+	}
+
+	// Okay to ignore the error here as if packet write failed, the multicast
+	// routers will eventually drop our membership anyways. If the interface is
+	// being disabled or removed, the generic multicast protocol's should be
+	// cleared eventually.
+	//
+	// As per RFC 2236 section 3 page 5 (for IGMPv2),
+	//
+	//   When a router receives a Report, it adds the group being reported to
+	//   the list of multicast group memberships on the network on which it
+	//   received the Report and sets the timer for the membership to the
+	//   [Group Membership Interval]. Repeated Reports refresh the timer. If
+	//   no Reports are received for a particular group before this timer has
+	//   expired, the router assumes that the group has no local members and
+	//   that it need not forward remotely-originated multicasts for that
+	//   group onto the attached network.
+	//
+	// As per RFC 2710 section 4 page 5 (for MLDv1),
+	//
+	//   When a router receives a Report from a link, if the reported address
+	//   is not already present in the router's list of multicast address
+	//   having listeners on that link, the reported address is added to the
+	//   list, its timer is set to [Multicast Listener Interval], and its
+	//   appearance is made known to the router's multicast routing component.
+	//   If a Report is received for a multicast address that is already
+	//   present in the router's list, the timer for that address is reset to
+	//   [Multicast Listener Interval]. If an address's timer expires, it is
+	//   assumed that there are no longer any listeners for that address
+	//   present on the link, so it is deleted from the list and its
+	//   disappearance is made known to the multicast routing component.
+	//
+	// The requirement to send a leave message is also optional (it MAY be
+	// skipped):
+	//
+	// As per RFC 2236 section 6 page 8 (for IGMPv2),
+	//
+	//  "send leave" for the group on the interface. If the interface
+	//   state says the Querier is running IGMPv1, this action SHOULD be
+	//   skipped. If the flag saying we were the last host to report is
+	//   cleared, this action MAY be skipped. The Leave Message is sent to
+	//   the ALL-ROUTERS group (224.0.0.2).
+	//
+	// As per RFC 2710 section 5 page 8 (for MLDv1),
+	//
+	//   "send done" for the address on the interface. If the flag saying
+	//   we were the last node to report is cleared, this action MAY be
+	//   skipped. The Done message is sent to the link-scope all-routers
+	//   address (FF02::2).
+	_ = g.opts.Protocol.SendLeave(groupAddress)
+}
+
+// transitionToNonMemberLocked transitions the given multicast group the the
+// non-member/listener state.
+//
+// Precondition: e.mu must be locked.
+func (g *GenericMulticastProtocolState) transitionToNonMemberLocked(groupAddress tcpip.Address, info *multicastGroupState) {
+	if info.state == nonMember {
+		return
+	}
+
+	info.delayedReportJob.Cancel()
+	g.maybeSendLeave(groupAddress, info.lastToSendReport)
+	info.lastToSendReport = false
+	info.state = nonMember
+}
+
+// setDelayTimerForAddressRLocked sets timer to send a delay report.
+//
+// Precondition: g.mu MUST be read locked.
+func (g *GenericMulticastProtocolState) setDelayTimerForAddressRLocked(groupAddress tcpip.Address, info *multicastGroupState, maxResponseTime time.Duration) {
+	if info.state == nonMember {
+		return
+	}
+
+	if groupAddress == g.opts.AllNodesAddress {
+		// As per RFC 2236 section 6 page 10 (for IGMPv2),
+		//
+		//   The all-systems group (address 224.0.0.1) is handled as a special
+		//   case. The host starts in Idle Member state for that group on every
+		//   interface, never transitions to another state, and never sends a
+		//   report for that group.
+		//
+		// As per RFC 2710 section 5 page 10 (for MLDv1),
+		//
+		//   The link-scope all-nodes address (FF02::1) is handled as a special
+		//   case. The node starts in Idle Listener state for that address on
+		//   every interface, never transitions to another state, and never sends
+		//   a Report or Done for that address.
+		return
+	}
+
+	// As per RFC 2236 section 3 page 3 (for IGMPv2),
+	//
+	//   If a timer for the group is already unning, it is reset to the random
+	//   value only if the requested Max Response Time is less than the remaining
+	//   value of the running timer.
+	//
+	// As per RFC 2710 section 4 page 5 (for MLDv1),
+	//
+	//   If a timer for any address is already running, it is reset to the new
+	//   random value only if the requested Maximum Response Delay is less than
+	//   the remaining value of the running timer.
+	if info.state == delayingMember {
+		// TODO: Reset the timer if time remaining is greater than maxResponseTime.
+		return
+	}
+	info.state = delayingMember
+	info.delayedReportJob.Cancel()
+	info.delayedReportJob.Schedule(g.calculateDelayTimerDuration(maxResponseTime))
+}
+
+// calculateDelayTimerDuration returns a random time between (0, maxRespTime].
+func (g *GenericMulticastProtocolState) calculateDelayTimerDuration(maxRespTime time.Duration) time.Duration {
+	// As per RFC 2236 section 3 page 3 (for IGMPv2),
+	//
+	//   When a host receives a Group-Specific Query, it sets a delay timer to a
+	//   random value selected from the range (0, Max Response Time]...
+	//
+	// As per RFC 2710 section 4 page 6 (for MLDv1),
+	//
+	//   When a node receives a Multicast-Address-Specific Query, if it is
+	//   listening to the queried Multicast Address on the interface from
+	//   which the Query was received, it sets a delay timer for that address
+	//   to a random value selected from the range [0, Maximum Response Delay],
+	//   as above.
+	if maxRespTime == 0 {
+		return 0
+	}
+	return time.Duration(g.opts.Rand.Int63n(int64(maxRespTime)))
+}
diff --git a/pkg/tcpip/network/ip/generic_multicast_protocol_test.go b/pkg/tcpip/network/ip/generic_multicast_protocol_test.go
new file mode 100644
index 000000000..670be30d4
--- /dev/null
+++ b/pkg/tcpip/network/ip/generic_multicast_protocol_test.go
@@ -0,0 +1,576 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ip_test
+
+import (
+	"math/rand"
+	"testing"
+	"time"
+
+	"github.com/google/go-cmp/cmp"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/faketime"
+	"gvisor.dev/gvisor/pkg/tcpip/network/ip"
+)
+
+const (
+	addr1 = tcpip.Address("\x01")
+	addr2 = tcpip.Address("\x02")
+	addr3 = tcpip.Address("\x03")
+	addr4 = tcpip.Address("\x04")
+
+	maxUnsolicitedReportDelay = time.Second
+)
+
+var _ ip.MulticastGroupProtocol = (*mockMulticastGroupProtocol)(nil)
+
+type mockMulticastGroupProtocol struct {
+	sendReportGroupAddrCount map[tcpip.Address]int
+	sendLeaveGroupAddrCount  map[tcpip.Address]int
+}
+
+func (m *mockMulticastGroupProtocol) init() {
+	m.sendReportGroupAddrCount = make(map[tcpip.Address]int)
+	m.sendLeaveGroupAddrCount = make(map[tcpip.Address]int)
+}
+
+func (m *mockMulticastGroupProtocol) SendReport(groupAddress tcpip.Address) *tcpip.Error {
+	m.sendReportGroupAddrCount[groupAddress]++
+	return nil
+}
+
+func (m *mockMulticastGroupProtocol) SendLeave(groupAddress tcpip.Address) *tcpip.Error {
+	m.sendLeaveGroupAddrCount[groupAddress]++
+	return nil
+}
+
+func checkProtocol(mgp *mockMulticastGroupProtocol, sendReportGroupAddresses []tcpip.Address, sendLeaveGroupAddresses []tcpip.Address) string {
+	sendReportGroupAddressesMap := make(map[tcpip.Address]int)
+	for _, a := range sendReportGroupAddresses {
+		sendReportGroupAddressesMap[a] = 1
+	}
+
+	sendLeaveGroupAddressesMap := make(map[tcpip.Address]int)
+	for _, a := range sendLeaveGroupAddresses {
+		sendLeaveGroupAddressesMap[a] = 1
+	}
+
+	diff := cmp.Diff(mockMulticastGroupProtocol{
+		sendReportGroupAddrCount: sendReportGroupAddressesMap,
+		sendLeaveGroupAddrCount:  sendLeaveGroupAddressesMap,
+	}, *mgp, cmp.AllowUnexported(mockMulticastGroupProtocol{}))
+	mgp.init()
+	return diff
+}
+
+func TestJoinGroup(t *testing.T) {
+	tests := []struct {
+		name              string
+		addr              tcpip.Address
+		shouldSendReports bool
+	}{
+		{
+			name:              "Normal group",
+			addr:              addr1,
+			shouldSendReports: true,
+		},
+		{
+			name:              "All-nodes group",
+			addr:              addr2,
+			shouldSendReports: false,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			var g ip.GenericMulticastProtocolState
+			var mgp mockMulticastGroupProtocol
+			mgp.init()
+			clock := faketime.NewManualClock()
+			g.Init(ip.GenericMulticastProtocolOptions{
+				Enabled:                   true,
+				Rand:                      rand.New(rand.NewSource(0)),
+				Clock:                     clock,
+				Protocol:                  &mgp,
+				MaxUnsolicitedReportDelay: maxUnsolicitedReportDelay,
+				AllNodesAddress:           addr2,
+			})
+
+			// Joining a group should send a report immediately and another after
+			// a random interval between 0 and the maximum unsolicited report delay.
+			g.JoinGroup(test.addr, false /* dontInitialize */)
+			if test.shouldSendReports {
+				if diff := checkProtocol(&mgp, []tcpip.Address{test.addr} /* sendReportGroupAddresses */, nil /* sendLeaveGroupAddresses */); diff != "" {
+					t.Errorf("mockMulticastGroupProtocol mismatch (-want +got):\n%s", diff)
+				}
+
+				clock.Advance(maxUnsolicitedReportDelay)
+				if diff := checkProtocol(&mgp, []tcpip.Address{test.addr} /* sendReportGroupAddresses */, nil /* sendLeaveGroupAddresses */); diff != "" {
+					t.Errorf("mockMulticastGroupProtocol mismatch (-want +got):\n%s", diff)
+				}
+			}
+
+			// Should have no more messages to send.
+			clock.Advance(time.Hour)
+			if diff := checkProtocol(&mgp, nil /* sendReportGroupAddresses */, nil /* sendLeaveGroupAddresses */); diff != "" {
+				t.Errorf("mockMulticastGroupProtocol mismatch (-want +got):\n%s", diff)
+			}
+		})
+	}
+}
+
+func TestLeaveGroup(t *testing.T) {
+	tests := []struct {
+		name               string
+		addr               tcpip.Address
+		shouldSendMessages bool
+	}{
+		{
+			name:               "Normal group",
+			addr:               addr1,
+			shouldSendMessages: true,
+		},
+		{
+			name:               "All-nodes group",
+			addr:               addr2,
+			shouldSendMessages: false,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			var g ip.GenericMulticastProtocolState
+			var mgp mockMulticastGroupProtocol
+			mgp.init()
+			clock := faketime.NewManualClock()
+			g.Init(ip.GenericMulticastProtocolOptions{
+				Enabled:                   true,
+				Rand:                      rand.New(rand.NewSource(1)),
+				Clock:                     clock,
+				Protocol:                  &mgp,
+				MaxUnsolicitedReportDelay: maxUnsolicitedReportDelay,
+				AllNodesAddress:           addr2,
+			})
+
+			g.JoinGroup(test.addr, false /* dontInitialize */)
+			if test.shouldSendMessages {
+				if diff := checkProtocol(&mgp, []tcpip.Address{test.addr} /* sendReportGroupAddresses */, nil /* sendLeaveGroupAddresses */); diff != "" {
+					t.Fatalf("mockMulticastGroupProtocol mismatch (-want +got):\n%s", diff)
+				}
+			}
+
+			// Leaving a group should send a leave report immediately and cancel any
+			// delayed reports.
+			if !g.LeaveGroup(test.addr) {
+				t.Fatalf("got g.LeaveGroup(%s) = false, want = true", test.addr)
+			}
+			if test.shouldSendMessages {
+				if diff := checkProtocol(&mgp, nil /* sendReportGroupAddresses */, []tcpip.Address{test.addr} /* sendLeaveGroupAddresses */); diff != "" {
+					t.Errorf("mockMulticastGroupProtocol mismatch (-want +got):\n%s", diff)
+				}
+			}
+
+			// Should have no more messages to send.
+			clock.Advance(time.Hour)
+			if diff := checkProtocol(&mgp, nil /* sendReportGroupAddresses */, nil /* sendLeaveGroupAddresses */); diff != "" {
+				t.Errorf("mockMulticastGroupProtocol mismatch (-want +got):\n%s", diff)
+			}
+		})
+	}
+}
+
+func TestHandleReport(t *testing.T) {
+	tests := []struct {
+		name             string
+		reportAddr       tcpip.Address
+		expectReportsFor []tcpip.Address
+	}{
+		{
+			name:             "Unpecified empty",
+			reportAddr:       "",
+			expectReportsFor: []tcpip.Address{addr1, addr2},
+		},
+		{
+			name:             "Unpecified any",
+			reportAddr:       "\x00",
+			expectReportsFor: []tcpip.Address{addr1, addr2},
+		},
+		{
+			name:             "Specified",
+			reportAddr:       addr1,
+			expectReportsFor: []tcpip.Address{addr2},
+		},
+		{
+			name:             "Specified all-nodes",
+			reportAddr:       addr3,
+			expectReportsFor: []tcpip.Address{addr1, addr2},
+		},
+		{
+			name:             "Specified other",
+			reportAddr:       addr4,
+			expectReportsFor: []tcpip.Address{addr1, addr2},
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			var g ip.GenericMulticastProtocolState
+			var mgp mockMulticastGroupProtocol
+			mgp.init()
+			clock := faketime.NewManualClock()
+			g.Init(ip.GenericMulticastProtocolOptions{
+				Enabled:                   true,
+				Rand:                      rand.New(rand.NewSource(2)),
+				Clock:                     clock,
+				Protocol:                  &mgp,
+				MaxUnsolicitedReportDelay: maxUnsolicitedReportDelay,
+				AllNodesAddress:           addr3,
+			})
+
+			g.JoinGroup(addr1, false /* dontInitialize */)
+			if diff := checkProtocol(&mgp, []tcpip.Address{addr1} /* sendReportGroupAddresses */, nil /* sendLeaveGroupAddresses */); diff != "" {
+				t.Fatalf("mockMulticastGroupProtocol mismatch (-want +got):\n%s", diff)
+			}
+			g.JoinGroup(addr2, false /* dontInitialize */)
+			if diff := checkProtocol(&mgp, []tcpip.Address{addr2} /* sendReportGroupAddresses */, nil /* sendLeaveGroupAddresses */); diff != "" {
+				t.Fatalf("mockMulticastGroupProtocol mismatch (-want +got):\n%s", diff)
+			}
+			g.JoinGroup(addr3, false /* dontInitialize */)
+			if diff := checkProtocol(&mgp, nil /* sendReportGroupAddresses */, nil /* sendLeaveGroupAddresses */); diff != "" {
+				t.Fatalf("mockMulticastGroupProtocol mismatch (-want +got):\n%s", diff)
+			}
+
+			// Receiving a report for a group we have a timer scheduled for should
+			// cancel our delayed report timer for the group.
+			g.HandleReport(test.reportAddr)
+			if len(test.expectReportsFor) != 0 {
+				clock.Advance(maxUnsolicitedReportDelay)
+				if diff := checkProtocol(&mgp, test.expectReportsFor /* sendReportGroupAddresses */, nil /* sendLeaveGroupAddresses */); diff != "" {
+					t.Errorf("mockMulticastGroupProtocol mismatch (-want +got):\n%s", diff)
+				}
+			}
+
+			// Should have no more messages to send.
+			clock.Advance(time.Hour)
+			if diff := checkProtocol(&mgp, nil /* sendReportGroupAddresses */, nil /* sendLeaveGroupAddresses */); diff != "" {
+				t.Errorf("mockMulticastGroupProtocol mismatch (-want +got):\n%s", diff)
+			}
+		})
+	}
+}
+
+func TestHandleQuery(t *testing.T) {
+	tests := []struct {
+		name             string
+		queryAddr        tcpip.Address
+		maxDelay         time.Duration
+		expectReportsFor []tcpip.Address
+	}{
+		{
+			name:             "Unpecified empty",
+			queryAddr:        "",
+			maxDelay:         0,
+			expectReportsFor: []tcpip.Address{addr1, addr2},
+		},
+		{
+			name:             "Unpecified any",
+			queryAddr:        "\x00",
+			maxDelay:         1,
+			expectReportsFor: []tcpip.Address{addr1, addr2},
+		},
+		{
+			name:             "Specified",
+			queryAddr:        addr1,
+			maxDelay:         2,
+			expectReportsFor: []tcpip.Address{addr1},
+		},
+		{
+			name:             "Specified all-nodes",
+			queryAddr:        addr3,
+			maxDelay:         3,
+			expectReportsFor: nil,
+		},
+		{
+			name:             "Specified other",
+			queryAddr:        addr4,
+			maxDelay:         4,
+			expectReportsFor: nil,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			var g ip.GenericMulticastProtocolState
+			var mgp mockMulticastGroupProtocol
+			mgp.init()
+			clock := faketime.NewManualClock()
+			g.Init(ip.GenericMulticastProtocolOptions{
+				Enabled:                   true,
+				Rand:                      rand.New(rand.NewSource(3)),
+				Clock:                     clock,
+				Protocol:                  &mgp,
+				MaxUnsolicitedReportDelay: maxUnsolicitedReportDelay,
+				AllNodesAddress:           addr3,
+			})
+
+			g.JoinGroup(addr1, false /* dontInitialize */)
+			if diff := checkProtocol(&mgp, []tcpip.Address{addr1} /* sendReportGroupAddresses */, nil /* sendLeaveGroupAddresses */); diff != "" {
+				t.Fatalf("mockMulticastGroupProtocol mismatch (-want +got):\n%s", diff)
+			}
+			g.JoinGroup(addr2, false /* dontInitialize */)
+			if diff := checkProtocol(&mgp, []tcpip.Address{addr2} /* sendReportGroupAddresses */, nil /* sendLeaveGroupAddresses */); diff != "" {
+				t.Fatalf("mockMulticastGroupProtocol mismatch (-want +got):\n%s", diff)
+			}
+			g.JoinGroup(addr3, false /* dontInitialize */)
+			if diff := checkProtocol(&mgp, nil /* sendReportGroupAddresses */, nil /* sendLeaveGroupAddresses */); diff != "" {
+				t.Fatalf("mockMulticastGroupProtocol mismatch (-want +got):\n%s", diff)
+			}
+			clock.Advance(maxUnsolicitedReportDelay)
+			if diff := checkProtocol(&mgp, []tcpip.Address{addr1, addr2} /* sendReportGroupAddresses */, nil /* sendLeaveGroupAddresses */); diff != "" {
+				t.Fatalf("mockMulticastGroupProtocol mismatch (-want +got):\n%s", diff)
+			}
+
+			// Receiving a query should make us schedule a new delayed report if it
+			// is a query directed at us or a general query.
+			g.HandleQuery(test.queryAddr, test.maxDelay)
+			if len(test.expectReportsFor) != 0 {
+				clock.Advance(test.maxDelay)
+				if diff := checkProtocol(&mgp, test.expectReportsFor /* sendReportGroupAddresses */, nil /* sendLeaveGroupAddresses */); diff != "" {
+					t.Errorf("mockMulticastGroupProtocol mismatch (-want +got):\n%s", diff)
+				}
+			}
+
+			// Should have no more messages to send.
+			clock.Advance(time.Hour)
+			if diff := checkProtocol(&mgp, nil /* sendReportGroupAddresses */, nil /* sendLeaveGroupAddresses */); diff != "" {
+				t.Errorf("mockMulticastGroupProtocol mismatch (-want +got):\n%s", diff)
+			}
+		})
+	}
+}
+
+func TestJoinCount(t *testing.T) {
+	var g ip.GenericMulticastProtocolState
+	var mgp mockMulticastGroupProtocol
+	mgp.init()
+	clock := faketime.NewManualClock()
+	g.Init(ip.GenericMulticastProtocolOptions{
+		Enabled:                   true,
+		Rand:                      rand.New(rand.NewSource(4)),
+		Clock:                     clock,
+		Protocol:                  &mgp,
+		MaxUnsolicitedReportDelay: time.Second,
+	})
+
+	// Set the join count to 2 for a group.
+	g.JoinGroup(addr1, false /* dontInitialize */)
+	if !g.IsLocallyJoined(addr1) {
+		t.Fatalf("got g.IsLocallyJoined(%s) = false, want = true", addr1)
+	}
+	// Only the first join should trigger a report to be sent.
+	if diff := checkProtocol(&mgp, []tcpip.Address{addr1} /* sendReportGroupAddresses */, nil /* sendLeaveGroupAddresses */); diff != "" {
+		t.Fatalf("mockMulticastGroupProtocol mismatch (-want +got):\n%s", diff)
+	}
+	g.JoinGroup(addr1, false /* dontInitialize */)
+	if !g.IsLocallyJoined(addr1) {
+		t.Fatalf("got g.IsLocallyJoined(%s) = false, want = true", addr1)
+	}
+	if diff := checkProtocol(&mgp, nil /* sendReportGroupAddresses */, nil /* sendLeaveGroupAddresses */); diff != "" {
+		t.Fatalf("mockMulticastGroupProtocol mismatch (-want +got):\n%s", diff)
+	}
+
+	// Group should still be considered joined after leaving once.
+	if !g.LeaveGroup(addr1) {
+		t.Fatalf("got g.LeaveGroup(%s) = false, want = true", addr1)
+	}
+	if !g.IsLocallyJoined(addr1) {
+		t.Fatalf("got g.IsLocallyJoined(%s) = false, want = true", addr1)
+	}
+	// A leave report should only be sent once the join count reaches 0.
+	if diff := checkProtocol(&mgp, nil /* sendReportGroupAddresses */, nil /* sendLeaveGroupAddresses */); diff != "" {
+		t.Fatalf("mockMulticastGroupProtocol mismatch (-want +got):\n%s", diff)
+	}
+
+	// Leaving once more should actually remove us from the group.
+	if !g.LeaveGroup(addr1) {
+		t.Fatalf("got g.LeaveGroup(%s) = false, want = true", addr1)
+	}
+	if g.IsLocallyJoined(addr1) {
+		t.Fatalf("got g.IsLocallyJoined(%s) = true, want = false", addr1)
+	}
+	if diff := checkProtocol(&mgp, nil /* sendReportGroupAddresses */, []tcpip.Address{addr1} /* sendLeaveGroupAddresses */); diff != "" {
+		t.Fatalf("mockMulticastGroupProtocol mismatch (-want +got):\n%s", diff)
+	}
+
+	// Group should no longer be joined so we should not have anything to
+	// leave.
+	if g.LeaveGroup(addr1) {
+		t.Fatalf("got g.LeaveGroup(%s) = true, want = false", addr1)
+	}
+	if g.IsLocallyJoined(addr1) {
+		t.Fatalf("got g.IsLocallyJoined(%s) = true, want = false", addr1)
+	}
+	if diff := checkProtocol(&mgp, nil /* sendReportGroupAddresses */, nil /* sendLeaveGroupAddresses */); diff != "" {
+		t.Fatalf("mockMulticastGroupProtocol mismatch (-want +got):\n%s", diff)
+	}
+
+	// Should have no more messages to send.
+	clock.Advance(time.Hour)
+	if diff := checkProtocol(&mgp, nil /* sendReportGroupAddresses */, nil /* sendLeaveGroupAddresses */); diff != "" {
+		t.Errorf("mockMulticastGroupProtocol mismatch (-want +got):\n%s", diff)
+	}
+}
+
+func TestMakeAllNonMemberAndInitialize(t *testing.T) {
+	var g ip.GenericMulticastProtocolState
+	var mgp mockMulticastGroupProtocol
+	mgp.init()
+	clock := faketime.NewManualClock()
+	g.Init(ip.GenericMulticastProtocolOptions{
+		Enabled:                   true,
+		Rand:                      rand.New(rand.NewSource(3)),
+		Clock:                     clock,
+		Protocol:                  &mgp,
+		MaxUnsolicitedReportDelay: maxUnsolicitedReportDelay,
+		AllNodesAddress:           addr3,
+	})
+
+	g.JoinGroup(addr1, false /* dontInitialize */)
+	if diff := checkProtocol(&mgp, []tcpip.Address{addr1} /* sendReportGroupAddresses */, nil /* sendLeaveGroupAddresses */); diff != "" {
+		t.Fatalf("mockMulticastGroupProtocol mismatch (-want +got):\n%s", diff)
+	}
+	g.JoinGroup(addr2, false /* dontInitialize */)
+	if diff := checkProtocol(&mgp, []tcpip.Address{addr2} /* sendReportGroupAddresses */, nil /* sendLeaveGroupAddresses */); diff != "" {
+		t.Fatalf("mockMulticastGroupProtocol mismatch (-want +got):\n%s", diff)
+	}
+	g.JoinGroup(addr3, false /* dontInitialize */)
+	if diff := checkProtocol(&mgp, nil /* sendReportGroupAddresses */, nil /* sendLeaveGroupAddresses */); diff != "" {
+		t.Fatalf("mockMulticastGroupProtocol mismatch (-want +got):\n%s", diff)
+	}
+
+	// Should send the leave reports for each but still consider them locally
+	// joined.
+	g.MakeAllNonMember()
+	if diff := checkProtocol(&mgp, nil /* sendReportGroupAddresses */, []tcpip.Address{addr1, addr2} /* sendLeaveGroupAddresses */); diff != "" {
+		t.Errorf("mockMulticastGroupProtocol mismatch (-want +got):\n%s", diff)
+	}
+	clock.Advance(time.Hour)
+	if diff := checkProtocol(&mgp, nil /* sendReportGroupAddresses */, nil /* sendLeaveGroupAddresses */); diff != "" {
+		t.Errorf("mockMulticastGroupProtocol mismatch (-want +got):\n%s", diff)
+	}
+	for _, group := range []tcpip.Address{addr1, addr2, addr3} {
+		if !g.IsLocallyJoined(group) {
+			t.Fatalf("got g.IsLocallyJoined(%s) = false, want = true", group)
+		}
+	}
+
+	// Should send the initial set of unsolcited reports.
+	g.InitializeGroups()
+	if diff := checkProtocol(&mgp, []tcpip.Address{addr1, addr2} /* sendReportGroupAddresses */, nil /* sendLeaveGroupAddresses */); diff != "" {
+		t.Errorf("mockMulticastGroupProtocol mismatch (-want +got):\n%s", diff)
+	}
+	clock.Advance(maxUnsolicitedReportDelay)
+	if diff := checkProtocol(&mgp, []tcpip.Address{addr1, addr2} /* sendReportGroupAddresses */, nil /* sendLeaveGroupAddresses */); diff != "" {
+		t.Errorf("mockMulticastGroupProtocol mismatch (-want +got):\n%s", diff)
+	}
+
+	// Should have no more messages to send.
+	clock.Advance(time.Hour)
+	if diff := checkProtocol(&mgp, nil /* sendReportGroupAddresses */, nil /* sendLeaveGroupAddresses */); diff != "" {
+		t.Errorf("mockMulticastGroupProtocol mismatch (-want +got):\n%s", diff)
+	}
+}
+
+// TestGroupStateNonMember tests that groups do not send packets when in the
+// non-member state, but are still considered locally joined.
+func TestGroupStateNonMember(t *testing.T) {
+	tests := []struct {
+		name           string
+		enabled        bool
+		dontInitialize bool
+	}{
+		{
+			name:           "Disabled",
+			enabled:        false,
+			dontInitialize: false,
+		},
+		{
+			name:           "Keep non-member",
+			enabled:        true,
+			dontInitialize: true,
+		},
+		{
+			name:           "disabled and Keep non-member",
+			enabled:        false,
+			dontInitialize: true,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			var g ip.GenericMulticastProtocolState
+			var mgp mockMulticastGroupProtocol
+			mgp.init()
+			clock := faketime.NewManualClock()
+			g.Init(ip.GenericMulticastProtocolOptions{
+				Enabled:                   test.enabled,
+				Rand:                      rand.New(rand.NewSource(3)),
+				Clock:                     clock,
+				Protocol:                  &mgp,
+				MaxUnsolicitedReportDelay: maxUnsolicitedReportDelay,
+			})
+
+			g.JoinGroup(addr1, test.dontInitialize)
+			if !g.IsLocallyJoined(addr1) {
+				t.Fatalf("got g.IsLocallyJoined(%s) = false, want = true", addr1)
+			}
+			if diff := checkProtocol(&mgp, nil /* sendReportGroupAddresses */, nil /* sendLeaveGroupAddresses */); diff != "" {
+				t.Fatalf("mockMulticastGroupProtocol mismatch (-want +got):\n%s", diff)
+			}
+
+			g.JoinGroup(addr2, test.dontInitialize)
+			if !g.IsLocallyJoined(addr2) {
+				t.Fatalf("got g.IsLocallyJoined(%s) = false, want = true", addr2)
+			}
+			if diff := checkProtocol(&mgp, nil /* sendReportGroupAddresses */, nil /* sendLeaveGroupAddresses */); diff != "" {
+				t.Fatalf("mockMulticastGroupProtocol mismatch (-want +got):\n%s", diff)
+			}
+
+			g.HandleQuery(addr1, time.Nanosecond)
+			clock.Advance(time.Nanosecond)
+			if diff := checkProtocol(&mgp, nil /* sendReportGroupAddresses */, nil /* sendLeaveGroupAddresses */); diff != "" {
+				t.Fatalf("mockMulticastGroupProtocol mismatch (-want +got):\n%s", diff)
+			}
+
+			if !g.LeaveGroup(addr2) {
+				t.Errorf("got g.LeaveGroup(%s) = false, want = true", addr2)
+			}
+			if !g.IsLocallyJoined(addr1) {
+				t.Fatalf("got g.IsLocallyJoined(%s) = false, want = true", addr1)
+			}
+			if g.IsLocallyJoined(addr2) {
+				t.Fatalf("got g.IsLocallyJoined(%s) = true, want = false", addr2)
+			}
+			if diff := checkProtocol(&mgp, nil /* sendReportGroupAddresses */, nil /* sendLeaveGroupAddresses */); diff != "" {
+				t.Fatalf("mockMulticastGroupProtocol mismatch (-want +got):\n%s", diff)
+			}
+
+			clock.Advance(time.Hour)
+			if diff := checkProtocol(&mgp, nil /* sendReportGroupAddresses */, nil /* sendLeaveGroupAddresses */); diff != "" {
+				t.Fatalf("mockMulticastGroupProtocol mismatch (-want +got):\n%s", diff)
+			}
+		})
+	}
+}
diff --git a/pkg/tcpip/network/ip_test.go b/pkg/tcpip/network/ip_test.go
index d49c44846..a314dd386 100644
--- a/pkg/tcpip/network/ip_test.go
+++ b/pkg/tcpip/network/ip_test.go
@@ -193,10 +193,6 @@ func (*testObject) WritePackets(_ *stack.Route, _ *stack.GSO, pkt stack.PacketBu
 	panic("not implemented")
 }
 
-func (*testObject) WriteRawPacket(_ buffer.VectorisedView) *tcpip.Error {
-	return tcpip.ErrNotSupported
-}
-
 // ARPHardwareType implements stack.LinkEndpoint.ARPHardwareType.
 func (*testObject) ARPHardwareType() header.ARPHardwareType {
 	panic("not implemented")
@@ -207,7 +203,7 @@ func (*testObject) AddHeader(local, remote tcpip.LinkAddress, protocol tcpip.Net
 	panic("not implemented")
 }
 
-func buildIPv4Route(local, remote tcpip.Address) (stack.Route, *tcpip.Error) {
+func buildIPv4Route(local, remote tcpip.Address) (*stack.Route, *tcpip.Error) {
 	s := stack.New(stack.Options{
 		NetworkProtocols:   []stack.NetworkProtocolFactory{ipv4.NewProtocol},
 		TransportProtocols: []stack.TransportProtocolFactory{udp.NewProtocol, tcp.NewProtocol},
@@ -223,7 +219,7 @@ func buildIPv4Route(local, remote tcpip.Address) (stack.Route, *tcpip.Error) {
 	return s.FindRoute(nicID, local, remote, ipv4.ProtocolNumber, false /* multicastLoop */)
 }
 
-func buildIPv6Route(local, remote tcpip.Address) (stack.Route, *tcpip.Error) {
+func buildIPv6Route(local, remote tcpip.Address) (*stack.Route, *tcpip.Error) {
 	s := stack.New(stack.Options{
 		NetworkProtocols:   []stack.NetworkProtocolFactory{ipv6.NewProtocol},
 		TransportProtocols: []stack.TransportProtocolFactory{udp.NewProtocol, tcp.NewProtocol},
@@ -554,7 +550,7 @@ func TestIPv4Send(t *testing.T) {
 	if err != nil {
 		t.Fatalf("could not find route: %v", err)
 	}
-	if err := ep.WritePacket(&r, nil /* gso */, stack.NetworkHeaderParams{
+	if err := ep.WritePacket(r, nil /* gso */, stack.NetworkHeaderParams{
 		Protocol: 123,
 		TTL:      123,
 		TOS:      stack.DefaultTOS,
@@ -937,7 +933,7 @@ func TestIPv6Send(t *testing.T) {
 	if err != nil {
 		t.Fatalf("could not find route: %v", err)
 	}
-	if err := ep.WritePacket(&r, nil /* gso */, stack.NetworkHeaderParams{
+	if err := ep.WritePacket(r, nil /* gso */, stack.NetworkHeaderParams{
 		Protocol: 123,
 		TTL:      123,
 		TOS:      stack.DefaultTOS,
@@ -1093,7 +1089,19 @@ func TestWriteHeaderIncludedPacket(t *testing.T) {
 	dataBuf := [dataLen]byte{1, 2, 3, 4}
 	data := dataBuf[:]
 
-	ipv4Options := header.IPv4Options{0, 1, 0, 1}
+	ipv4Options := header.IPv4OptionsSerializer{
+		&header.IPv4SerializableListEndOption{},
+		&header.IPv4SerializableNOPOption{},
+		&header.IPv4SerializableListEndOption{},
+		&header.IPv4SerializableNOPOption{},
+	}
+
+	expectOptions := header.IPv4Options{
+		byte(header.IPv4OptionListEndType),
+		byte(header.IPv4OptionNOPType),
+		byte(header.IPv4OptionListEndType),
+		byte(header.IPv4OptionNOPType),
+	}
 
 	ipv6FragmentExtHdrBuf := [header.IPv6FragmentExtHdrLength]byte{transportProto, 0, 62, 4, 1, 2, 3, 4}
 	ipv6FragmentExtHdr := ipv6FragmentExtHdrBuf[:]
@@ -1243,7 +1251,7 @@ func TestWriteHeaderIncludedPacket(t *testing.T) {
 			nicAddr:      localIPv4Addr,
 			remoteAddr:   remoteIPv4Addr,
 			pktGen: func(t *testing.T, src tcpip.Address) buffer.VectorisedView {
-				ipHdrLen := header.IPv4MinimumSize + ipv4Options.SizeWithPadding()
+				ipHdrLen := int(header.IPv4MinimumSize + ipv4Options.Length())
 				totalLen := ipHdrLen + len(data)
 				hdr := buffer.NewPrependable(totalLen)
 				if n := copy(hdr.Prepend(len(data)), data); n != len(data) {
@@ -1266,7 +1274,7 @@ func TestWriteHeaderIncludedPacket(t *testing.T) {
 
 				netHdr := pkt.NetworkHeader()
 
-				hdrLen := header.IPv4MinimumSize + len(ipv4Options)
+				hdrLen := int(header.IPv4MinimumSize + ipv4Options.Length())
 				if len(netHdr.View()) != hdrLen {
 					t.Errorf("got len(netHdr.View()) = %d, want = %d", len(netHdr.View()), hdrLen)
 				}
@@ -1276,7 +1284,7 @@ func TestWriteHeaderIncludedPacket(t *testing.T) {
 					checker.DstAddr(remoteIPv4Addr),
 					checker.IPv4HeaderLength(hdrLen),
 					checker.IPFullLength(uint16(hdrLen+len(data))),
-					checker.IPv4Options(ipv4Options),
+					checker.IPv4Options(expectOptions),
 					checker.IPPayload(data),
 				)
 			},
@@ -1288,7 +1296,7 @@ func TestWriteHeaderIncludedPacket(t *testing.T) {
 			nicAddr:      localIPv4Addr,
 			remoteAddr:   remoteIPv4Addr,
 			pktGen: func(t *testing.T, src tcpip.Address) buffer.VectorisedView {
-				ip := header.IPv4(make([]byte, header.IPv4MinimumSize+ipv4Options.SizeWithPadding()))
+				ip := header.IPv4(make([]byte, header.IPv4MinimumSize+ipv4Options.Length()))
 				ip.Encode(&header.IPv4Fields{
 					Protocol: transportProto,
 					TTL:      ipv4.DefaultTTL,
@@ -1307,7 +1315,7 @@ func TestWriteHeaderIncludedPacket(t *testing.T) {
 
 				netHdr := pkt.NetworkHeader()
 
-				hdrLen := header.IPv4MinimumSize + len(ipv4Options)
+				hdrLen := int(header.IPv4MinimumSize + ipv4Options.Length())
 				if len(netHdr.View()) != hdrLen {
 					t.Errorf("got len(netHdr.View()) = %d, want = %d", len(netHdr.View()), hdrLen)
 				}
@@ -1317,7 +1325,7 @@ func TestWriteHeaderIncludedPacket(t *testing.T) {
 					checker.DstAddr(remoteIPv4Addr),
 					checker.IPv4HeaderLength(hdrLen),
 					checker.IPFullLength(uint16(hdrLen+len(data))),
-					checker.IPv4Options(ipv4Options),
+					checker.IPv4Options(expectOptions),
 					checker.IPPayload(data),
 				)
 			},
diff --git a/pkg/tcpip/network/ipv4/BUILD b/pkg/tcpip/network/ipv4/BUILD
index 6252614ec..32f53f217 100644
--- a/pkg/tcpip/network/ipv4/BUILD
+++ b/pkg/tcpip/network/ipv4/BUILD
@@ -6,6 +6,7 @@ go_library(
     name = "ipv4",
     srcs = [
         "icmp.go",
+        "igmp.go",
         "ipv4.go",
     ],
     visibility = ["//visibility:public"],
@@ -17,6 +18,7 @@ go_library(
         "//pkg/tcpip/header/parse",
         "//pkg/tcpip/network/fragmentation",
         "//pkg/tcpip/network/hash",
+        "//pkg/tcpip/network/ip",
         "//pkg/tcpip/stack",
     ],
 )
@@ -24,7 +26,10 @@ go_library(
 go_test(
     name = "ipv4_test",
     size = "small",
-    srcs = ["ipv4_test.go"],
+    srcs = [
+        "igmp_test.go",
+        "ipv4_test.go",
+    ],
     deps = [
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
diff --git a/pkg/tcpip/network/ipv4/icmp.go b/pkg/tcpip/network/ipv4/icmp.go
index 488945226..8e392f86c 100644
--- a/pkg/tcpip/network/ipv4/icmp.go
+++ b/pkg/tcpip/network/ipv4/icmp.go
@@ -63,7 +63,7 @@ func (e *endpoint) handleControl(typ stack.ControlType, extra uint32, pkt *stack
 
 func (e *endpoint) handleICMP(pkt *stack.PacketBuffer) {
 	stats := e.protocol.stack.Stats()
-	received := stats.ICMP.V4PacketsReceived
+	received := stats.ICMP.V4.PacketsReceived
 	// TODO(gvisor.dev/issue/170): ICMP packets don't have their
 	// TransportHeader fields set. See icmp/protocol.go:protocol.Parse for a
 	// full explanation.
@@ -130,7 +130,7 @@ func (e *endpoint) handleICMP(pkt *stack.PacketBuffer) {
 	case header.ICMPv4Echo:
 		received.Echo.Increment()
 
-		sent := stats.ICMP.V4PacketsSent
+		sent := stats.ICMP.V4.PacketsSent
 		if !e.protocol.stack.AllowICMPMessage() {
 			sent.RateLimited.Increment()
 			return
@@ -379,7 +379,7 @@ func (p *protocol) returnError(reason icmpReason, pkt *stack.PacketBuffer) *tcpi
 	}
 	defer route.Release()
 
-	sent := p.stack.Stats().ICMP.V4PacketsSent
+	sent := p.stack.Stats().ICMP.V4.PacketsSent
 	if !p.stack.AllowICMPMessage() {
 		sent.RateLimited.Increment()
 		return nil
diff --git a/pkg/tcpip/network/ipv4/igmp.go b/pkg/tcpip/network/ipv4/igmp.go
new file mode 100644
index 000000000..0134fadc0
--- /dev/null
+++ b/pkg/tcpip/network/ipv4/igmp.go
@@ -0,0 +1,323 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ipv4
+
+import (
+	"fmt"
+	"sync"
+	"sync/atomic"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/network/ip"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+)
+
+const (
+	// igmpV1PresentDefault is the initial state for igmpV1Present in the
+	// igmpState. As per RFC 2236 Page 9 says "No IGMPv1 Router Present ... is
+	// the initial state."
+	igmpV1PresentDefault = 0
+
+	// v1RouterPresentTimeout from RFC 2236 Section 8.11, Page 18
+	// See note on igmpState.igmpV1Present for more detail.
+	v1RouterPresentTimeout = 400 * time.Second
+
+	// v1MaxRespTime from RFC 2236 Section 4, Page 5. "The IGMPv1 router
+	// will send General Queries with the Max Response Time set to 0. This MUST
+	// be interpreted as a value of 100 (10 seconds)."
+	//
+	// Note that the Max Response Time field is a value in units of deciseconds.
+	v1MaxRespTime = 10 * time.Second
+
+	// UnsolicitedReportIntervalMax is the maximum delay between sending
+	// unsolicited IGMP reports.
+	//
+	// Obtained from RFC 2236 Section 8.10, Page 19.
+	UnsolicitedReportIntervalMax = 10 * time.Second
+)
+
+// IGMPOptions holds options for IGMP.
+type IGMPOptions struct {
+	// Enabled indicates whether IGMP will be performed.
+	//
+	// When enabled, IGMP may transmit IGMP report and leave messages when
+	// joining and leaving multicast groups respectively, and handle incoming
+	// IGMP packets.
+	Enabled bool
+}
+
+var _ ip.MulticastGroupProtocol = (*igmpState)(nil)
+
+// igmpState is the per-interface IGMP state.
+//
+// igmpState.init() MUST be called after creating an IGMP state.
+type igmpState struct {
+	// The IPv4 endpoint this igmpState is for.
+	ep   *endpoint
+	opts IGMPOptions
+
+	// igmpV1Present is for maintaining compatibility with IGMPv1 Routers, from
+	// RFC 2236 Section 4 Page 6: "The IGMPv1 router expects Version 1
+	// Membership Reports in response to its Queries, and will not pay
+	// attention to Version 2 Membership Reports.  Therefore, a state variable
+	// MUST be kept for each interface, describing whether the multicast
+	// Querier on that interface is running IGMPv1 or IGMPv2.  This variable
+	// MUST be based upon whether or not an IGMPv1 query was heard in the last
+	// [Version 1 Router Present Timeout] seconds".
+	//
+	// Must be accessed with atomic operations. Holds a value of 1 when true, 0
+	// when false.
+	igmpV1Present uint32
+
+	mu struct {
+		sync.RWMutex
+
+		genericMulticastProtocol ip.GenericMulticastProtocolState
+
+		// igmpV1Job is scheduled when this interface receives an IGMPv1 style
+		// message, upon expiration the igmpV1Present flag is cleared.
+		// igmpV1Job may not be nil once igmpState is initialized.
+		igmpV1Job *tcpip.Job
+	}
+}
+
+// SendReport implements ip.MulticastGroupProtocol.
+func (igmp *igmpState) SendReport(groupAddress tcpip.Address) *tcpip.Error {
+	igmpType := header.IGMPv2MembershipReport
+	if igmp.v1Present() {
+		igmpType = header.IGMPv1MembershipReport
+	}
+	return igmp.writePacket(groupAddress, groupAddress, igmpType)
+}
+
+// SendLeave implements ip.MulticastGroupProtocol.
+func (igmp *igmpState) SendLeave(groupAddress tcpip.Address) *tcpip.Error {
+	// As per RFC 2236 Section 6, Page 8: "If the interface state says the
+	// Querier is running IGMPv1, this action SHOULD be skipped. If the flag
+	// saying we were the last host to report is cleared, this action MAY be
+	// skipped."
+	if igmp.v1Present() {
+		return nil
+	}
+	return igmp.writePacket(header.IPv4AllRoutersGroup, groupAddress, header.IGMPLeaveGroup)
+}
+
+// init sets up an igmpState struct, and is required to be called before using
+// a new igmpState.
+func (igmp *igmpState) init(ep *endpoint, opts IGMPOptions) {
+	igmp.mu.Lock()
+	defer igmp.mu.Unlock()
+	igmp.ep = ep
+	igmp.opts = opts
+	igmp.mu.genericMulticastProtocol.Init(ip.GenericMulticastProtocolOptions{
+		Enabled:                   opts.Enabled,
+		Rand:                      ep.protocol.stack.Rand(),
+		Clock:                     ep.protocol.stack.Clock(),
+		Protocol:                  igmp,
+		MaxUnsolicitedReportDelay: UnsolicitedReportIntervalMax,
+		AllNodesAddress:           header.IPv4AllSystems,
+	})
+	igmp.igmpV1Present = igmpV1PresentDefault
+	igmp.mu.igmpV1Job = igmp.ep.protocol.stack.NewJob(&igmp.mu, func() {
+		igmp.setV1Present(false)
+	})
+}
+
+func (igmp *igmpState) handleIGMP(pkt *stack.PacketBuffer) {
+	stats := igmp.ep.protocol.stack.Stats()
+	received := stats.IGMP.PacketsReceived
+	headerView, ok := pkt.Data.PullUp(header.IGMPMinimumSize)
+	if !ok {
+		received.Invalid.Increment()
+		return
+	}
+	h := header.IGMP(headerView)
+
+	// Temporarily reset the checksum field to 0 in order to calculate the proper
+	// checksum.
+	wantChecksum := h.Checksum()
+	h.SetChecksum(0)
+	gotChecksum := ^header.ChecksumVV(pkt.Data, 0 /* initial */)
+	h.SetChecksum(wantChecksum)
+
+	if gotChecksum != wantChecksum {
+		received.ChecksumErrors.Increment()
+		return
+	}
+
+	switch h.Type() {
+	case header.IGMPMembershipQuery:
+		received.MembershipQuery.Increment()
+		if len(headerView) < header.IGMPQueryMinimumSize {
+			received.Invalid.Increment()
+			return
+		}
+		igmp.handleMembershipQuery(h.GroupAddress(), h.MaxRespTime())
+	case header.IGMPv1MembershipReport:
+		received.V1MembershipReport.Increment()
+		if len(headerView) < header.IGMPReportMinimumSize {
+			received.Invalid.Increment()
+			return
+		}
+		igmp.handleMembershipReport(h.GroupAddress())
+	case header.IGMPv2MembershipReport:
+		received.V2MembershipReport.Increment()
+		if len(headerView) < header.IGMPReportMinimumSize {
+			received.Invalid.Increment()
+			return
+		}
+		igmp.handleMembershipReport(h.GroupAddress())
+	case header.IGMPLeaveGroup:
+		received.LeaveGroup.Increment()
+		// As per RFC 2236 Section 6, Page 7: "IGMP messages other than Query or
+		// Report, are ignored in all states"
+
+	default:
+		// As per RFC 2236 Section 2.1 Page 3: "Unrecognized message types should
+		// be silently ignored. New message types may be used by newer versions of
+		// IGMP, by multicast routing protocols, or other uses."
+		received.Unrecognized.Increment()
+	}
+}
+
+func (igmp *igmpState) v1Present() bool {
+	return atomic.LoadUint32(&igmp.igmpV1Present) == 1
+}
+
+func (igmp *igmpState) setV1Present(v bool) {
+	if v {
+		atomic.StoreUint32(&igmp.igmpV1Present, 1)
+	} else {
+		atomic.StoreUint32(&igmp.igmpV1Present, 0)
+	}
+}
+
+func (igmp *igmpState) handleMembershipQuery(groupAddress tcpip.Address, maxRespTime time.Duration) {
+	igmp.mu.Lock()
+	defer igmp.mu.Unlock()
+
+	// As per RFC 2236 Section 6, Page 10: If the maximum response time is zero
+	// then change the state to note that an IGMPv1 router is present and
+	// schedule the query received Job.
+	if maxRespTime == 0 && igmp.opts.Enabled {
+		igmp.mu.igmpV1Job.Cancel()
+		igmp.mu.igmpV1Job.Schedule(v1RouterPresentTimeout)
+		igmp.setV1Present(true)
+		maxRespTime = v1MaxRespTime
+	}
+
+	igmp.mu.genericMulticastProtocol.HandleQuery(groupAddress, maxRespTime)
+}
+
+func (igmp *igmpState) handleMembershipReport(groupAddress tcpip.Address) {
+	igmp.mu.Lock()
+	defer igmp.mu.Unlock()
+	igmp.mu.genericMulticastProtocol.HandleReport(groupAddress)
+}
+
+// writePacket assembles and sends an IGMP packet with the provided fields,
+// incrementing the provided stat counter on success.
+func (igmp *igmpState) writePacket(destAddress tcpip.Address, groupAddress tcpip.Address, igmpType header.IGMPType) *tcpip.Error {
+	igmpData := header.IGMP(buffer.NewView(header.IGMPReportMinimumSize))
+	igmpData.SetType(igmpType)
+	igmpData.SetGroupAddress(groupAddress)
+	igmpData.SetChecksum(header.IGMPCalculateChecksum(igmpData))
+
+	pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
+		ReserveHeaderBytes: int(igmp.ep.MaxHeaderLength()),
+		Data:               buffer.View(igmpData).ToVectorisedView(),
+	})
+
+	// TODO(gvisor.dev/issue/4888): We should not use the unspecified address,
+	// rather we should select an appropriate local address.
+	localAddr := header.IPv4Any
+	igmp.ep.addIPHeader(localAddr, destAddress, pkt, stack.NetworkHeaderParams{
+		Protocol: header.IGMPProtocolNumber,
+		TTL:      header.IGMPTTL,
+		TOS:      stack.DefaultTOS,
+	}, header.IPv4OptionsSerializer{
+		&header.IPv4SerializableRouterAlertOption{},
+	})
+
+	sent := igmp.ep.protocol.stack.Stats().IGMP.PacketsSent
+	if err := igmp.ep.nic.WritePacketToRemote(header.EthernetAddressFromMulticastIPv4Address(destAddress), nil /* gso */, ProtocolNumber, pkt); err != nil {
+		sent.Dropped.Increment()
+		return err
+	}
+	switch igmpType {
+	case header.IGMPv1MembershipReport:
+		sent.V1MembershipReport.Increment()
+	case header.IGMPv2MembershipReport:
+		sent.V2MembershipReport.Increment()
+	case header.IGMPLeaveGroup:
+		sent.LeaveGroup.Increment()
+	default:
+		panic(fmt.Sprintf("unrecognized igmp type = %d", igmpType))
+	}
+	return nil
+}
+
+// joinGroup handles adding a new group to the membership map, setting up the
+// IGMP state for the group, and sending and scheduling the required
+// messages.
+//
+// If the group already exists in the membership map, returns
+// tcpip.ErrDuplicateAddress.
+func (igmp *igmpState) joinGroup(groupAddress tcpip.Address) {
+	igmp.mu.Lock()
+	defer igmp.mu.Unlock()
+	igmp.mu.genericMulticastProtocol.JoinGroup(groupAddress, !igmp.ep.Enabled() /* dontInitialize */)
+}
+
+// isInGroup returns true if the specified group has been joined locally.
+func (igmp *igmpState) isInGroup(groupAddress tcpip.Address) bool {
+	igmp.mu.Lock()
+	defer igmp.mu.Unlock()
+	return igmp.mu.genericMulticastProtocol.IsLocallyJoined(groupAddress)
+}
+
+// leaveGroup handles removing the group from the membership map, cancels any
+// delay timers associated with that group, and sends the Leave Group message
+// if required.
+func (igmp *igmpState) leaveGroup(groupAddress tcpip.Address) *tcpip.Error {
+	igmp.mu.Lock()
+	defer igmp.mu.Unlock()
+
+	// LeaveGroup returns false only if the group was not joined.
+	if igmp.mu.genericMulticastProtocol.LeaveGroup(groupAddress) {
+		return nil
+	}
+
+	return tcpip.ErrBadLocalAddress
+}
+
+// softLeaveAll leaves all groups from the perspective of IGMP, but remains
+// joined locally.
+func (igmp *igmpState) softLeaveAll() {
+	igmp.mu.Lock()
+	defer igmp.mu.Unlock()
+	igmp.mu.genericMulticastProtocol.MakeAllNonMember()
+}
+
+// initializeAll attemps to initialize the IGMP state for each group that has
+// been joined locally.
+func (igmp *igmpState) initializeAll() {
+	igmp.mu.Lock()
+	defer igmp.mu.Unlock()
+	igmp.mu.genericMulticastProtocol.InitializeGroups()
+}
diff --git a/pkg/tcpip/network/ipv4/igmp_test.go b/pkg/tcpip/network/ipv4/igmp_test.go
new file mode 100644
index 000000000..5e139377b
--- /dev/null
+++ b/pkg/tcpip/network/ipv4/igmp_test.go
@@ -0,0 +1,156 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ipv4_test
+
+import (
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/checker"
+	"gvisor.dev/gvisor/pkg/tcpip/faketime"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/link/channel"
+	"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+)
+
+const (
+	linkAddr      = tcpip.LinkAddress("\x02\x02\x03\x04\x05\x06")
+	multicastAddr = tcpip.Address("\xe0\x00\x00\x03")
+	nicID         = 1
+)
+
+// validateIgmpPacket checks that a passed PacketInfo is an IPv4 IGMP packet
+// sent to the provided address with the passed fields set. Raises a t.Error if
+// any field does not match.
+func validateIgmpPacket(t *testing.T, p channel.PacketInfo, remoteAddress tcpip.Address, igmpType header.IGMPType, maxRespTime byte, groupAddress tcpip.Address) {
+	t.Helper()
+
+	payload := header.IPv4(stack.PayloadSince(p.Pkt.NetworkHeader()))
+	checker.IPv4(t, payload,
+		checker.DstAddr(remoteAddress),
+		// TTL for an IGMP message must be 1 as per RFC 2236 section 2.
+		checker.TTL(1),
+		checker.IPv4RouterAlert(),
+		checker.IGMP(
+			checker.IGMPType(igmpType),
+			checker.IGMPMaxRespTime(header.DecisecondToDuration(maxRespTime)),
+			checker.IGMPGroupAddress(groupAddress),
+		),
+	)
+}
+
+func createStack(t *testing.T, igmpEnabled bool) (*channel.Endpoint, *stack.Stack, *faketime.ManualClock) {
+	t.Helper()
+
+	// Create an endpoint of queue size 1, since no more than 1 packets are ever
+	// queued in the tests in this file.
+	e := channel.New(1, 1280, linkAddr)
+	clock := faketime.NewManualClock()
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocolFactory{ipv4.NewProtocolWithOptions(ipv4.Options{
+			IGMP: ipv4.IGMPOptions{
+				Enabled: igmpEnabled,
+			},
+		})},
+		Clock: clock,
+	})
+	if err := s.CreateNIC(nicID, e); err != nil {
+		t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+	}
+
+	return e, s, clock
+}
+
+func createAndInjectIGMPPacket(e *channel.Endpoint, igmpType header.IGMPType, maxRespTime byte, groupAddress tcpip.Address) {
+	buf := buffer.NewView(header.IPv4MinimumSize + header.IGMPQueryMinimumSize)
+
+	ip := header.IPv4(buf)
+	ip.Encode(&header.IPv4Fields{
+		TotalLength: uint16(len(buf)),
+		TTL:         1,
+		Protocol:    uint8(header.IGMPProtocolNumber),
+		SrcAddr:     header.IPv4Any,
+		DstAddr:     header.IPv4AllSystems,
+	})
+	ip.SetChecksum(^ip.CalculateChecksum())
+
+	igmp := header.IGMP(buf[header.IPv4MinimumSize:])
+	igmp.SetType(igmpType)
+	igmp.SetMaxRespTime(maxRespTime)
+	igmp.SetGroupAddress(groupAddress)
+	igmp.SetChecksum(header.IGMPCalculateChecksum(igmp))
+
+	e.InjectInbound(ipv4.ProtocolNumber, &stack.PacketBuffer{
+		Data: buf.ToVectorisedView(),
+	})
+}
+
+// TestIgmpV1Present tests the handling of the case where an IGMPv1 router is
+// present on the network. The IGMP stack will then send IGMPv1 Membership
+// reports for backwards compatibility.
+func TestIgmpV1Present(t *testing.T) {
+	e, s, clock := createStack(t, true)
+
+	if err := s.JoinGroup(ipv4.ProtocolNumber, nicID, multicastAddr); err != nil {
+		t.Fatalf("JoinGroup(ipv4, nic, %s) = %s", multicastAddr, err)
+	}
+
+	// This NIC will send an IGMPv2 report immediately, before this test can get
+	// the IGMPv1 General Membership Query in.
+	p, ok := e.Read()
+	if !ok {
+		t.Fatal("unable to Read IGMP packet, expected V2MembershipReport")
+	}
+	if got := s.Stats().IGMP.PacketsSent.V2MembershipReport.Value(); got != 1 {
+		t.Fatalf("got V2MembershipReport messages sent = %d, want = 1", got)
+	}
+	validateIgmpPacket(t, p, multicastAddr, header.IGMPv2MembershipReport, 0, multicastAddr)
+	if t.Failed() {
+		t.FailNow()
+	}
+
+	// Inject an IGMPv1 General Membership Query which is identical to a standard
+	// membership query except the Max Response Time is set to 0, which will tell
+	// the stack that this is a router using IGMPv1. Send it to the all systems
+	// group which is the only group this host belongs to.
+	createAndInjectIGMPPacket(e, header.IGMPMembershipQuery, 0, header.IPv4AllSystems)
+	if got := s.Stats().IGMP.PacketsReceived.MembershipQuery.Value(); got != 1 {
+		t.Fatalf("got Membership Queries received = %d, want = 1", got)
+	}
+
+	// Before advancing the clock, verify that this host has not sent a
+	// V1MembershipReport yet.
+	if got := s.Stats().IGMP.PacketsSent.V1MembershipReport.Value(); got != 0 {
+		t.Fatalf("got V1MembershipReport messages sent = %d, want = 0", got)
+	}
+
+	// Verify the solicited Membership Report is sent. Now that this NIC has seen
+	// an IGMPv1 query, it should send an IGMPv1 Membership Report.
+	p, ok = e.Read()
+	if ok {
+		t.Fatalf("sent unexpected packet, expected V1MembershipReport only after advancing the clock = %+v", p.Pkt)
+	}
+	clock.Advance(ipv4.UnsolicitedReportIntervalMax)
+	p, ok = e.Read()
+	if !ok {
+		t.Fatal("unable to Read IGMP packet, expected V1MembershipReport")
+	}
+	if got := s.Stats().IGMP.PacketsSent.V1MembershipReport.Value(); got != 1 {
+		t.Fatalf("got V1MembershipReport messages sent = %d, want = 1", got)
+	}
+	validateIgmpPacket(t, p, multicastAddr, header.IGMPv1MembershipReport, 0, multicastAddr)
+}
diff --git a/pkg/tcpip/network/ipv4/ipv4.go b/pkg/tcpip/network/ipv4/ipv4.go
index 1efe6297a..3076185cd 100644
--- a/pkg/tcpip/network/ipv4/ipv4.go
+++ b/pkg/tcpip/network/ipv4/ipv4.go
@@ -72,6 +72,7 @@ type endpoint struct {
 	nic        stack.NetworkInterface
 	dispatcher stack.TransportDispatcher
 	protocol   *protocol
+	igmp       igmpState
 
 	// enabled is set to 1 when the enpoint is enabled and 0 when it is
 	// disabled.
@@ -94,6 +95,7 @@ func (p *protocol) NewEndpoint(nic stack.NetworkInterface, _ stack.LinkAddressCa
 		protocol:   p,
 	}
 	e.mu.addressableEndpointState.Init(e)
+	e.igmp.init(e, p.options.IGMP)
 	return e
 }
 
@@ -121,11 +123,22 @@ func (e *endpoint) Enable() *tcpip.Error {
 	// We have no need for the address endpoint.
 	ep.DecRef()
 
+	// Groups may have been joined while the endpoint was disabled, or the
+	// endpoint may have left groups from the perspective of IGMP when the
+	// endpoint was disabled. Either way, we need to let routers know to
+	// send us multicast traffic.
+	e.igmp.initializeAll()
+
 	// As per RFC 1122 section 3.3.7, all hosts should join the all-hosts
 	// multicast group. Note, the IANA calls the all-hosts multicast group the
 	// all-systems multicast group.
-	_, err = e.mu.addressableEndpointState.JoinGroup(header.IPv4AllSystems)
-	return err
+	if err := e.joinGroupLocked(header.IPv4AllSystems); err != nil {
+		// joinGroupLocked only returns an error if the group address is not a valid
+		// IPv4 multicast address.
+		panic(fmt.Sprintf("e.joinGroupLocked(%s): %s", header.IPv4AllSystems, err))
+	}
+
+	return nil
 }
 
 // Enabled implements stack.NetworkEndpoint.
@@ -162,10 +175,14 @@ func (e *endpoint) disableLocked() {
 	}
 
 	// The endpoint may have already left the multicast group.
-	if _, err := e.mu.addressableEndpointState.LeaveGroup(header.IPv4AllSystems); err != nil && err != tcpip.ErrBadLocalAddress {
+	if err := e.leaveGroupLocked(header.IPv4AllSystems); err != nil && err != tcpip.ErrBadLocalAddress {
 		panic(fmt.Sprintf("unexpected error when leaving group = %s: %s", header.IPv4AllSystems, err))
 	}
 
+	// Leave groups from the perspective of IGMP so that routers know that
+	// we are no longer interested in the group.
+	e.igmp.softLeaveAll()
+
 	// The address may have already been removed.
 	if err := e.mu.addressableEndpointState.RemovePermanentAddress(ipv4BroadcastAddr.Address); err != nil && err != tcpip.ErrBadLocalAddress {
 		panic(fmt.Sprintf("unexpected error when removing address = %s: %s", ipv4BroadcastAddr.Address, err))
@@ -198,37 +215,34 @@ func (e *endpoint) NetworkProtocolNumber() tcpip.NetworkProtocolNumber {
 	return e.protocol.Number()
 }
 
-func (e *endpoint) addIPHeader(r *stack.Route, pkt *stack.PacketBuffer, params stack.NetworkHeaderParams) {
+func (e *endpoint) addIPHeader(srcAddr, dstAddr tcpip.Address, pkt *stack.PacketBuffer, params stack.NetworkHeaderParams, options header.IPv4OptionsSerializer) {
 	hdrLen := header.IPv4MinimumSize
-	var opts header.IPv4Options
-	if params.Options != nil {
-		var ok bool
-		if opts, ok = params.Options.(header.IPv4Options); !ok {
-			panic(fmt.Sprintf("want IPv4Options, got %T", params.Options))
-		}
-		hdrLen += opts.SizeWithPadding()
-		if hdrLen > header.IPv4MaximumHeaderSize {
-			// Since we have no way to report an error we must either panic or create
-			// a packet which is different to what was requested. Choose panic as this
-			// would be a programming error that should be caught in testing.
-			panic(fmt.Sprintf("IPv4 Options %d bytes, Max %d", params.Options.SizeWithPadding(), header.IPv4MaximumOptionsSize))
-		}
+	var optLen int
+	if options != nil {
+		optLen = int(options.Length())
+	}
+	hdrLen += optLen
+	if hdrLen > header.IPv4MaximumHeaderSize {
+		// Since we have no way to report an error we must either panic or create
+		// a packet which is different to what was requested. Choose panic as this
+		// would be a programming error that should be caught in testing.
+		panic(fmt.Sprintf("IPv4 Options %d bytes, Max %d", optLen, header.IPv4MaximumOptionsSize))
 	}
 	ip := header.IPv4(pkt.NetworkHeader().Push(hdrLen))
 	length := uint16(pkt.Size())
 	// RFC 6864 section 4.3 mandates uniqueness of ID values for non-atomic
 	// datagrams. Since the DF bit is never being set here, all datagrams
 	// are non-atomic and need an ID.
-	id := atomic.AddUint32(&e.protocol.ids[hashRoute(r, params.Protocol, e.protocol.hashIV)%buckets], 1)
+	id := atomic.AddUint32(&e.protocol.ids[hashRoute(srcAddr, dstAddr, params.Protocol, e.protocol.hashIV)%buckets], 1)
 	ip.Encode(&header.IPv4Fields{
 		TotalLength: length,
 		ID:          uint16(id),
 		TTL:         params.TTL,
 		TOS:         params.TOS,
 		Protocol:    uint8(params.Protocol),
-		SrcAddr:     r.LocalAddress,
-		DstAddr:     r.RemoteAddress,
-		Options:     opts,
+		SrcAddr:     srcAddr,
+		DstAddr:     dstAddr,
+		Options:     options,
 	})
 	ip.SetChecksum(^ip.CalculateChecksum())
 	pkt.NetworkProtocolNumber = ProtocolNumber
@@ -259,7 +273,7 @@ func (e *endpoint) handleFragments(r *stack.Route, gso *stack.GSO, networkMTU ui
 
 // WritePacket writes a packet to the given destination address and protocol.
 func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.NetworkHeaderParams, pkt *stack.PacketBuffer) *tcpip.Error {
-	e.addIPHeader(r, pkt, params)
+	e.addIPHeader(r.LocalAddress, r.RemoteAddress, pkt, params, nil /* options */)
 
 	// iptables filtering. All packets that reach here are locally
 	// generated.
@@ -347,7 +361,7 @@ func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.Packe
 	}
 
 	for pkt := pkts.Front(); pkt != nil; pkt = pkt.Next() {
-		e.addIPHeader(r, pkt, params)
+		e.addIPHeader(r.LocalAddress, r.RemoteAddress, pkt, params, nil /* options */)
 		networkMTU, err := calculateNetworkMTU(e.nic.MTU(), uint32(pkt.NetworkHeader().View().Size()))
 		if err != nil {
 			r.Stats().IP.OutgoingPacketErrors.IncrementBy(uint64(pkts.Len()))
@@ -461,7 +475,7 @@ func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt *stack.PacketBu
 		// non-atomic datagrams, so assign an ID to all such datagrams
 		// according to the definition given in RFC 6864 section 4.
 		if ip.Flags()&header.IPv4FlagDontFragment == 0 || ip.Flags()&header.IPv4FlagMoreFragments != 0 || ip.FragmentOffset() > 0 {
-			ip.SetID(uint16(atomic.AddUint32(&e.protocol.ids[hashRoute(r, 0 /* protocol */, e.protocol.hashIV)%buckets], 1)))
+			ip.SetID(uint16(atomic.AddUint32(&e.protocol.ids[hashRoute(r.LocalAddress, r.RemoteAddress, 0 /* protocol */, e.protocol.hashIV)%buckets], 1)))
 		}
 	}
 
@@ -566,21 +580,6 @@ func (e *endpoint) handlePacket(pkt *stack.PacketBuffer) {
 		stats.IP.MalformedPacketsReceived.Increment()
 		return
 	}
-	srcAddr := h.SourceAddress()
-	dstAddr := h.DestinationAddress()
-
-	addressEndpoint := e.AcquireAssignedAddress(dstAddr, e.nic.Promiscuous(), stack.CanBePrimaryEndpoint)
-	if addressEndpoint == nil {
-		if !e.protocol.Forwarding() {
-			stats.IP.InvalidDestinationAddressesReceived.Increment()
-			return
-		}
-
-		_ = e.forwardPacket(pkt)
-		return
-	}
-	subnet := addressEndpoint.AddressWithPrefix().Subnet()
-	addressEndpoint.DecRef()
 
 	// There has been some confusion regarding verifying checksums. We need
 	// just look for negative 0 (0xffff) as the checksum, as it's not possible to
@@ -608,16 +607,42 @@ func (e *endpoint) handlePacket(pkt *stack.PacketBuffer) {
 		return
 	}
 
+	srcAddr := h.SourceAddress()
+	dstAddr := h.DestinationAddress()
+
 	// As per RFC 1122 section 3.2.1.3:
 	//   When a host sends any datagram, the IP source address MUST
 	//   be one of its own IP addresses (but not a broadcast or
 	//   multicast address).
-	if directedBroadcast := subnet.IsBroadcast(srcAddr); directedBroadcast || srcAddr == header.IPv4Broadcast || header.IsV4MulticastAddress(srcAddr) {
+	if srcAddr == header.IPv4Broadcast || header.IsV4MulticastAddress(srcAddr) {
 		stats.IP.InvalidSourceAddressesReceived.Increment()
 		return
 	}
+	// Make sure the source address is not a subnet-local broadcast address.
+	if addressEndpoint := e.AcquireAssignedAddress(srcAddr, false /* createTemp */, stack.NeverPrimaryEndpoint); addressEndpoint != nil {
+		subnet := addressEndpoint.Subnet()
+		addressEndpoint.DecRef()
+		if subnet.IsBroadcast(srcAddr) {
+			stats.IP.InvalidSourceAddressesReceived.Increment()
+			return
+		}
+	}
 
-	pkt.NetworkPacketInfo.LocalAddressBroadcast = subnet.IsBroadcast(dstAddr) || dstAddr == header.IPv4Broadcast
+	// The destination address should be an address we own or a group we joined
+	// for us to receive the packet. Otherwise, attempt to forward the packet.
+	if addressEndpoint := e.AcquireAssignedAddress(dstAddr, e.nic.Promiscuous(), stack.CanBePrimaryEndpoint); addressEndpoint != nil {
+		subnet := addressEndpoint.AddressWithPrefix().Subnet()
+		addressEndpoint.DecRef()
+		pkt.NetworkPacketInfo.LocalAddressBroadcast = subnet.IsBroadcast(dstAddr) || dstAddr == header.IPv4Broadcast
+	} else if !e.IsInGroup(dstAddr) {
+		if !e.protocol.Forwarding() {
+			stats.IP.InvalidDestinationAddressesReceived.Increment()
+			return
+		}
+
+		_ = e.forwardPacket(pkt)
+		return
+	}
 
 	// iptables filtering. All packets that reach here are intended for
 	// this machine and will not be forwarded.
@@ -692,6 +717,10 @@ func (e *endpoint) handlePacket(pkt *stack.PacketBuffer) {
 		e.handleICMP(pkt)
 		return
 	}
+	if p == header.IGMPProtocolNumber {
+		e.igmp.handleIGMP(pkt)
+		return
+	}
 	if opts := h.Options(); len(opts) != 0 {
 		// TODO(gvisor.dev/issue/4586):
 		// When we add forwarding support we should use the verified options
@@ -770,28 +799,12 @@ func (e *endpoint) AcquireAssignedAddress(localAddr tcpip.Address, allowTemp boo
 	defer e.mu.Unlock()
 
 	loopback := e.nic.IsLoopback()
-	addressEndpoint := e.mu.addressableEndpointState.ReadOnly().AddrOrMatching(localAddr, allowTemp, func(addressEndpoint stack.AddressEndpoint) bool {
+	return e.mu.addressableEndpointState.AcquireAssignedAddressOrMatching(localAddr, func(addressEndpoint stack.AddressEndpoint) bool {
 		subnet := addressEndpoint.Subnet()
 		// IPv4 has a notion of a subnet broadcast address and considers the
 		// loopback interface bound to an address's whole subnet (on linux).
 		return subnet.IsBroadcast(localAddr) || (loopback && subnet.Contains(localAddr))
-	})
-	if addressEndpoint != nil {
-		return addressEndpoint
-	}
-
-	if !allowTemp {
-		return nil
-	}
-
-	addr := localAddr.WithPrefix()
-	addressEndpoint, err := e.mu.addressableEndpointState.AddAndAcquireTemporaryAddress(addr, tempPEB)
-	if err != nil {
-		// AddAddress only returns an error if the address is already assigned,
-		// but we just checked above if the address exists so we expect no error.
-		panic(fmt.Sprintf("e.mu.addressableEndpointState.AddAndAcquireTemporaryAddress(%s, %d): %s", addr, tempPEB, err))
-	}
-	return addressEndpoint
+	}, allowTemp, tempPEB)
 }
 
 // AcquireOutgoingPrimaryAddress implements stack.AddressableEndpoint.
@@ -816,28 +829,43 @@ func (e *endpoint) PermanentAddresses() []tcpip.AddressWithPrefix {
 }
 
 // JoinGroup implements stack.GroupAddressableEndpoint.
-func (e *endpoint) JoinGroup(addr tcpip.Address) (bool, *tcpip.Error) {
+func (e *endpoint) JoinGroup(addr tcpip.Address) *tcpip.Error {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+	return e.joinGroupLocked(addr)
+}
+
+// joinGroupLocked is like JoinGroup but with locking requirements.
+//
+// Precondition: e.mu must be locked.
+func (e *endpoint) joinGroupLocked(addr tcpip.Address) *tcpip.Error {
 	if !header.IsV4MulticastAddress(addr) {
-		return false, tcpip.ErrBadAddress
+		return tcpip.ErrBadAddress
 	}
 
-	e.mu.Lock()
-	defer e.mu.Unlock()
-	return e.mu.addressableEndpointState.JoinGroup(addr)
+	e.igmp.joinGroup(addr)
+	return nil
 }
 
 // LeaveGroup implements stack.GroupAddressableEndpoint.
-func (e *endpoint) LeaveGroup(addr tcpip.Address) (bool, *tcpip.Error) {
+func (e *endpoint) LeaveGroup(addr tcpip.Address) *tcpip.Error {
 	e.mu.Lock()
 	defer e.mu.Unlock()
-	return e.mu.addressableEndpointState.LeaveGroup(addr)
+	return e.leaveGroupLocked(addr)
+}
+
+// leaveGroupLocked is like LeaveGroup but with locking requirements.
+//
+// Precondition: e.mu must be locked.
+func (e *endpoint) leaveGroupLocked(addr tcpip.Address) *tcpip.Error {
+	return e.igmp.leaveGroup(addr)
 }
 
 // IsInGroup implements stack.GroupAddressableEndpoint.
 func (e *endpoint) IsInGroup(addr tcpip.Address) bool {
 	e.mu.RLock()
 	defer e.mu.RUnlock()
-	return e.mu.addressableEndpointState.IsInGroup(addr)
+	return e.igmp.isInGroup(addr)
 }
 
 var _ stack.ForwardingNetworkProtocol = (*protocol)(nil)
@@ -863,6 +891,8 @@ type protocol struct {
 	hashIV uint32
 
 	fragmentation *fragmentation.Fragmentation
+
+	options Options
 }
 
 // Number returns the ipv4 protocol number.
@@ -987,17 +1017,23 @@ func addressToUint32(addr tcpip.Address) uint32 {
 	return uint32(addr[0]) | uint32(addr[1])<<8 | uint32(addr[2])<<16 | uint32(addr[3])<<24
 }
 
-// hashRoute calculates a hash value for the given route. It uses the source &
-// destination address, the transport protocol number and a 32-bit number to
-// generate the hash.
-func hashRoute(r *stack.Route, protocol tcpip.TransportProtocolNumber, hashIV uint32) uint32 {
-	a := addressToUint32(r.LocalAddress)
-	b := addressToUint32(r.RemoteAddress)
+// hashRoute calculates a hash value for the given source/destination pair using
+// the addresses, transport protocol number and a 32-bit number to generate the
+// hash.
+func hashRoute(srcAddr, dstAddr tcpip.Address, protocol tcpip.TransportProtocolNumber, hashIV uint32) uint32 {
+	a := addressToUint32(srcAddr)
+	b := addressToUint32(dstAddr)
 	return hash.Hash3Words(a, b, uint32(protocol), hashIV)
 }
 
-// NewProtocol returns an IPv4 network protocol.
-func NewProtocol(s *stack.Stack) stack.NetworkProtocol {
+// Options holds options to configure a new protocol.
+type Options struct {
+	// IGMP holds options for IGMP.
+	IGMP IGMPOptions
+}
+
+// NewProtocolWithOptions returns an IPv4 network protocol.
+func NewProtocolWithOptions(opts Options) stack.NetworkProtocolFactory {
 	ids := make([]uint32, buckets)
 
 	// Randomly initialize hashIV and the ids.
@@ -1007,14 +1043,22 @@ func NewProtocol(s *stack.Stack) stack.NetworkProtocol {
 	}
 	hashIV := r[buckets]
 
-	p := &protocol{
-		stack:      s,
-		ids:        ids,
-		hashIV:     hashIV,
-		defaultTTL: DefaultTTL,
+	return func(s *stack.Stack) stack.NetworkProtocol {
+		p := &protocol{
+			stack:      s,
+			ids:        ids,
+			hashIV:     hashIV,
+			defaultTTL: DefaultTTL,
+			options:    opts,
+		}
+		p.fragmentation = fragmentation.NewFragmentation(fragmentblockSize, fragmentation.HighFragThreshold, fragmentation.LowFragThreshold, ReassembleTimeout, s.Clock(), p)
+		return p
 	}
-	p.fragmentation = fragmentation.NewFragmentation(fragmentblockSize, fragmentation.HighFragThreshold, fragmentation.LowFragThreshold, ReassembleTimeout, s.Clock(), p)
-	return p
+}
+
+// NewProtocol is equivalent to NewProtocolWithOptions with an empty Options.
+func NewProtocol(s *stack.Stack) stack.NetworkProtocol {
+	return NewProtocolWithOptions(Options{})(s)
 }
 
 func buildNextFragment(pf *fragmentation.PacketFragmenter, originalIPHeader header.IPv4) (*stack.PacketBuffer, bool) {
@@ -1129,6 +1173,12 @@ func handleTimestamp(tsOpt header.IPv4OptionTimestamp, localAddress tcpip.Addres
 	}
 
 	pointer := tsOpt.Pointer()
+	// RFC 791 page 22 states: "The smallest legal value is 5."
+	// Since the pointer is 1 based, and the header is 4 bytes long the
+	// pointer must point beyond the header therefore 4 or less is bad.
+	if pointer <= header.IPv4OptionTimestampHdrLength {
+		return header.IPv4OptTSPointerOffset, errIPv4TimestampOptInvalidPointer
+	}
 	// To simplify processing below, base further work on the array of timestamps
 	// beyond the header, rather than on the whole option. Also to aid
 	// calculations set 'nextSlot' to be 0 based as in the packet it is 1 based.
@@ -1215,7 +1265,15 @@ func handleRecordRoute(rrOpt header.IPv4OptionRecordRoute, localAddress tcpip.Ad
 		return header.IPv4OptionLengthOffset, errIPv4RecordRouteOptInvalidLength
 	}
 
-	nextSlot := rrOpt.Pointer() - 1 // Pointer is 1 based.
+	pointer := rrOpt.Pointer()
+	// RFC 791 page 20 states:
+	//      The pointer is relative to this option, and the
+	//      smallest legal value for the pointer is 4.
+	// Since the pointer is 1 based, and the header is 3 bytes long the
+	// pointer must point beyond the header therefore 3 or less is bad.
+	if pointer <= header.IPv4OptionRecordRouteHdrLength {
+		return header.IPv4OptRRPointerOffset, errIPv4RecordRouteOptInvalidPointer
+	}
 
 	// RFC 791 page 21 says
 	//       If the route data area is already full (the pointer exceeds the
@@ -1230,14 +1288,14 @@ func handleRecordRoute(rrOpt header.IPv4OptionRecordRoute, localAddress tcpip.Ad
 	// do this (as do most implementations). It is probable that the inclusion
 	// of these words is a copy/paste error from the timestamp option where
 	// there are two failure reasons given.
-	if nextSlot >= optlen {
+	if pointer > optlen {
 		return 0, nil
 	}
 
 	// The data area isn't full but there isn't room for a new entry.
 	// Either Length or Pointer could be bad. We must select Pointer for Linux
-	// compatibility, even if only the length is bad.
-	if nextSlot+header.IPv4AddressSize > optlen {
+	// compatibility, even if only the length is bad. NB. pointer is 1 based.
+	if pointer+header.IPv4AddressSize > optlen+1 {
 		if false {
 			// This is what we would do if we were not being Linux compatible.
 			// Check for bad pointer or length value. Must be a multiple of 4 after
diff --git a/pkg/tcpip/network/ipv4/ipv4_test.go b/pkg/tcpip/network/ipv4/ipv4_test.go
index 4e4e1f3b4..9e2d2cfd6 100644
--- a/pkg/tcpip/network/ipv4/ipv4_test.go
+++ b/pkg/tcpip/network/ipv4/ipv4_test.go
@@ -103,105 +103,6 @@ func TestExcludeBroadcast(t *testing.T) {
 	})
 }
 
-// TestIPv4Encode checks that ipv4.Encode correctly fills out the requested
-// fields when options are supplied.
-func TestIPv4EncodeOptions(t *testing.T) {
-	tests := []struct {
-		name           string
-		options        header.IPv4Options
-		encodedOptions header.IPv4Options // reply should look like this
-		wantIHL        int
-	}{
-		{
-			name:    "valid no options",
-			wantIHL: header.IPv4MinimumSize,
-		},
-		{
-			name:           "one byte options",
-			options:        header.IPv4Options{1},
-			encodedOptions: header.IPv4Options{1, 0, 0, 0},
-			wantIHL:        header.IPv4MinimumSize + 4,
-		},
-		{
-			name:           "two byte options",
-			options:        header.IPv4Options{1, 1},
-			encodedOptions: header.IPv4Options{1, 1, 0, 0},
-			wantIHL:        header.IPv4MinimumSize + 4,
-		},
-		{
-			name:           "three byte options",
-			options:        header.IPv4Options{1, 1, 1},
-			encodedOptions: header.IPv4Options{1, 1, 1, 0},
-			wantIHL:        header.IPv4MinimumSize + 4,
-		},
-		{
-			name:           "four byte options",
-			options:        header.IPv4Options{1, 1, 1, 1},
-			encodedOptions: header.IPv4Options{1, 1, 1, 1},
-			wantIHL:        header.IPv4MinimumSize + 4,
-		},
-		{
-			name:           "five byte options",
-			options:        header.IPv4Options{1, 1, 1, 1, 1},
-			encodedOptions: header.IPv4Options{1, 1, 1, 1, 1, 0, 0, 0},
-			wantIHL:        header.IPv4MinimumSize + 8,
-		},
-		{
-			name: "thirty nine byte options",
-			options: header.IPv4Options{
-				1, 2, 3, 4, 5, 6, 7, 8,
-				9, 10, 11, 12, 13, 14, 15, 16,
-				17, 18, 19, 20, 21, 22, 23, 24,
-				25, 26, 27, 28, 29, 30, 31, 32,
-				33, 34, 35, 36, 37, 38, 39,
-			},
-			encodedOptions: header.IPv4Options{
-				1, 2, 3, 4, 5, 6, 7, 8,
-				9, 10, 11, 12, 13, 14, 15, 16,
-				17, 18, 19, 20, 21, 22, 23, 24,
-				25, 26, 27, 28, 29, 30, 31, 32,
-				33, 34, 35, 36, 37, 38, 39, 0,
-			},
-			wantIHL: header.IPv4MinimumSize + 40,
-		},
-	}
-	for _, test := range tests {
-		t.Run(test.name, func(t *testing.T) {
-			paddedOptionLength := test.options.SizeWithPadding()
-			ipHeaderLength := header.IPv4MinimumSize + paddedOptionLength
-			if ipHeaderLength > header.IPv4MaximumHeaderSize {
-				t.Fatalf("IP header length too large: got = %d, want <= %d ", ipHeaderLength, header.IPv4MaximumHeaderSize)
-			}
-			totalLen := uint16(ipHeaderLength)
-			hdr := buffer.NewPrependable(int(totalLen))
-			ip := header.IPv4(hdr.Prepend(ipHeaderLength))
-			// To check the padding works, poison the last byte of the options space.
-			if paddedOptionLength != len(test.options) {
-				ip.SetHeaderLength(uint8(ipHeaderLength))
-				ip.Options()[paddedOptionLength-1] = 0xff
-				ip.SetHeaderLength(0)
-			}
-			ip.Encode(&header.IPv4Fields{
-				Options: test.options,
-			})
-			options := ip.Options()
-			wantOptions := test.encodedOptions
-			if got, want := int(ip.HeaderLength()), test.wantIHL; got != want {
-				t.Errorf("got IHL of %d, want %d", got, want)
-			}
-
-			// cmp.Diff does not consider nil slices equal to empty slices, but we do.
-			if len(wantOptions) == 0 && len(options) == 0 {
-				return
-			}
-
-			if diff := cmp.Diff(wantOptions, options); diff != "" {
-				t.Errorf("options mismatch (-want +got):\n%s", diff)
-			}
-		})
-	}
-}
-
 func TestForwarding(t *testing.T) {
 	const (
 		nicID1         = 1
@@ -453,14 +354,6 @@ func TestIPv4Sanity(t *testing.T) {
 			replyOptions:      header.IPv4Options{1, 1, 0, 0},
 		},
 		{
-			name:              "Check option padding",
-			maxTotalLength:    ipv4.MaxTotalSize,
-			transportProtocol: uint8(header.ICMPv4ProtocolNumber),
-			TTL:               ttl,
-			options:           header.IPv4Options{1, 1, 1},
-			replyOptions:      header.IPv4Options{1, 1, 1, 0},
-		},
-		{
 			name:              "bad header length",
 			headerLength:      header.IPv4MinimumSize - 1,
 			maxTotalLength:    ipv4.MaxTotalSize,
@@ -583,7 +476,7 @@ func TestIPv4Sanity(t *testing.T) {
 				68, 7, 5, 0,
 				//  ^  ^ Linux points here which is wrong.
 				//  | Not a multiple of 4
-				1, 2, 3,
+				1, 2, 3, 0,
 			},
 			shouldFail:          true,
 			expectErrorICMP:     true,
@@ -662,6 +555,56 @@ func TestIPv4Sanity(t *testing.T) {
 			},
 		},
 		{
+			// Timestamp pointer uses one based counting so 0 is invalid.
+			name:              "timestamp pointer invalid",
+			maxTotalLength:    ipv4.MaxTotalSize,
+			transportProtocol: uint8(header.ICMPv4ProtocolNumber),
+			TTL:               ttl,
+			options: header.IPv4Options{
+				68, 8, 0, 0x00,
+				//      ^ 0 instead of 5 or more.
+				0, 0, 0, 0,
+			},
+			shouldFail:          true,
+			expectErrorICMP:     true,
+			ICMPType:            header.ICMPv4ParamProblem,
+			ICMPCode:            header.ICMPv4UnusedCode,
+			paramProblemPointer: header.IPv4MinimumSize + 2,
+		},
+		{
+			// Timestamp pointer cannot be less than 5. It must point past the header
+			// which is 4 bytes. (1 based counting)
+			name:              "timestamp pointer too small by 1",
+			maxTotalLength:    ipv4.MaxTotalSize,
+			transportProtocol: uint8(header.ICMPv4ProtocolNumber),
+			TTL:               ttl,
+			options: header.IPv4Options{
+				68, 8, header.IPv4OptionTimestampHdrLength, 0x00,
+				//          ^ header is 4 bytes, so 4 should fail.
+				0, 0, 0, 0,
+			},
+			shouldFail:          true,
+			expectErrorICMP:     true,
+			ICMPType:            header.ICMPv4ParamProblem,
+			ICMPCode:            header.ICMPv4UnusedCode,
+			paramProblemPointer: header.IPv4MinimumSize + 2,
+		},
+		{
+			name:              "valid timestamp pointer",
+			maxTotalLength:    ipv4.MaxTotalSize,
+			transportProtocol: uint8(header.ICMPv4ProtocolNumber),
+			TTL:               ttl,
+			options: header.IPv4Options{
+				68, 8, header.IPv4OptionTimestampHdrLength + 1, 0x00,
+				//          ^ header is 4 bytes, so 5 should succeed.
+				0, 0, 0, 0,
+			},
+			replyOptions: header.IPv4Options{
+				68, 8, 9, 0x00,
+				0x00, 0xad, 0x1c, 0x40, // time we expect from fakeclock
+			},
+		},
+		{
 			// Needs 8 bytes for a type 1 timestamp but there are only 4 free.
 			name:              "bad timer element alignment",
 			maxTotalLength:    ipv4.MaxTotalSize,
@@ -792,7 +735,61 @@ func TestIPv4Sanity(t *testing.T) {
 			},
 		},
 		{
-			// Confirm linux bug for bug compatibility.
+			// Pointer uses one based counting so 0 is invalid.
+			name:              "record route pointer zero",
+			maxTotalLength:    ipv4.MaxTotalSize,
+			transportProtocol: uint8(header.ICMPv4ProtocolNumber),
+			TTL:               ttl,
+			options: header.IPv4Options{
+				7, 8, 0, // 3 byte header
+				0, 0, 0, 0,
+				0,
+			},
+			shouldFail:          true,
+			expectErrorICMP:     true,
+			ICMPType:            header.ICMPv4ParamProblem,
+			ICMPCode:            header.ICMPv4UnusedCode,
+			paramProblemPointer: header.IPv4MinimumSize + 2,
+		},
+		{
+			// Pointer must be 4 or more as it must point past the 3 byte header
+			// using 1 based counting. 3 should fail.
+			name:              "record route pointer too small by 1",
+			maxTotalLength:    ipv4.MaxTotalSize,
+			transportProtocol: uint8(header.ICMPv4ProtocolNumber),
+			TTL:               ttl,
+			options: header.IPv4Options{
+				7, 8, header.IPv4OptionRecordRouteHdrLength, // 3 byte header
+				0, 0, 0, 0,
+				0,
+			},
+			shouldFail:          true,
+			expectErrorICMP:     true,
+			ICMPType:            header.ICMPv4ParamProblem,
+			ICMPCode:            header.ICMPv4UnusedCode,
+			paramProblemPointer: header.IPv4MinimumSize + 2,
+		},
+		{
+			// Pointer must be 4 or more as it must point past the 3 byte header
+			// using 1 based counting. Check 4 passes. (Duplicates "single
+			// record route with room")
+			name:              "valid record route pointer",
+			maxTotalLength:    ipv4.MaxTotalSize,
+			transportProtocol: uint8(header.ICMPv4ProtocolNumber),
+			TTL:               ttl,
+			options: header.IPv4Options{
+				7, 7, header.IPv4OptionRecordRouteHdrLength + 1, // 3 byte header
+				0, 0, 0, 0,
+				0,
+			},
+			replyOptions: header.IPv4Options{
+				7, 7, 8, // 3 byte header
+				192, 168, 1, 58, // New IP Address.
+				0, // padding to multiple of 4 bytes.
+			},
+		},
+		{
+			// Confirm Linux bug for bug compatibility.
 			// Linux returns slot 22 but the error is in slot 21.
 			name:              "multiple record route with not enough room",
 			maxTotalLength:    ipv4.MaxTotalSize,
@@ -863,8 +860,10 @@ func TestIPv4Sanity(t *testing.T) {
 				},
 			})
 
-			paddedOptionLength := test.options.SizeWithPadding()
-			ipHeaderLength := header.IPv4MinimumSize + paddedOptionLength
+			if len(test.options)%4 != 0 {
+				t.Fatalf("options must be aligned to 32 bits, invalid test options: %x (len=%d)", test.options, len(test.options))
+			}
+			ipHeaderLength := header.IPv4MinimumSize + len(test.options)
 			if ipHeaderLength > header.IPv4MaximumHeaderSize {
 				t.Fatalf("IP header length too large: got = %d, want <= %d ", ipHeaderLength, header.IPv4MaximumHeaderSize)
 			}
@@ -883,11 +882,6 @@ func TestIPv4Sanity(t *testing.T) {
 			if test.maxTotalLength < totalLen {
 				totalLen = test.maxTotalLength
 			}
-			// To check the padding works, poison the options space.
-			if paddedOptionLength != len(test.options) {
-				ip.SetHeaderLength(uint8(ipHeaderLength))
-				ip.Options()[paddedOptionLength-1] = 0x01
-			}
 
 			ip.Encode(&header.IPv4Fields{
 				TotalLength: totalLen,
@@ -895,10 +889,19 @@ func TestIPv4Sanity(t *testing.T) {
 				TTL:         test.TTL,
 				SrcAddr:     remoteIPv4Addr,
 				DstAddr:     ipv4Addr.Address,
-				Options:     test.options,
 			})
 			if test.headerLength != 0 {
 				ip.SetHeaderLength(test.headerLength)
+			} else {
+				// Set the calculated header length, since we may manually add options.
+				ip.SetHeaderLength(uint8(ipHeaderLength))
+			}
+			if len(test.options) != 0 {
+				// Copy options manually. We do not use Encode for options so we can
+				// verify malformed options with handcrafted payloads.
+				if want, got := copy(ip.Options(), test.options), len(test.options); want != got {
+					t.Fatalf("got copy(ip.Options(), test.options) = %d, want = %d", got, want)
+				}
 			}
 			ip.SetChecksum(0)
 			ipHeaderChecksum := ip.CalculateChecksum()
@@ -1003,7 +1006,7 @@ func TestIPv4Sanity(t *testing.T) {
 				}
 				// If the IP options change size then the packet will change size, so
 				// some IP header fields will need to be adjusted for the checks.
-				sizeChange := len(test.replyOptions) - paddedOptionLength
+				sizeChange := len(test.replyOptions) - len(test.options)
 
 				checker.IPv4(t, replyIPHeader,
 					checker.IPv4HeaderLength(ipHeaderLength+sizeChange),
@@ -2320,6 +2323,28 @@ func TestReceiveFragments(t *testing.T) {
 			},
 			expectedPayloads: [][]byte{udpPayload4Addr1ToAddr2},
 		},
+		{
+			name: "Two fragments with MF flag reassembled into a maximum UDP packet",
+			fragments: []fragmentData{
+				{
+					srcAddr:        addr1,
+					dstAddr:        addr2,
+					id:             1,
+					flags:          header.IPv4FlagMoreFragments,
+					fragmentOffset: 0,
+					payload:        ipv4Payload4Addr1ToAddr2[:65512],
+				},
+				{
+					srcAddr:        addr1,
+					dstAddr:        addr2,
+					id:             1,
+					flags:          header.IPv4FlagMoreFragments,
+					fragmentOffset: 65512,
+					payload:        ipv4Payload4Addr1ToAddr2[65512:],
+				},
+			},
+			expectedPayloads: nil,
+		},
 	}
 
 	for _, test := range tests {
@@ -2513,7 +2538,7 @@ func TestWriteStats(t *testing.T) {
 
 					test.setup(t, rt.Stack())
 
-					nWritten, _ := writer.writePackets(&rt, pkts)
+					nWritten, _ := writer.writePackets(rt, pkts)
 
 					if got := int(rt.Stats().IP.PacketsSent.Value()); got != test.expectSent {
 						t.Errorf("sent %d packets, but expected to send %d", got, test.expectSent)
@@ -2530,7 +2555,7 @@ func TestWriteStats(t *testing.T) {
 	}
 }
 
-func buildRoute(t *testing.T, ep stack.LinkEndpoint) stack.Route {
+func buildRoute(t *testing.T, ep stack.LinkEndpoint) *stack.Route {
 	s := stack.New(stack.Options{
 		NetworkProtocols: []stack.NetworkProtocolFactory{ipv4.NewProtocol},
 	})
@@ -2644,8 +2669,8 @@ func TestPacketQueing(t *testing.T) {
 				if p.Proto != header.IPv4ProtocolNumber {
 					t.Errorf("got p.Proto = %d, want = %d", p.Proto, header.IPv4ProtocolNumber)
 				}
-				if p.Route.RemoteLinkAddress != host2NICLinkAddr {
-					t.Errorf("got p.Route.RemoteLinkAddress = %s, want = %s", p.Route.RemoteLinkAddress, host2NICLinkAddr)
+				if got := p.Route.RemoteLinkAddress(); got != host2NICLinkAddr {
+					t.Errorf("got p.Route.RemoteLinkAddress() = %s, want = %s", got, host2NICLinkAddr)
 				}
 				checker.IPv4(t, stack.PayloadSince(p.Pkt.NetworkHeader()),
 					checker.SrcAddr(host1IPv4Addr.AddressWithPrefix.Address),
@@ -2687,8 +2712,8 @@ func TestPacketQueing(t *testing.T) {
 				if p.Proto != header.IPv4ProtocolNumber {
 					t.Errorf("got p.Proto = %d, want = %d", p.Proto, header.IPv4ProtocolNumber)
 				}
-				if p.Route.RemoteLinkAddress != host2NICLinkAddr {
-					t.Errorf("got p.Route.RemoteLinkAddress = %s, want = %s", p.Route.RemoteLinkAddress, host2NICLinkAddr)
+				if got := p.Route.RemoteLinkAddress(); got != host2NICLinkAddr {
+					t.Errorf("got p.Route.RemoteLinkAddress() = %s, want = %s", got, host2NICLinkAddr)
 				}
 				checker.IPv4(t, stack.PayloadSince(p.Pkt.NetworkHeader()),
 					checker.SrcAddr(host1IPv4Addr.AddressWithPrefix.Address),
@@ -2736,8 +2761,8 @@ func TestPacketQueing(t *testing.T) {
 				if p.Proto != arp.ProtocolNumber {
 					t.Errorf("got p.Proto = %d, want = %d", p.Proto, arp.ProtocolNumber)
 				}
-				if p.Route.RemoteLinkAddress != header.EthernetBroadcastAddress {
-					t.Errorf("got p.Route.RemoteLinkAddress = %s, want = %s", p.Route.RemoteLinkAddress, header.EthernetBroadcastAddress)
+				if got := p.Route.RemoteLinkAddress(); got != header.EthernetBroadcastAddress {
+					t.Errorf("got p.Route.RemoteLinkAddress() = %s, want = %s", got, header.EthernetBroadcastAddress)
 				}
 				rep := header.ARP(p.Pkt.NetworkHeader().View())
 				if got := rep.Op(); got != header.ARPRequest {
diff --git a/pkg/tcpip/network/ipv6/BUILD b/pkg/tcpip/network/ipv6/BUILD
index 0ac24a6fb..5e75c8740 100644
--- a/pkg/tcpip/network/ipv6/BUILD
+++ b/pkg/tcpip/network/ipv6/BUILD
@@ -8,6 +8,7 @@ go_library(
         "dhcpv6configurationfromndpra_string.go",
         "icmp.go",
         "ipv6.go",
+        "mld.go",
         "ndp.go",
     ],
     visibility = ["//visibility:public"],
@@ -19,6 +20,7 @@ go_library(
         "//pkg/tcpip/header/parse",
         "//pkg/tcpip/network/fragmentation",
         "//pkg/tcpip/network/hash",
+        "//pkg/tcpip/network/ip",
         "//pkg/tcpip/stack",
     ],
 )
@@ -49,3 +51,16 @@ go_test(
         "@com_github_google_go_cmp//cmp:go_default_library",
     ],
 )
+
+go_test(
+    name = "ipv6_x_test",
+    size = "small",
+    srcs = ["mld_test.go"],
+    deps = [
+        ":ipv6",
+        "//pkg/tcpip/checker",
+        "//pkg/tcpip/header",
+        "//pkg/tcpip/link/channel",
+        "//pkg/tcpip/stack",
+    ],
+)
diff --git a/pkg/tcpip/network/ipv6/icmp.go b/pkg/tcpip/network/ipv6/icmp.go
index beb8f562e..510276b8e 100644
--- a/pkg/tcpip/network/ipv6/icmp.go
+++ b/pkg/tcpip/network/ipv6/icmp.go
@@ -126,8 +126,8 @@ func getTargetLinkAddr(it header.NDPOptionIterator) (tcpip.LinkAddress, bool) {
 
 func (e *endpoint) handleICMP(pkt *stack.PacketBuffer, hasFragmentHeader bool) {
 	stats := e.protocol.stack.Stats().ICMP
-	sent := stats.V6PacketsSent
-	received := stats.V6PacketsReceived
+	sent := stats.V6.PacketsSent
+	received := stats.V6.PacketsReceived
 	// TODO(gvisor.dev/issue/170): ICMP packets don't have their
 	// TransportHeader fields set. See icmp/protocol.go:protocol.Parse for a
 	// full explanation.
@@ -163,7 +163,7 @@ func (e *endpoint) handleICMP(pkt *stack.PacketBuffer, hasFragmentHeader bool) {
 	}
 
 	// TODO(b/112892170): Meaningfully handle all ICMP types.
-	switch h.Type() {
+	switch icmpType := h.Type(); icmpType {
 	case header.ICMPv6PacketTooBig:
 		received.PacketTooBig.Increment()
 		hdr, ok := pkt.Data.PullUp(header.ICMPv6PacketTooBigMinimumSize)
@@ -358,7 +358,7 @@ func (e *endpoint) handleICMP(pkt *stack.PacketBuffer, hasFragmentHeader bool) {
 		pkt.TransportProtocolNumber = header.ICMPv6ProtocolNumber
 		packet := header.ICMPv6(pkt.TransportHeader().Push(neighborAdvertSize))
 		packet.SetType(header.ICMPv6NeighborAdvert)
-		na := header.NDPNeighborAdvert(packet.NDPPayload())
+		na := header.NDPNeighborAdvert(packet.MessageBody())
 
 		// As per RFC 4861 section 7.2.4:
 		//
@@ -644,8 +644,31 @@ func (e *endpoint) handleICMP(pkt *stack.PacketBuffer, hasFragmentHeader bool) {
 			return
 		}
 
+	case header.ICMPv6MulticastListenerQuery, header.ICMPv6MulticastListenerReport, header.ICMPv6MulticastListenerDone:
+		var handler func(header.MLD)
+		switch icmpType {
+		case header.ICMPv6MulticastListenerQuery:
+			received.MulticastListenerQuery.Increment()
+			handler = e.mld.handleMulticastListenerQuery
+		case header.ICMPv6MulticastListenerReport:
+			received.MulticastListenerReport.Increment()
+			handler = e.mld.handleMulticastListenerReport
+		case header.ICMPv6MulticastListenerDone:
+			received.MulticastListenerDone.Increment()
+		default:
+			panic(fmt.Sprintf("unrecognized MLD message = %d", icmpType))
+		}
+		if pkt.Data.Size()-header.ICMPv6HeaderSize < header.MLDMinimumSize {
+			received.Invalid.Increment()
+			return
+		}
+
+		if handler != nil {
+			handler(header.MLD(payload.ToView()))
+		}
+
 	default:
-		received.Invalid.Increment()
+		received.Unrecognized.Increment()
 	}
 }
 
@@ -681,12 +704,12 @@ func (p *protocol) LinkAddressRequest(targetAddr, localAddr tcpip.Address, remot
 	pkt.TransportProtocolNumber = header.ICMPv6ProtocolNumber
 	packet := header.ICMPv6(pkt.TransportHeader().Push(neighborSolicitSize))
 	packet.SetType(header.ICMPv6NeighborSolicit)
-	ns := header.NDPNeighborSolicit(packet.NDPPayload())
+	ns := header.NDPNeighborSolicit(packet.MessageBody())
 	ns.SetTargetAddress(targetAddr)
 	ns.Options().Serialize(optsSerializer)
 	packet.SetChecksum(header.ICMPv6Checksum(packet, r.LocalAddress, r.RemoteAddress, buffer.VectorisedView{}))
 
-	stat := p.stack.Stats().ICMP.V6PacketsSent
+	stat := p.stack.Stats().ICMP.V6.PacketsSent
 	if err := r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{
 		Protocol: header.ICMPv6ProtocolNumber,
 		TTL:      header.NDPHopLimit,
@@ -796,7 +819,8 @@ func (p *protocol) returnError(reason icmpReason, pkt *stack.PacketBuffer) *tcpi
 		allowResponseToMulticast = reason.respondToMulticast
 	}
 
-	if (!allowResponseToMulticast && header.IsV6MulticastAddress(origIPHdrDst)) || origIPHdrSrc == header.IPv6Any {
+	isOrigDstMulticast := header.IsV6MulticastAddress(origIPHdrDst)
+	if (!allowResponseToMulticast && isOrigDstMulticast) || origIPHdrSrc == header.IPv6Any {
 		return nil
 	}
 
@@ -812,8 +836,13 @@ func (p *protocol) returnError(reason icmpReason, pkt *stack.PacketBuffer) *tcpi
 	// If we are operating as a router, do not use the packet's destination
 	// address as the response's source address as we should not own the
 	// destination address of a packet we are forwarding.
+	//
+	// If the packet was originally destined to a multicast address, then do not
+	// use the packet's destination address as the source for the response ICMP
+	// packet as "multicast addresses must not be used as source addresses in IPv6
+	// packets", as per RFC 4291 section 2.7.
 	localAddr := origIPHdrDst
-	if _, ok := reason.(*icmpReasonHopLimitExceeded); ok {
+	if _, ok := reason.(*icmpReasonHopLimitExceeded); ok || isOrigDstMulticast {
 		localAddr = ""
 	}
 	// Even if we were able to receive a packet from some remote, we may not have
@@ -827,7 +856,7 @@ func (p *protocol) returnError(reason icmpReason, pkt *stack.PacketBuffer) *tcpi
 	defer route.Release()
 
 	stats := p.stack.Stats().ICMP
-	sent := stats.V6PacketsSent
+	sent := stats.V6.PacketsSent
 	if !p.stack.AllowICMPMessage() {
 		sent.RateLimited.Increment()
 		return nil
diff --git a/pkg/tcpip/network/ipv6/icmp_test.go b/pkg/tcpip/network/ipv6/icmp_test.go
index 9bc02d851..32adb5c83 100644
--- a/pkg/tcpip/network/ipv6/icmp_test.go
+++ b/pkg/tcpip/network/ipv6/icmp_test.go
@@ -150,9 +150,9 @@ func (*testInterface) Promiscuous() bool {
 
 func (t *testInterface) WritePacketToRemote(remoteLinkAddr tcpip.LinkAddress, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) *tcpip.Error {
 	r := stack.Route{
-		NetProto:          protocol,
-		RemoteLinkAddress: remoteLinkAddr,
+		NetProto: protocol,
 	}
+	r.ResolveWith(remoteLinkAddr)
 	return t.LinkEndpoint.WritePacket(&r, gso, protocol, pkt)
 }
 
@@ -271,6 +271,22 @@ func TestICMPCounts(t *testing.T) {
 					typ:  header.ICMPv6RedirectMsg,
 					size: header.ICMPv6MinimumSize,
 				},
+				{
+					typ:  header.ICMPv6MulticastListenerQuery,
+					size: header.MLDMinimumSize + header.ICMPv6HeaderSize,
+				},
+				{
+					typ:  header.ICMPv6MulticastListenerReport,
+					size: header.MLDMinimumSize + header.ICMPv6HeaderSize,
+				},
+				{
+					typ:  header.ICMPv6MulticastListenerDone,
+					size: header.MLDMinimumSize + header.ICMPv6HeaderSize,
+				},
+				{
+					typ:  255, /* Unrecognized */
+					size: 50,
+				},
 			}
 
 			handleIPv6Payload := func(icmp header.ICMPv6) {
@@ -301,7 +317,7 @@ func TestICMPCounts(t *testing.T) {
 			// Stats().ICMP.ICMPv6ReceivedPacketStats.Invalid is incremented.
 			handleIPv6Payload(header.ICMPv6(buffer.NewView(header.IPv6MinimumSize)))
 
-			icmpv6Stats := s.Stats().ICMP.V6PacketsReceived
+			icmpv6Stats := s.Stats().ICMP.V6.PacketsReceived
 			visitStats(reflect.ValueOf(&icmpv6Stats).Elem(), func(name string, s *tcpip.StatCounter) {
 				if got, want := s.Value(), uint64(1); got != want {
 					t.Errorf("got %s = %d, want = %d", name, got, want)
@@ -413,6 +429,22 @@ func TestICMPCountsWithNeighborCache(t *testing.T) {
 			typ:  header.ICMPv6RedirectMsg,
 			size: header.ICMPv6MinimumSize,
 		},
+		{
+			typ:  header.ICMPv6MulticastListenerQuery,
+			size: header.MLDMinimumSize + header.ICMPv6HeaderSize,
+		},
+		{
+			typ:  header.ICMPv6MulticastListenerReport,
+			size: header.MLDMinimumSize + header.ICMPv6HeaderSize,
+		},
+		{
+			typ:  header.ICMPv6MulticastListenerDone,
+			size: header.MLDMinimumSize + header.ICMPv6HeaderSize,
+		},
+		{
+			typ:  255, /* Unrecognized */
+			size: 50,
+		},
 	}
 
 	handleIPv6Payload := func(icmp header.ICMPv6) {
@@ -443,7 +475,7 @@ func TestICMPCountsWithNeighborCache(t *testing.T) {
 	// Stats().ICMP.ICMPv6ReceivedPacketStats.Invalid is incremented.
 	handleIPv6Payload(header.ICMPv6(buffer.NewView(header.IPv6MinimumSize)))
 
-	icmpv6Stats := s.Stats().ICMP.V6PacketsReceived
+	icmpv6Stats := s.Stats().ICMP.V6.PacketsReceived
 	visitStats(reflect.ValueOf(&icmpv6Stats).Elem(), func(name string, s *tcpip.StatCounter) {
 		if got, want := s.Value(), uint64(1); got != want {
 			t.Errorf("got %s = %d, want = %d", name, got, want)
@@ -568,8 +600,8 @@ func routeICMPv6Packet(t *testing.T, args routeArgs, fn func(*testing.T, header.
 		return
 	}
 
-	if len(args.remoteLinkAddr) != 0 && args.remoteLinkAddr != pi.Route.RemoteLinkAddress {
-		t.Errorf("got remote link address = %s, want = %s", pi.Route.RemoteLinkAddress, args.remoteLinkAddr)
+	if got := pi.Route.RemoteLinkAddress(); len(args.remoteLinkAddr) != 0 && got != args.remoteLinkAddr {
+		t.Errorf("got remote link address = %s, want = %s", got, args.remoteLinkAddr)
 	}
 
 	// Pull the full payload since network header. Needed for header.IPv6 to
@@ -833,7 +865,7 @@ func TestICMPChecksumValidationSimple(t *testing.T) {
 							e.InjectInbound(ProtocolNumber, pkt)
 						}
 
-						stats := s.Stats().ICMP.V6PacketsReceived
+						stats := s.Stats().ICMP.V6.PacketsReceived
 						invalid := stats.Invalid
 						routerOnly := stats.RouterOnlyPacketsDroppedByHost
 						typStat := typ.statCounter(stats)
@@ -1028,7 +1060,7 @@ func TestICMPChecksumValidationWithPayload(t *testing.T) {
 				e.InjectInbound(ProtocolNumber, pkt)
 			}
 
-			stats := s.Stats().ICMP.V6PacketsReceived
+			stats := s.Stats().ICMP.V6.PacketsReceived
 			invalid := stats.Invalid
 			typStat := typ.statCounter(stats)
 
@@ -1207,7 +1239,7 @@ func TestICMPChecksumValidationWithPayloadMultipleViews(t *testing.T) {
 				e.InjectInbound(ProtocolNumber, pkt)
 			}
 
-			stats := s.Stats().ICMP.V6PacketsReceived
+			stats := s.Stats().ICMP.V6.PacketsReceived
 			invalid := stats.Invalid
 			typStat := typ.statCounter(stats)
 
@@ -1349,8 +1381,8 @@ func TestLinkAddressRequest(t *testing.T) {
 		if !ok {
 			t.Fatal("expected to send a link address request")
 		}
-		if pkt.Route.RemoteLinkAddress != test.expectedRemoteLinkAddr {
-			t.Errorf("got pkt.Route.RemoteLinkAddress = %s, want = %s", pkt.Route.RemoteLinkAddress, test.expectedRemoteLinkAddr)
+		if got := pkt.Route.RemoteLinkAddress(); got != test.expectedRemoteLinkAddr {
+			t.Errorf("got pkt.Route.RemoteLinkAddress() = %s, want = %s", got, test.expectedRemoteLinkAddr)
 		}
 		if pkt.Route.RemoteAddress != test.expectedRemoteAddr {
 			t.Errorf("got pkt.Route.RemoteAddress = %s, want = %s", pkt.Route.RemoteAddress, test.expectedRemoteAddr)
@@ -1431,8 +1463,8 @@ func TestPacketQueing(t *testing.T) {
 				if p.Proto != ProtocolNumber {
 					t.Errorf("got p.Proto = %d, want = %d", p.Proto, ProtocolNumber)
 				}
-				if p.Route.RemoteLinkAddress != host2NICLinkAddr {
-					t.Errorf("got p.Route.RemoteLinkAddress = %s, want = %s", p.Route.RemoteLinkAddress, host2NICLinkAddr)
+				if got := p.Route.RemoteLinkAddress(); got != host2NICLinkAddr {
+					t.Errorf("got p.Route.RemoteLinkAddress() = %s, want = %s", got, host2NICLinkAddr)
 				}
 				checker.IPv6(t, stack.PayloadSince(p.Pkt.NetworkHeader()),
 					checker.SrcAddr(host1IPv6Addr.AddressWithPrefix.Address),
@@ -1473,8 +1505,8 @@ func TestPacketQueing(t *testing.T) {
 				if p.Proto != ProtocolNumber {
 					t.Errorf("got p.Proto = %d, want = %d", p.Proto, ProtocolNumber)
 				}
-				if p.Route.RemoteLinkAddress != host2NICLinkAddr {
-					t.Errorf("got p.Route.RemoteLinkAddress = %s, want = %s", p.Route.RemoteLinkAddress, host2NICLinkAddr)
+				if got := p.Route.RemoteLinkAddress(); got != host2NICLinkAddr {
+					t.Errorf("got p.Route.RemoteLinkAddress() = %s, want = %s", got, host2NICLinkAddr)
 				}
 				checker.IPv6(t, stack.PayloadSince(p.Pkt.NetworkHeader()),
 					checker.SrcAddr(host1IPv6Addr.AddressWithPrefix.Address),
@@ -1524,8 +1556,8 @@ func TestPacketQueing(t *testing.T) {
 					t.Errorf("got Proto = %d, want = %d", p.Proto, ProtocolNumber)
 				}
 				snmc := header.SolicitedNodeAddr(host2IPv6Addr.AddressWithPrefix.Address)
-				if want := header.EthernetAddressFromMulticastIPv6Address(snmc); p.Route.RemoteLinkAddress != want {
-					t.Errorf("got p.Route.RemoteLinkAddress = %s, want = %s", p.Route.RemoteLinkAddress, want)
+				if got, want := p.Route.RemoteLinkAddress(), header.EthernetAddressFromMulticastIPv6Address(snmc); got != want {
+					t.Errorf("got p.Route.RemoteLinkAddress() = %s, want = %s", got, want)
 				}
 				checker.IPv6(t, stack.PayloadSince(p.Pkt.NetworkHeader()),
 					checker.SrcAddr(host1IPv6Addr.AddressWithPrefix.Address),
@@ -1543,7 +1575,7 @@ func TestPacketQueing(t *testing.T) {
 				hdr := buffer.NewPrependable(header.IPv6MinimumSize + naSize)
 				pkt := header.ICMPv6(hdr.Prepend(naSize))
 				pkt.SetType(header.ICMPv6NeighborAdvert)
-				na := header.NDPNeighborAdvert(pkt.NDPPayload())
+				na := header.NDPNeighborAdvert(pkt.MessageBody())
 				na.SetSolicitedFlag(true)
 				na.SetOverrideFlag(true)
 				na.SetTargetAddress(host2IPv6Addr.AddressWithPrefix.Address)
@@ -1592,7 +1624,7 @@ func TestCallsToNeighborCache(t *testing.T) {
 				nsSize := header.ICMPv6NeighborSolicitMinimumSize + header.NDPLinkLayerAddressSize
 				icmp := header.ICMPv6(buffer.NewView(nsSize))
 				icmp.SetType(header.ICMPv6NeighborSolicit)
-				ns := header.NDPNeighborSolicit(icmp.NDPPayload())
+				ns := header.NDPNeighborSolicit(icmp.MessageBody())
 				ns.SetTargetAddress(lladdr0)
 				return icmp
 			},
@@ -1612,7 +1644,7 @@ func TestCallsToNeighborCache(t *testing.T) {
 				nsSize := header.ICMPv6NeighborSolicitMinimumSize + header.NDPLinkLayerAddressSize
 				icmp := header.ICMPv6(buffer.NewView(nsSize))
 				icmp.SetType(header.ICMPv6NeighborSolicit)
-				ns := header.NDPNeighborSolicit(icmp.NDPPayload())
+				ns := header.NDPNeighborSolicit(icmp.MessageBody())
 				ns.SetTargetAddress(lladdr0)
 				ns.Options().Serialize(header.NDPOptionsSerializer{
 					header.NDPSourceLinkLayerAddressOption(linkAddr1),
@@ -1629,7 +1661,7 @@ func TestCallsToNeighborCache(t *testing.T) {
 				nsSize := header.ICMPv6NeighborSolicitMinimumSize + header.NDPLinkLayerAddressSize
 				icmp := header.ICMPv6(buffer.NewView(nsSize))
 				icmp.SetType(header.ICMPv6NeighborSolicit)
-				ns := header.NDPNeighborSolicit(icmp.NDPPayload())
+				ns := header.NDPNeighborSolicit(icmp.MessageBody())
 				ns.SetTargetAddress(lladdr0)
 				return icmp
 			},
@@ -1645,7 +1677,7 @@ func TestCallsToNeighborCache(t *testing.T) {
 				nsSize := header.ICMPv6NeighborSolicitMinimumSize + header.NDPLinkLayerAddressSize
 				icmp := header.ICMPv6(buffer.NewView(nsSize))
 				icmp.SetType(header.ICMPv6NeighborSolicit)
-				ns := header.NDPNeighborSolicit(icmp.NDPPayload())
+				ns := header.NDPNeighborSolicit(icmp.MessageBody())
 				ns.SetTargetAddress(lladdr0)
 				ns.Options().Serialize(header.NDPOptionsSerializer{
 					header.NDPSourceLinkLayerAddressOption(linkAddr1),
@@ -1662,7 +1694,7 @@ func TestCallsToNeighborCache(t *testing.T) {
 				naSize := header.ICMPv6NeighborAdvertMinimumSize
 				icmp := header.ICMPv6(buffer.NewView(naSize))
 				icmp.SetType(header.ICMPv6NeighborAdvert)
-				na := header.NDPNeighborAdvert(icmp.NDPPayload())
+				na := header.NDPNeighborAdvert(icmp.MessageBody())
 				na.SetSolicitedFlag(true)
 				na.SetOverrideFlag(false)
 				na.SetTargetAddress(lladdr1)
@@ -1683,7 +1715,7 @@ func TestCallsToNeighborCache(t *testing.T) {
 				naSize := header.ICMPv6NeighborAdvertMinimumSize + header.NDPLinkLayerAddressSize
 				icmp := header.ICMPv6(buffer.NewView(naSize))
 				icmp.SetType(header.ICMPv6NeighborAdvert)
-				na := header.NDPNeighborAdvert(icmp.NDPPayload())
+				na := header.NDPNeighborAdvert(icmp.MessageBody())
 				na.SetSolicitedFlag(true)
 				na.SetOverrideFlag(false)
 				na.SetTargetAddress(lladdr1)
@@ -1702,7 +1734,7 @@ func TestCallsToNeighborCache(t *testing.T) {
 				naSize := header.ICMPv6NeighborAdvertMinimumSize + header.NDPLinkLayerAddressSize
 				icmp := header.ICMPv6(buffer.NewView(naSize))
 				icmp.SetType(header.ICMPv6NeighborAdvert)
-				na := header.NDPNeighborAdvert(icmp.NDPPayload())
+				na := header.NDPNeighborAdvert(icmp.MessageBody())
 				na.SetSolicitedFlag(false)
 				na.SetOverrideFlag(false)
 				na.SetTargetAddress(lladdr1)
@@ -1722,7 +1754,7 @@ func TestCallsToNeighborCache(t *testing.T) {
 				naSize := header.ICMPv6NeighborAdvertMinimumSize + header.NDPLinkLayerAddressSize
 				icmp := header.ICMPv6(buffer.NewView(naSize))
 				icmp.SetType(header.ICMPv6NeighborAdvert)
-				na := header.NDPNeighborAdvert(icmp.NDPPayload())
+				na := header.NDPNeighborAdvert(icmp.MessageBody())
 				na.SetSolicitedFlag(false)
 				na.SetOverrideFlag(false)
 				na.SetTargetAddress(lladdr1)
diff --git a/pkg/tcpip/network/ipv6/ipv6.go b/pkg/tcpip/network/ipv6/ipv6.go
index 7a00f6314..8bf84601f 100644
--- a/pkg/tcpip/network/ipv6/ipv6.go
+++ b/pkg/tcpip/network/ipv6/ipv6.go
@@ -34,7 +34,9 @@ import (
 )
 
 const (
+	// ReassembleTimeout controls how long a fragment will be held.
 	// As per RFC 8200 section 4.5:
+	//
 	//   If insufficient fragments are received to complete reassembly of a packet
 	//   within 60 seconds of the reception of the first-arriving fragment of that
 	//   packet, reassembly of that packet must be abandoned.
@@ -84,6 +86,8 @@ type endpoint struct {
 		addressableEndpointState stack.AddressableEndpointState
 		ndp                      ndpState
 	}
+
+	mld mldState
 }
 
 // NICNameFromID is a function that returns a stable name for the specified NIC,
@@ -224,6 +228,12 @@ func (e *endpoint) Enable() *tcpip.Error {
 		return nil
 	}
 
+	// Groups may have been joined when the endpoint was disabled, or the
+	// endpoint may have left groups from the perspective of MLD when the
+	// endpoint was disabled. Either way, we need to let routers know to
+	// send us multicast traffic.
+	e.mld.initializeAll()
+
 	// Join the IPv6 All-Nodes Multicast group if the stack is configured to
 	// use IPv6. This is required to ensure that this node properly receives
 	// and responds to the various NDP messages that are destined to the
@@ -241,8 +251,10 @@ func (e *endpoint) Enable() *tcpip.Error {
 	// (NDP NS) messages may be sent to the All-Nodes multicast group if the
 	// source address of the NDP NS is the unspecified address, as per RFC 4861
 	// section 7.2.4.
-	if _, err := e.mu.addressableEndpointState.JoinGroup(header.IPv6AllNodesMulticastAddress); err != nil {
-		return err
+	if err := e.joinGroupLocked(header.IPv6AllNodesMulticastAddress); err != nil {
+		// joinGroupLocked only returns an error if the group address is not a valid
+		// IPv6 multicast address.
+		panic(fmt.Sprintf("e.joinGroupLocked(%s): %s", header.IPv6AllNodesMulticastAddress, err))
 	}
 
 	// Perform DAD on the all the unicast IPv6 endpoints that are in the permanent
@@ -251,7 +263,7 @@ func (e *endpoint) Enable() *tcpip.Error {
 	// Addresses may have aleady completed DAD but in the time since the endpoint
 	// was last enabled, other devices may have acquired the same addresses.
 	var err *tcpip.Error
-	e.mu.addressableEndpointState.ReadOnly().ForEach(func(addressEndpoint stack.AddressEndpoint) bool {
+	e.mu.addressableEndpointState.ForEachEndpoint(func(addressEndpoint stack.AddressEndpoint) bool {
 		addr := addressEndpoint.AddressWithPrefix().Address
 		if !header.IsV6UnicastAddress(addr) {
 			return true
@@ -273,7 +285,7 @@ func (e *endpoint) Enable() *tcpip.Error {
 	}
 
 	// Do not auto-generate an IPv6 link-local address for loopback devices.
-	if e.protocol.autoGenIPv6LinkLocal && !e.nic.IsLoopback() {
+	if e.protocol.options.AutoGenLinkLocal && !e.nic.IsLoopback() {
 		// The valid and preferred lifetime is infinite for the auto-generated
 		// link-local address.
 		e.mu.ndp.doSLAAC(header.IPv6LinkLocalPrefix.Subnet(), header.NDPInfiniteLifetime, header.NDPInfiniteLifetime)
@@ -331,9 +343,13 @@ func (e *endpoint) disableLocked() {
 	e.stopDADForPermanentAddressesLocked()
 
 	// The endpoint may have already left the multicast group.
-	if _, err := e.mu.addressableEndpointState.LeaveGroup(header.IPv6AllNodesMulticastAddress); err != nil && err != tcpip.ErrBadLocalAddress {
+	if err := e.leaveGroupLocked(header.IPv6AllNodesMulticastAddress); err != nil && err != tcpip.ErrBadLocalAddress {
 		panic(fmt.Sprintf("unexpected error when leaving group = %s: %s", header.IPv6AllNodesMulticastAddress, err))
 	}
+
+	// Leave groups from the perspective of MLD so that routers know that
+	// we are no longer interested in the group.
+	e.mld.softLeaveAll()
 }
 
 // stopDADForPermanentAddressesLocked stops DAD for all permaneent addresses.
@@ -341,7 +357,7 @@ func (e *endpoint) disableLocked() {
 // Precondition: e.mu must be write locked.
 func (e *endpoint) stopDADForPermanentAddressesLocked() {
 	// Stop DAD for all the tentative unicast addresses.
-	e.mu.addressableEndpointState.ReadOnly().ForEach(func(addressEndpoint stack.AddressEndpoint) bool {
+	e.mu.addressableEndpointState.ForEachEndpoint(func(addressEndpoint stack.AddressEndpoint) bool {
 		if addressEndpoint.GetKind() != stack.PermanentTentative {
 			return true
 		}
@@ -376,7 +392,7 @@ func (e *endpoint) MaxHeaderLength() uint16 {
 	return e.nic.MaxHeaderLength() + header.IPv6MinimumSize
 }
 
-func (e *endpoint) addIPHeader(r *stack.Route, pkt *stack.PacketBuffer, params stack.NetworkHeaderParams) {
+func (e *endpoint) addIPHeader(srcAddr, dstAddr tcpip.Address, pkt *stack.PacketBuffer, params stack.NetworkHeaderParams) {
 	length := uint16(pkt.Size())
 	ip := header.IPv6(pkt.NetworkHeader().Push(header.IPv6MinimumSize))
 	ip.Encode(&header.IPv6Fields{
@@ -384,8 +400,8 @@ func (e *endpoint) addIPHeader(r *stack.Route, pkt *stack.PacketBuffer, params s
 		NextHeader:    uint8(params.Protocol),
 		HopLimit:      params.TTL,
 		TrafficClass:  params.TOS,
-		SrcAddr:       r.LocalAddress,
-		DstAddr:       r.RemoteAddress,
+		SrcAddr:       srcAddr,
+		DstAddr:       dstAddr,
 	})
 	pkt.NetworkProtocolNumber = ProtocolNumber
 }
@@ -440,7 +456,7 @@ func (e *endpoint) handleFragments(r *stack.Route, gso *stack.GSO, networkMTU ui
 
 // WritePacket writes a packet to the given destination address and protocol.
 func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.NetworkHeaderParams, pkt *stack.PacketBuffer) *tcpip.Error {
-	e.addIPHeader(r, pkt, params)
+	e.addIPHeader(r.LocalAddress, r.RemoteAddress, pkt, params)
 
 	// iptables filtering. All packets that reach here are locally
 	// generated.
@@ -529,7 +545,7 @@ func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.Packe
 
 	linkMTU := e.nic.MTU()
 	for pb := pkts.Front(); pb != nil; pb = pb.Next() {
-		e.addIPHeader(r, pb, params)
+		e.addIPHeader(r.LocalAddress, r.RemoteAddress, pb, params)
 
 		networkMTU, err := calculateNetworkMTU(linkMTU, uint32(pb.NetworkHeader().View().Size()))
 		if err != nil {
@@ -737,8 +753,11 @@ func (e *endpoint) handlePacket(pkt *stack.PacketBuffer) {
 		return
 	}
 
-	addressEndpoint := e.AcquireAssignedAddress(dstAddr, e.nic.Promiscuous(), stack.CanBePrimaryEndpoint)
-	if addressEndpoint == nil {
+	// The destination address should be an address we own or a group we joined
+	// for us to receive the packet. Otherwise, attempt to forward the packet.
+	if addressEndpoint := e.AcquireAssignedAddress(dstAddr, e.nic.Promiscuous(), stack.CanBePrimaryEndpoint); addressEndpoint != nil {
+		addressEndpoint.DecRef()
+	} else if !e.IsInGroup(dstAddr) {
 		if !e.protocol.Forwarding() {
 			stats.IP.InvalidDestinationAddressesReceived.Increment()
 			return
@@ -747,7 +766,6 @@ func (e *endpoint) handlePacket(pkt *stack.PacketBuffer) {
 		_ = e.forwardPacket(pkt)
 		return
 	}
-	addressEndpoint.DecRef()
 
 	// vv consists of:
 	// - Any IPv6 header bytes after the first 40 (i.e. extensions).
@@ -1090,9 +1108,16 @@ func (e *endpoint) handlePacket(pkt *stack.PacketBuffer) {
 					//
 					// Which when taken together indicate that an unknown protocol should
 					// be treated as an unrecognized next header value.
+					// The location of the Next Header field is in a different place in
+					// the initial IPv6 header than it is in the extension headers so
+					// treat it specially.
+					prevHdrIDOffset := uint32(header.IPv6NextHeaderOffset)
+					if previousHeaderStart != 0 {
+						prevHdrIDOffset = previousHeaderStart
+					}
 					_ = e.protocol.returnError(&icmpReasonParameterProblem{
 						code:    header.ICMPv6UnknownHeader,
-						pointer: it.ParseOffset(),
+						pointer: prevHdrIDOffset,
 					}, pkt)
 				default:
 					panic(fmt.Sprintf("unrecognized result from DeliverTransportPacket = %d", res))
@@ -1100,12 +1125,11 @@ func (e *endpoint) handlePacket(pkt *stack.PacketBuffer) {
 			}
 
 		default:
-			_ = e.protocol.returnError(&icmpReasonParameterProblem{
-				code:    header.ICMPv6UnknownHeader,
-				pointer: it.ParseOffset(),
-			}, pkt)
-			stats.UnknownProtocolRcvdPackets.Increment()
-			return
+			// Since the iterator returns IPv6RawPayloadHeader for unknown Extension
+			// Header IDs this should never happen unless we missed a supported type
+			// here.
+			panic(fmt.Sprintf("unrecognized type from it.Next() = %T", extHdr))
+
 		}
 	}
 }
@@ -1154,8 +1178,10 @@ func (e *endpoint) addAndAcquirePermanentAddressLocked(addr tcpip.AddressWithPre
 	}
 
 	snmc := header.SolicitedNodeAddr(addr.Address)
-	if _, err := e.mu.addressableEndpointState.JoinGroup(snmc); err != nil {
-		return nil, err
+	if err := e.joinGroupLocked(snmc); err != nil {
+		// joinGroupLocked only returns an error if the group address is not a valid
+		// IPv6 multicast address.
+		panic(fmt.Sprintf("e.joinGroupLocked(%s): %s", snmc, err))
 	}
 
 	addressEndpoint.SetKind(stack.PermanentTentative)
@@ -1211,7 +1237,8 @@ func (e *endpoint) removePermanentEndpointLocked(addressEndpoint stack.AddressEn
 	}
 
 	snmc := header.SolicitedNodeAddr(addr.Address)
-	if _, err := e.mu.addressableEndpointState.LeaveGroup(snmc); err != nil && err != tcpip.ErrBadLocalAddress {
+	// The endpoint may have already left the multicast group.
+	if err := e.leaveGroupLocked(snmc); err != nil && err != tcpip.ErrBadLocalAddress {
 		return err
 	}
 
@@ -1234,7 +1261,7 @@ func (e *endpoint) hasPermanentAddressRLocked(addr tcpip.Address) bool {
 //
 // Precondition: e.mu must be read or write locked.
 func (e *endpoint) getAddressRLocked(localAddr tcpip.Address) stack.AddressEndpoint {
-	return e.mu.addressableEndpointState.ReadOnly().Lookup(localAddr)
+	return e.mu.addressableEndpointState.GetAddress(localAddr)
 }
 
 // MainAddress implements stack.AddressableEndpoint.
@@ -1285,7 +1312,7 @@ func (e *endpoint) acquireOutgoingPrimaryAddressRLocked(remoteAddr tcpip.Address
 	// Create a candidate set of available addresses we can potentially use as a
 	// source address.
 	var cs []addrCandidate
-	e.mu.addressableEndpointState.ReadOnly().ForEachPrimaryEndpoint(func(addressEndpoint stack.AddressEndpoint) {
+	e.mu.addressableEndpointState.ForEachPrimaryEndpoint(func(addressEndpoint stack.AddressEndpoint) {
 		// If r is not valid for outgoing connections, it is not a valid endpoint.
 		if !addressEndpoint.IsAssigned(allowExpired) {
 			return
@@ -1376,28 +1403,43 @@ func (e *endpoint) PermanentAddresses() []tcpip.AddressWithPrefix {
 }
 
 // JoinGroup implements stack.GroupAddressableEndpoint.
-func (e *endpoint) JoinGroup(addr tcpip.Address) (bool, *tcpip.Error) {
+func (e *endpoint) JoinGroup(addr tcpip.Address) *tcpip.Error {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+	return e.joinGroupLocked(addr)
+}
+
+// joinGroupLocked is like JoinGroup but with locking requirements.
+//
+// Precondition: e.mu must be locked.
+func (e *endpoint) joinGroupLocked(addr tcpip.Address) *tcpip.Error {
 	if !header.IsV6MulticastAddress(addr) {
-		return false, tcpip.ErrBadAddress
+		return tcpip.ErrBadAddress
 	}
 
-	e.mu.Lock()
-	defer e.mu.Unlock()
-	return e.mu.addressableEndpointState.JoinGroup(addr)
+	e.mld.joinGroup(addr)
+	return nil
 }
 
 // LeaveGroup implements stack.GroupAddressableEndpoint.
-func (e *endpoint) LeaveGroup(addr tcpip.Address) (bool, *tcpip.Error) {
+func (e *endpoint) LeaveGroup(addr tcpip.Address) *tcpip.Error {
 	e.mu.Lock()
 	defer e.mu.Unlock()
-	return e.mu.addressableEndpointState.LeaveGroup(addr)
+	return e.leaveGroupLocked(addr)
+}
+
+// leaveGroupLocked is like LeaveGroup but with locking requirements.
+//
+// Precondition: e.mu must be locked.
+func (e *endpoint) leaveGroupLocked(addr tcpip.Address) *tcpip.Error {
+	return e.mld.leaveGroup(addr)
 }
 
 // IsInGroup implements stack.GroupAddressableEndpoint.
 func (e *endpoint) IsInGroup(addr tcpip.Address) bool {
 	e.mu.RLock()
 	defer e.mu.RUnlock()
-	return e.mu.addressableEndpointState.IsInGroup(addr)
+	return e.mld.isInGroup(addr)
 }
 
 var _ stack.ForwardingNetworkProtocol = (*protocol)(nil)
@@ -1405,7 +1447,8 @@ var _ stack.NetworkProtocol = (*protocol)(nil)
 var _ fragmentation.TimeoutHandler = (*protocol)(nil)
 
 type protocol struct {
-	stack *stack.Stack
+	stack   *stack.Stack
+	options Options
 
 	mu struct {
 		sync.RWMutex
@@ -1429,26 +1472,6 @@ type protocol struct {
 	forwarding uint32
 
 	fragmentation *fragmentation.Fragmentation
-
-	// ndpDisp is the NDP event dispatcher that is used to send the netstack
-	// integrator NDP related events.
-	ndpDisp NDPDispatcher
-
-	// ndpConfigs is the default NDP configurations used by an IPv6 endpoint.
-	ndpConfigs NDPConfigurations
-
-	// opaqueIIDOpts hold the options for generating opaque interface identifiers
-	// (IIDs) as outlined by RFC 7217.
-	opaqueIIDOpts OpaqueInterfaceIdentifierOptions
-
-	// tempIIDSeed is used to seed the initial temporary interface identifier
-	// history value used to generate IIDs for temporary SLAAC addresses.
-	tempIIDSeed []byte
-
-	// autoGenIPv6LinkLocal determines whether or not the stack attempts to
-	// auto-generate an IPv6 link-local address for newly enabled non-loopback
-	// NICs. See the AutoGenIPv6LinkLocal field of Options for more details.
-	autoGenIPv6LinkLocal bool
 }
 
 // Number returns the ipv6 protocol number.
@@ -1484,13 +1507,14 @@ func (p *protocol) NewEndpoint(nic stack.NetworkInterface, linkAddrCache stack.L
 	e.mu.addressableEndpointState.Init(e)
 	e.mu.ndp = ndpState{
 		ep:             e,
-		configs:        p.ndpConfigs,
+		configs:        p.options.NDPConfigs,
 		dad:            make(map[tcpip.Address]dadState),
 		defaultRouters: make(map[tcpip.Address]defaultRouterState),
 		onLinkPrefixes: make(map[tcpip.Subnet]onLinkPrefixState),
 		slaacPrefixes:  make(map[tcpip.Subnet]slaacPrefixState),
 	}
 	e.mu.ndp.initializeTempAddrState()
+	e.mld.init(e, p.options.MLD)
 
 	p.mu.Lock()
 	defer p.mu.Unlock()
@@ -1613,17 +1637,17 @@ type Options struct {
 	// NDPConfigs is the default NDP configurations used by interfaces.
 	NDPConfigs NDPConfigurations
 
-	// AutoGenIPv6LinkLocal determines whether or not the stack attempts to
-	// auto-generate an IPv6 link-local address for newly enabled non-loopback
+	// AutoGenLinkLocal determines whether or not the stack attempts to
+	// auto-generate a link-local address for newly enabled non-loopback
 	// NICs.
 	//
 	// Note, setting this to true does not mean that a link-local address is
 	// assigned right away, or at all. If Duplicate Address Detection is enabled,
 	// an address is only assigned if it successfully resolves. If it fails, no
-	// further attempts are made to auto-generate an IPv6 link-local adddress.
+	// further attempts are made to auto-generate a link-local adddress.
 	//
 	// The generated link-local address follows RFC 4291 Appendix A guidelines.
-	AutoGenIPv6LinkLocal bool
+	AutoGenLinkLocal bool
 
 	// NDPDisp is the NDP event dispatcher that an integrator can provide to
 	// receive NDP related events.
@@ -1647,6 +1671,9 @@ type Options struct {
 	// seed that is too small would reduce randomness and increase predictability,
 	// defeating the purpose of temporary SLAAC addresses.
 	TempIIDSeed []byte
+
+	// MLD holds options for MLD.
+	MLD MLDOptions
 }
 
 // NewProtocolWithOptions returns an IPv6 network protocol.
@@ -1658,15 +1685,11 @@ func NewProtocolWithOptions(opts Options) stack.NetworkProtocolFactory {
 
 	return func(s *stack.Stack) stack.NetworkProtocol {
 		p := &protocol{
-			stack:  s,
+			stack:   s,
+			options: opts,
+
 			ids:    ids,
 			hashIV: hashIV,
-
-			ndpDisp:              opts.NDPDisp,
-			ndpConfigs:           opts.NDPConfigs,
-			opaqueIIDOpts:        opts.OpaqueIIDOpts,
-			tempIIDSeed:          opts.TempIIDSeed,
-			autoGenIPv6LinkLocal: opts.AutoGenIPv6LinkLocal,
 		}
 		p.fragmentation = fragmentation.NewFragmentation(header.IPv6FragmentExtHdrFragmentOffsetBytesPerUnit, fragmentation.HighFragThreshold, fragmentation.LowFragThreshold, ReassembleTimeout, s.Clock(), p)
 		p.mu.eps = make(map[*endpoint]struct{})
diff --git a/pkg/tcpip/network/ipv6/ipv6_test.go b/pkg/tcpip/network/ipv6/ipv6_test.go
index a671d4bac..1c01f17ab 100644
--- a/pkg/tcpip/network/ipv6/ipv6_test.go
+++ b/pkg/tcpip/network/ipv6/ipv6_test.go
@@ -51,6 +51,7 @@ const (
 	fragmentExtHdrID    = uint8(header.IPv6FragmentExtHdrIdentifier)
 	destinationExtHdrID = uint8(header.IPv6DestinationOptionsExtHdrIdentifier)
 	noNextHdrID         = uint8(header.IPv6NoNextHeaderIdentifier)
+	unknownHdrID        = uint8(header.IPv6UnknownExtHdrIdentifier)
 
 	extraHeaderReserve = 50
 )
@@ -79,7 +80,7 @@ func testReceiveICMP(t *testing.T, s *stack.Stack, e *channel.Endpoint, src, dst
 		Data: hdr.View().ToVectorisedView(),
 	}))
 
-	stats := s.Stats().ICMP.V6PacketsReceived
+	stats := s.Stats().ICMP.V6.PacketsReceived
 
 	if got := stats.NeighborAdvert.Value(); got != want {
 		t.Fatalf("got NeighborAdvert = %d, want = %d", got, want)
@@ -573,6 +574,33 @@ func TestReceiveIPv6ExtHdrs(t *testing.T) {
 			expectICMP:   false,
 		},
 		{
+			name: "unknown next header (first)",
+			extHdr: func(nextHdr uint8) ([]byte, uint8) {
+				return []byte{
+					nextHdr, 0, 63, 4, 1, 2, 3, 4,
+				}, unknownHdrID
+			},
+			shouldAccept: false,
+			expectICMP:   true,
+			ICMPType:     header.ICMPv6ParamProblem,
+			ICMPCode:     header.ICMPv6UnknownHeader,
+			pointer:      header.IPv6NextHeaderOffset,
+		},
+		{
+			name: "unknown next header (not first)",
+			extHdr: func(nextHdr uint8) ([]byte, uint8) {
+				return []byte{
+					unknownHdrID, 0,
+					63, 4, 1, 2, 3, 4,
+				}, hopByHopExtHdrID
+			},
+			shouldAccept: false,
+			expectICMP:   true,
+			ICMPType:     header.ICMPv6ParamProblem,
+			ICMPCode:     header.ICMPv6UnknownHeader,
+			pointer:      header.IPv6FixedHeaderSize,
+		},
+		{
 			name: "destination with unknown option skippable action",
 			extHdr: func(nextHdr uint8) ([]byte, uint8) {
 				return []byte{
@@ -755,11 +783,6 @@ func TestReceiveIPv6ExtHdrs(t *testing.T) {
 			pointer:      header.IPv6FixedHeaderSize,
 		},
 		{
-			name:         "No next header",
-			extHdr:       func(nextHdr uint8) ([]byte, uint8) { return []byte{}, noNextHdrID },
-			shouldAccept: false,
-		},
-		{
 			name: "hopbyhop (with skippable unknown) - routing - atomic fragment - destination (with skippable unknown)",
 			extHdr: func(nextHdr uint8) ([]byte, uint8) {
 				return []byte{
@@ -873,7 +896,13 @@ func TestReceiveIPv6ExtHdrs(t *testing.T) {
 				Length:  uint16(udpLength),
 			})
 			copy(u.Payload(), udpPayload)
-			sum := header.PseudoHeaderChecksum(udp.ProtocolNumber, addr1, addr2, uint16(udpLength))
+
+			dstAddr := tcpip.Address(addr2)
+			if test.multicast {
+				dstAddr = header.IPv6AllNodesMulticastAddress
+			}
+
+			sum := header.PseudoHeaderChecksum(udp.ProtocolNumber, addr1, dstAddr, uint16(udpLength))
 			sum = header.Checksum(udpPayload, sum)
 			u.SetChecksum(^u.CalculateChecksum(sum))
 
@@ -884,10 +913,6 @@ func TestReceiveIPv6ExtHdrs(t *testing.T) {
 			// Serialize IPv6 fixed header.
 			payloadLength := hdr.UsedLength()
 			ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
-			dstAddr := tcpip.Address(addr2)
-			if test.multicast {
-				dstAddr = header.IPv6AllNodesMulticastAddress
-			}
 			ip.Encode(&header.IPv6Fields{
 				PayloadLength: uint16(payloadLength),
 				NextHeader:    ipv6NextHdr,
@@ -982,9 +1007,10 @@ func TestReceiveIPv6Fragments(t *testing.T) {
 		udpPayload2Length = 128
 		// Used to test cases where the fragment blocks are not a multiple of
 		// the fragment block size of 8 (RFC 8200 section 4.5).
-		udpPayload3Length = 127
-		udpPayload4Length = header.IPv6MaximumPayloadSize - header.UDPMinimumSize
-		fragmentExtHdrLen = 8
+		udpPayload3Length     = 127
+		udpPayload4Length     = header.IPv6MaximumPayloadSize - header.UDPMinimumSize
+		udpMaximumSizeMinus15 = header.UDPMaximumSize - 15
+		fragmentExtHdrLen     = 8
 		// Note, not all routing extension headers will be 8 bytes but this test
 		// uses 8 byte routing extension headers for most sub tests.
 		routingExtHdrLen = 8
@@ -1328,14 +1354,14 @@ func TestReceiveIPv6Fragments(t *testing.T) {
 					dstAddr: addr2,
 					nextHdr: fragmentExtHdrID,
 					data: buffer.NewVectorisedView(
-						fragmentExtHdrLen+65520,
+						fragmentExtHdrLen+udpMaximumSizeMinus15,
 						[]buffer.View{
 							// Fragment extension header.
 							//
 							// Fragment offset = 0, More = true, ID = 1
 							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 1, 0, 0, 0, 1}),
 
-							ipv6Payload4Addr1ToAddr2[:65520],
+							ipv6Payload4Addr1ToAddr2[:udpMaximumSizeMinus15],
 						},
 					),
 				},
@@ -1344,14 +1370,17 @@ func TestReceiveIPv6Fragments(t *testing.T) {
 					dstAddr: addr2,
 					nextHdr: fragmentExtHdrID,
 					data: buffer.NewVectorisedView(
-						fragmentExtHdrLen+len(ipv6Payload4Addr1ToAddr2)-65520,
+						fragmentExtHdrLen+len(ipv6Payload4Addr1ToAddr2)-udpMaximumSizeMinus15,
 						[]buffer.View{
 							// Fragment extension header.
 							//
-							// Fragment offset = 8190, More = false, ID = 1
-							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 255, 240, 0, 0, 0, 1}),
+							// Fragment offset = udpMaximumSizeMinus15/8, More = false, ID = 1
+							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0,
+								udpMaximumSizeMinus15 >> 8,
+								udpMaximumSizeMinus15 & 0xff,
+								0, 0, 0, 1}),
 
-							ipv6Payload4Addr1ToAddr2[65520:],
+							ipv6Payload4Addr1ToAddr2[udpMaximumSizeMinus15:],
 						},
 					),
 				},
@@ -1359,6 +1388,47 @@ func TestReceiveIPv6Fragments(t *testing.T) {
 			expectedPayloads: [][]byte{udpPayload4Addr1ToAddr2},
 		},
 		{
+			name: "Two fragments with MF flag reassembled into a maximum UDP packet",
+			fragments: []fragmentData{
+				{
+					srcAddr: addr1,
+					dstAddr: addr2,
+					nextHdr: fragmentExtHdrID,
+					data: buffer.NewVectorisedView(
+						fragmentExtHdrLen+udpMaximumSizeMinus15,
+						[]buffer.View{
+							// Fragment extension header.
+							//
+							// Fragment offset = 0, More = true, ID = 1
+							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 1, 0, 0, 0, 1}),
+
+							ipv6Payload4Addr1ToAddr2[:udpMaximumSizeMinus15],
+						},
+					),
+				},
+				{
+					srcAddr: addr1,
+					dstAddr: addr2,
+					nextHdr: fragmentExtHdrID,
+					data: buffer.NewVectorisedView(
+						fragmentExtHdrLen+len(ipv6Payload4Addr1ToAddr2)-udpMaximumSizeMinus15,
+						[]buffer.View{
+							// Fragment extension header.
+							//
+							// Fragment offset = udpMaximumSizeMinus15/8, More = true, ID = 1
+							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0,
+								udpMaximumSizeMinus15 >> 8,
+								(udpMaximumSizeMinus15 & 0xff) + 1,
+								0, 0, 0, 1}),
+
+							ipv6Payload4Addr1ToAddr2[udpMaximumSizeMinus15:],
+						},
+					),
+				},
+			},
+			expectedPayloads: nil,
+		},
+		{
 			name: "Two fragments with per-fragment routing header with zero segments left",
 			fragments: []fragmentData{
 				{
@@ -2439,7 +2509,7 @@ func TestWriteStats(t *testing.T) {
 
 					test.setup(t, rt.Stack())
 
-					nWritten, _ := writer.writePackets(&rt, pkts)
+					nWritten, _ := writer.writePackets(rt, pkts)
 
 					if got := int(rt.Stats().IP.PacketsSent.Value()); got != test.expectSent {
 						t.Errorf("sent %d packets, but expected to send %d", got, test.expectSent)
@@ -2456,7 +2526,7 @@ func TestWriteStats(t *testing.T) {
 	}
 }
 
-func buildRoute(t *testing.T, ep stack.LinkEndpoint) stack.Route {
+func buildRoute(t *testing.T, ep stack.LinkEndpoint) *stack.Route {
 	s := stack.New(stack.Options{
 		NetworkProtocols: []stack.NetworkProtocolFactory{NewProtocol},
 	})
diff --git a/pkg/tcpip/network/ipv6/mld.go b/pkg/tcpip/network/ipv6/mld.go
new file mode 100644
index 000000000..4c06b3f0c
--- /dev/null
+++ b/pkg/tcpip/network/ipv6/mld.go
@@ -0,0 +1,164 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ipv6
+
+import (
+	"fmt"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/network/ip"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+)
+
+const (
+	// UnsolicitedReportIntervalMax is the maximum delay between sending
+	// unsolicited MLD reports.
+	//
+	// Obtained from RFC 2710 Section 7.10.
+	UnsolicitedReportIntervalMax = 10 * time.Second
+)
+
+// MLDOptions holds options for MLD.
+type MLDOptions struct {
+	// Enabled indicates whether MLD will be performed.
+	//
+	// When enabled, MLD may transmit MLD report and done messages when
+	// joining and leaving multicast groups respectively, and handle incoming
+	// MLD packets.
+	Enabled bool
+}
+
+var _ ip.MulticastGroupProtocol = (*mldState)(nil)
+
+// mldState is the per-interface MLD state.
+//
+// mldState.init MUST be called to initialize the MLD state.
+type mldState struct {
+	// The IPv6 endpoint this mldState is for.
+	ep *endpoint
+
+	genericMulticastProtocol ip.GenericMulticastProtocolState
+}
+
+// SendReport implements ip.MulticastGroupProtocol.
+func (mld *mldState) SendReport(groupAddress tcpip.Address) *tcpip.Error {
+	return mld.writePacket(groupAddress, groupAddress, header.ICMPv6MulticastListenerReport)
+}
+
+// SendLeave implements ip.MulticastGroupProtocol.
+func (mld *mldState) SendLeave(groupAddress tcpip.Address) *tcpip.Error {
+	return mld.writePacket(header.IPv6AllRoutersMulticastAddress, groupAddress, header.ICMPv6MulticastListenerDone)
+}
+
+// init sets up an mldState struct, and is required to be called before using
+// a new mldState.
+func (mld *mldState) init(ep *endpoint, opts MLDOptions) {
+	mld.ep = ep
+	mld.genericMulticastProtocol.Init(ip.GenericMulticastProtocolOptions{
+		Enabled:                   opts.Enabled,
+		Rand:                      ep.protocol.stack.Rand(),
+		Clock:                     ep.protocol.stack.Clock(),
+		Protocol:                  mld,
+		MaxUnsolicitedReportDelay: UnsolicitedReportIntervalMax,
+		AllNodesAddress:           header.IPv6AllNodesMulticastAddress,
+	})
+}
+
+func (mld *mldState) handleMulticastListenerQuery(mldHdr header.MLD) {
+	mld.genericMulticastProtocol.HandleQuery(mldHdr.MulticastAddress(), mldHdr.MaximumResponseDelay())
+}
+
+func (mld *mldState) handleMulticastListenerReport(mldHdr header.MLD) {
+	mld.genericMulticastProtocol.HandleReport(mldHdr.MulticastAddress())
+}
+
+// joinGroup handles joining a new group and sending and scheduling the required
+// messages.
+//
+// If the group is already joined, returns tcpip.ErrDuplicateAddress.
+func (mld *mldState) joinGroup(groupAddress tcpip.Address) {
+	mld.genericMulticastProtocol.JoinGroup(groupAddress, !mld.ep.Enabled() /* dontInitialize */)
+}
+
+// isInGroup returns true if the specified group has been joined locally.
+func (mld *mldState) isInGroup(groupAddress tcpip.Address) bool {
+	return mld.genericMulticastProtocol.IsLocallyJoined(groupAddress)
+}
+
+// leaveGroup handles removing the group from the membership map, cancels any
+// delay timers associated with that group, and sends the Done message, if
+// required.
+func (mld *mldState) leaveGroup(groupAddress tcpip.Address) *tcpip.Error {
+	// LeaveGroup returns false only if the group was not joined.
+	if mld.genericMulticastProtocol.LeaveGroup(groupAddress) {
+		return nil
+	}
+
+	return tcpip.ErrBadLocalAddress
+}
+
+// softLeaveAll leaves all groups from the perspective of MLD, but remains
+// joined locally.
+func (mld *mldState) softLeaveAll() {
+	mld.genericMulticastProtocol.MakeAllNonMember()
+}
+
+// initializeAll attemps to initialize the MLD state for each group that has
+// been joined locally.
+func (mld *mldState) initializeAll() {
+	mld.genericMulticastProtocol.InitializeGroups()
+}
+
+func (mld *mldState) writePacket(destAddress, groupAddress tcpip.Address, mldType header.ICMPv6Type) *tcpip.Error {
+	sentStats := mld.ep.protocol.stack.Stats().ICMP.V6.PacketsSent
+	var mldStat *tcpip.StatCounter
+	switch mldType {
+	case header.ICMPv6MulticastListenerReport:
+		mldStat = sentStats.MulticastListenerReport
+	case header.ICMPv6MulticastListenerDone:
+		mldStat = sentStats.MulticastListenerDone
+	default:
+		panic(fmt.Sprintf("unrecognized mld type = %d", mldType))
+	}
+
+	icmp := header.ICMPv6(buffer.NewView(header.ICMPv6HeaderSize + header.MLDMinimumSize))
+	icmp.SetType(mldType)
+	header.MLD(icmp.MessageBody()).SetMulticastAddress(groupAddress)
+	// TODO(gvisor.dev/issue/4888): We should not use the unspecified address,
+	// rather we should select an appropriate local address.
+	localAddress := header.IPv6Any
+	icmp.SetChecksum(header.ICMPv6Checksum(icmp, localAddress, destAddress, buffer.VectorisedView{}))
+
+	pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
+		ReserveHeaderBytes: int(mld.ep.MaxHeaderLength()),
+		Data:               buffer.View(icmp).ToVectorisedView(),
+	})
+
+	mld.ep.addIPHeader(localAddress, destAddress, pkt, stack.NetworkHeaderParams{
+		Protocol: header.ICMPv6ProtocolNumber,
+		TTL:      header.MLDHopLimit,
+	})
+	// TODO(b/162198658): set the ROUTER_ALERT option when sending Host
+	// Membership Reports.
+	if err := mld.ep.nic.WritePacketToRemote(header.EthernetAddressFromMulticastIPv6Address(destAddress), nil /* gso */, ProtocolNumber, pkt); err != nil {
+		sentStats.Dropped.Increment()
+		return err
+	}
+	mldStat.Increment()
+	return nil
+}
diff --git a/pkg/tcpip/network/ipv6/mld_test.go b/pkg/tcpip/network/ipv6/mld_test.go
new file mode 100644
index 000000000..5677bdd54
--- /dev/null
+++ b/pkg/tcpip/network/ipv6/mld_test.go
@@ -0,0 +1,90 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ipv6_test
+
+import (
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/tcpip/checker"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/link/channel"
+	"gvisor.dev/gvisor/pkg/tcpip/network/ipv6"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+)
+
+const (
+	addr1 = "\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01"
+)
+
+func TestIPv6JoinLeaveSolicitedNodeAddressPerformsMLD(t *testing.T) {
+	const nicID = 1
+
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
+			MLD: ipv6.MLDOptions{
+				Enabled: true,
+			},
+		})},
+	})
+	e := channel.New(1, header.IPv6MinimumMTU, "")
+	if err := s.CreateNIC(nicID, e); err != nil {
+		t.Fatalf("CreateNIC(%d, _): %s", nicID, err)
+	}
+
+	// The stack will join an address's solicited node multicast address when
+	// an address is added. An MLD report message should be sent for the
+	// solicited-node group.
+	if err := s.AddAddress(nicID, ipv6.ProtocolNumber, addr1); err != nil {
+		t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, ipv6.ProtocolNumber, addr1, err)
+	}
+	{
+		p, ok := e.Read()
+		if !ok {
+			t.Fatal("expected a report message to be sent")
+		}
+		snmc := header.SolicitedNodeAddr(addr1)
+		checker.IPv6(t, header.IPv6(stack.PayloadSince(p.Pkt.NetworkHeader())),
+			checker.DstAddr(snmc),
+			// Hop Limit for an MLD message must be 1 as per RFC 2710 section 3.
+			checker.TTL(1),
+			checker.MLD(header.ICMPv6MulticastListenerReport, header.MLDMinimumSize,
+				checker.MLDMaxRespDelay(0),
+				checker.MLDMulticastAddress(snmc),
+			),
+		)
+	}
+
+	// The stack will leave an address's solicited node multicast address when
+	// an address is removed. An MLD done message should be sent for the
+	// solicited-node group.
+	if err := s.RemoveAddress(nicID, addr1); err != nil {
+		t.Fatalf("RemoveAddress(%d, %s) = %s", nicID, addr1, err)
+	}
+	{
+		p, ok := e.Read()
+		if !ok {
+			t.Fatal("expected a done message to be sent")
+		}
+		snmc := header.SolicitedNodeAddr(addr1)
+		checker.IPv6(t, header.IPv6(stack.PayloadSince(p.Pkt.NetworkHeader())),
+			checker.DstAddr(header.IPv6AllRoutersMulticastAddress),
+			checker.TTL(1),
+			checker.MLD(header.ICMPv6MulticastListenerDone, header.MLDMinimumSize,
+				checker.MLDMaxRespDelay(0),
+				checker.MLDMulticastAddress(snmc),
+			),
+		)
+	}
+}
diff --git a/pkg/tcpip/network/ipv6/ndp.go b/pkg/tcpip/network/ipv6/ndp.go
index 40da011f8..8cb7d4dab 100644
--- a/pkg/tcpip/network/ipv6/ndp.go
+++ b/pkg/tcpip/network/ipv6/ndp.go
@@ -471,17 +471,8 @@ type ndpState struct {
 	// The default routers discovered through Router Advertisements.
 	defaultRouters map[tcpip.Address]defaultRouterState
 
-	rtrSolicit struct {
-		// The timer used to send the next router solicitation message.
-		timer tcpip.Timer
-
-		// Used to let the Router Solicitation timer know that it has been stopped.
-		//
-		// Must only be read from or written to while protected by the lock of
-		// the IPv6 endpoint this ndpState is associated with. MUST be set when the
-		// timer is set.
-		done *bool
-	}
+	// The job used to send the next router solicitation message.
+	rtrSolicitJob *tcpip.Job
 
 	// The on-link prefixes discovered through Router Advertisements' Prefix
 	// Information option.
@@ -507,7 +498,7 @@ type ndpState struct {
 // to the DAD goroutine that DAD should stop.
 type dadState struct {
 	// The DAD timer to send the next NS message, or resolve the address.
-	timer tcpip.Timer
+	job *tcpip.Job
 
 	// Used to let the DAD timer know that it has been stopped.
 	//
@@ -648,96 +639,70 @@ func (ndp *ndpState) startDuplicateAddressDetection(addr tcpip.Address, addressE
 
 		// Consider DAD to have resolved even if no DAD messages were actually
 		// transmitted.
-		if ndpDisp := ndp.ep.protocol.ndpDisp; ndpDisp != nil {
+		if ndpDisp := ndp.ep.protocol.options.NDPDisp; ndpDisp != nil {
 			ndpDisp.OnDuplicateAddressDetectionStatus(ndp.ep.nic.ID(), addr, true, nil)
 		}
 
 		return nil
 	}
 
-	var done bool
-	var timer tcpip.Timer
-	// We initially start a timer to fire immediately because some of the DAD work
-	// cannot be done while holding the IPv6 endpoint's lock. This is effectively
-	// the same as starting a goroutine but we use a timer that fires immediately
-	// so we can reset it for the next DAD iteration.
-	timer = ndp.ep.protocol.stack.Clock().AfterFunc(0, func() {
-		ndp.ep.mu.Lock()
-		defer ndp.ep.mu.Unlock()
-
-		if done {
-			// If we reach this point, it means that the DAD timer fired after
-			// another goroutine already obtained the IPv6 endpoint lock and stopped
-			// DAD before this function obtained the NIC lock. Simply return here and
-			// do nothing further.
-			return
-		}
-
-		if addressEndpoint.GetKind() != stack.PermanentTentative {
-			// The endpoint should still be marked as tentative since we are still
-			// performing DAD on it.
-			panic(fmt.Sprintf("ndpdad: addr %s is no longer tentative on NIC(%d)", addr, ndp.ep.nic.ID()))
-		}
+	state := dadState{
+		job: ndp.ep.protocol.stack.NewJob(&ndp.ep.mu, func() {
+			state, ok := ndp.dad[addr]
+			if !ok {
+				panic(fmt.Sprintf("ndpdad: DAD timer fired but missing state for %s on NIC(%d)", addr, ndp.ep.nic.ID()))
+			}
 
-		dadDone := remaining == 0
-
-		var err *tcpip.Error
-		if !dadDone {
-			// Use the unspecified address as the source address when performing DAD.
-			addressEndpoint := ndp.ep.acquireAddressOrCreateTempLocked(header.IPv6Any, true /* createTemp */, stack.NeverPrimaryEndpoint)
-
-			// Do not hold the lock when sending packets which may be a long running
-			// task or may block link address resolution. We know this is safe
-			// because immediately after obtaining the lock again, we check if DAD
-			// has been stopped before doing any work with the IPv6 endpoint. Note,
-			// DAD would be stopped if the IPv6 endpoint was disabled or closed, or if
-			// the address was removed.
-			ndp.ep.mu.Unlock()
-			err = ndp.sendDADPacket(addr, addressEndpoint)
-			ndp.ep.mu.Lock()
-			addressEndpoint.DecRef()
-		}
+			if addressEndpoint.GetKind() != stack.PermanentTentative {
+				// The endpoint should still be marked as tentative since we are still
+				// performing DAD on it.
+				panic(fmt.Sprintf("ndpdad: addr %s is no longer tentative on NIC(%d)", addr, ndp.ep.nic.ID()))
+			}
 
-		if done {
-			// If we reach this point, it means that DAD was stopped after we released
-			// the IPv6 endpoint's read lock and before we obtained the write lock.
-			return
-		}
+			dadDone := remaining == 0
 
-		if dadDone {
-			// DAD has resolved.
-			addressEndpoint.SetKind(stack.Permanent)
-		} else if err == nil {
-			// DAD is not done and we had no errors when sending the last NDP NS,
-			// schedule the next DAD timer.
-			remaining--
-			timer.Reset(ndp.configs.RetransmitTimer)
-			return
-		}
+			var err *tcpip.Error
+			if !dadDone {
+				err = ndp.sendDADPacket(addr, addressEndpoint)
+			}
 
-		// At this point we know that either DAD is done or we hit an error sending
-		// the last NDP NS. Either way, clean up addr's DAD state and let the
-		// integrator know DAD has completed.
-		delete(ndp.dad, addr)
+			if dadDone {
+				// DAD has resolved.
+				addressEndpoint.SetKind(stack.Permanent)
+			} else if err == nil {
+				// DAD is not done and we had no errors when sending the last NDP NS,
+				// schedule the next DAD timer.
+				remaining--
+				state.job.Schedule(ndp.configs.RetransmitTimer)
+				return
+			}
 
-		if ndpDisp := ndp.ep.protocol.ndpDisp; ndpDisp != nil {
-			ndpDisp.OnDuplicateAddressDetectionStatus(ndp.ep.nic.ID(), addr, dadDone, err)
-		}
+			// At this point we know that either DAD is done or we hit an error
+			// sending the last NDP NS. Either way, clean up addr's DAD state and let
+			// the integrator know DAD has completed.
+			delete(ndp.dad, addr)
 
-		// If DAD resolved for a stable SLAAC address, attempt generation of a
-		// temporary SLAAC address.
-		if dadDone && addressEndpoint.ConfigType() == stack.AddressConfigSlaac {
-			// Reset the generation attempts counter as we are starting the generation
-			// of a new address for the SLAAC prefix.
-			ndp.regenerateTempSLAACAddr(addressEndpoint.AddressWithPrefix().Subnet(), true /* resetGenAttempts */)
-		}
-	})
+			if ndpDisp := ndp.ep.protocol.options.NDPDisp; ndpDisp != nil {
+				ndpDisp.OnDuplicateAddressDetectionStatus(ndp.ep.nic.ID(), addr, dadDone, err)
+			}
 
-	ndp.dad[addr] = dadState{
-		timer: timer,
-		done:  &done,
+			// If DAD resolved for a stable SLAAC address, attempt generation of a
+			// temporary SLAAC address.
+			if dadDone && addressEndpoint.ConfigType() == stack.AddressConfigSlaac {
+				// Reset the generation attempts counter as we are starting the generation
+				// of a new address for the SLAAC prefix.
+				ndp.regenerateTempSLAACAddr(addressEndpoint.AddressWithPrefix().Subnet(), true /* resetGenAttempts */)
+			}
+		}),
 	}
 
+	// We initially start a timer to fire immediately because some of the DAD work
+	// cannot be done while holding the IPv6 endpoint's lock. This is effectively
+	// the same as starting a goroutine but we use a timer that fires immediately
+	// so we can reset it for the next DAD iteration.
+	state.job.Schedule(0)
+	ndp.dad[addr] = state
+
 	return nil
 }
 
@@ -745,55 +710,31 @@ func (ndp *ndpState) startDuplicateAddressDetection(addr tcpip.Address, addressE
 // addr.
 //
 // addr must be a tentative IPv6 address on ndp's IPv6 endpoint.
-//
-// The IPv6 endpoint that ndp belongs to MUST NOT be locked.
 func (ndp *ndpState) sendDADPacket(addr tcpip.Address, addressEndpoint stack.AddressEndpoint) *tcpip.Error {
 	snmc := header.SolicitedNodeAddr(addr)
 
-	r, err := ndp.ep.protocol.stack.FindRoute(ndp.ep.nic.ID(), header.IPv6Any, snmc, ProtocolNumber, false /* multicastLoop */)
-	if err != nil {
-		return err
-	}
-	defer r.Release()
-
-	// Route should resolve immediately since snmc is a multicast address so a
-	// remote link address can be calculated without a resolution process.
-	if c, err := r.Resolve(nil); err != nil {
-		// Do not consider the NIC being unknown or disabled as a fatal error.
-		// Since this method is required to be called when the IPv6 endpoint is not
-		// locked, the NIC could have been disabled or removed by another goroutine.
-		if err == tcpip.ErrUnknownNICID || err != tcpip.ErrInvalidEndpointState {
-			return err
-		}
-
-		panic(fmt.Sprintf("ndp: error when resolving route to send NDP NS for DAD (%s -> %s on NIC(%d)): %s", header.IPv6Any, snmc, ndp.ep.nic.ID(), err))
-	} else if c != nil {
-		panic(fmt.Sprintf("ndp: route resolution not immediate for route to send NDP NS for DAD (%s -> %s on NIC(%d))", header.IPv6Any, snmc, ndp.ep.nic.ID()))
-	}
-
-	icmpData := header.ICMPv6(buffer.NewView(header.ICMPv6NeighborSolicitMinimumSize))
-	icmpData.SetType(header.ICMPv6NeighborSolicit)
-	ns := header.NDPNeighborSolicit(icmpData.NDPPayload())
+	icmp := header.ICMPv6(buffer.NewView(header.ICMPv6NeighborSolicitMinimumSize))
+	icmp.SetType(header.ICMPv6NeighborSolicit)
+	ns := header.NDPNeighborSolicit(icmp.MessageBody())
 	ns.SetTargetAddress(addr)
-	icmpData.SetChecksum(header.ICMPv6Checksum(icmpData, r.LocalAddress, r.RemoteAddress, buffer.VectorisedView{}))
+	icmp.SetChecksum(header.ICMPv6Checksum(icmp, header.IPv6Any, snmc, buffer.VectorisedView{}))
 
 	pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
-		ReserveHeaderBytes: int(r.MaxHeaderLength()),
-		Data:               buffer.View(icmpData).ToVectorisedView(),
+		ReserveHeaderBytes: int(ndp.ep.MaxHeaderLength()),
+		Data:               buffer.View(icmp).ToVectorisedView(),
 	})
 
-	sent := r.Stats().ICMP.V6PacketsSent
-	if err := r.WritePacket(nil,
-		stack.NetworkHeaderParams{
-			Protocol: header.ICMPv6ProtocolNumber,
-			TTL:      header.NDPHopLimit,
-		}, pkt,
-	); err != nil {
+	sent := ndp.ep.protocol.stack.Stats().ICMP.V6.PacketsSent
+	ndp.ep.addIPHeader(header.IPv6Any, snmc, pkt, stack.NetworkHeaderParams{
+		Protocol: header.ICMPv6ProtocolNumber,
+		TTL:      header.NDPHopLimit,
+	})
+
+	if err := ndp.ep.nic.WritePacketToRemote(header.EthernetAddressFromMulticastIPv6Address(snmc), nil /* gso */, ProtocolNumber, pkt); err != nil {
 		sent.Dropped.Increment()
 		return err
 	}
 	sent.NeighborSolicit.Increment()
-
 	return nil
 }
 
@@ -812,18 +753,11 @@ func (ndp *ndpState) stopDuplicateAddressDetection(addr tcpip.Address) {
 		return
 	}
 
-	if dad.timer != nil {
-		dad.timer.Stop()
-		dad.timer = nil
-
-		*dad.done = true
-		dad.done = nil
-	}
-
+	dad.job.Cancel()
 	delete(ndp.dad, addr)
 
 	// Let the integrator know DAD did not resolve.
-	if ndpDisp := ndp.ep.protocol.ndpDisp; ndpDisp != nil {
+	if ndpDisp := ndp.ep.protocol.options.NDPDisp; ndpDisp != nil {
 		ndpDisp.OnDuplicateAddressDetectionStatus(ndp.ep.nic.ID(), addr, false, nil)
 	}
 }
@@ -846,7 +780,7 @@ func (ndp *ndpState) handleRA(ip tcpip.Address, ra header.NDPRouterAdvert) {
 	// Only worry about the DHCPv6 configuration if we have an NDPDispatcher as we
 	// only inform the dispatcher on configuration changes. We do nothing else
 	// with the information.
-	if ndpDisp := ndp.ep.protocol.ndpDisp; ndpDisp != nil {
+	if ndpDisp := ndp.ep.protocol.options.NDPDisp; ndpDisp != nil {
 		var configuration DHCPv6ConfigurationFromNDPRA
 		switch {
 		case ra.ManagedAddrConfFlag():
@@ -903,20 +837,20 @@ func (ndp *ndpState) handleRA(ip tcpip.Address, ra header.NDPRouterAdvert) {
 	for opt, done, _ := it.Next(); !done; opt, done, _ = it.Next() {
 		switch opt := opt.(type) {
 		case header.NDPRecursiveDNSServer:
-			if ndp.ep.protocol.ndpDisp == nil {
+			if ndp.ep.protocol.options.NDPDisp == nil {
 				continue
 			}
 
 			addrs, _ := opt.Addresses()
-			ndp.ep.protocol.ndpDisp.OnRecursiveDNSServerOption(ndp.ep.nic.ID(), addrs, opt.Lifetime())
+			ndp.ep.protocol.options.NDPDisp.OnRecursiveDNSServerOption(ndp.ep.nic.ID(), addrs, opt.Lifetime())
 
 		case header.NDPDNSSearchList:
-			if ndp.ep.protocol.ndpDisp == nil {
+			if ndp.ep.protocol.options.NDPDisp == nil {
 				continue
 			}
 
 			domainNames, _ := opt.DomainNames()
-			ndp.ep.protocol.ndpDisp.OnDNSSearchListOption(ndp.ep.nic.ID(), domainNames, opt.Lifetime())
+			ndp.ep.protocol.options.NDPDisp.OnDNSSearchListOption(ndp.ep.nic.ID(), domainNames, opt.Lifetime())
 
 		case header.NDPPrefixInformation:
 			prefix := opt.Subnet()
@@ -964,7 +898,7 @@ func (ndp *ndpState) invalidateDefaultRouter(ip tcpip.Address) {
 	delete(ndp.defaultRouters, ip)
 
 	// Let the integrator know a discovered default router is invalidated.
-	if ndpDisp := ndp.ep.protocol.ndpDisp; ndpDisp != nil {
+	if ndpDisp := ndp.ep.protocol.options.NDPDisp; ndpDisp != nil {
 		ndpDisp.OnDefaultRouterInvalidated(ndp.ep.nic.ID(), ip)
 	}
 }
@@ -976,7 +910,7 @@ func (ndp *ndpState) invalidateDefaultRouter(ip tcpip.Address) {
 //
 // The IPv6 endpoint that ndp belongs to MUST be locked.
 func (ndp *ndpState) rememberDefaultRouter(ip tcpip.Address, rl time.Duration) {
-	ndpDisp := ndp.ep.protocol.ndpDisp
+	ndpDisp := ndp.ep.protocol.options.NDPDisp
 	if ndpDisp == nil {
 		return
 	}
@@ -1006,7 +940,7 @@ func (ndp *ndpState) rememberDefaultRouter(ip tcpip.Address, rl time.Duration) {
 //
 // The IPv6 endpoint that ndp belongs to MUST be locked.
 func (ndp *ndpState) rememberOnLinkPrefix(prefix tcpip.Subnet, l time.Duration) {
-	ndpDisp := ndp.ep.protocol.ndpDisp
+	ndpDisp := ndp.ep.protocol.options.NDPDisp
 	if ndpDisp == nil {
 		return
 	}
@@ -1047,7 +981,7 @@ func (ndp *ndpState) invalidateOnLinkPrefix(prefix tcpip.Subnet) {
 	delete(ndp.onLinkPrefixes, prefix)
 
 	// Let the integrator know a discovered on-link prefix is invalidated.
-	if ndpDisp := ndp.ep.protocol.ndpDisp; ndpDisp != nil {
+	if ndpDisp := ndp.ep.protocol.options.NDPDisp; ndpDisp != nil {
 		ndpDisp.OnOnLinkPrefixInvalidated(ndp.ep.nic.ID(), prefix)
 	}
 }
@@ -1225,7 +1159,7 @@ func (ndp *ndpState) doSLAAC(prefix tcpip.Subnet, pl, vl time.Duration) {
 // The IPv6 endpoint that ndp belongs to MUST be locked.
 func (ndp *ndpState) addAndAcquireSLAACAddr(addr tcpip.AddressWithPrefix, configType stack.AddressConfigType, deprecated bool) stack.AddressEndpoint {
 	// Inform the integrator that we have a new SLAAC address.
-	ndpDisp := ndp.ep.protocol.ndpDisp
+	ndpDisp := ndp.ep.protocol.options.NDPDisp
 	if ndpDisp == nil {
 		return nil
 	}
@@ -1272,7 +1206,7 @@ func (ndp *ndpState) generateSLAACAddr(prefix tcpip.Subnet, state *slaacPrefixSt
 		}
 
 		dadCounter := state.generationAttempts + state.stableAddr.localGenerationFailures
-		if oIID := ndp.ep.protocol.opaqueIIDOpts; oIID.NICNameFromID != nil {
+		if oIID := ndp.ep.protocol.options.OpaqueIIDOpts; oIID.NICNameFromID != nil {
 			addrBytes = header.AppendOpaqueInterfaceIdentifier(
 				addrBytes[:header.IIDOffsetInIPv6Address],
 				prefix,
@@ -1676,7 +1610,7 @@ func (ndp *ndpState) deprecateSLAACAddress(addressEndpoint stack.AddressEndpoint
 	}
 
 	addressEndpoint.SetDeprecated(true)
-	if ndpDisp := ndp.ep.protocol.ndpDisp; ndpDisp != nil {
+	if ndpDisp := ndp.ep.protocol.options.NDPDisp; ndpDisp != nil {
 		ndpDisp.OnAutoGenAddressDeprecated(ndp.ep.nic.ID(), addressEndpoint.AddressWithPrefix())
 	}
 }
@@ -1701,7 +1635,7 @@ func (ndp *ndpState) invalidateSLAACPrefix(prefix tcpip.Subnet, state slaacPrefi
 //
 // The IPv6 endpoint that ndp belongs to MUST be locked.
 func (ndp *ndpState) cleanupSLAACAddrResourcesAndNotify(addr tcpip.AddressWithPrefix, invalidatePrefix bool) {
-	if ndpDisp := ndp.ep.protocol.ndpDisp; ndpDisp != nil {
+	if ndpDisp := ndp.ep.protocol.options.NDPDisp; ndpDisp != nil {
 		ndpDisp.OnAutoGenAddressInvalidated(ndp.ep.nic.ID(), addr)
 	}
 
@@ -1761,7 +1695,7 @@ func (ndp *ndpState) invalidateTempSLAACAddr(tempAddrs map[tcpip.Address]tempSLA
 //
 // The IPv6 endpoint that ndp belongs to MUST be locked.
 func (ndp *ndpState) cleanupTempSLAACAddrResourcesAndNotify(addr tcpip.AddressWithPrefix, invalidateAddr bool) {
-	if ndpDisp := ndp.ep.protocol.ndpDisp; ndpDisp != nil {
+	if ndpDisp := ndp.ep.protocol.options.NDPDisp; ndpDisp != nil {
 		ndpDisp.OnAutoGenAddressInvalidated(ndp.ep.nic.ID(), addr)
 	}
 
@@ -1859,7 +1793,7 @@ func (ndp *ndpState) cleanupState(hostOnly bool) {
 //
 // The IPv6 endpoint that ndp belongs to MUST be locked.
 func (ndp *ndpState) startSolicitingRouters() {
-	if ndp.rtrSolicit.timer != nil {
+	if ndp.rtrSolicitJob != nil {
 		// We are already soliciting routers.
 		return
 	}
@@ -1876,56 +1810,14 @@ func (ndp *ndpState) startSolicitingRouters() {
 		delay = time.Duration(rand.Int63n(int64(ndp.configs.MaxRtrSolicitationDelay)))
 	}
 
-	var done bool
-	ndp.rtrSolicit.done = &done
-	ndp.rtrSolicit.timer = ndp.ep.protocol.stack.Clock().AfterFunc(delay, func() {
-		ndp.ep.mu.Lock()
-		if done {
-			// If we reach this point, it means that the RS timer fired after another
-			// goroutine already obtained the IPv6 endpoint lock and stopped
-			// solicitations. Simply return here and do nothing further.
-			ndp.ep.mu.Unlock()
-			return
-		}
-
+	ndp.rtrSolicitJob = ndp.ep.protocol.stack.NewJob(&ndp.ep.mu, func() {
 		// As per RFC 4861 section 4.1, the source of the RS is an address assigned
 		// to the sending interface, or the unspecified address if no address is
 		// assigned to the sending interface.
-		addressEndpoint := ndp.ep.acquireOutgoingPrimaryAddressRLocked(header.IPv6AllRoutersMulticastAddress, false)
-		if addressEndpoint == nil {
-			// Incase this ends up creating a new temporary address, we need to hold
-			// onto the endpoint until a route is obtained. If we decrement the
-			// reference count before obtaing a route, the address's resources would
-			// be released and attempting to obtain a route after would fail. Once a
-			// route is obtainted, it is safe to decrement the reference count since
-			// obtaining a route increments the address's reference count.
-			addressEndpoint = ndp.ep.acquireAddressOrCreateTempLocked(header.IPv6Any, true /* createTemp */, stack.NeverPrimaryEndpoint)
-		}
-		ndp.ep.mu.Unlock()
-
-		localAddr := addressEndpoint.AddressWithPrefix().Address
-		r, err := ndp.ep.protocol.stack.FindRoute(ndp.ep.nic.ID(), localAddr, header.IPv6AllRoutersMulticastAddress, ProtocolNumber, false /* multicastLoop */)
-		addressEndpoint.DecRef()
-		if err != nil {
-			return
-		}
-		defer r.Release()
-
-		// Route should resolve immediately since
-		// header.IPv6AllRoutersMulticastAddress is a multicast address so a
-		// remote link address can be calculated without a resolution process.
-		if c, err := r.Resolve(nil); err != nil {
-			// Do not consider the NIC being unknown or disabled as a fatal error.
-			// Since this method is required to be called when the IPv6 endpoint is
-			// not locked, the IPv6 endpoint could have been disabled or removed by
-			// another goroutine.
-			if err == tcpip.ErrUnknownNICID || err == tcpip.ErrInvalidEndpointState {
-				return
-			}
-
-			panic(fmt.Sprintf("ndp: error when resolving route to send NDP RS (%s -> %s on NIC(%d)): %s", header.IPv6Any, header.IPv6AllRoutersMulticastAddress, ndp.ep.nic.ID(), err))
-		} else if c != nil {
-			panic(fmt.Sprintf("ndp: route resolution not immediate for route to send NDP RS (%s -> %s on NIC(%d))", header.IPv6Any, header.IPv6AllRoutersMulticastAddress, ndp.ep.nic.ID()))
+		localAddr := header.IPv6Any
+		if addressEndpoint := ndp.ep.acquireOutgoingPrimaryAddressRLocked(header.IPv6AllRoutersMulticastAddress, false); addressEndpoint != nil {
+			localAddr = addressEndpoint.AddressWithPrefix().Address
+			addressEndpoint.DecRef()
 		}
 
 		// As per RFC 4861 section 4.1, an NDP RS SHOULD include the source
@@ -1936,30 +1828,31 @@ func (ndp *ndpState) startSolicitingRouters() {
 		// TODO(b/141011931): Validate a LinkEndpoint's link address (provided by
 		// LinkEndpoint.LinkAddress) before reaching this point.
 		var optsSerializer header.NDPOptionsSerializer
-		if localAddr != header.IPv6Any && header.IsValidUnicastEthernetAddress(r.LocalLinkAddress) {
+		linkAddress := ndp.ep.nic.LinkAddress()
+		if localAddr != header.IPv6Any && header.IsValidUnicastEthernetAddress(linkAddress) {
 			optsSerializer = header.NDPOptionsSerializer{
-				header.NDPSourceLinkLayerAddressOption(r.LocalLinkAddress),
+				header.NDPSourceLinkLayerAddressOption(linkAddress),
 			}
 		}
 		payloadSize := header.ICMPv6HeaderSize + header.NDPRSMinimumSize + int(optsSerializer.Length())
 		icmpData := header.ICMPv6(buffer.NewView(payloadSize))
 		icmpData.SetType(header.ICMPv6RouterSolicit)
-		rs := header.NDPRouterSolicit(icmpData.NDPPayload())
+		rs := header.NDPRouterSolicit(icmpData.MessageBody())
 		rs.Options().Serialize(optsSerializer)
-		icmpData.SetChecksum(header.ICMPv6Checksum(icmpData, r.LocalAddress, r.RemoteAddress, buffer.VectorisedView{}))
+		icmpData.SetChecksum(header.ICMPv6Checksum(icmpData, localAddr, header.IPv6AllRoutersMulticastAddress, buffer.VectorisedView{}))
 
 		pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
-			ReserveHeaderBytes: int(r.MaxHeaderLength()),
+			ReserveHeaderBytes: int(ndp.ep.MaxHeaderLength()),
 			Data:               buffer.View(icmpData).ToVectorisedView(),
 		})
 
-		sent := r.Stats().ICMP.V6PacketsSent
-		if err := r.WritePacket(nil,
-			stack.NetworkHeaderParams{
-				Protocol: header.ICMPv6ProtocolNumber,
-				TTL:      header.NDPHopLimit,
-			}, pkt,
-		); err != nil {
+		sent := ndp.ep.protocol.stack.Stats().ICMP.V6.PacketsSent
+		ndp.ep.addIPHeader(localAddr, header.IPv6AllRoutersMulticastAddress, pkt, stack.NetworkHeaderParams{
+			Protocol: header.ICMPv6ProtocolNumber,
+			TTL:      header.NDPHopLimit,
+		})
+
+		if err := ndp.ep.nic.WritePacketToRemote(header.EthernetAddressFromMulticastIPv6Address(header.IPv6AllRoutersMulticastAddress), nil /* gso */, ProtocolNumber, pkt); err != nil {
 			sent.Dropped.Increment()
 			log.Printf("startSolicitingRouters: error writing NDP router solicit message on NIC(%d); err = %s", ndp.ep.nic.ID(), err)
 			// Don't send any more messages if we had an error.
@@ -1969,21 +1862,12 @@ func (ndp *ndpState) startSolicitingRouters() {
 			remaining--
 		}
 
-		ndp.ep.mu.Lock()
-		if done || remaining == 0 {
-			ndp.rtrSolicit.timer = nil
-			ndp.rtrSolicit.done = nil
-		} else if ndp.rtrSolicit.timer != nil {
-			// Note, we need to explicitly check to make sure that
-			// the timer field is not nil because if it was nil but
-			// we still reached this point, then we know the IPv6 endpoint
-			// was requested to stop soliciting routers so we don't
-			// need to send the next Router Solicitation message.
-			ndp.rtrSolicit.timer.Reset(ndp.configs.RtrSolicitationInterval)
+		if remaining != 0 {
+			ndp.rtrSolicitJob.Schedule(ndp.configs.RtrSolicitationInterval)
 		}
-		ndp.ep.mu.Unlock()
 	})
 
+	ndp.rtrSolicitJob.Schedule(delay)
 }
 
 // stopSolicitingRouters stops soliciting routers. If routers are not currently
@@ -1991,21 +1875,19 @@ func (ndp *ndpState) startSolicitingRouters() {
 //
 // The IPv6 endpoint that ndp belongs to MUST be locked.
 func (ndp *ndpState) stopSolicitingRouters() {
-	if ndp.rtrSolicit.timer == nil {
+	if ndp.rtrSolicitJob == nil {
 		// Nothing to do.
 		return
 	}
 
-	*ndp.rtrSolicit.done = true
-	ndp.rtrSolicit.timer.Stop()
-	ndp.rtrSolicit.timer = nil
-	ndp.rtrSolicit.done = nil
+	ndp.rtrSolicitJob.Cancel()
+	ndp.rtrSolicitJob = nil
 }
 
 // initializeTempAddrState initializes state related to temporary SLAAC
 // addresses.
 func (ndp *ndpState) initializeTempAddrState() {
-	header.InitialTempIID(ndp.temporaryIIDHistory[:], ndp.ep.protocol.tempIIDSeed, ndp.ep.nic.ID())
+	header.InitialTempIID(ndp.temporaryIIDHistory[:], ndp.ep.protocol.options.TempIIDSeed, ndp.ep.nic.ID())
 
 	if MaxDesyncFactor != 0 {
 		ndp.temporaryAddressDesyncFactor = time.Duration(rand.Int63n(int64(MaxDesyncFactor)))
diff --git a/pkg/tcpip/network/ipv6/ndp_test.go b/pkg/tcpip/network/ipv6/ndp_test.go
index 37e8b1083..95c626bb8 100644
--- a/pkg/tcpip/network/ipv6/ndp_test.go
+++ b/pkg/tcpip/network/ipv6/ndp_test.go
@@ -205,7 +205,7 @@ func TestNeighorSolicitationWithSourceLinkLayerOption(t *testing.T) {
 			hdr := buffer.NewPrependable(header.IPv6MinimumSize + ndpNSSize)
 			pkt := header.ICMPv6(hdr.Prepend(ndpNSSize))
 			pkt.SetType(header.ICMPv6NeighborSolicit)
-			ns := header.NDPNeighborSolicit(pkt.NDPPayload())
+			ns := header.NDPNeighborSolicit(pkt.MessageBody())
 			ns.SetTargetAddress(lladdr0)
 			opts := ns.Options()
 			copy(opts, test.optsBuf)
@@ -220,7 +220,7 @@ func TestNeighorSolicitationWithSourceLinkLayerOption(t *testing.T) {
 				DstAddr:       lladdr0,
 			})
 
-			invalid := s.Stats().ICMP.V6PacketsReceived.Invalid
+			invalid := s.Stats().ICMP.V6.PacketsReceived.Invalid
 
 			// Invalid count should initially be 0.
 			if got := invalid.Value(); got != 0 {
@@ -311,7 +311,7 @@ func TestNeighorSolicitationWithSourceLinkLayerOptionUsingNeighborCache(t *testi
 			hdr := buffer.NewPrependable(header.IPv6MinimumSize + ndpNSSize)
 			pkt := header.ICMPv6(hdr.Prepend(ndpNSSize))
 			pkt.SetType(header.ICMPv6NeighborSolicit)
-			ns := header.NDPNeighborSolicit(pkt.NDPPayload())
+			ns := header.NDPNeighborSolicit(pkt.MessageBody())
 			ns.SetTargetAddress(lladdr0)
 			opts := ns.Options()
 			copy(opts, test.optsBuf)
@@ -326,16 +326,16 @@ func TestNeighorSolicitationWithSourceLinkLayerOptionUsingNeighborCache(t *testi
 				DstAddr:       lladdr0,
 			})
 
-			invalid := s.Stats().ICMP.V6PacketsReceived.Invalid
+			invalid := s.Stats().ICMP.V6.PacketsReceived.Invalid
 
 			// Invalid count should initially be 0.
 			if got := invalid.Value(); got != 0 {
 				t.Fatalf("got invalid = %d, want = 0", got)
 			}
 
-			e.InjectInbound(ProtocolNumber, &stack.PacketBuffer{
+			e.InjectInbound(ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
 				Data: hdr.View().ToVectorisedView(),
-			})
+			}))
 
 			neighbors, err := s.Neighbors(nicID)
 			if err != nil {
@@ -591,7 +591,7 @@ func TestNeighorSolicitationResponse(t *testing.T) {
 					hdr := buffer.NewPrependable(header.IPv6MinimumSize + ndpNSSize)
 					pkt := header.ICMPv6(hdr.Prepend(ndpNSSize))
 					pkt.SetType(header.ICMPv6NeighborSolicit)
-					ns := header.NDPNeighborSolicit(pkt.NDPPayload())
+					ns := header.NDPNeighborSolicit(pkt.MessageBody())
 					ns.SetTargetAddress(nicAddr)
 					opts := ns.Options()
 					opts.Serialize(test.nsOpts)
@@ -606,7 +606,7 @@ func TestNeighorSolicitationResponse(t *testing.T) {
 						DstAddr:       test.nsDst,
 					})
 
-					invalid := s.Stats().ICMP.V6PacketsReceived.Invalid
+					invalid := s.Stats().ICMP.V6.PacketsReceived.Invalid
 
 					// Invalid count should initially be 0.
 					if got := invalid.Value(); got != 0 {
@@ -650,8 +650,8 @@ func TestNeighorSolicitationResponse(t *testing.T) {
 						if p.Route.RemoteAddress != respNSDst {
 							t.Errorf("got p.Route.RemoteAddress = %s, want = %s", p.Route.RemoteAddress, respNSDst)
 						}
-						if want := header.EthernetAddressFromMulticastIPv6Address(respNSDst); p.Route.RemoteLinkAddress != want {
-							t.Errorf("got p.Route.RemoteLinkAddress = %s, want = %s", p.Route.RemoteLinkAddress, want)
+						if got, want := p.Route.RemoteLinkAddress(), header.EthernetAddressFromMulticastIPv6Address(respNSDst); got != want {
+							t.Errorf("got p.Route.RemoteLinkAddress() = %s, want = %s", got, want)
 						}
 
 						checker.IPv6(t, stack.PayloadSince(p.Pkt.NetworkHeader()),
@@ -672,7 +672,7 @@ func TestNeighorSolicitationResponse(t *testing.T) {
 						hdr := buffer.NewPrependable(header.IPv6MinimumSize + ndpNASize)
 						pkt := header.ICMPv6(hdr.Prepend(ndpNASize))
 						pkt.SetType(header.ICMPv6NeighborAdvert)
-						na := header.NDPNeighborAdvert(pkt.NDPPayload())
+						na := header.NDPNeighborAdvert(pkt.MessageBody())
 						na.SetSolicitedFlag(true)
 						na.SetOverrideFlag(true)
 						na.SetTargetAddress(test.nsSrc)
@@ -706,8 +706,8 @@ func TestNeighorSolicitationResponse(t *testing.T) {
 					if p.Route.RemoteAddress != test.naDst {
 						t.Errorf("got p.Route.RemoteAddress = %s, want = %s", p.Route.RemoteAddress, test.naDst)
 					}
-					if p.Route.RemoteLinkAddress != test.naDstLinkAddr {
-						t.Errorf("got p.Route.RemoteLinkAddress = %s, want = %s", p.Route.RemoteLinkAddress, test.naDstLinkAddr)
+					if got := p.Route.RemoteLinkAddress(); got != test.naDstLinkAddr {
+						t.Errorf("got p.Route.RemoteLinkAddress() = %s, want = %s", got, test.naDstLinkAddr)
 					}
 
 					checker.IPv6(t, stack.PayloadSince(p.Pkt.NetworkHeader()),
@@ -777,7 +777,7 @@ func TestNeighorAdvertisementWithTargetLinkLayerOption(t *testing.T) {
 			hdr := buffer.NewPrependable(header.IPv6MinimumSize + ndpNASize)
 			pkt := header.ICMPv6(hdr.Prepend(ndpNASize))
 			pkt.SetType(header.ICMPv6NeighborAdvert)
-			ns := header.NDPNeighborAdvert(pkt.NDPPayload())
+			ns := header.NDPNeighborAdvert(pkt.MessageBody())
 			ns.SetTargetAddress(lladdr1)
 			opts := ns.Options()
 			copy(opts, test.optsBuf)
@@ -792,7 +792,7 @@ func TestNeighorAdvertisementWithTargetLinkLayerOption(t *testing.T) {
 				DstAddr:       lladdr0,
 			})
 
-			invalid := s.Stats().ICMP.V6PacketsReceived.Invalid
+			invalid := s.Stats().ICMP.V6.PacketsReceived.Invalid
 
 			// Invalid count should initially be 0.
 			if got := invalid.Value(); got != 0 {
@@ -890,7 +890,7 @@ func TestNeighorAdvertisementWithTargetLinkLayerOptionUsingNeighborCache(t *test
 			hdr := buffer.NewPrependable(header.IPv6MinimumSize + ndpNASize)
 			pkt := header.ICMPv6(hdr.Prepend(ndpNASize))
 			pkt.SetType(header.ICMPv6NeighborAdvert)
-			ns := header.NDPNeighborAdvert(pkt.NDPPayload())
+			ns := header.NDPNeighborAdvert(pkt.MessageBody())
 			ns.SetTargetAddress(lladdr1)
 			opts := ns.Options()
 			copy(opts, test.optsBuf)
@@ -905,16 +905,16 @@ func TestNeighorAdvertisementWithTargetLinkLayerOptionUsingNeighborCache(t *test
 				DstAddr:       lladdr0,
 			})
 
-			invalid := s.Stats().ICMP.V6PacketsReceived.Invalid
+			invalid := s.Stats().ICMP.V6.PacketsReceived.Invalid
 
 			// Invalid count should initially be 0.
 			if got := invalid.Value(); got != 0 {
 				t.Fatalf("got invalid = %d, want = 0", got)
 			}
 
-			e.InjectInbound(ProtocolNumber, &stack.PacketBuffer{
+			e.InjectInbound(ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
 				Data: hdr.View().ToVectorisedView(),
-			})
+			}))
 
 			neighbors, err := s.Neighbors(nicID)
 			if err != nil {
@@ -1122,7 +1122,7 @@ func TestNDPValidation(t *testing.T) {
 									s.SetForwarding(ProtocolNumber, true)
 								}
 
-								stats := s.Stats().ICMP.V6PacketsReceived
+								stats := s.Stats().ICMP.V6.PacketsReceived
 								invalid := stats.Invalid
 								routerOnly := stats.RouterOnlyPacketsDroppedByHost
 								typStat := typ.statCounter(stats)
@@ -1346,7 +1346,7 @@ func TestRouterAdvertValidation(t *testing.T) {
 					pkt := header.ICMPv6(hdr.Prepend(icmpSize))
 					pkt.SetType(header.ICMPv6RouterAdvert)
 					pkt.SetCode(test.code)
-					copy(pkt.NDPPayload(), test.ndpPayload)
+					copy(pkt.MessageBody(), test.ndpPayload)
 					payloadLength := hdr.UsedLength()
 					pkt.SetChecksum(header.ICMPv6Checksum(pkt, test.src, header.IPv6AllNodesMulticastAddress, buffer.VectorisedView{}))
 					ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
@@ -1358,7 +1358,7 @@ func TestRouterAdvertValidation(t *testing.T) {
 						DstAddr:       header.IPv6AllNodesMulticastAddress,
 					})
 
-					stats := s.Stats().ICMP.V6PacketsReceived
+					stats := s.Stats().ICMP.V6.PacketsReceived
 					invalid := stats.Invalid
 					rxRA := stats.RouterAdvert
 
diff --git a/pkg/tcpip/network/multicast_group_test.go b/pkg/tcpip/network/multicast_group_test.go
new file mode 100644
index 000000000..95fb67986
--- /dev/null
+++ b/pkg/tcpip/network/multicast_group_test.go
@@ -0,0 +1,1069 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ip_test
+
+import (
+	"fmt"
+	"strings"
+	"testing"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/checker"
+	"gvisor.dev/gvisor/pkg/tcpip/faketime"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/link/channel"
+	"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
+	"gvisor.dev/gvisor/pkg/tcpip/network/ipv6"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+)
+
+const (
+	linkAddr = tcpip.LinkAddress("\x02\x02\x03\x04\x05\x06")
+
+	ipv4MulticastAddr1 = tcpip.Address("\xe0\x00\x00\x03")
+	ipv4MulticastAddr2 = tcpip.Address("\xe0\x00\x00\x04")
+	ipv4MulticastAddr3 = tcpip.Address("\xe0\x00\x00\x05")
+	ipv6MulticastAddr1 = tcpip.Address("\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03")
+	ipv6MulticastAddr2 = tcpip.Address("\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x04")
+	ipv6MulticastAddr3 = tcpip.Address("\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x05")
+
+	igmpMembershipQuery    = uint8(header.IGMPMembershipQuery)
+	igmpv1MembershipReport = uint8(header.IGMPv1MembershipReport)
+	igmpv2MembershipReport = uint8(header.IGMPv2MembershipReport)
+	igmpLeaveGroup         = uint8(header.IGMPLeaveGroup)
+	mldQuery               = uint8(header.ICMPv6MulticastListenerQuery)
+	mldReport              = uint8(header.ICMPv6MulticastListenerReport)
+	mldDone                = uint8(header.ICMPv6MulticastListenerDone)
+)
+
+var (
+	// unsolicitedIGMPReportIntervalMaxTenthSec is the maximum amount of time the
+	// NIC will wait before sending an unsolicited report after joining a
+	// multicast group, in deciseconds.
+	unsolicitedIGMPReportIntervalMaxTenthSec = func() uint8 {
+		const decisecond = time.Second / 10
+		if ipv4.UnsolicitedReportIntervalMax%decisecond != 0 {
+			panic(fmt.Sprintf("UnsolicitedReportIntervalMax of %d is a lossy conversion to deciseconds", ipv4.UnsolicitedReportIntervalMax))
+		}
+		return uint8(ipv4.UnsolicitedReportIntervalMax / decisecond)
+	}()
+)
+
+// validateMLDPacket checks that a passed PacketInfo is an IPv6 MLD packet
+// sent to the provided address with the passed fields set.
+func validateMLDPacket(t *testing.T, p channel.PacketInfo, remoteAddress tcpip.Address, mldType uint8, maxRespTime byte, groupAddress tcpip.Address) {
+	t.Helper()
+
+	payload := header.IPv6(stack.PayloadSince(p.Pkt.NetworkHeader()))
+	checker.IPv6(t, payload,
+		checker.DstAddr(remoteAddress),
+		// Hop Limit for an MLD message must be 1 as per RFC 2710 section 3.
+		checker.TTL(1),
+		checker.MLD(header.ICMPv6Type(mldType), header.MLDMinimumSize,
+			checker.MLDMaxRespDelay(time.Duration(maxRespTime)*time.Millisecond),
+			checker.MLDMulticastAddress(groupAddress),
+		),
+	)
+}
+
+// validateIGMPPacket checks that a passed PacketInfo is an IPv4 IGMP packet
+// sent to the provided address with the passed fields set.
+func validateIGMPPacket(t *testing.T, p channel.PacketInfo, remoteAddress tcpip.Address, igmpType uint8, maxRespTime byte, groupAddress tcpip.Address) {
+	t.Helper()
+
+	payload := header.IPv4(stack.PayloadSince(p.Pkt.NetworkHeader()))
+	checker.IPv4(t, payload,
+		checker.DstAddr(remoteAddress),
+		// TTL for an IGMP message must be 1 as per RFC 2236 section 2.
+		checker.TTL(1),
+		checker.IPv4RouterAlert(),
+		checker.IGMP(
+			checker.IGMPType(header.IGMPType(igmpType)),
+			checker.IGMPMaxRespTime(header.DecisecondToDuration(maxRespTime)),
+			checker.IGMPGroupAddress(groupAddress),
+		),
+	)
+}
+
+func createStack(t *testing.T, mgpEnabled bool) (*channel.Endpoint, *stack.Stack, *faketime.ManualClock) {
+	t.Helper()
+
+	// Create an endpoint of queue size 2, since no more than 2 packets are ever
+	// queued in the tests in this file.
+	e := channel.New(2, 1280, linkAddr)
+	clock := faketime.NewManualClock()
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocolFactory{
+			ipv4.NewProtocolWithOptions(ipv4.Options{
+				IGMP: ipv4.IGMPOptions{
+					Enabled: mgpEnabled,
+				},
+			}),
+			ipv6.NewProtocolWithOptions(ipv6.Options{
+				MLD: ipv6.MLDOptions{
+					Enabled: mgpEnabled,
+				},
+			}),
+		},
+		Clock: clock,
+	})
+	if err := s.CreateNIC(nicID, e); err != nil {
+		t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+	}
+
+	return e, s, clock
+}
+
+// createAndInjectIGMPPacket creates and injects an IGMP packet with the
+// specified fields.
+//
+// Note, the router alert option is not included in this packet.
+//
+// TODO(b/162198658): set the router alert option.
+func createAndInjectIGMPPacket(e *channel.Endpoint, igmpType byte, maxRespTime byte, groupAddress tcpip.Address) {
+	buf := buffer.NewView(header.IPv4MinimumSize + header.IGMPQueryMinimumSize)
+
+	ip := header.IPv4(buf)
+	ip.Encode(&header.IPv4Fields{
+		TotalLength: uint16(len(buf)),
+		TTL:         header.IGMPTTL,
+		Protocol:    uint8(header.IGMPProtocolNumber),
+		SrcAddr:     header.IPv4Any,
+		DstAddr:     header.IPv4AllSystems,
+	})
+	ip.SetChecksum(^ip.CalculateChecksum())
+
+	igmp := header.IGMP(buf[header.IPv4MinimumSize:])
+	igmp.SetType(header.IGMPType(igmpType))
+	igmp.SetMaxRespTime(maxRespTime)
+	igmp.SetGroupAddress(groupAddress)
+	igmp.SetChecksum(header.IGMPCalculateChecksum(igmp))
+
+	e.InjectInbound(ipv4.ProtocolNumber, &stack.PacketBuffer{
+		Data: buf.ToVectorisedView(),
+	})
+}
+
+// createAndInjectMLDPacket creates and injects an MLD packet with the
+// specified fields.
+//
+// Note, the router alert option is not included in this packet.
+//
+// TODO(b/162198658): set the router alert option.
+func createAndInjectMLDPacket(e *channel.Endpoint, mldType uint8, maxRespDelay byte, groupAddress tcpip.Address) {
+	icmpSize := header.ICMPv6HeaderSize + header.MLDMinimumSize
+	buf := buffer.NewView(header.IPv6MinimumSize + icmpSize)
+
+	ip := header.IPv6(buf)
+	ip.Encode(&header.IPv6Fields{
+		PayloadLength: uint16(icmpSize),
+		HopLimit:      header.MLDHopLimit,
+		NextHeader:    uint8(header.ICMPv6ProtocolNumber),
+		SrcAddr:       header.IPv4Any,
+		DstAddr:       header.IPv6AllNodesMulticastAddress,
+	})
+
+	icmp := header.ICMPv6(buf[header.IPv6MinimumSize:])
+	icmp.SetType(header.ICMPv6Type(mldType))
+	mld := header.MLD(icmp.MessageBody())
+	mld.SetMaximumResponseDelay(uint16(maxRespDelay))
+	mld.SetMulticastAddress(groupAddress)
+	icmp.SetChecksum(header.ICMPv6Checksum(icmp, header.IPv6Any, header.IPv6AllNodesMulticastAddress, buffer.VectorisedView{}))
+
+	e.InjectInbound(ipv6.ProtocolNumber, &stack.PacketBuffer{
+		Data: buf.ToVectorisedView(),
+	})
+}
+
+// TestMGPDisabled tests that the multicast group protocol is not enabled by
+// default.
+func TestMGPDisabled(t *testing.T) {
+	tests := []struct {
+		name              string
+		protoNum          tcpip.NetworkProtocolNumber
+		multicastAddr     tcpip.Address
+		sentReportStat    func(*stack.Stack) *tcpip.StatCounter
+		receivedQueryStat func(*stack.Stack) *tcpip.StatCounter
+		rxQuery           func(*channel.Endpoint)
+	}{
+		{
+			name:          "IGMP",
+			protoNum:      ipv4.ProtocolNumber,
+			multicastAddr: ipv4MulticastAddr1,
+			sentReportStat: func(s *stack.Stack) *tcpip.StatCounter {
+				return s.Stats().IGMP.PacketsSent.V2MembershipReport
+			},
+			receivedQueryStat: func(s *stack.Stack) *tcpip.StatCounter {
+				return s.Stats().IGMP.PacketsReceived.MembershipQuery
+			},
+			rxQuery: func(e *channel.Endpoint) {
+				createAndInjectIGMPPacket(e, igmpMembershipQuery, unsolicitedIGMPReportIntervalMaxTenthSec, header.IPv4Any)
+			},
+		},
+		{
+			name:          "MLD",
+			protoNum:      ipv6.ProtocolNumber,
+			multicastAddr: ipv6MulticastAddr1,
+			sentReportStat: func(s *stack.Stack) *tcpip.StatCounter {
+				return s.Stats().ICMP.V6.PacketsSent.MulticastListenerReport
+			},
+			receivedQueryStat: func(s *stack.Stack) *tcpip.StatCounter {
+				return s.Stats().ICMP.V6.PacketsReceived.MulticastListenerQuery
+			},
+			rxQuery: func(e *channel.Endpoint) {
+				createAndInjectMLDPacket(e, mldQuery, 0, header.IPv6Any)
+			},
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			e, s, clock := createStack(t, false)
+
+			// This NIC may join multicast groups when it is enabled but since MGP is
+			// disabled, no reports should be sent.
+			sentReportStat := test.sentReportStat(s)
+			if got := sentReportStat.Value(); got != 0 {
+				t.Fatalf("got sentReportState.Value() = %d, want = 0", got)
+			}
+			clock.Advance(time.Hour)
+			if p, ok := e.Read(); ok {
+				t.Fatalf("sent unexpected packet, stack with disabled MGP sent packet = %#v", p.Pkt)
+			}
+
+			// Test joining a specific group explicitly and verify that no reports are
+			// sent.
+			if err := s.JoinGroup(test.protoNum, nicID, test.multicastAddr); err != nil {
+				t.Fatalf("JoinGroup(%d, %d, %s): %s", test.protoNum, nicID, test.multicastAddr, err)
+			}
+			if got := sentReportStat.Value(); got != 0 {
+				t.Fatalf("got sentReportState.Value() = %d, want = 0", got)
+			}
+			clock.Advance(time.Hour)
+			if p, ok := e.Read(); ok {
+				t.Fatalf("sent unexpected packet, stack with disabled IGMP sent packet = %#v", p.Pkt)
+			}
+
+			// Inject a general query message. This should only trigger a report to be
+			// sent if the MGP was enabled.
+			test.rxQuery(e)
+			if got := test.receivedQueryStat(s).Value(); got != 1 {
+				t.Fatalf("got receivedQueryStat(_).Value() = %d, want = 1", got)
+			}
+			clock.Advance(time.Hour)
+			if p, ok := e.Read(); ok {
+				t.Fatalf("sent unexpected packet, stack with disabled IGMP sent packet = %+v", p.Pkt)
+			}
+		})
+	}
+}
+
+func TestMGPReceiveCounters(t *testing.T) {
+	tests := []struct {
+		name         string
+		headerType   uint8
+		maxRespTime  byte
+		groupAddress tcpip.Address
+		statCounter  func(*stack.Stack) *tcpip.StatCounter
+		rxMGPkt      func(*channel.Endpoint, byte, byte, tcpip.Address)
+	}{
+		{
+			name:         "IGMP Membership Query",
+			headerType:   igmpMembershipQuery,
+			maxRespTime:  unsolicitedIGMPReportIntervalMaxTenthSec,
+			groupAddress: header.IPv4Any,
+			statCounter: func(s *stack.Stack) *tcpip.StatCounter {
+				return s.Stats().IGMP.PacketsReceived.MembershipQuery
+			},
+			rxMGPkt: createAndInjectIGMPPacket,
+		},
+		{
+			name:         "IGMPv1 Membership Report",
+			headerType:   igmpv1MembershipReport,
+			maxRespTime:  0,
+			groupAddress: header.IPv4AllSystems,
+			statCounter: func(s *stack.Stack) *tcpip.StatCounter {
+				return s.Stats().IGMP.PacketsReceived.V1MembershipReport
+			},
+			rxMGPkt: createAndInjectIGMPPacket,
+		},
+		{
+			name:         "IGMPv2 Membership Report",
+			headerType:   igmpv2MembershipReport,
+			maxRespTime:  0,
+			groupAddress: header.IPv4AllSystems,
+			statCounter: func(s *stack.Stack) *tcpip.StatCounter {
+				return s.Stats().IGMP.PacketsReceived.V2MembershipReport
+			},
+			rxMGPkt: createAndInjectIGMPPacket,
+		},
+		{
+			name:         "IGMP Leave Group",
+			headerType:   igmpLeaveGroup,
+			maxRespTime:  0,
+			groupAddress: header.IPv4AllRoutersGroup,
+			statCounter: func(s *stack.Stack) *tcpip.StatCounter {
+				return s.Stats().IGMP.PacketsReceived.LeaveGroup
+			},
+			rxMGPkt: createAndInjectIGMPPacket,
+		},
+		{
+			name:         "MLD Query",
+			headerType:   mldQuery,
+			maxRespTime:  0,
+			groupAddress: header.IPv6Any,
+			statCounter: func(s *stack.Stack) *tcpip.StatCounter {
+				return s.Stats().ICMP.V6.PacketsReceived.MulticastListenerQuery
+			},
+			rxMGPkt: createAndInjectMLDPacket,
+		},
+		{
+			name:         "MLD Report",
+			headerType:   mldReport,
+			maxRespTime:  0,
+			groupAddress: header.IPv6Any,
+			statCounter: func(s *stack.Stack) *tcpip.StatCounter {
+				return s.Stats().ICMP.V6.PacketsReceived.MulticastListenerReport
+			},
+			rxMGPkt: createAndInjectMLDPacket,
+		},
+		{
+			name:         "MLD Done",
+			headerType:   mldDone,
+			maxRespTime:  0,
+			groupAddress: header.IPv6Any,
+			statCounter: func(s *stack.Stack) *tcpip.StatCounter {
+				return s.Stats().ICMP.V6.PacketsReceived.MulticastListenerDone
+			},
+			rxMGPkt: createAndInjectMLDPacket,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			e, s, _ := createStack(t, true)
+
+			test.rxMGPkt(e, test.headerType, test.maxRespTime, test.groupAddress)
+			if got := test.statCounter(s).Value(); got != 1 {
+				t.Fatalf("got %s received = %d, want = 1", test.name, got)
+			}
+		})
+	}
+}
+
+// TestMGPJoinGroup tests that when explicitly joining a multicast group, the
+// stack schedules and sends correct Membership Reports.
+func TestMGPJoinGroup(t *testing.T) {
+	tests := []struct {
+		name                        string
+		protoNum                    tcpip.NetworkProtocolNumber
+		multicastAddr               tcpip.Address
+		maxUnsolicitedResponseDelay time.Duration
+		sentReportStat              func(*stack.Stack) *tcpip.StatCounter
+		receivedQueryStat           func(*stack.Stack) *tcpip.StatCounter
+		validateReport              func(*testing.T, channel.PacketInfo)
+	}{
+		{
+			name:                        "IGMP",
+			protoNum:                    ipv4.ProtocolNumber,
+			multicastAddr:               ipv4MulticastAddr1,
+			maxUnsolicitedResponseDelay: ipv4.UnsolicitedReportIntervalMax,
+			sentReportStat: func(s *stack.Stack) *tcpip.StatCounter {
+				return s.Stats().IGMP.PacketsSent.V2MembershipReport
+			},
+			receivedQueryStat: func(s *stack.Stack) *tcpip.StatCounter {
+				return s.Stats().IGMP.PacketsReceived.MembershipQuery
+			},
+			validateReport: func(t *testing.T, p channel.PacketInfo) {
+				t.Helper()
+
+				validateIGMPPacket(t, p, ipv4MulticastAddr1, igmpv2MembershipReport, 0, ipv4MulticastAddr1)
+			},
+		},
+		{
+			name:                        "MLD",
+			protoNum:                    ipv6.ProtocolNumber,
+			multicastAddr:               ipv6MulticastAddr1,
+			maxUnsolicitedResponseDelay: ipv6.UnsolicitedReportIntervalMax,
+			sentReportStat: func(s *stack.Stack) *tcpip.StatCounter {
+				return s.Stats().ICMP.V6.PacketsSent.MulticastListenerReport
+			},
+			receivedQueryStat: func(s *stack.Stack) *tcpip.StatCounter {
+				return s.Stats().ICMP.V6.PacketsReceived.MulticastListenerQuery
+			},
+			validateReport: func(t *testing.T, p channel.PacketInfo) {
+				t.Helper()
+
+				validateMLDPacket(t, p, ipv6MulticastAddr1, mldReport, 0, ipv6MulticastAddr1)
+			},
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			e, s, clock := createStack(t, true)
+
+			// Test joining a specific address explicitly and verify a Report is sent
+			// immediately.
+			if err := s.JoinGroup(test.protoNum, nicID, test.multicastAddr); err != nil {
+				t.Fatalf("JoinGroup(%d, %d, %s): %s", test.protoNum, nicID, test.multicastAddr, err)
+			}
+			sentReportStat := test.sentReportStat(s)
+			if got := sentReportStat.Value(); got != 1 {
+				t.Errorf("got sentReportState.Value() = %d, want = 1", got)
+			}
+			if p, ok := e.Read(); !ok {
+				t.Fatal("expected a report message to be sent")
+			} else {
+				test.validateReport(t, p)
+			}
+			if t.Failed() {
+				t.FailNow()
+			}
+
+			// Verify the second report is sent by the maximum unsolicited response
+			// interval.
+			p, ok := e.Read()
+			if ok {
+				t.Fatalf("sent unexpected packet, expected report only after advancing the clock = %#v", p.Pkt)
+			}
+			clock.Advance(test.maxUnsolicitedResponseDelay)
+			if got := sentReportStat.Value(); got != 2 {
+				t.Errorf("got sentReportState.Value() = %d, want = 2", got)
+			}
+			if p, ok := e.Read(); !ok {
+				t.Fatal("expected a report message to be sent")
+			} else {
+				test.validateReport(t, p)
+			}
+
+			// Should not send any more packets.
+			clock.Advance(time.Hour)
+			if p, ok := e.Read(); ok {
+				t.Fatalf("sent unexpected packet = %#v", p)
+			}
+		})
+	}
+}
+
+// TestMGPLeaveGroup tests that when leaving a previously joined multicast
+// group the stack sends a leave/done message.
+func TestMGPLeaveGroup(t *testing.T) {
+	tests := []struct {
+		name           string
+		protoNum       tcpip.NetworkProtocolNumber
+		multicastAddr  tcpip.Address
+		sentReportStat func(*stack.Stack) *tcpip.StatCounter
+		sentLeaveStat  func(*stack.Stack) *tcpip.StatCounter
+		validateReport func(*testing.T, channel.PacketInfo)
+		validateLeave  func(*testing.T, channel.PacketInfo)
+	}{
+		{
+			name:          "IGMP",
+			protoNum:      ipv4.ProtocolNumber,
+			multicastAddr: ipv4MulticastAddr1,
+			sentReportStat: func(s *stack.Stack) *tcpip.StatCounter {
+				return s.Stats().IGMP.PacketsSent.V2MembershipReport
+			},
+			sentLeaveStat: func(s *stack.Stack) *tcpip.StatCounter {
+				return s.Stats().IGMP.PacketsSent.LeaveGroup
+			},
+			validateReport: func(t *testing.T, p channel.PacketInfo) {
+				t.Helper()
+
+				validateIGMPPacket(t, p, ipv4MulticastAddr1, igmpv2MembershipReport, 0, ipv4MulticastAddr1)
+			},
+			validateLeave: func(t *testing.T, p channel.PacketInfo) {
+				t.Helper()
+
+				validateIGMPPacket(t, p, header.IPv4AllRoutersGroup, igmpLeaveGroup, 0, ipv4MulticastAddr1)
+			},
+		},
+		{
+			name:          "MLD",
+			protoNum:      ipv6.ProtocolNumber,
+			multicastAddr: ipv6MulticastAddr1,
+			sentReportStat: func(s *stack.Stack) *tcpip.StatCounter {
+				return s.Stats().ICMP.V6.PacketsSent.MulticastListenerReport
+			},
+			sentLeaveStat: func(s *stack.Stack) *tcpip.StatCounter {
+				return s.Stats().ICMP.V6.PacketsSent.MulticastListenerDone
+			},
+			validateReport: func(t *testing.T, p channel.PacketInfo) {
+				t.Helper()
+
+				validateMLDPacket(t, p, ipv6MulticastAddr1, mldReport, 0, ipv6MulticastAddr1)
+			},
+			validateLeave: func(t *testing.T, p channel.PacketInfo) {
+				t.Helper()
+
+				validateMLDPacket(t, p, header.IPv6AllRoutersMulticastAddress, mldDone, 0, ipv6MulticastAddr1)
+			},
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			e, s, clock := createStack(t, true)
+
+			if err := s.JoinGroup(test.protoNum, nicID, test.multicastAddr); err != nil {
+				t.Fatalf("JoinGroup(%d, %d, %s): %s", test.protoNum, nicID, test.multicastAddr, err)
+			}
+			if got := test.sentReportStat(s).Value(); got != 1 {
+				t.Errorf("got sentReportStat(_).Value() = %d, want = 1", got)
+			}
+			if p, ok := e.Read(); !ok {
+				t.Fatal("expected a report message to be sent")
+			} else {
+				test.validateReport(t, p)
+			}
+			if t.Failed() {
+				t.FailNow()
+			}
+
+			// Leaving the group should trigger an leave/done message to be sent.
+			if err := s.LeaveGroup(test.protoNum, nicID, test.multicastAddr); err != nil {
+				t.Fatalf("LeaveGroup(%d, nic, %s): %s", test.protoNum, test.multicastAddr, err)
+			}
+			if got := test.sentLeaveStat(s).Value(); got != 1 {
+				t.Fatalf("got sentLeaveStat(_).Value() = %d, want = 1", got)
+			}
+			if p, ok := e.Read(); !ok {
+				t.Fatal("expected a leave message to be sent")
+			} else {
+				test.validateLeave(t, p)
+			}
+
+			// Should not send any more packets.
+			clock.Advance(time.Hour)
+			if p, ok := e.Read(); ok {
+				t.Fatalf("sent unexpected packet = %#v", p)
+			}
+		})
+	}
+}
+
+// TestMGPQueryMessages tests that a report is sent in response to query
+// messages.
+func TestMGPQueryMessages(t *testing.T) {
+	tests := []struct {
+		name                        string
+		protoNum                    tcpip.NetworkProtocolNumber
+		multicastAddr               tcpip.Address
+		maxUnsolicitedResponseDelay time.Duration
+		sentReportStat              func(*stack.Stack) *tcpip.StatCounter
+		receivedQueryStat           func(*stack.Stack) *tcpip.StatCounter
+		rxQuery                     func(*channel.Endpoint, uint8, tcpip.Address)
+		validateReport              func(*testing.T, channel.PacketInfo)
+		maxRespTimeToDuration       func(uint8) time.Duration
+	}{
+		{
+			name:                        "IGMP",
+			protoNum:                    ipv4.ProtocolNumber,
+			multicastAddr:               ipv4MulticastAddr1,
+			maxUnsolicitedResponseDelay: ipv4.UnsolicitedReportIntervalMax,
+			sentReportStat: func(s *stack.Stack) *tcpip.StatCounter {
+				return s.Stats().IGMP.PacketsSent.V2MembershipReport
+			},
+			receivedQueryStat: func(s *stack.Stack) *tcpip.StatCounter {
+				return s.Stats().IGMP.PacketsReceived.MembershipQuery
+			},
+			rxQuery: func(e *channel.Endpoint, maxRespTime uint8, groupAddress tcpip.Address) {
+				createAndInjectIGMPPacket(e, igmpMembershipQuery, maxRespTime, groupAddress)
+			},
+			validateReport: func(t *testing.T, p channel.PacketInfo) {
+				t.Helper()
+
+				validateIGMPPacket(t, p, ipv4MulticastAddr1, igmpv2MembershipReport, 0, ipv4MulticastAddr1)
+			},
+			maxRespTimeToDuration: header.DecisecondToDuration,
+		},
+		{
+			name:                        "MLD",
+			protoNum:                    ipv6.ProtocolNumber,
+			multicastAddr:               ipv6MulticastAddr1,
+			maxUnsolicitedResponseDelay: ipv6.UnsolicitedReportIntervalMax,
+			sentReportStat: func(s *stack.Stack) *tcpip.StatCounter {
+				return s.Stats().ICMP.V6.PacketsSent.MulticastListenerReport
+			},
+			receivedQueryStat: func(s *stack.Stack) *tcpip.StatCounter {
+				return s.Stats().ICMP.V6.PacketsReceived.MulticastListenerQuery
+			},
+			rxQuery: func(e *channel.Endpoint, maxRespTime uint8, groupAddress tcpip.Address) {
+				createAndInjectMLDPacket(e, mldQuery, maxRespTime, groupAddress)
+			},
+			validateReport: func(t *testing.T, p channel.PacketInfo) {
+				t.Helper()
+
+				validateMLDPacket(t, p, ipv6MulticastAddr1, mldReport, 0, ipv6MulticastAddr1)
+			},
+			maxRespTimeToDuration: func(d uint8) time.Duration {
+				return time.Duration(d) * time.Millisecond
+			},
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			subTests := []struct {
+				name          string
+				multicastAddr tcpip.Address
+				expectReport  bool
+			}{
+				{
+					name:          "Unspecified",
+					multicastAddr: tcpip.Address(strings.Repeat("\x00", len(test.multicastAddr))),
+					expectReport:  true,
+				},
+				{
+					name:          "Specified",
+					multicastAddr: test.multicastAddr,
+					expectReport:  true,
+				},
+				{
+					name: "Specified other address",
+					multicastAddr: func() tcpip.Address {
+						addrBytes := []byte(test.multicastAddr)
+						addrBytes[len(addrBytes)-1]++
+						return tcpip.Address(addrBytes)
+					}(),
+					expectReport: false,
+				},
+			}
+
+			for _, subTest := range subTests {
+				t.Run(subTest.name, func(t *testing.T) {
+					e, s, clock := createStack(t, true)
+
+					if err := s.JoinGroup(test.protoNum, nicID, test.multicastAddr); err != nil {
+						t.Fatalf("JoinGroup(%d, %d, %s): %s", test.protoNum, nicID, test.multicastAddr, err)
+					}
+					sentReportStat := test.sentReportStat(s)
+					for i := uint64(1); i <= 2; i++ {
+						sentReportStat := test.sentReportStat(s)
+						if got := sentReportStat.Value(); got != i {
+							t.Errorf("(i=%d) got sentReportState.Value() = %d, want = %d", i, got, i)
+						}
+						if p, ok := e.Read(); !ok {
+							t.Fatalf("expected %d-th report message to be sent", i)
+						} else {
+							test.validateReport(t, p)
+						}
+						clock.Advance(test.maxUnsolicitedResponseDelay)
+					}
+					if t.Failed() {
+						t.FailNow()
+					}
+
+					// Should not send any more packets until a query.
+					clock.Advance(time.Hour)
+					if p, ok := e.Read(); ok {
+						t.Fatalf("sent unexpected packet = %#v", p)
+					}
+
+					// Receive a query message which should trigger a report to be sent at
+					// some time before the maximum response time if the report is
+					// targeted at the host.
+					const maxRespTime = 100
+					test.rxQuery(e, maxRespTime, subTest.multicastAddr)
+					if p, ok := e.Read(); ok {
+						t.Fatalf("sent unexpected packet = %#v", p.Pkt)
+					}
+
+					if subTest.expectReport {
+						clock.Advance(test.maxRespTimeToDuration(maxRespTime))
+						if got := sentReportStat.Value(); got != 3 {
+							t.Errorf("got sentReportState.Value() = %d, want = 3", got)
+						}
+						if p, ok := e.Read(); !ok {
+							t.Fatal("expected a report message to be sent")
+						} else {
+							test.validateReport(t, p)
+						}
+					}
+
+					// Should not send any more packets.
+					clock.Advance(time.Hour)
+					if p, ok := e.Read(); ok {
+						t.Fatalf("sent unexpected packet = %#v", p)
+					}
+				})
+			}
+		})
+	}
+}
+
+// TestMGPQueryMessages tests that no further reports or leave/done messages
+// are sent after receiving a report.
+func TestMGPReportMessages(t *testing.T) {
+	tests := []struct {
+		name                  string
+		protoNum              tcpip.NetworkProtocolNumber
+		multicastAddr         tcpip.Address
+		sentReportStat        func(*stack.Stack) *tcpip.StatCounter
+		sentLeaveStat         func(*stack.Stack) *tcpip.StatCounter
+		rxReport              func(*channel.Endpoint)
+		validateReport        func(*testing.T, channel.PacketInfo)
+		maxRespTimeToDuration func(uint8) time.Duration
+	}{
+		{
+			name:          "IGMP",
+			protoNum:      ipv4.ProtocolNumber,
+			multicastAddr: ipv4MulticastAddr1,
+			sentReportStat: func(s *stack.Stack) *tcpip.StatCounter {
+				return s.Stats().IGMP.PacketsSent.V2MembershipReport
+			},
+			sentLeaveStat: func(s *stack.Stack) *tcpip.StatCounter {
+				return s.Stats().IGMP.PacketsSent.LeaveGroup
+			},
+			rxReport: func(e *channel.Endpoint) {
+				createAndInjectIGMPPacket(e, igmpv2MembershipReport, 0, ipv4MulticastAddr1)
+			},
+			validateReport: func(t *testing.T, p channel.PacketInfo) {
+				t.Helper()
+
+				validateIGMPPacket(t, p, ipv4MulticastAddr1, igmpv2MembershipReport, 0, ipv4MulticastAddr1)
+			},
+			maxRespTimeToDuration: header.DecisecondToDuration,
+		},
+		{
+			name:          "MLD",
+			protoNum:      ipv6.ProtocolNumber,
+			multicastAddr: ipv6MulticastAddr1,
+			sentReportStat: func(s *stack.Stack) *tcpip.StatCounter {
+				return s.Stats().ICMP.V6.PacketsSent.MulticastListenerReport
+			},
+			sentLeaveStat: func(s *stack.Stack) *tcpip.StatCounter {
+				return s.Stats().ICMP.V6.PacketsSent.MulticastListenerDone
+			},
+			rxReport: func(e *channel.Endpoint) {
+				createAndInjectMLDPacket(e, mldReport, 0, ipv6MulticastAddr1)
+			},
+			validateReport: func(t *testing.T, p channel.PacketInfo) {
+				t.Helper()
+
+				validateMLDPacket(t, p, ipv6MulticastAddr1, mldReport, 0, ipv6MulticastAddr1)
+			},
+			maxRespTimeToDuration: func(d uint8) time.Duration {
+				return time.Duration(d) * time.Millisecond
+			},
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			e, s, clock := createStack(t, true)
+
+			if err := s.JoinGroup(test.protoNum, nicID, test.multicastAddr); err != nil {
+				t.Fatalf("JoinGroup(%d, %d, %s): %s", test.protoNum, nicID, test.multicastAddr, err)
+			}
+			sentReportStat := test.sentReportStat(s)
+			if got := sentReportStat.Value(); got != 1 {
+				t.Errorf("got sentReportStat.Value() = %d, want = 1", got)
+			}
+			if p, ok := e.Read(); !ok {
+				t.Fatal("expected a report message to be sent")
+			} else {
+				test.validateReport(t, p)
+			}
+			if t.Failed() {
+				t.FailNow()
+			}
+
+			// Receiving a report for a group we joined should cancel any further
+			// reports.
+			test.rxReport(e)
+			clock.Advance(time.Hour)
+			if got := sentReportStat.Value(); got != 1 {
+				t.Errorf("got sentReportStat.Value() = %d, want = 1", got)
+			}
+			if p, ok := e.Read(); ok {
+				t.Errorf("sent unexpected packet = %#v", p)
+			}
+			if t.Failed() {
+				t.FailNow()
+			}
+
+			// Leaving a group after getting a report should not send a leave/done
+			// message.
+			if err := s.LeaveGroup(test.protoNum, nicID, test.multicastAddr); err != nil {
+				t.Fatalf("LeaveGroup(%d, nic, %s): %s", test.protoNum, test.multicastAddr, err)
+			}
+			clock.Advance(time.Hour)
+			if got := test.sentLeaveStat(s).Value(); got != 0 {
+				t.Fatalf("got sentLeaveStat(_).Value() = %d, want = 0", got)
+			}
+
+			// Should not send any more packets.
+			clock.Advance(time.Hour)
+			if p, ok := e.Read(); ok {
+				t.Fatalf("sent unexpected packet = %#v", p)
+			}
+		})
+	}
+}
+
+func TestMGPWithNICLifecycle(t *testing.T) {
+	tests := []struct {
+		name                        string
+		protoNum                    tcpip.NetworkProtocolNumber
+		multicastAddrs              []tcpip.Address
+		finalMulticastAddr          tcpip.Address
+		maxUnsolicitedResponseDelay time.Duration
+		sentReportStat              func(*stack.Stack) *tcpip.StatCounter
+		sentLeaveStat               func(*stack.Stack) *tcpip.StatCounter
+		validateReport              func(*testing.T, channel.PacketInfo, tcpip.Address)
+		validateLeave               func(*testing.T, channel.PacketInfo, tcpip.Address)
+		getAndCheckGroupAddress     func(*testing.T, map[tcpip.Address]bool, channel.PacketInfo) tcpip.Address
+	}{
+		{
+			name:                        "IGMP",
+			protoNum:                    ipv4.ProtocolNumber,
+			multicastAddrs:              []tcpip.Address{ipv4MulticastAddr1, ipv4MulticastAddr2},
+			finalMulticastAddr:          ipv4MulticastAddr3,
+			maxUnsolicitedResponseDelay: ipv4.UnsolicitedReportIntervalMax,
+			sentReportStat: func(s *stack.Stack) *tcpip.StatCounter {
+				return s.Stats().IGMP.PacketsSent.V2MembershipReport
+			},
+			sentLeaveStat: func(s *stack.Stack) *tcpip.StatCounter {
+				return s.Stats().IGMP.PacketsSent.LeaveGroup
+			},
+			validateReport: func(t *testing.T, p channel.PacketInfo, addr tcpip.Address) {
+				t.Helper()
+
+				validateIGMPPacket(t, p, addr, igmpv2MembershipReport, 0, addr)
+			},
+			validateLeave: func(t *testing.T, p channel.PacketInfo, addr tcpip.Address) {
+				t.Helper()
+
+				validateIGMPPacket(t, p, header.IPv4AllRoutersGroup, igmpLeaveGroup, 0, addr)
+			},
+			getAndCheckGroupAddress: func(t *testing.T, seen map[tcpip.Address]bool, p channel.PacketInfo) tcpip.Address {
+				t.Helper()
+
+				ipv4 := header.IPv4(stack.PayloadSince(p.Pkt.NetworkHeader()))
+				if got := tcpip.TransportProtocolNumber(ipv4.Protocol()); got != header.IGMPProtocolNumber {
+					t.Fatalf("got ipv4.Protocol() = %d, want = %d", got, header.IGMPProtocolNumber)
+				}
+				addr := header.IGMP(ipv4.Payload()).GroupAddress()
+				s, ok := seen[addr]
+				if !ok {
+					t.Fatalf("unexpectedly got a packet for group %s", addr)
+				}
+				if s {
+					t.Fatalf("already saw packet for group %s", addr)
+				}
+				seen[addr] = true
+				return addr
+			},
+		},
+		{
+			name:                        "MLD",
+			protoNum:                    ipv6.ProtocolNumber,
+			multicastAddrs:              []tcpip.Address{ipv6MulticastAddr1, ipv6MulticastAddr2},
+			finalMulticastAddr:          ipv6MulticastAddr3,
+			maxUnsolicitedResponseDelay: ipv6.UnsolicitedReportIntervalMax,
+			sentReportStat: func(s *stack.Stack) *tcpip.StatCounter {
+				return s.Stats().ICMP.V6.PacketsSent.MulticastListenerReport
+			},
+			sentLeaveStat: func(s *stack.Stack) *tcpip.StatCounter {
+				return s.Stats().ICMP.V6.PacketsSent.MulticastListenerDone
+			},
+			validateReport: func(t *testing.T, p channel.PacketInfo, addr tcpip.Address) {
+				t.Helper()
+
+				validateMLDPacket(t, p, addr, mldReport, 0, addr)
+			},
+			validateLeave: func(t *testing.T, p channel.PacketInfo, addr tcpip.Address) {
+				t.Helper()
+
+				validateMLDPacket(t, p, header.IPv6AllRoutersMulticastAddress, mldDone, 0, addr)
+			},
+			getAndCheckGroupAddress: func(t *testing.T, seen map[tcpip.Address]bool, p channel.PacketInfo) tcpip.Address {
+				t.Helper()
+
+				ipv6 := header.IPv6(stack.PayloadSince(p.Pkt.NetworkHeader()))
+				if got := tcpip.TransportProtocolNumber(ipv6.NextHeader()); got != header.ICMPv6ProtocolNumber {
+					t.Fatalf("got ipv6.NextHeader() = %d, want = %d", got, header.ICMPv6ProtocolNumber)
+				}
+				icmpv6 := header.ICMPv6(ipv6.Payload())
+				if got := icmpv6.Type(); got != header.ICMPv6MulticastListenerReport && got != header.ICMPv6MulticastListenerDone {
+					t.Fatalf("got icmpv6.Type() = %d, want = %d or %d", got, header.ICMPv6MulticastListenerReport, header.ICMPv6MulticastListenerDone)
+				}
+				addr := header.MLD(icmpv6.MessageBody()).MulticastAddress()
+				s, ok := seen[addr]
+				if !ok {
+					t.Fatalf("unexpectedly got a packet for group %s", addr)
+				}
+				if s {
+					t.Fatalf("already saw packet for group %s", addr)
+				}
+				seen[addr] = true
+				return addr
+
+			},
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			e, s, clock := createStack(t, true)
+
+			sentReportStat := test.sentReportStat(s)
+			var reportCounter uint64
+			for _, a := range test.multicastAddrs {
+				if err := s.JoinGroup(test.protoNum, nicID, a); err != nil {
+					t.Fatalf("JoinGroup(%d, %d, %s): %s", test.protoNum, nicID, a, err)
+				}
+				reportCounter++
+				if got := sentReportStat.Value(); got != reportCounter {
+					t.Errorf("got sentReportStat.Value() = %d, want = %d", got, reportCounter)
+				}
+				if p, ok := e.Read(); !ok {
+					t.Fatalf("expected a report message to be sent for %s", a)
+				} else {
+					test.validateReport(t, p, a)
+				}
+			}
+			if t.Failed() {
+				t.FailNow()
+			}
+
+			// Leave messages should be sent for the joined groups when the NIC is
+			// disabled.
+			if err := s.DisableNIC(nicID); err != nil {
+				t.Fatalf("DisableNIC(%d): %s", nicID, err)
+			}
+			sentLeaveStat := test.sentLeaveStat(s)
+			leaveCounter := uint64(len(test.multicastAddrs))
+			if got := sentLeaveStat.Value(); got != leaveCounter {
+				t.Errorf("got sentLeaveStat.Value() = %d, want = %d", got, leaveCounter)
+			}
+			{
+				seen := make(map[tcpip.Address]bool)
+				for _, a := range test.multicastAddrs {
+					seen[a] = false
+				}
+
+				for i, _ := range test.multicastAddrs {
+					p, ok := e.Read()
+					if !ok {
+						t.Fatalf("expected (%d-th) leave message to be sent", i)
+					}
+
+					test.validateLeave(t, p, test.getAndCheckGroupAddress(t, seen, p))
+				}
+			}
+			if t.Failed() {
+				t.FailNow()
+			}
+
+			// Reports should be sent for the joined groups when the NIC is enabled.
+			if err := s.EnableNIC(nicID); err != nil {
+				t.Fatalf("EnableNIC(%d): %s", nicID, err)
+			}
+			reportCounter += uint64(len(test.multicastAddrs))
+			if got := sentReportStat.Value(); got != reportCounter {
+				t.Errorf("got sentReportStat.Value() = %d, want = %d", got, reportCounter)
+			}
+			{
+				seen := make(map[tcpip.Address]bool)
+				for _, a := range test.multicastAddrs {
+					seen[a] = false
+				}
+
+				for i, _ := range test.multicastAddrs {
+					p, ok := e.Read()
+					if !ok {
+						t.Fatalf("expected (%d-th) report message to be sent", i)
+					}
+
+					test.validateReport(t, p, test.getAndCheckGroupAddress(t, seen, p))
+				}
+			}
+			if t.Failed() {
+				t.FailNow()
+			}
+
+			// Joining/leaving a group while disabled should not send any messages.
+			if err := s.DisableNIC(nicID); err != nil {
+				t.Fatalf("DisableNIC(%d): %s", nicID, err)
+			}
+			leaveCounter += uint64(len(test.multicastAddrs))
+			if got := sentLeaveStat.Value(); got != leaveCounter {
+				t.Errorf("got sentLeaveStat.Value() = %d, want = %d", got, leaveCounter)
+			}
+			for i, _ := range test.multicastAddrs {
+				if _, ok := e.Read(); !ok {
+					t.Fatalf("expected (%d-th) leave message to be sent", i)
+				}
+			}
+			for _, a := range test.multicastAddrs {
+				if err := s.LeaveGroup(test.protoNum, nicID, a); err != nil {
+					t.Fatalf("LeaveGroup(%d, nic, %s): %s", test.protoNum, a, err)
+				}
+				if got := sentLeaveStat.Value(); got != leaveCounter {
+					t.Errorf("got sentLeaveStat.Value() = %d, want = %d", got, leaveCounter)
+				}
+				if p, ok := e.Read(); ok {
+					t.Fatalf("leaving group %s on disabled NIC sent unexpected packet = %#v", a, p.Pkt)
+				}
+			}
+			if err := s.JoinGroup(test.protoNum, nicID, test.finalMulticastAddr); err != nil {
+				t.Fatalf("JoinGroup(%d, %d, %s): %s", test.protoNum, nicID, test.finalMulticastAddr, err)
+			}
+			if got := sentReportStat.Value(); got != reportCounter {
+				t.Errorf("got sentReportStat.Value() = %d, want = %d", got, reportCounter)
+			}
+			if p, ok := e.Read(); ok {
+				t.Fatalf("joining group %s on disabled NIC sent unexpected packet = %#v", test.finalMulticastAddr, p.Pkt)
+			}
+
+			// A report should only be sent for the group we last joined after
+			// enabling the NIC since the original groups were all left.
+			if err := s.EnableNIC(nicID); err != nil {
+				t.Fatalf("EnableNIC(%d): %s", nicID, err)
+			}
+			reportCounter++
+			if got := sentReportStat.Value(); got != reportCounter {
+				t.Errorf("got sentReportStat.Value() = %d, want = %d", got, reportCounter)
+			}
+			if p, ok := e.Read(); !ok {
+				t.Fatal("expected a report message to be sent")
+			} else {
+				test.validateReport(t, p, test.finalMulticastAddr)
+			}
+
+			clock.Advance(test.maxUnsolicitedResponseDelay)
+			reportCounter++
+			if got := sentReportStat.Value(); got != reportCounter {
+				t.Errorf("got sentReportState.Value() = %d, want = %d", got, reportCounter)
+			}
+			if p, ok := e.Read(); !ok {
+				t.Fatal("expected a report message to be sent")
+			} else {
+				test.validateReport(t, p, test.finalMulticastAddr)
+			}
+
+			// Should not send any more packets.
+			clock.Advance(time.Hour)
+			if p, ok := e.Read(); ok {
+				t.Fatalf("sent unexpected packet = %#v", p)
+			}
+		})
+	}
+}
diff --git a/pkg/tcpip/network/testutil/testutil.go b/pkg/tcpip/network/testutil/testutil.go
index 7cc52985e..5c3363759 100644
--- a/pkg/tcpip/network/testutil/testutil.go
+++ b/pkg/tcpip/network/testutil/testutil.go
@@ -85,21 +85,6 @@ func (ep *MockLinkEndpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts st
 	return n, nil
 }
 
-// WriteRawPacket implements LinkEndpoint.WriteRawPacket.
-func (ep *MockLinkEndpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error {
-	if ep.allowPackets == 0 {
-		return ep.err
-	}
-	ep.allowPackets--
-
-	pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
-		Data: vv,
-	})
-	ep.WrittenPackets = append(ep.WrittenPackets, pkt)
-
-	return nil
-}
-
 // Attach implements LinkEndpoint.Attach.
 func (*MockLinkEndpoint) Attach(stack.NetworkDispatcher) {}
 
diff --git a/pkg/tcpip/socketops.go b/pkg/tcpip/socketops.go
index 2a6c7c7c0..c53698a6a 100644
--- a/pkg/tcpip/socketops.go
+++ b/pkg/tcpip/socketops.go
@@ -15,31 +15,290 @@
 package tcpip
 
 import (
-	"gvisor.dev/gvisor/pkg/sync"
+	"sync/atomic"
 )
 
-// SocketOptions contains all the variables which store values for socket
-// level options.
+// SocketOptionsHandler holds methods that help define endpoint specific
+// behavior for socket level socket options. These must be implemented by
+// endpoints to get notified when socket level options are set.
+type SocketOptionsHandler interface {
+	// OnReuseAddressSet is invoked when SO_REUSEADDR is set for an endpoint.
+	OnReuseAddressSet(v bool)
+
+	// OnReusePortSet is invoked when SO_REUSEPORT is set for an endpoint.
+	OnReusePortSet(v bool)
+
+	// OnKeepAliveSet is invoked when SO_KEEPALIVE is set for an endpoint.
+	OnKeepAliveSet(v bool)
+
+	// OnDelayOptionSet is invoked when TCP_NODELAY is set for an endpoint.
+	// Note that v will be the inverse of TCP_NODELAY option.
+	OnDelayOptionSet(v bool)
+
+	// OnCorkOptionSet is invoked when TCP_CORK is set for an endpoint.
+	OnCorkOptionSet(v bool)
+}
+
+// DefaultSocketOptionsHandler is an embeddable type that implements no-op
+// implementations for SocketOptionsHandler methods.
+type DefaultSocketOptionsHandler struct{}
+
+var _ SocketOptionsHandler = (*DefaultSocketOptionsHandler)(nil)
+
+// OnReuseAddressSet implements SocketOptionsHandler.OnReuseAddressSet.
+func (*DefaultSocketOptionsHandler) OnReuseAddressSet(bool) {}
+
+// OnReusePortSet implements SocketOptionsHandler.OnReusePortSet.
+func (*DefaultSocketOptionsHandler) OnReusePortSet(bool) {}
+
+// OnKeepAliveSet implements SocketOptionsHandler.OnKeepAliveSet.
+func (*DefaultSocketOptionsHandler) OnKeepAliveSet(bool) {}
+
+// OnDelayOptionSet implements SocketOptionsHandler.OnDelayOptionSet.
+func (*DefaultSocketOptionsHandler) OnDelayOptionSet(bool) {}
+
+// OnCorkOptionSet implements SocketOptionsHandler.OnCorkOptionSet.
+func (*DefaultSocketOptionsHandler) OnCorkOptionSet(bool) {}
+
+// SocketOptions contains all the variables which store values for SOL_SOCKET,
+// SOL_IP, SOL_IPV6 and SOL_TCP level options.
 //
 // +stateify savable
 type SocketOptions struct {
-	// mu protects fields below.
-	mu               sync.Mutex `state:"nosave"`
-	broadcastEnabled bool
+	handler SocketOptionsHandler
+
+	// These fields are accessed and modified using atomic operations.
+
+	// broadcastEnabled determines whether datagram sockets are allowed to send
+	// packets to a broadcast address.
+	broadcastEnabled uint32
+
+	// passCredEnabled determines whether SCM_CREDENTIALS socket control messages
+	// are enabled.
+	passCredEnabled uint32
+
+	// noChecksumEnabled determines whether UDP checksum is disabled while
+	// transmitting for this socket.
+	noChecksumEnabled uint32
+
+	// reuseAddressEnabled determines whether Bind() should allow reuse of local
+	// address.
+	reuseAddressEnabled uint32
+
+	// reusePortEnabled determines whether to permit multiple sockets to be bound
+	// to an identical socket address.
+	reusePortEnabled uint32
+
+	// keepAliveEnabled determines whether TCP keepalive is enabled for this
+	// socket.
+	keepAliveEnabled uint32
+
+	// multicastLoopEnabled determines whether multicast packets sent over a
+	// non-loopback interface will be looped back. Analogous to inet->mc_loop.
+	multicastLoopEnabled uint32
+
+	// receiveTOSEnabled is used to specify if the TOS ancillary message is
+	// passed with incoming packets.
+	receiveTOSEnabled uint32
+
+	// receiveTClassEnabled is used to specify if the IPV6_TCLASS ancillary
+	// message is passed with incoming packets.
+	receiveTClassEnabled uint32
+
+	// receivePacketInfoEnabled is used to specify if more inforamtion is
+	// provided with incoming packets such as interface index and address.
+	receivePacketInfoEnabled uint32
+
+	// hdrIncludeEnabled is used to indicate for a raw endpoint that all packets
+	// being written have an IP header and the endpoint should not attach an IP
+	// header.
+	hdrIncludedEnabled uint32
+
+	// v6OnlyEnabled is used to determine whether an IPv6 socket is to be
+	// restricted to sending and receiving IPv6 packets only.
+	v6OnlyEnabled uint32
+
+	// quickAckEnabled is used to represent the value of TCP_QUICKACK option.
+	// It currently does not have any effect on the TCP endpoint.
+	quickAckEnabled uint32
+
+	// delayOptionEnabled is used to specify if data should be sent out immediately
+	// by the transport protocol. For TCP, it determines if the Nagle algorithm
+	// is on or off.
+	delayOptionEnabled uint32
+
+	// corkOptionEnabled is used to specify if data should be held until segments
+	// are full by the TCP transport protocol.
+	corkOptionEnabled uint32
+}
+
+// InitHandler initializes the handler. This must be called before using the
+// socket options utility.
+func (so *SocketOptions) InitHandler(handler SocketOptionsHandler) {
+	so.handler = handler
+}
+
+func storeAtomicBool(addr *uint32, v bool) {
+	var val uint32
+	if v {
+		val = 1
+	}
+	atomic.StoreUint32(addr, val)
 }
 
 // GetBroadcast gets value for SO_BROADCAST option.
 func (so *SocketOptions) GetBroadcast() bool {
-	so.mu.Lock()
-	defer so.mu.Unlock()
-
-	return so.broadcastEnabled
+	return atomic.LoadUint32(&so.broadcastEnabled) != 0
 }
 
 // SetBroadcast sets value for SO_BROADCAST option.
 func (so *SocketOptions) SetBroadcast(v bool) {
-	so.mu.Lock()
-	defer so.mu.Unlock()
+	storeAtomicBool(&so.broadcastEnabled, v)
+}
+
+// GetPassCred gets value for SO_PASSCRED option.
+func (so *SocketOptions) GetPassCred() bool {
+	return atomic.LoadUint32(&so.passCredEnabled) != 0
+}
+
+// SetPassCred sets value for SO_PASSCRED option.
+func (so *SocketOptions) SetPassCred(v bool) {
+	storeAtomicBool(&so.passCredEnabled, v)
+}
+
+// GetNoChecksum gets value for SO_NO_CHECK option.
+func (so *SocketOptions) GetNoChecksum() bool {
+	return atomic.LoadUint32(&so.noChecksumEnabled) != 0
+}
+
+// SetNoChecksum sets value for SO_NO_CHECK option.
+func (so *SocketOptions) SetNoChecksum(v bool) {
+	storeAtomicBool(&so.noChecksumEnabled, v)
+}
+
+// GetReuseAddress gets value for SO_REUSEADDR option.
+func (so *SocketOptions) GetReuseAddress() bool {
+	return atomic.LoadUint32(&so.reuseAddressEnabled) != 0
+}
+
+// SetReuseAddress sets value for SO_REUSEADDR option.
+func (so *SocketOptions) SetReuseAddress(v bool) {
+	storeAtomicBool(&so.reuseAddressEnabled, v)
+	so.handler.OnReuseAddressSet(v)
+}
+
+// GetReusePort gets value for SO_REUSEPORT option.
+func (so *SocketOptions) GetReusePort() bool {
+	return atomic.LoadUint32(&so.reusePortEnabled) != 0
+}
+
+// SetReusePort sets value for SO_REUSEPORT option.
+func (so *SocketOptions) SetReusePort(v bool) {
+	storeAtomicBool(&so.reusePortEnabled, v)
+	so.handler.OnReusePortSet(v)
+}
+
+// GetKeepAlive gets value for SO_KEEPALIVE option.
+func (so *SocketOptions) GetKeepAlive() bool {
+	return atomic.LoadUint32(&so.keepAliveEnabled) != 0
+}
+
+// SetKeepAlive sets value for SO_KEEPALIVE option.
+func (so *SocketOptions) SetKeepAlive(v bool) {
+	storeAtomicBool(&so.keepAliveEnabled, v)
+	so.handler.OnKeepAliveSet(v)
+}
+
+// GetMulticastLoop gets value for IP_MULTICAST_LOOP option.
+func (so *SocketOptions) GetMulticastLoop() bool {
+	return atomic.LoadUint32(&so.multicastLoopEnabled) != 0
+}
+
+// SetMulticastLoop sets value for IP_MULTICAST_LOOP option.
+func (so *SocketOptions) SetMulticastLoop(v bool) {
+	storeAtomicBool(&so.multicastLoopEnabled, v)
+}
+
+// GetReceiveTOS gets value for IP_RECVTOS option.
+func (so *SocketOptions) GetReceiveTOS() bool {
+	return atomic.LoadUint32(&so.receiveTOSEnabled) != 0
+}
+
+// SetReceiveTOS sets value for IP_RECVTOS option.
+func (so *SocketOptions) SetReceiveTOS(v bool) {
+	storeAtomicBool(&so.receiveTOSEnabled, v)
+}
+
+// GetReceiveTClass gets value for IPV6_RECVTCLASS option.
+func (so *SocketOptions) GetReceiveTClass() bool {
+	return atomic.LoadUint32(&so.receiveTClassEnabled) != 0
+}
+
+// SetReceiveTClass sets value for IPV6_RECVTCLASS option.
+func (so *SocketOptions) SetReceiveTClass(v bool) {
+	storeAtomicBool(&so.receiveTClassEnabled, v)
+}
+
+// GetReceivePacketInfo gets value for IP_PKTINFO option.
+func (so *SocketOptions) GetReceivePacketInfo() bool {
+	return atomic.LoadUint32(&so.receivePacketInfoEnabled) != 0
+}
+
+// SetReceivePacketInfo sets value for IP_PKTINFO option.
+func (so *SocketOptions) SetReceivePacketInfo(v bool) {
+	storeAtomicBool(&so.receivePacketInfoEnabled, v)
+}
+
+// GetHeaderIncluded gets value for IP_HDRINCL option.
+func (so *SocketOptions) GetHeaderIncluded() bool {
+	return atomic.LoadUint32(&so.hdrIncludedEnabled) != 0
+}
+
+// SetHeaderIncluded sets value for IP_HDRINCL option.
+func (so *SocketOptions) SetHeaderIncluded(v bool) {
+	storeAtomicBool(&so.hdrIncludedEnabled, v)
+}
+
+// GetV6Only gets value for IPV6_V6ONLY option.
+func (so *SocketOptions) GetV6Only() bool {
+	return atomic.LoadUint32(&so.v6OnlyEnabled) != 0
+}
+
+// SetV6Only sets value for IPV6_V6ONLY option.
+//
+// Preconditions: the backing TCP or UDP endpoint must be in initial state.
+func (so *SocketOptions) SetV6Only(v bool) {
+	storeAtomicBool(&so.v6OnlyEnabled, v)
+}
+
+// GetQuickAck gets value for TCP_QUICKACK option.
+func (so *SocketOptions) GetQuickAck() bool {
+	return atomic.LoadUint32(&so.quickAckEnabled) != 0
+}
+
+// SetQuickAck sets value for TCP_QUICKACK option.
+func (so *SocketOptions) SetQuickAck(v bool) {
+	storeAtomicBool(&so.quickAckEnabled, v)
+}
+
+// GetDelayOption gets inverted value for TCP_NODELAY option.
+func (so *SocketOptions) GetDelayOption() bool {
+	return atomic.LoadUint32(&so.delayOptionEnabled) != 0
+}
+
+// SetDelayOption sets inverted value for TCP_NODELAY option.
+func (so *SocketOptions) SetDelayOption(v bool) {
+	storeAtomicBool(&so.delayOptionEnabled, v)
+	so.handler.OnDelayOptionSet(v)
+}
+
+// GetCorkOption gets value for TCP_CORK option.
+func (so *SocketOptions) GetCorkOption() bool {
+	return atomic.LoadUint32(&so.corkOptionEnabled) != 0
+}
 
-	so.broadcastEnabled = v
+// SetCorkOption sets value for TCP_CORK option.
+func (so *SocketOptions) SetCorkOption(v bool) {
+	storeAtomicBool(&so.corkOptionEnabled, v)
+	so.handler.OnCorkOptionSet(v)
 }
diff --git a/pkg/tcpip/stack/BUILD b/pkg/tcpip/stack/BUILD
index d09ebe7fa..9cc6074da 100644
--- a/pkg/tcpip/stack/BUILD
+++ b/pkg/tcpip/stack/BUILD
@@ -1,4 +1,4 @@
-load("//tools:defs.bzl", "go_library", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test", "most_shards")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 
 package(licenses = ["notice"])
@@ -112,7 +112,7 @@ go_test(
         "transport_demuxer_test.go",
         "transport_test.go",
     ],
-    shard_count = 20,
+    shard_count = most_shards,
     deps = [
         ":stack",
         "//pkg/rand",
@@ -120,6 +120,7 @@ go_test(
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/checker",
+        "//pkg/tcpip/faketime",
         "//pkg/tcpip/header",
         "//pkg/tcpip/link/channel",
         "//pkg/tcpip/link/loopback",
@@ -131,7 +132,6 @@ go_test(
         "//pkg/tcpip/transport/udp",
         "//pkg/waiter",
         "@com_github_google_go_cmp//cmp:go_default_library",
-        "@com_github_google_go_cmp//cmp/cmpopts:go_default_library",
     ],
 )
 
diff --git a/pkg/tcpip/stack/addressable_endpoint_state.go b/pkg/tcpip/stack/addressable_endpoint_state.go
index 9478f3fb7..6e4f5fa46 100644
--- a/pkg/tcpip/stack/addressable_endpoint_state.go
+++ b/pkg/tcpip/stack/addressable_endpoint_state.go
@@ -21,7 +21,6 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip"
 )
 
-var _ GroupAddressableEndpoint = (*AddressableEndpointState)(nil)
 var _ AddressableEndpoint = (*AddressableEndpointState)(nil)
 
 // AddressableEndpointState is an implementation of an AddressableEndpoint.
@@ -37,10 +36,6 @@ type AddressableEndpointState struct {
 
 		endpoints map[tcpip.Address]*addressState
 		primary   []*addressState
-
-		// groups holds the mapping between group addresses and the number of times
-		// they have been joined.
-		groups map[tcpip.Address]uint32
 	}
 }
 
@@ -53,65 +48,33 @@ func (a *AddressableEndpointState) Init(networkEndpoint NetworkEndpoint) {
 	a.mu.Lock()
 	defer a.mu.Unlock()
 	a.mu.endpoints = make(map[tcpip.Address]*addressState)
-	a.mu.groups = make(map[tcpip.Address]uint32)
-}
-
-// ReadOnlyAddressableEndpointState provides read-only access to an
-// AddressableEndpointState.
-type ReadOnlyAddressableEndpointState struct {
-	inner *AddressableEndpointState
 }
 
-// AddrOrMatching returns an endpoint for the passed address that is consisdered
-// bound to the wrapped AddressableEndpointState.
+// GetAddress returns the AddressEndpoint for the passed address.
 //
-// If addr is an exact match with an existing address, that address is returned.
-// Otherwise, f is called with each address and the address that f returns true
-// for is returned.
-//
-// Returns nil of no address matches.
-func (m ReadOnlyAddressableEndpointState) AddrOrMatching(addr tcpip.Address, spoofingOrPrimiscuous bool, f func(AddressEndpoint) bool) AddressEndpoint {
-	m.inner.mu.RLock()
-	defer m.inner.mu.RUnlock()
-
-	if ep, ok := m.inner.mu.endpoints[addr]; ok {
-		if ep.IsAssigned(spoofingOrPrimiscuous) && ep.IncRef() {
-			return ep
-		}
-	}
-
-	for _, ep := range m.inner.mu.endpoints {
-		if ep.IsAssigned(spoofingOrPrimiscuous) && f(ep) && ep.IncRef() {
-			return ep
-		}
-	}
-
-	return nil
-}
-
-// Lookup returns the AddressEndpoint for the passed address.
+// GetAddress does not increment the address's reference count or check if the
+// address is considered bound to the endpoint.
 //
-// Returns nil if the passed address is not associated with the
-// AddressableEndpointState.
-func (m ReadOnlyAddressableEndpointState) Lookup(addr tcpip.Address) AddressEndpoint {
-	m.inner.mu.RLock()
-	defer m.inner.mu.RUnlock()
+// Returns nil if the passed address is not associated with the endpoint.
+func (a *AddressableEndpointState) GetAddress(addr tcpip.Address) AddressEndpoint {
+	a.mu.RLock()
+	defer a.mu.RUnlock()
 
-	ep, ok := m.inner.mu.endpoints[addr]
+	ep, ok := a.mu.endpoints[addr]
 	if !ok {
 		return nil
 	}
 	return ep
 }
 
-// ForEach calls f for each address pair.
+// ForEachEndpoint calls f for each address.
 //
-// If f returns false, f is no longer be called.
-func (m ReadOnlyAddressableEndpointState) ForEach(f func(AddressEndpoint) bool) {
-	m.inner.mu.RLock()
-	defer m.inner.mu.RUnlock()
+// Once f returns false, f will no longer be called.
+func (a *AddressableEndpointState) ForEachEndpoint(f func(AddressEndpoint) bool) {
+	a.mu.RLock()
+	defer a.mu.RUnlock()
 
-	for _, ep := range m.inner.mu.endpoints {
+	for _, ep := range a.mu.endpoints {
 		if !f(ep) {
 			return
 		}
@@ -119,21 +82,15 @@ func (m ReadOnlyAddressableEndpointState) ForEach(f func(AddressEndpoint) bool)
 }
 
 // ForEachPrimaryEndpoint calls f for each primary address.
-//
-// If f returns false, f is no longer be called.
-func (m ReadOnlyAddressableEndpointState) ForEachPrimaryEndpoint(f func(AddressEndpoint)) {
-	m.inner.mu.RLock()
-	defer m.inner.mu.RUnlock()
-	for _, ep := range m.inner.mu.primary {
+func (a *AddressableEndpointState) ForEachPrimaryEndpoint(f func(AddressEndpoint)) {
+	a.mu.RLock()
+	defer a.mu.RUnlock()
+
+	for _, ep := range a.mu.primary {
 		f(ep)
 	}
 }
 
-// ReadOnly returns a readonly reference to a.
-func (a *AddressableEndpointState) ReadOnly() ReadOnlyAddressableEndpointState {
-	return ReadOnlyAddressableEndpointState{inner: a}
-}
-
 func (a *AddressableEndpointState) releaseAddressState(addrState *addressState) {
 	a.mu.Lock()
 	defer a.mu.Unlock()
@@ -335,11 +292,6 @@ func (a *AddressableEndpointState) addAndAcquireAddressLocked(addr tcpip.Address
 func (a *AddressableEndpointState) RemovePermanentAddress(addr tcpip.Address) *tcpip.Error {
 	a.mu.Lock()
 	defer a.mu.Unlock()
-
-	if _, ok := a.mu.groups[addr]; ok {
-		panic(fmt.Sprintf("group address = %s must be removed with LeaveGroup", addr))
-	}
-
 	return a.removePermanentAddressLocked(addr)
 }
 
@@ -471,8 +423,19 @@ func (a *AddressableEndpointState) acquirePrimaryAddressRLocked(isValid func(*ad
 	return deprecatedEndpoint
 }
 
-// AcquireAssignedAddress implements AddressableEndpoint.
-func (a *AddressableEndpointState) AcquireAssignedAddress(localAddr tcpip.Address, allowTemp bool, tempPEB PrimaryEndpointBehavior) AddressEndpoint {
+// AcquireAssignedAddressOrMatching returns an address endpoint that is
+// considered assigned to the addressable endpoint.
+//
+// If the address is an exact match with an existing address, that address is
+// returned. Otherwise, if f is provided, f is called with each address and
+// the address that f returns true for is returned.
+//
+// If there is no matching address, a temporary address will be returned if
+// allowTemp is true.
+//
+// Regardless how the address was obtained, it will be acquired before it is
+// returned.
+func (a *AddressableEndpointState) AcquireAssignedAddressOrMatching(localAddr tcpip.Address, f func(AddressEndpoint) bool, allowTemp bool, tempPEB PrimaryEndpointBehavior) AddressEndpoint {
 	a.mu.Lock()
 	defer a.mu.Unlock()
 
@@ -488,6 +451,14 @@ func (a *AddressableEndpointState) AcquireAssignedAddress(localAddr tcpip.Addres
 		return addrState
 	}
 
+	if f != nil {
+		for _, addrState := range a.mu.endpoints {
+			if addrState.IsAssigned(allowTemp) && f(addrState) && addrState.IncRef() {
+				return addrState
+			}
+		}
+	}
+
 	if !allowTemp {
 		return nil
 	}
@@ -520,6 +491,11 @@ func (a *AddressableEndpointState) AcquireAssignedAddress(localAddr tcpip.Addres
 	return ep
 }
 
+// AcquireAssignedAddress implements AddressableEndpoint.
+func (a *AddressableEndpointState) AcquireAssignedAddress(localAddr tcpip.Address, allowTemp bool, tempPEB PrimaryEndpointBehavior) AddressEndpoint {
+	return a.AcquireAssignedAddressOrMatching(localAddr, nil, allowTemp, tempPEB)
+}
+
 // AcquireOutgoingPrimaryAddress implements AddressableEndpoint.
 func (a *AddressableEndpointState) AcquireOutgoingPrimaryAddress(remoteAddr tcpip.Address, allowExpired bool) AddressEndpoint {
 	a.mu.RLock()
@@ -588,72 +564,11 @@ func (a *AddressableEndpointState) PermanentAddresses() []tcpip.AddressWithPrefi
 	return addrs
 }
 
-// JoinGroup implements GroupAddressableEndpoint.
-func (a *AddressableEndpointState) JoinGroup(group tcpip.Address) (bool, *tcpip.Error) {
-	a.mu.Lock()
-	defer a.mu.Unlock()
-
-	joins, ok := a.mu.groups[group]
-	if !ok {
-		ep, err := a.addAndAcquireAddressLocked(group.WithPrefix(), NeverPrimaryEndpoint, AddressConfigStatic, false /* deprecated */, true /* permanent */)
-		if err != nil {
-			return false, err
-		}
-		// We have no need for the address endpoint.
-		a.decAddressRefLocked(ep)
-	}
-
-	a.mu.groups[group] = joins + 1
-	return !ok, nil
-}
-
-// LeaveGroup implements GroupAddressableEndpoint.
-func (a *AddressableEndpointState) LeaveGroup(group tcpip.Address) (bool, *tcpip.Error) {
-	a.mu.Lock()
-	defer a.mu.Unlock()
-
-	joins, ok := a.mu.groups[group]
-	if !ok {
-		return false, tcpip.ErrBadLocalAddress
-	}
-
-	if joins == 1 {
-		a.removeGroupAddressLocked(group)
-		delete(a.mu.groups, group)
-		return true, nil
-	}
-
-	a.mu.groups[group] = joins - 1
-	return false, nil
-}
-
-// IsInGroup implements GroupAddressableEndpoint.
-func (a *AddressableEndpointState) IsInGroup(group tcpip.Address) bool {
-	a.mu.RLock()
-	defer a.mu.RUnlock()
-	_, ok := a.mu.groups[group]
-	return ok
-}
-
-func (a *AddressableEndpointState) removeGroupAddressLocked(group tcpip.Address) {
-	if err := a.removePermanentAddressLocked(group); err != nil {
-		// removePermanentEndpointLocked would only return an error if group is
-		// not bound to the addressable endpoint, but we know it MUST be assigned
-		// since we have group in our map of groups.
-		panic(fmt.Sprintf("error removing group address = %s: %s", group, err))
-	}
-}
-
 // Cleanup forcefully leaves all groups and removes all permanent addresses.
 func (a *AddressableEndpointState) Cleanup() {
 	a.mu.Lock()
 	defer a.mu.Unlock()
 
-	for group := range a.mu.groups {
-		a.removeGroupAddressLocked(group)
-	}
-	a.mu.groups = make(map[tcpip.Address]uint32)
-
 	for _, ep := range a.mu.endpoints {
 		// removePermanentEndpointLocked returns tcpip.ErrBadLocalAddress if ep is
 		// not a permanent address.
diff --git a/pkg/tcpip/stack/addressable_endpoint_state_test.go b/pkg/tcpip/stack/addressable_endpoint_state_test.go
index 26787d0a3..140f146f6 100644
--- a/pkg/tcpip/stack/addressable_endpoint_state_test.go
+++ b/pkg/tcpip/stack/addressable_endpoint_state_test.go
@@ -53,25 +53,9 @@ func TestAddressableEndpointStateCleanup(t *testing.T) {
 		ep.DecRef()
 	}
 
-	group := tcpip.Address("\x02")
-	if added, err := s.JoinGroup(group); err != nil {
-		t.Fatalf("s.JoinGroup(%s): %s", group, err)
-	} else if !added {
-		t.Fatalf("got s.JoinGroup(%s) = false, want = true", group)
-	}
-	if !s.IsInGroup(group) {
-		t.Fatalf("got s.IsInGroup(%s) = false, want = true", group)
-	}
-
 	s.Cleanup()
-	{
-		ep := s.AcquireAssignedAddress(addr.Address, false /* allowTemp */, stack.NeverPrimaryEndpoint)
-		if ep != nil {
-			ep.DecRef()
-			t.Fatalf("got s.AcquireAssignedAddress(%s, false, NeverPrimaryEndpoint) = %s, want = nil", addr.Address, ep.AddressWithPrefix())
-		}
-	}
-	if s.IsInGroup(group) {
-		t.Fatalf("got s.IsInGroup(%s) = true, want = false", group)
+	if ep := s.AcquireAssignedAddress(addr.Address, false /* allowTemp */, stack.NeverPrimaryEndpoint); ep != nil {
+		ep.DecRef()
+		t.Fatalf("got s.AcquireAssignedAddress(%s, false, NeverPrimaryEndpoint) = %s, want = nil", addr.Address, ep.AddressWithPrefix())
 	}
 }
diff --git a/pkg/tcpip/stack/forwarding_test.go b/pkg/tcpip/stack/forwarding_test.go
index 6dc9e7859..5ec9b3411 100644
--- a/pkg/tcpip/stack/forwarding_test.go
+++ b/pkg/tcpip/stack/forwarding_test.go
@@ -309,7 +309,7 @@ func (e *fwdTestLinkEndpoint) LinkAddress() tcpip.LinkAddress {
 
 func (e fwdTestLinkEndpoint) WritePacket(r *Route, gso *GSO, protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer) *tcpip.Error {
 	p := fwdTestPacketInfo{
-		RemoteLinkAddress: r.RemoteLinkAddress,
+		RemoteLinkAddress: r.RemoteLinkAddress(),
 		LocalLinkAddress:  r.LocalLinkAddress,
 		Pkt:               pkt,
 	}
@@ -333,20 +333,6 @@ func (e *fwdTestLinkEndpoint) WritePackets(r *Route, gso *GSO, pkts PacketBuffer
 	return n, nil
 }
 
-// WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket.
-func (e *fwdTestLinkEndpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error {
-	p := fwdTestPacketInfo{
-		Pkt: NewPacketBuffer(PacketBufferOptions{Data: vv}),
-	}
-
-	select {
-	case e.C <- p:
-	default:
-	}
-
-	return nil
-}
-
 // Wait implements stack.LinkEndpoint.Wait.
 func (*fwdTestLinkEndpoint) Wait() {}
 
diff --git a/pkg/tcpip/stack/ndp_test.go b/pkg/tcpip/stack/ndp_test.go
index 73a01c2dd..31b67b987 100644
--- a/pkg/tcpip/stack/ndp_test.go
+++ b/pkg/tcpip/stack/ndp_test.go
@@ -26,6 +26,7 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/checker"
+	"gvisor.dev/gvisor/pkg/tcpip/faketime"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/link/channel"
 	"gvisor.dev/gvisor/pkg/tcpip/network/ipv6"
@@ -352,7 +353,7 @@ func TestDADDisabled(t *testing.T) {
 	}
 
 	// We should not have sent any NDP NS messages.
-	if got := s.Stats().ICMP.V6PacketsSent.NeighborSolicit.Value(); got != 0 {
+	if got := s.Stats().ICMP.V6.PacketsSent.NeighborSolicit.Value(); got != 0 {
 		t.Fatalf("got NeighborSolicit = %d, want = 0", got)
 	}
 }
@@ -465,14 +466,18 @@ func TestDADResolve(t *testing.T) {
 				if err != tcpip.ErrNoRoute {
 					t.Errorf("got FindRoute(%d, '', %s, %d, false) = (%+v, %v), want = (_, %s)", nicID, addr2, header.IPv6ProtocolNumber, r, err, tcpip.ErrNoRoute)
 				}
-				r.Release()
+				if r != nil {
+					r.Release()
+				}
 			}
 			{
 				r, err := s.FindRoute(nicID, addr1, addr2, header.IPv6ProtocolNumber, false)
 				if err != tcpip.ErrNoRoute {
 					t.Errorf("got FindRoute(%d, %s, %s, %d, false) = (%+v, %v), want = (_, %s)", nicID, addr1, addr2, header.IPv6ProtocolNumber, r, err, tcpip.ErrNoRoute)
 				}
-				r.Release()
+				if r != nil {
+					r.Release()
+				}
 			}
 
 			if t.Failed() {
@@ -510,7 +515,9 @@ func TestDADResolve(t *testing.T) {
 				} else if r.LocalAddress != addr1 {
 					t.Errorf("got r.LocalAddress = %s, want = %s", r.LocalAddress, addr1)
 				}
-				r.Release()
+				if r != nil {
+					r.Release()
+				}
 			}
 
 			if t.Failed() {
@@ -518,7 +525,7 @@ func TestDADResolve(t *testing.T) {
 			}
 
 			// Should not have sent any more NS messages.
-			if got := s.Stats().ICMP.V6PacketsSent.NeighborSolicit.Value(); got != uint64(test.dupAddrDetectTransmits) {
+			if got := s.Stats().ICMP.V6.PacketsSent.NeighborSolicit.Value(); got != uint64(test.dupAddrDetectTransmits) {
 				t.Fatalf("got NeighborSolicit = %d, want = %d", got, test.dupAddrDetectTransmits)
 			}
 
@@ -533,8 +540,8 @@ func TestDADResolve(t *testing.T) {
 
 				// Make sure the right remote link address is used.
 				snmc := header.SolicitedNodeAddr(addr1)
-				if want := header.EthernetAddressFromMulticastIPv6Address(snmc); p.Route.RemoteLinkAddress != want {
-					t.Errorf("got remote link address = %s, want = %s", p.Route.RemoteLinkAddress, want)
+				if got, want := p.Route.RemoteLinkAddress(), header.EthernetAddressFromMulticastIPv6Address(snmc); got != want {
+					t.Errorf("got remote link address = %s, want = %s", got, want)
 				}
 
 				// Check NDP NS packet.
@@ -563,7 +570,7 @@ func rxNDPSolicit(e *channel.Endpoint, tgt tcpip.Address) {
 	hdr := buffer.NewPrependable(header.IPv6MinimumSize + header.ICMPv6NeighborSolicitMinimumSize)
 	pkt := header.ICMPv6(hdr.Prepend(header.ICMPv6NeighborSolicitMinimumSize))
 	pkt.SetType(header.ICMPv6NeighborSolicit)
-	ns := header.NDPNeighborSolicit(pkt.NDPPayload())
+	ns := header.NDPNeighborSolicit(pkt.MessageBody())
 	ns.SetTargetAddress(tgt)
 	snmc := header.SolicitedNodeAddr(tgt)
 	pkt.SetChecksum(header.ICMPv6Checksum(pkt, header.IPv6Any, snmc, buffer.VectorisedView{}))
@@ -605,7 +612,7 @@ func TestDADFail(t *testing.T) {
 				hdr := buffer.NewPrependable(header.IPv6MinimumSize + naSize)
 				pkt := header.ICMPv6(hdr.Prepend(naSize))
 				pkt.SetType(header.ICMPv6NeighborAdvert)
-				na := header.NDPNeighborAdvert(pkt.NDPPayload())
+				na := header.NDPNeighborAdvert(pkt.MessageBody())
 				na.SetSolicitedFlag(true)
 				na.SetOverrideFlag(true)
 				na.SetTargetAddress(tgt)
@@ -666,7 +673,7 @@ func TestDADFail(t *testing.T) {
 			// Receive a packet to simulate an address conflict.
 			test.rxPkt(e, addr1)
 
-			stat := test.getStat(s.Stats().ICMP.V6PacketsReceived)
+			stat := test.getStat(s.Stats().ICMP.V6.PacketsReceived)
 			if got := stat.Value(); got != 1 {
 				t.Fatalf("got stat = %d, want = 1", got)
 			}
@@ -803,7 +810,7 @@ func TestDADStop(t *testing.T) {
 			}
 
 			// Should not have sent more than 1 NS message.
-			if got := s.Stats().ICMP.V6PacketsSent.NeighborSolicit.Value(); got > 1 {
+			if got := s.Stats().ICMP.V6.PacketsSent.NeighborSolicit.Value(); got > 1 {
 				t.Errorf("got NeighborSolicit = %d, want <= 1", got)
 			}
 		})
@@ -982,7 +989,7 @@ func raBufWithOptsAndDHCPv6(ip tcpip.Address, rl uint16, managedAddress, otherCo
 	pkt := header.ICMPv6(hdr.Prepend(icmpSize))
 	pkt.SetType(header.ICMPv6RouterAdvert)
 	pkt.SetCode(0)
-	raPayload := pkt.NDPPayload()
+	raPayload := pkt.MessageBody()
 	ra := header.NDPRouterAdvert(raPayload)
 	// Populate the Router Lifetime.
 	binary.BigEndian.PutUint16(raPayload[2:], rl)
@@ -2162,8 +2169,8 @@ func TestNoAutoGenTempAddrForLinkLocal(t *testing.T) {
 						NDPConfigs: ipv6.NDPConfigurations{
 							AutoGenTempGlobalAddresses: true,
 						},
-						NDPDisp:              &ndpDisp,
-						AutoGenIPv6LinkLocal: true,
+						NDPDisp:          &ndpDisp,
+						AutoGenLinkLocal: true,
 					})},
 				})
 
@@ -2843,9 +2850,7 @@ func addrForNewConnectionTo(t *testing.T, s *stack.Stack, addr tcpip.FullAddress
 		t.Fatalf("s.NewEndpoint(%d, %d, _): %s", header.UDPProtocolNumber, header.IPv6ProtocolNumber, err)
 	}
 	defer ep.Close()
-	if err := ep.SetSockOptBool(tcpip.V6OnlyOption, true); err != nil {
-		t.Fatalf("SetSockOpt(tcpip.V6OnlyOption, true): %s", err)
-	}
+	ep.SocketOptions().SetV6Only(true)
 	if err := ep.Connect(addr); err != nil {
 		t.Fatalf("ep.Connect(%+v): %s", addr, err)
 	}
@@ -2879,9 +2884,7 @@ func addrForNewConnectionWithAddr(t *testing.T, s *stack.Stack, addr tcpip.FullA
 		t.Fatalf("s.NewEndpoint(%d, %d, _): %s", header.UDPProtocolNumber, header.IPv6ProtocolNumber, err)
 	}
 	defer ep.Close()
-	if err := ep.SetSockOptBool(tcpip.V6OnlyOption, true); err != nil {
-		t.Fatalf("SetSockOpt(tcpip.V6OnlyOption, true): %s", err)
-	}
+	ep.SocketOptions().SetV6Only(true)
 	if err := ep.Bind(addr); err != nil {
 		t.Fatalf("ep.Bind(%+v): %s", addr, err)
 	}
@@ -3250,9 +3253,7 @@ func TestAutoGenAddrJobDeprecation(t *testing.T) {
 				t.Fatalf("s.NewEndpoint(%d, %d, _): %s", header.UDPProtocolNumber, header.IPv6ProtocolNumber, err)
 			}
 			defer ep.Close()
-			if err := ep.SetSockOptBool(tcpip.V6OnlyOption, true); err != nil {
-				t.Fatalf("SetSockOpt(tcpip.V6OnlyOption, true): %s", err)
-			}
+			ep.SocketOptions().SetV6Only(true)
 
 			if err := ep.Connect(dstAddr); err != tcpip.ErrNoRoute {
 				t.Errorf("got ep.Connect(%+v) = %s, want = %s", dstAddr, err, tcpip.ErrNoRoute)
@@ -4044,9 +4045,9 @@ func TestAutoGenAddrInResponseToDADConflicts(t *testing.T) {
 						ndpConfigs.AutoGenAddressConflictRetries = maxRetries
 						s := stack.New(stack.Options{
 							NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
-								AutoGenIPv6LinkLocal: addrType.autoGenLinkLocal,
-								NDPConfigs:           ndpConfigs,
-								NDPDisp:              &ndpDisp,
+								AutoGenLinkLocal: addrType.autoGenLinkLocal,
+								NDPConfigs:       ndpConfigs,
+								NDPDisp:          &ndpDisp,
 								OpaqueIIDOpts: ipv6.OpaqueInterfaceIdentifierOptions{
 									NICNameFromID: func(_ tcpip.NICID, nicName string) string {
 										return nicName
@@ -4179,9 +4180,9 @@ func TestAutoGenAddrWithEUI64IIDNoDADRetries(t *testing.T) {
 			e := channel.New(0, 1280, linkAddr1)
 			s := stack.New(stack.Options{
 				NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
-					AutoGenIPv6LinkLocal: addrType.autoGenLinkLocal,
-					NDPConfigs:           addrType.ndpConfigs,
-					NDPDisp:              &ndpDisp,
+					AutoGenLinkLocal: addrType.autoGenLinkLocal,
+					NDPConfigs:       addrType.ndpConfigs,
+					NDPDisp:          &ndpDisp,
 				})},
 			})
 			if err := s.CreateNIC(nicID, e); err != nil {
@@ -4708,7 +4709,7 @@ func TestCleanupNDPState(t *testing.T) {
 			}
 			s := stack.New(stack.Options{
 				NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
-					AutoGenIPv6LinkLocal: true,
+					AutoGenLinkLocal: true,
 					NDPConfigs: ipv6.NDPConfigurations{
 						HandleRAs:              true,
 						DiscoverDefaultRouters: true,
@@ -5174,113 +5175,99 @@ func TestRouterSolicitation(t *testing.T) {
 		},
 	}
 
-	// This Run will not return until the parallel tests finish.
-	//
-	// We need this because we need to do some teardown work after the
-	// parallel tests complete.
-	//
-	// See https://godoc.org/testing#hdr-Subtests_and_Sub_benchmarks for
-	// more details.
-	t.Run("group", func(t *testing.T) {
-		for _, test := range tests {
-			test := test
-
-			t.Run(test.name, func(t *testing.T) {
-				t.Parallel()
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			clock := faketime.NewManualClock()
+			e := channelLinkWithHeaderLength{
+				Endpoint:     channel.New(int(test.maxRtrSolicit), 1280, test.linkAddr),
+				headerLength: test.linkHeaderLen,
+			}
+			e.Endpoint.LinkEPCapabilities |= stack.CapabilityResolutionRequired
+			waitForPkt := func(timeout time.Duration) {
+				t.Helper()
 
-				e := channelLinkWithHeaderLength{
-					Endpoint:     channel.New(int(test.maxRtrSolicit), 1280, test.linkAddr),
-					headerLength: test.linkHeaderLen,
+				clock.Advance(timeout)
+				p, ok := e.Read()
+				if !ok {
+					t.Fatal("expected router solicitation packet")
 				}
-				e.Endpoint.LinkEPCapabilities |= stack.CapabilityResolutionRequired
-				waitForPkt := func(timeout time.Duration) {
-					t.Helper()
-					ctx, cancel := context.WithTimeout(context.Background(), timeout)
-					defer cancel()
-					p, ok := e.ReadContext(ctx)
-					if !ok {
-						t.Fatal("timed out waiting for packet")
-						return
-					}
 
-					if p.Proto != header.IPv6ProtocolNumber {
-						t.Fatalf("got Proto = %d, want = %d", p.Proto, header.IPv6ProtocolNumber)
-					}
+				if p.Proto != header.IPv6ProtocolNumber {
+					t.Fatalf("got Proto = %d, want = %d", p.Proto, header.IPv6ProtocolNumber)
+				}
 
-					// Make sure the right remote link address is used.
-					if want := header.EthernetAddressFromMulticastIPv6Address(header.IPv6AllRoutersMulticastAddress); p.Route.RemoteLinkAddress != want {
-						t.Errorf("got remote link address = %s, want = %s", p.Route.RemoteLinkAddress, want)
-					}
+				// Make sure the right remote link address is used.
+				if got, want := p.Route.RemoteLinkAddress(), header.EthernetAddressFromMulticastIPv6Address(header.IPv6AllRoutersMulticastAddress); got != want {
+					t.Errorf("got remote link address = %s, want = %s", got, want)
+				}
 
-					checker.IPv6(t, stack.PayloadSince(p.Pkt.NetworkHeader()),
-						checker.SrcAddr(test.expectedSrcAddr),
-						checker.DstAddr(header.IPv6AllRoutersMulticastAddress),
-						checker.TTL(header.NDPHopLimit),
-						checker.NDPRS(checker.NDPRSOptions(test.expectedNDPOpts)),
-					)
+				checker.IPv6(t, stack.PayloadSince(p.Pkt.NetworkHeader()),
+					checker.SrcAddr(test.expectedSrcAddr),
+					checker.DstAddr(header.IPv6AllRoutersMulticastAddress),
+					checker.TTL(header.NDPHopLimit),
+					checker.NDPRS(checker.NDPRSOptions(test.expectedNDPOpts)),
+				)
 
-					if l, want := p.Pkt.AvailableHeaderBytes(), int(test.linkHeaderLen); l != want {
-						t.Errorf("got p.Pkt.AvailableHeaderBytes() = %d; want = %d", l, want)
-					}
-				}
-				waitForNothing := func(timeout time.Duration) {
-					t.Helper()
-					ctx, cancel := context.WithTimeout(context.Background(), timeout)
-					defer cancel()
-					if _, ok := e.ReadContext(ctx); ok {
-						t.Fatal("unexpectedly got a packet")
-					}
-				}
-				s := stack.New(stack.Options{
-					NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
-						NDPConfigs: ipv6.NDPConfigurations{
-							MaxRtrSolicitations:     test.maxRtrSolicit,
-							RtrSolicitationInterval: test.rtrSolicitInt,
-							MaxRtrSolicitationDelay: test.maxRtrSolicitDelay,
-						},
-					})},
-				})
-				if err := s.CreateNIC(nicID, &e); err != nil {
-					t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+				if l, want := p.Pkt.AvailableHeaderBytes(), int(test.linkHeaderLen); l != want {
+					t.Errorf("got p.Pkt.AvailableHeaderBytes() = %d; want = %d", l, want)
 				}
+			}
+			waitForNothing := func(timeout time.Duration) {
+				t.Helper()
 
-				if addr := test.nicAddr; addr != "" {
-					if err := s.AddAddress(nicID, header.IPv6ProtocolNumber, addr); err != nil {
-						t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, header.IPv6ProtocolNumber, addr, err)
-					}
+				clock.Advance(timeout)
+				if p, ok := e.Read(); ok {
+					t.Fatalf("unexpectedly got a packet = %#v", p)
 				}
+			}
+			s := stack.New(stack.Options{
+				NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
+					NDPConfigs: ipv6.NDPConfigurations{
+						MaxRtrSolicitations:     test.maxRtrSolicit,
+						RtrSolicitationInterval: test.rtrSolicitInt,
+						MaxRtrSolicitationDelay: test.maxRtrSolicitDelay,
+					},
+				})},
+				Clock: clock,
+			})
+			if err := s.CreateNIC(nicID, &e); err != nil {
+				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+			}
 
-				// Make sure each RS is sent at the right time.
-				remaining := test.maxRtrSolicit
-				if remaining > 0 {
-					waitForPkt(test.effectiveMaxRtrSolicitDelay + defaultAsyncPositiveEventTimeout)
-					remaining--
+			if addr := test.nicAddr; addr != "" {
+				if err := s.AddAddress(nicID, header.IPv6ProtocolNumber, addr); err != nil {
+					t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, header.IPv6ProtocolNumber, addr, err)
 				}
+			}
 
-				for ; remaining > 0; remaining-- {
-					if test.effectiveRtrSolicitInt > defaultAsyncPositiveEventTimeout {
-						waitForNothing(test.effectiveRtrSolicitInt - defaultAsyncNegativeEventTimeout)
-						waitForPkt(defaultAsyncPositiveEventTimeout)
-					} else {
-						waitForPkt(test.effectiveRtrSolicitInt + defaultAsyncPositiveEventTimeout)
-					}
-				}
+			// Make sure each RS is sent at the right time.
+			remaining := test.maxRtrSolicit
+			if remaining > 0 {
+				waitForPkt(test.effectiveMaxRtrSolicitDelay)
+				remaining--
+			}
 
-				// Make sure no more RS.
-				if test.effectiveRtrSolicitInt > test.effectiveMaxRtrSolicitDelay {
-					waitForNothing(test.effectiveRtrSolicitInt + defaultAsyncNegativeEventTimeout)
+			for ; remaining > 0; remaining-- {
+				if test.effectiveRtrSolicitInt > defaultAsyncPositiveEventTimeout {
+					waitForNothing(test.effectiveRtrSolicitInt - time.Nanosecond)
+					waitForPkt(time.Nanosecond)
 				} else {
-					waitForNothing(test.effectiveMaxRtrSolicitDelay + defaultAsyncNegativeEventTimeout)
+					waitForPkt(test.effectiveRtrSolicitInt)
 				}
+			}
 
-				// Make sure the counter got properly
-				// incremented.
-				if got, want := s.Stats().ICMP.V6PacketsSent.RouterSolicit.Value(), uint64(test.maxRtrSolicit); got != want {
-					t.Fatalf("got sent RouterSolicit = %d, want = %d", got, want)
-				}
-			})
-		}
-	})
+			// Make sure no more RS.
+			if test.effectiveRtrSolicitInt > test.effectiveMaxRtrSolicitDelay {
+				waitForNothing(test.effectiveRtrSolicitInt)
+			} else {
+				waitForNothing(test.effectiveMaxRtrSolicitDelay)
+			}
+
+			if got, want := s.Stats().ICMP.V6.PacketsSent.RouterSolicit.Value(), uint64(test.maxRtrSolicit); got != want {
+				t.Fatalf("got sent RouterSolicit = %d, want = %d", got, want)
+			}
+		})
+	}
 }
 
 func TestStopStartSolicitingRouters(t *testing.T) {
diff --git a/pkg/tcpip/stack/neighbor_cache.go b/pkg/tcpip/stack/neighbor_cache.go
index 177bf5516..317f6871d 100644
--- a/pkg/tcpip/stack/neighbor_cache.go
+++ b/pkg/tcpip/stack/neighbor_cache.go
@@ -24,9 +24,16 @@ import (
 
 const neighborCacheSize = 512 // max entries per interface
 
+// NeighborStats holds metrics for the neighbor table.
+type NeighborStats struct {
+	// FailedEntryLookups counts the number of lookups performed on an entry in
+	// Failed state.
+	FailedEntryLookups *tcpip.StatCounter
+}
+
 // neighborCache maps IP addresses to link addresses. It uses the Least
 // Recently Used (LRU) eviction strategy to implement a bounded cache for
-// dynmically acquired entries. It contains the state machine and configuration
+// dynamically acquired entries. It contains the state machine and configuration
 // for running Neighbor Unreachability Detection (NUD).
 //
 // There are two types of entries in the neighbor cache:
@@ -175,14 +182,15 @@ func (n *neighborCache) removeWaker(addr tcpip.Address, waker *sleep.Waker) {
 
 // entries returns all entries in the neighbor cache.
 func (n *neighborCache) entries() []NeighborEntry {
-	entries := make([]NeighborEntry, 0, len(n.cache))
 	n.mu.RLock()
+	defer n.mu.RUnlock()
+
+	entries := make([]NeighborEntry, 0, len(n.cache))
 	for _, entry := range n.cache {
 		entry.mu.RLock()
 		entries = append(entries, entry.neigh)
 		entry.mu.RUnlock()
 	}
-	n.mu.RUnlock()
 	return entries
 }
 
@@ -226,6 +234,8 @@ func (n *neighborCache) addStaticEntry(addr tcpip.Address, linkAddr tcpip.LinkAd
 }
 
 // removeEntryLocked removes the specified entry from the neighbor cache.
+//
+// Prerequisite: n.mu and entry.mu MUST be locked.
 func (n *neighborCache) removeEntryLocked(entry *neighborEntry) {
 	if entry.neigh.State != Static {
 		n.dynamic.lru.Remove(entry)
diff --git a/pkg/tcpip/stack/neighbor_cache_test.go b/pkg/tcpip/stack/neighbor_cache_test.go
index ed33418f3..732a299f7 100644
--- a/pkg/tcpip/stack/neighbor_cache_test.go
+++ b/pkg/tcpip/stack/neighbor_cache_test.go
@@ -80,17 +80,20 @@ func entryDiffOptsWithSort() []cmp.Option {
 func newTestNeighborCache(nudDisp NUDDispatcher, config NUDConfigurations, clock tcpip.Clock) *neighborCache {
 	config.resetInvalidFields()
 	rng := rand.New(rand.NewSource(time.Now().UnixNano()))
-	return &neighborCache{
+	neigh := &neighborCache{
 		nic: &NIC{
 			stack: &Stack{
 				clock:   clock,
 				nudDisp: nudDisp,
 			},
-			id: 1,
+			id:    1,
+			stats: makeNICStats(),
 		},
 		state: NewNUDState(config, rng),
 		cache: make(map[tcpip.Address]*neighborEntry, neighborCacheSize),
 	}
+	neigh.nic.neigh = neigh
+	return neigh
 }
 
 // testEntryStore contains a set of IP to NeighborEntry mappings.
diff --git a/pkg/tcpip/stack/neighbor_entry.go b/pkg/tcpip/stack/neighbor_entry.go
index 493e48031..32399b4f5 100644
--- a/pkg/tcpip/stack/neighbor_entry.go
+++ b/pkg/tcpip/stack/neighbor_entry.go
@@ -258,7 +258,7 @@ func (e *neighborEntry) setStateLocked(next NeighborState) {
 
 	case Failed:
 		e.notifyWakersLocked()
-		e.job = e.nic.stack.newJob(&e.mu, func() {
+		e.job = e.nic.stack.newJob(&doubleLock{first: &e.nic.neigh.mu, second: &e.mu}, func() {
 			e.nic.neigh.removeEntryLocked(e)
 		})
 		e.job.Schedule(config.UnreachableTime)
@@ -347,9 +347,10 @@ func (e *neighborEntry) handlePacketQueuedLocked(localAddr tcpip.Address) {
 		e.setStateLocked(Delay)
 		e.dispatchChangeEventLocked()
 
-	case Incomplete, Reachable, Delay, Probe, Static, Failed:
+	case Incomplete, Reachable, Delay, Probe, Static:
 		// Do nothing
-
+	case Failed:
+		e.nic.stats.Neighbor.FailedEntryLookups.Increment()
 	default:
 		panic(fmt.Sprintf("Invalid cache entry state: %s", e.neigh.State))
 	}
@@ -511,3 +512,23 @@ func (e *neighborEntry) handleUpperLevelConfirmationLocked() {
 		panic(fmt.Sprintf("Invalid cache entry state: %s", e.neigh.State))
 	}
 }
+
+// doubleLock combines two locks into one while maintaining lock ordering.
+//
+// TODO(gvisor.dev/issue/4796): Remove this once subsequent traffic to a Failed
+// neighbor is allowed.
+type doubleLock struct {
+	first, second sync.Locker
+}
+
+// Lock locks both locks in order: first then second.
+func (l *doubleLock) Lock() {
+	l.first.Lock()
+	l.second.Lock()
+}
+
+// Unlock unlocks both locks in reverse order: second then first.
+func (l *doubleLock) Unlock() {
+	l.second.Unlock()
+	l.first.Unlock()
+}
diff --git a/pkg/tcpip/stack/neighbor_entry_test.go b/pkg/tcpip/stack/neighbor_entry_test.go
index c2b763325..c497d3932 100644
--- a/pkg/tcpip/stack/neighbor_entry_test.go
+++ b/pkg/tcpip/stack/neighbor_entry_test.go
@@ -89,7 +89,7 @@ func eventDiffOptsWithSort() []cmp.Option {
 // | Stale      | Reachable  | Solicited confirmation w/o address         | Notify wakers   | Changed |
 // | Stale      | Stale      | Override confirmation                      | Update LinkAddr | Changed |
 // | Stale      | Stale      | Probe w/ different address                 | Update LinkAddr | Changed |
-// | Stale      | Delay      | Packet sent                                |                 | Changed |
+// | Stale      | Delay      | Packet queued                              |                 | Changed |
 // | Delay      | Reachable  | Upper-layer confirmation                   |                 | Changed |
 // | Delay      | Reachable  | Solicited override confirmation            | Update LinkAddr | Changed |
 // | Delay      | Reachable  | Solicited confirmation w/o address         | Notify wakers   | Changed |
@@ -101,6 +101,7 @@ func eventDiffOptsWithSort() []cmp.Option {
 // | Probe      | Stale      | Probe or confirmation w/ different address |                 | Changed |
 // | Probe      | Probe      | Retransmit timer expired                   | Send probe      | Changed |
 // | Probe      | Failed     | Max probes sent without reply              | Notify wakers   | Removed |
+// | Failed     | Failed     | Packet queued                              |                 |         |
 // | Failed     |            | Unreachability timer expired               | Delete entry    |         |
 
 type testEntryEventType uint8
@@ -228,6 +229,7 @@ func entryTestSetup(c NUDConfigurations) (*neighborEntry, *testNUDDispatcher, *e
 			clock:   clock,
 			nudDisp: &disp,
 		},
+		stats: makeNICStats(),
 	}
 	nic.networkEndpoints = map[tcpip.NetworkProtocolNumber]NetworkEndpoint{
 		header.IPv6ProtocolNumber: (&testIPv6Protocol{}).NewEndpoint(&nic, nil, nil, nil),
@@ -3433,6 +3435,146 @@ func TestEntryProbeToFailed(t *testing.T) {
 	nudDisp.mu.Unlock()
 }
 
+func TestEntryFailedToFailed(t *testing.T) {
+	c := DefaultNUDConfigurations()
+	c.MaxMulticastProbes = 3
+	c.MaxUnicastProbes = 3
+	e, nudDisp, linkRes, clock := entryTestSetup(c)
+
+	// Verify the cache contains the entry.
+	if _, ok := e.nic.neigh.cache[entryTestAddr1]; !ok {
+		t.Errorf("expected entry %q to exist in the neighbor cache", entryTestAddr1)
+	}
+
+	// TODO(gvisor.dev/issue/4872): Use helper functions to start entry tests in
+	// their expected state.
+	e.mu.Lock()
+	e.handlePacketQueuedLocked(entryTestAddr2)
+	e.mu.Unlock()
+
+	runImmediatelyScheduledJobs(clock)
+	{
+		wantProbes := []entryTestProbeInfo{
+			{
+				RemoteAddress: entryTestAddr1,
+				LocalAddress:  entryTestAddr2,
+			},
+		}
+		linkRes.mu.Lock()
+		diff := cmp.Diff(linkRes.probes, wantProbes)
+		linkRes.probes = nil
+		linkRes.mu.Unlock()
+		if diff != "" {
+			t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff)
+		}
+	}
+
+	e.mu.Lock()
+	e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{
+		Solicited: false,
+		Override:  false,
+		IsRouter:  false,
+	})
+	e.handlePacketQueuedLocked(entryTestAddr2)
+	e.mu.Unlock()
+
+	waitFor := c.DelayFirstProbeTime + c.RetransmitTimer*time.Duration(c.MaxUnicastProbes)
+	clock.Advance(waitFor)
+	{
+		wantProbes := []entryTestProbeInfo{
+			{
+				RemoteAddress:     entryTestAddr1,
+				RemoteLinkAddress: entryTestLinkAddr1,
+			},
+			{
+				RemoteAddress:     entryTestAddr1,
+				RemoteLinkAddress: entryTestLinkAddr1,
+			},
+			{
+				RemoteAddress:     entryTestAddr1,
+				RemoteLinkAddress: entryTestLinkAddr1,
+			},
+		}
+		linkRes.mu.Lock()
+		diff := cmp.Diff(linkRes.probes, wantProbes)
+		linkRes.mu.Unlock()
+		if diff != "" {
+			t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff)
+		}
+	}
+
+	wantEvents := []testEntryEventInfo{
+		{
+			EventType: entryTestAdded,
+			NICID:     entryTestNICID,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: tcpip.LinkAddress(""),
+				State:    Incomplete,
+			},
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr1,
+				State:    Stale,
+			},
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr1,
+				State:    Delay,
+			},
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr1,
+				State:    Probe,
+			},
+		},
+		{
+			EventType: entryTestRemoved,
+			NICID:     entryTestNICID,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr1,
+				State:    Probe,
+			},
+		},
+	}
+	nudDisp.mu.Lock()
+	if diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...); diff != "" {
+		t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff)
+	}
+	nudDisp.mu.Unlock()
+
+	failedLookups := e.nic.stats.Neighbor.FailedEntryLookups
+	if got := failedLookups.Value(); got != 0 {
+		t.Errorf("got Neighbor.FailedEntryLookups = %d, want = 0", got)
+	}
+
+	e.mu.Lock()
+	// Verify queuing a packet to the entry immediately fails.
+	e.handlePacketQueuedLocked(entryTestAddr2)
+	state := e.neigh.State
+	e.mu.Unlock()
+	if state != Failed {
+		t.Errorf("got e.neigh.State = %q, want = %q", state, Failed)
+	}
+
+	if got := failedLookups.Value(); got != 1 {
+		t.Errorf("got Neighbor.FailedEntryLookups = %d, want = 1", got)
+	}
+}
+
 func TestEntryFailedGetsDeleted(t *testing.T) {
 	c := DefaultNUDConfigurations()
 	c.MaxMulticastProbes = 3
diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index 3e6ceff28..5887aa1ed 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -60,12 +60,14 @@ type NIC struct {
 	}
 }
 
-// NICStats includes transmitted and received stats.
+// NICStats hold statistics for a NIC.
 type NICStats struct {
 	Tx DirectionStats
 	Rx DirectionStats
 
 	DisabledRx DirectionStats
+
+	Neighbor NeighborStats
 }
 
 func makeNICStats() NICStats {
@@ -265,7 +267,7 @@ func (n *NIC) WritePacket(r *Route, gso *GSO, protocol tcpip.NetworkProtocolNumb
 	if ch, err := r.Resolve(nil); err != nil {
 		if err == tcpip.ErrWouldBlock {
 			r := r.Clone()
-			n.stack.linkResQueue.enqueue(ch, &r, protocol, pkt)
+			n.stack.linkResQueue.enqueue(ch, r, protocol, pkt)
 			return nil
 		}
 		return err
@@ -277,9 +279,9 @@ func (n *NIC) WritePacket(r *Route, gso *GSO, protocol tcpip.NetworkProtocolNumb
 // WritePacketToRemote implements NetworkInterface.
 func (n *NIC) WritePacketToRemote(remoteLinkAddr tcpip.LinkAddress, gso *GSO, protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer) *tcpip.Error {
 	r := Route{
-		NetProto:          protocol,
-		RemoteLinkAddress: remoteLinkAddr,
+		NetProto: protocol,
 	}
+	r.ResolveWith(remoteLinkAddr)
 	return n.writePacket(&r, gso, protocol, pkt)
 }
 
@@ -561,8 +563,7 @@ func (n *NIC) joinGroup(protocol tcpip.NetworkProtocolNumber, addr tcpip.Address
 		return tcpip.ErrNotSupported
 	}
 
-	_, err := gep.JoinGroup(addr)
-	return err
+	return gep.JoinGroup(addr)
 }
 
 // leaveGroup decrements the count for the given multicast address, and when it
@@ -578,11 +579,7 @@ func (n *NIC) leaveGroup(protocol tcpip.NetworkProtocolNumber, addr tcpip.Addres
 		return tcpip.ErrNotSupported
 	}
 
-	if _, err := gep.LeaveGroup(addr); err != nil {
-		return err
-	}
-
-	return nil
+	return gep.LeaveGroup(addr)
 }
 
 // isInGroup returns true if n has joined the multicast group addr.
diff --git a/pkg/tcpip/stack/registration.go b/pkg/tcpip/stack/registration.go
index 2cb13c6fa..b334e27c4 100644
--- a/pkg/tcpip/stack/registration.go
+++ b/pkg/tcpip/stack/registration.go
@@ -259,15 +259,6 @@ const (
 	PacketLoop
 )
 
-// NetOptions is an interface that allows us to pass network protocol specific
-// options through the Stack layer code.
-type NetOptions interface {
-	// SizeWithPadding returns the amount of memory that must be allocated to
-	// hold the options given that the value must be rounded up to the next
-	// multiple of 4 bytes.
-	SizeWithPadding() int
-}
-
 // NetworkHeaderParams are the header parameters given as input by the
 // transport endpoint to the network.
 type NetworkHeaderParams struct {
@@ -279,10 +270,6 @@ type NetworkHeaderParams struct {
 
 	// TOS refers to TypeOfService or TrafficClass field of the IP-header.
 	TOS uint8
-
-	// Options is a set of options to add to a network header (or nil).
-	// It will be protocol specific opaque information from higher layers.
-	Options NetOptions
 }
 
 // GroupAddressableEndpoint is an endpoint that supports group addressing.
@@ -291,14 +278,10 @@ type NetworkHeaderParams struct {
 // endpoints may associate themselves with the same identifier (group address).
 type GroupAddressableEndpoint interface {
 	// JoinGroup joins the specified group.
-	//
-	// Returns true if the group was newly joined.
-	JoinGroup(group tcpip.Address) (bool, *tcpip.Error)
+	JoinGroup(group tcpip.Address) *tcpip.Error
 
 	// LeaveGroup attempts to leave the specified group.
-	//
-	// Returns tcpip.ErrBadLocalAddress if the endpoint has not joined the group.
-	LeaveGroup(group tcpip.Address) (bool, *tcpip.Error)
+	LeaveGroup(group tcpip.Address) *tcpip.Error
 
 	// IsInGroup returns true if the endpoint is a member of the specified group.
 	IsInGroup(group tcpip.Address) bool
@@ -739,10 +722,6 @@ type LinkEndpoint interface {
 	// endpoint.
 	Capabilities() LinkEndpointCapabilities
 
-	// WriteRawPacket writes a packet directly to the link. The packet
-	// should already have an ethernet header. It takes ownership of vv.
-	WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error
-
 	// Attach attaches the data link layer endpoint to the network-layer
 	// dispatcher of the stack.
 	//
diff --git a/pkg/tcpip/stack/route.go b/pkg/tcpip/stack/route.go
index 53cb6694f..de5fe6ffe 100644
--- a/pkg/tcpip/stack/route.go
+++ b/pkg/tcpip/stack/route.go
@@ -18,19 +18,22 @@ import (
 	"fmt"
 
 	"gvisor.dev/gvisor/pkg/sleep"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 )
 
 // Route represents a route through the networking stack to a given destination.
+//
+// It is safe to call Route's methods from multiple goroutines.
+//
+// The exported fields are immutable.
+//
+// TODO(gvisor.dev/issue/4902): Unexpose immutable fields.
 type Route struct {
 	// RemoteAddress is the final destination of the route.
 	RemoteAddress tcpip.Address
 
-	// RemoteLinkAddress is the link-layer (MAC) address of the
-	// final destination of the route.
-	RemoteLinkAddress tcpip.LinkAddress
-
 	// LocalAddress is the local address where the route starts.
 	LocalAddress tcpip.Address
 
@@ -52,8 +55,16 @@ type Route struct {
 	// address's assigned status without the NIC.
 	localAddressNIC *NIC
 
-	// localAddressEndpoint is the local address this route is associated with.
-	localAddressEndpoint AssignableAddressEndpoint
+	mu struct {
+		sync.RWMutex
+
+		// localAddressEndpoint is the local address this route is associated with.
+		localAddressEndpoint AssignableAddressEndpoint
+
+		// remoteLinkAddress is the link-layer (MAC) address of the next hop in the
+		// route.
+		remoteLinkAddress tcpip.LinkAddress
+	}
 
 	// outgoingNIC is the interface this route uses to write packets.
 	outgoingNIC *NIC
@@ -71,22 +82,24 @@ type Route struct {
 // ownership of the provided local address.
 //
 // Returns an empty route if validation fails.
-func constructAndValidateRoute(netProto tcpip.NetworkProtocolNumber, addressEndpoint AssignableAddressEndpoint, localAddressNIC, outgoingNIC *NIC, gateway, remoteAddr tcpip.Address, handleLocal, multicastLoop bool) Route {
-	addrWithPrefix := addressEndpoint.AddressWithPrefix()
+func constructAndValidateRoute(netProto tcpip.NetworkProtocolNumber, addressEndpoint AssignableAddressEndpoint, localAddressNIC, outgoingNIC *NIC, gateway, localAddr, remoteAddr tcpip.Address, handleLocal, multicastLoop bool) *Route {
+	if len(localAddr) == 0 {
+		localAddr = addressEndpoint.AddressWithPrefix().Address
+	}
 
-	if localAddressNIC != outgoingNIC && header.IsV6LinkLocalAddress(addrWithPrefix.Address) {
+	if localAddressNIC != outgoingNIC && header.IsV6LinkLocalAddress(localAddr) {
 		addressEndpoint.DecRef()
-		return Route{}
+		return nil
 	}
 
 	// If no remote address is provided, use the local address.
 	if len(remoteAddr) == 0 {
-		remoteAddr = addrWithPrefix.Address
+		remoteAddr = localAddr
 	}
 
 	r := makeRoute(
 		netProto,
-		addrWithPrefix.Address,
+		localAddr,
 		remoteAddr,
 		outgoingNIC,
 		localAddressNIC,
@@ -99,8 +112,8 @@ func constructAndValidateRoute(netProto tcpip.NetworkProtocolNumber, addressEndp
 	// broadcast it.
 	if len(gateway) > 0 {
 		r.NextHop = gateway
-	} else if subnet := addrWithPrefix.Subnet(); subnet.IsBroadcast(remoteAddr) {
-		r.RemoteLinkAddress = header.EthernetBroadcastAddress
+	} else if subnet := addressEndpoint.Subnet(); subnet.IsBroadcast(remoteAddr) {
+		r.ResolveWith(header.EthernetBroadcastAddress)
 	}
 
 	return r
@@ -108,11 +121,15 @@ func constructAndValidateRoute(netProto tcpip.NetworkProtocolNumber, addressEndp
 
 // makeRoute initializes a new route. It takes ownership of the provided
 // AssignableAddressEndpoint.
-func makeRoute(netProto tcpip.NetworkProtocolNumber, localAddr, remoteAddr tcpip.Address, outgoingNIC, localAddressNIC *NIC, localAddressEndpoint AssignableAddressEndpoint, handleLocal, multicastLoop bool) Route {
+func makeRoute(netProto tcpip.NetworkProtocolNumber, localAddr, remoteAddr tcpip.Address, outgoingNIC, localAddressNIC *NIC, localAddressEndpoint AssignableAddressEndpoint, handleLocal, multicastLoop bool) *Route {
 	if localAddressNIC.stack != outgoingNIC.stack {
 		panic(fmt.Sprintf("cannot create a route with NICs from different stacks"))
 	}
 
+	if len(localAddr) == 0 {
+		localAddr = localAddressEndpoint.AddressWithPrefix().Address
+	}
+
 	loop := PacketOut
 
 	// TODO(gvisor.dev/issue/4689): Loopback interface loops back packets at the
@@ -133,18 +150,21 @@ func makeRoute(netProto tcpip.NetworkProtocolNumber, localAddr, remoteAddr tcpip
 	return makeRouteInner(netProto, localAddr, remoteAddr, outgoingNIC, localAddressNIC, localAddressEndpoint, loop)
 }
 
-func makeRouteInner(netProto tcpip.NetworkProtocolNumber, localAddr, remoteAddr tcpip.Address, outgoingNIC, localAddressNIC *NIC, localAddressEndpoint AssignableAddressEndpoint, loop PacketLooping) Route {
-	r := Route{
-		NetProto:             netProto,
-		LocalAddress:         localAddr,
-		LocalLinkAddress:     outgoingNIC.LinkEndpoint.LinkAddress(),
-		RemoteAddress:        remoteAddr,
-		localAddressNIC:      localAddressNIC,
-		localAddressEndpoint: localAddressEndpoint,
-		outgoingNIC:          outgoingNIC,
-		Loop:                 loop,
+func makeRouteInner(netProto tcpip.NetworkProtocolNumber, localAddr, remoteAddr tcpip.Address, outgoingNIC, localAddressNIC *NIC, localAddressEndpoint AssignableAddressEndpoint, loop PacketLooping) *Route {
+	r := &Route{
+		NetProto:         netProto,
+		LocalAddress:     localAddr,
+		LocalLinkAddress: outgoingNIC.LinkEndpoint.LinkAddress(),
+		RemoteAddress:    remoteAddr,
+		localAddressNIC:  localAddressNIC,
+		outgoingNIC:      outgoingNIC,
+		Loop:             loop,
 	}
 
+	r.mu.Lock()
+	r.mu.localAddressEndpoint = localAddressEndpoint
+	r.mu.Unlock()
+
 	if r.outgoingNIC.LinkEndpoint.Capabilities()&CapabilityResolutionRequired != 0 {
 		if linkRes, ok := r.outgoingNIC.stack.linkAddrResolvers[r.NetProto]; ok {
 			r.linkRes = linkRes
@@ -159,7 +179,7 @@ func makeRouteInner(netProto tcpip.NetworkProtocolNumber, localAddr, remoteAddr
 // provided AssignableAddressEndpoint.
 //
 // A local route is a route to a destination that is local to the stack.
-func makeLocalRoute(netProto tcpip.NetworkProtocolNumber, localAddr, remoteAddr tcpip.Address, outgoingNIC, localAddressNIC *NIC, localAddressEndpoint AssignableAddressEndpoint) Route {
+func makeLocalRoute(netProto tcpip.NetworkProtocolNumber, localAddr, remoteAddr tcpip.Address, outgoingNIC, localAddressNIC *NIC, localAddressEndpoint AssignableAddressEndpoint) *Route {
 	loop := PacketLoop
 	// TODO(gvisor.dev/issue/4689): Loopback interface loops back packets at the
 	// link endpoint level. We can remove this check once loopback interfaces
@@ -170,6 +190,14 @@ func makeLocalRoute(netProto tcpip.NetworkProtocolNumber, localAddr, remoteAddr
 	return makeRouteInner(netProto, localAddr, remoteAddr, outgoingNIC, localAddressNIC, localAddressEndpoint, loop)
 }
 
+// RemoteLinkAddress returns the link-layer (MAC) address of the next hop in
+// the route.
+func (r *Route) RemoteLinkAddress() tcpip.LinkAddress {
+	r.mu.RLock()
+	defer r.mu.RUnlock()
+	return r.mu.remoteLinkAddress
+}
+
 // NICID returns the id of the NIC from which this route originates.
 func (r *Route) NICID() tcpip.NICID {
 	return r.outgoingNIC.ID()
@@ -231,7 +259,9 @@ func (r *Route) GSOMaxSize() uint32 {
 // ResolveWith immediately resolves a route with the specified remote link
 // address.
 func (r *Route) ResolveWith(addr tcpip.LinkAddress) {
-	r.RemoteLinkAddress = addr
+	r.mu.Lock()
+	defer r.mu.Unlock()
+	r.mu.remoteLinkAddress = addr
 }
 
 // Resolve attempts to resolve the link address if necessary. Returns ErrWouldBlock in
@@ -244,7 +274,10 @@ func (r *Route) ResolveWith(addr tcpip.LinkAddress) {
 //
 // The NIC r uses must not be locked.
 func (r *Route) Resolve(waker *sleep.Waker) (<-chan struct{}, *tcpip.Error) {
-	if !r.IsResolutionRequired() {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+
+	if !r.isResolutionRequiredRLocked() {
 		// Nothing to do if there is no cache (which does the resolution on cache miss) or
 		// link address is already known.
 		return nil, nil
@@ -254,7 +287,7 @@ func (r *Route) Resolve(waker *sleep.Waker) (<-chan struct{}, *tcpip.Error) {
 	if nextAddr == "" {
 		// Local link address is already known.
 		if r.RemoteAddress == r.LocalAddress {
-			r.RemoteLinkAddress = r.LocalLinkAddress
+			r.mu.remoteLinkAddress = r.LocalLinkAddress
 			return nil, nil
 		}
 		nextAddr = r.RemoteAddress
@@ -272,7 +305,7 @@ func (r *Route) Resolve(waker *sleep.Waker) (<-chan struct{}, *tcpip.Error) {
 		if err != nil {
 			return ch, err
 		}
-		r.RemoteLinkAddress = entry.LinkAddr
+		r.mu.remoteLinkAddress = entry.LinkAddr
 		return nil, nil
 	}
 
@@ -280,7 +313,7 @@ func (r *Route) Resolve(waker *sleep.Waker) (<-chan struct{}, *tcpip.Error) {
 	if err != nil {
 		return ch, err
 	}
-	r.RemoteLinkAddress = linkAddr
+	r.mu.remoteLinkAddress = linkAddr
 	return nil, nil
 }
 
@@ -309,7 +342,13 @@ func (r *Route) local() bool {
 //
 // The NICs the route is associated with must not be locked.
 func (r *Route) IsResolutionRequired() bool {
-	if !r.isValidForOutgoing() || r.RemoteLinkAddress != "" || r.local() {
+	r.mu.RLock()
+	defer r.mu.RUnlock()
+	return r.isResolutionRequiredRLocked()
+}
+
+func (r *Route) isResolutionRequiredRLocked() bool {
+	if !r.isValidForOutgoingRLocked() || r.mu.remoteLinkAddress != "" || r.local() {
 		return false
 	}
 
@@ -317,11 +356,18 @@ func (r *Route) IsResolutionRequired() bool {
 }
 
 func (r *Route) isValidForOutgoing() bool {
+	r.mu.RLock()
+	defer r.mu.RUnlock()
+	return r.isValidForOutgoingRLocked()
+}
+
+func (r *Route) isValidForOutgoingRLocked() bool {
 	if !r.outgoingNIC.Enabled() {
 		return false
 	}
 
-	if !r.localAddressNIC.isValidForOutgoing(r.localAddressEndpoint) {
+	localAddressEndpoint := r.mu.localAddressEndpoint
+	if localAddressEndpoint == nil || !r.localAddressNIC.isValidForOutgoing(localAddressEndpoint) {
 		return false
 	}
 
@@ -375,37 +421,44 @@ func (r *Route) MTU() uint32 {
 
 // Release frees all resources associated with the route.
 func (r *Route) Release() {
-	if r.localAddressEndpoint != nil {
-		r.localAddressEndpoint.DecRef()
-		r.localAddressEndpoint = nil
+	r.mu.Lock()
+	defer r.mu.Unlock()
+
+	if r.mu.localAddressEndpoint != nil {
+		r.mu.localAddressEndpoint.DecRef()
+		r.mu.localAddressEndpoint = nil
 	}
 }
 
 // Clone clones the route.
-func (r *Route) Clone() Route {
-	if r.localAddressEndpoint != nil {
-		if !r.localAddressEndpoint.IncRef() {
-			panic(fmt.Sprintf("failed to increment reference count for local address endpoint = %s", r.LocalAddress))
-		}
+func (r *Route) Clone() *Route {
+	r.mu.RLock()
+	defer r.mu.RUnlock()
+
+	newRoute := &Route{
+		RemoteAddress:    r.RemoteAddress,
+		LocalAddress:     r.LocalAddress,
+		LocalLinkAddress: r.LocalLinkAddress,
+		NextHop:          r.NextHop,
+		NetProto:         r.NetProto,
+		Loop:             r.Loop,
+		localAddressNIC:  r.localAddressNIC,
+		outgoingNIC:      r.outgoingNIC,
+		linkCache:        r.linkCache,
+		linkRes:          r.linkRes,
 	}
-	return *r
-}
 
-// MakeLoopedRoute duplicates the given route with special handling for routes
-// used for sending multicast or broadcast packets. In those cases the
-// multicast/broadcast address is the remote address when sending out, but for
-// incoming (looped) packets it becomes the local address. Similarly, the local
-// interface address that was the local address going out becomes the remote
-// address coming in. This is different to unicast routes where local and
-// remote addresses remain the same as they identify location (local vs remote)
-// not direction (source vs destination).
-func (r *Route) MakeLoopedRoute() Route {
-	l := r.Clone()
-	if r.RemoteAddress == header.IPv4Broadcast || header.IsV4MulticastAddress(r.RemoteAddress) || header.IsV6MulticastAddress(r.RemoteAddress) {
-		l.RemoteAddress, l.LocalAddress = l.LocalAddress, l.RemoteAddress
-		l.RemoteLinkAddress = l.LocalLinkAddress
+	newRoute.mu.Lock()
+	defer newRoute.mu.Unlock()
+	newRoute.mu.localAddressEndpoint = r.mu.localAddressEndpoint
+	if newRoute.mu.localAddressEndpoint != nil {
+		if !newRoute.mu.localAddressEndpoint.IncRef() {
+			panic(fmt.Sprintf("failed to increment reference count for local address endpoint = %s", newRoute.LocalAddress))
+		}
 	}
-	return l
+	newRoute.mu.remoteLinkAddress = r.mu.remoteLinkAddress
+
+	return newRoute
 }
 
 // Stack returns the instance of the Stack that owns this route.
@@ -418,7 +471,14 @@ func (r *Route) isV4Broadcast(addr tcpip.Address) bool {
 		return true
 	}
 
-	subnet := r.localAddressEndpoint.Subnet()
+	r.mu.RLock()
+	localAddressEndpoint := r.mu.localAddressEndpoint
+	r.mu.RUnlock()
+	if localAddressEndpoint == nil {
+		return false
+	}
+
+	subnet := localAddressEndpoint.Subnet()
 	return subnet.IsBroadcast(addr)
 }
 
@@ -428,27 +488,3 @@ func (r *Route) IsOutboundBroadcast() bool {
 	// Only IPv4 has a notion of broadcast.
 	return r.isV4Broadcast(r.RemoteAddress)
 }
-
-// isInboundBroadcast returns true if the route is for an inbound broadcast
-// packet.
-func (r *Route) isInboundBroadcast() bool {
-	// Only IPv4 has a notion of broadcast.
-	return r.isV4Broadcast(r.LocalAddress)
-}
-
-// ReverseRoute returns new route with given source and destination address.
-func (r *Route) ReverseRoute(src tcpip.Address, dst tcpip.Address) Route {
-	return Route{
-		NetProto:             r.NetProto,
-		LocalAddress:         dst,
-		LocalLinkAddress:     r.RemoteLinkAddress,
-		RemoteAddress:        src,
-		RemoteLinkAddress:    r.LocalLinkAddress,
-		Loop:                 r.Loop,
-		localAddressNIC:      r.localAddressNIC,
-		localAddressEndpoint: r.localAddressEndpoint,
-		outgoingNIC:          r.outgoingNIC,
-		linkCache:            r.linkCache,
-		linkRes:              r.linkRes,
-	}
-}
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index e0025e0a9..dc4f5b3e7 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -1118,6 +1118,16 @@ func (s *Stack) AddAddress(id tcpip.NICID, protocol tcpip.NetworkProtocolNumber,
 	return s.AddAddressWithOptions(id, protocol, addr, CanBePrimaryEndpoint)
 }
 
+// AddAddressWithPrefix is the same as AddAddress, but allows you to specify
+// the address prefix.
+func (s *Stack) AddAddressWithPrefix(id tcpip.NICID, protocol tcpip.NetworkProtocolNumber, addr tcpip.AddressWithPrefix) *tcpip.Error {
+	ap := tcpip.ProtocolAddress{
+		Protocol:          protocol,
+		AddressWithPrefix: addr,
+	}
+	return s.AddProtocolAddressWithOptions(id, ap, CanBePrimaryEndpoint)
+}
+
 // AddProtocolAddress adds a new network-layer protocol address to the
 // specified NIC.
 func (s *Stack) AddProtocolAddress(id tcpip.NICID, protocolAddress tcpip.ProtocolAddress) *tcpip.Error {
@@ -1208,10 +1218,10 @@ func (s *Stack) getAddressEP(nic *NIC, localAddr, remoteAddr tcpip.Address, netP
 // from the specified NIC.
 //
 // Precondition: s.mu must be read locked.
-func (s *Stack) findLocalRouteFromNICRLocked(localAddressNIC *NIC, localAddr, remoteAddr tcpip.Address, netProto tcpip.NetworkProtocolNumber) (route Route, ok bool) {
+func (s *Stack) findLocalRouteFromNICRLocked(localAddressNIC *NIC, localAddr, remoteAddr tcpip.Address, netProto tcpip.NetworkProtocolNumber) *Route {
 	localAddressEndpoint := localAddressNIC.getAddressOrCreateTempInner(netProto, localAddr, false /* createTemp */, NeverPrimaryEndpoint)
 	if localAddressEndpoint == nil {
-		return Route{}, false
+		return nil
 	}
 
 	var outgoingNIC *NIC
@@ -1235,12 +1245,12 @@ func (s *Stack) findLocalRouteFromNICRLocked(localAddressNIC *NIC, localAddr, re
 	// route.
 	if outgoingNIC == nil {
 		localAddressEndpoint.DecRef()
-		return Route{}, false
+		return nil
 	}
 
 	r := makeLocalRoute(
 		netProto,
-		localAddressEndpoint.AddressWithPrefix().Address,
+		localAddr,
 		remoteAddr,
 		outgoingNIC,
 		localAddressNIC,
@@ -1249,10 +1259,10 @@ func (s *Stack) findLocalRouteFromNICRLocked(localAddressNIC *NIC, localAddr, re
 
 	if r.IsOutboundBroadcast() {
 		r.Release()
-		return Route{}, false
+		return nil
 	}
 
-	return r, true
+	return r
 }
 
 // findLocalRouteRLocked returns a local route.
@@ -1261,26 +1271,26 @@ func (s *Stack) findLocalRouteFromNICRLocked(localAddressNIC *NIC, localAddr, re
 // is, a local route is a route where packets never have to leave the stack.
 //
 // Precondition: s.mu must be read locked.
-func (s *Stack) findLocalRouteRLocked(localAddressNICID tcpip.NICID, localAddr, remoteAddr tcpip.Address, netProto tcpip.NetworkProtocolNumber) (route Route, ok bool) {
+func (s *Stack) findLocalRouteRLocked(localAddressNICID tcpip.NICID, localAddr, remoteAddr tcpip.Address, netProto tcpip.NetworkProtocolNumber) *Route {
 	if len(localAddr) == 0 {
 		localAddr = remoteAddr
 	}
 
 	if localAddressNICID == 0 {
 		for _, localAddressNIC := range s.nics {
-			if r, ok := s.findLocalRouteFromNICRLocked(localAddressNIC, localAddr, remoteAddr, netProto); ok {
-				return r, true
+			if r := s.findLocalRouteFromNICRLocked(localAddressNIC, localAddr, remoteAddr, netProto); r != nil {
+				return r
 			}
 		}
 
-		return Route{}, false
+		return nil
 	}
 
 	if localAddressNIC, ok := s.nics[localAddressNICID]; ok {
 		return s.findLocalRouteFromNICRLocked(localAddressNIC, localAddr, remoteAddr, netProto)
 	}
 
-	return Route{}, false
+	return nil
 }
 
 // FindRoute creates a route to the given destination address, leaving through
@@ -1294,7 +1304,7 @@ func (s *Stack) findLocalRouteRLocked(localAddressNICID tcpip.NICID, localAddr,
 // If no local address is provided, the stack will select a local address. If no
 // remote address is provided, the stack wil use a remote address equal to the
 // local address.
-func (s *Stack) FindRoute(id tcpip.NICID, localAddr, remoteAddr tcpip.Address, netProto tcpip.NetworkProtocolNumber, multicastLoop bool) (Route, *tcpip.Error) {
+func (s *Stack) FindRoute(id tcpip.NICID, localAddr, remoteAddr tcpip.Address, netProto tcpip.NetworkProtocolNumber, multicastLoop bool) (*Route, *tcpip.Error) {
 	s.mu.RLock()
 	defer s.mu.RUnlock()
 
@@ -1305,7 +1315,7 @@ func (s *Stack) FindRoute(id tcpip.NICID, localAddr, remoteAddr tcpip.Address, n
 	needRoute := !(isLocalBroadcast || isMulticast || isLinkLocal || isLoopback)
 
 	if s.handleLocal && !isMulticast && !isLocalBroadcast {
-		if r, ok := s.findLocalRouteRLocked(id, localAddr, remoteAddr, netProto); ok {
+		if r := s.findLocalRouteRLocked(id, localAddr, remoteAddr, netProto); r != nil {
 			return r, nil
 		}
 	}
@@ -1317,7 +1327,7 @@ func (s *Stack) FindRoute(id tcpip.NICID, localAddr, remoteAddr tcpip.Address, n
 			if addressEndpoint := s.getAddressEP(nic, localAddr, remoteAddr, netProto); addressEndpoint != nil {
 				return makeRoute(
 					netProto,
-					addressEndpoint.AddressWithPrefix().Address,
+					localAddr,
 					remoteAddr,
 					nic, /* outboundNIC */
 					nic, /* localAddressNIC*/
@@ -1329,9 +1339,9 @@ func (s *Stack) FindRoute(id tcpip.NICID, localAddr, remoteAddr tcpip.Address, n
 		}
 
 		if isLoopback {
-			return Route{}, tcpip.ErrBadLocalAddress
+			return nil, tcpip.ErrBadLocalAddress
 		}
-		return Route{}, tcpip.ErrNetworkUnreachable
+		return nil, tcpip.ErrNetworkUnreachable
 	}
 
 	canForward := s.Forwarding(netProto) && !header.IsV6LinkLocalAddress(localAddr) && !isLinkLocal
@@ -1354,8 +1364,8 @@ func (s *Stack) FindRoute(id tcpip.NICID, localAddr, remoteAddr tcpip.Address, n
 				if needRoute {
 					gateway = route.Gateway
 				}
-				r := constructAndValidateRoute(netProto, addressEndpoint, nic /* outgoingNIC */, nic /* outgoingNIC */, gateway, remoteAddr, s.handleLocal, multicastLoop)
-				if r == (Route{}) {
+				r := constructAndValidateRoute(netProto, addressEndpoint, nic /* outgoingNIC */, nic /* outgoingNIC */, gateway, localAddr, remoteAddr, s.handleLocal, multicastLoop)
+				if r == nil {
 					panic(fmt.Sprintf("non-forwarding route validation failed with route table entry = %#v, id = %d, localAddr = %s, remoteAddr = %s", route, id, localAddr, remoteAddr))
 				}
 				return r, nil
@@ -1391,13 +1401,13 @@ func (s *Stack) FindRoute(id tcpip.NICID, localAddr, remoteAddr tcpip.Address, n
 		if id != 0 {
 			if aNIC, ok := s.nics[id]; ok {
 				if addressEndpoint := s.getAddressEP(aNIC, localAddr, remoteAddr, netProto); addressEndpoint != nil {
-					if r := constructAndValidateRoute(netProto, addressEndpoint, aNIC /* localAddressNIC */, nic /* outgoingNIC */, gateway, remoteAddr, s.handleLocal, multicastLoop); r != (Route{}) {
+					if r := constructAndValidateRoute(netProto, addressEndpoint, aNIC /* localAddressNIC */, nic /* outgoingNIC */, gateway, localAddr, remoteAddr, s.handleLocal, multicastLoop); r != nil {
 						return r, nil
 					}
 				}
 			}
 
-			return Route{}, tcpip.ErrNoRoute
+			return nil, tcpip.ErrNoRoute
 		}
 
 		if id == 0 {
@@ -1409,7 +1419,7 @@ func (s *Stack) FindRoute(id tcpip.NICID, localAddr, remoteAddr tcpip.Address, n
 					continue
 				}
 
-				if r := constructAndValidateRoute(netProto, addressEndpoint, aNIC /* localAddressNIC */, nic /* outgoingNIC */, gateway, remoteAddr, s.handleLocal, multicastLoop); r != (Route{}) {
+				if r := constructAndValidateRoute(netProto, addressEndpoint, aNIC /* localAddressNIC */, nic /* outgoingNIC */, gateway, localAddr, remoteAddr, s.handleLocal, multicastLoop); r != nil {
 					return r, nil
 				}
 			}
@@ -1417,12 +1427,12 @@ func (s *Stack) FindRoute(id tcpip.NICID, localAddr, remoteAddr tcpip.Address, n
 	}
 
 	if needRoute {
-		return Route{}, tcpip.ErrNoRoute
+		return nil, tcpip.ErrNoRoute
 	}
 	if header.IsV6LoopbackAddress(remoteAddr) {
-		return Route{}, tcpip.ErrBadLocalAddress
+		return nil, tcpip.ErrBadLocalAddress
 	}
-	return Route{}, tcpip.ErrNetworkUnreachable
+	return nil, tcpip.ErrNetworkUnreachable
 }
 
 // CheckNetworkProtocol checks if a given network protocol is enabled in the
@@ -1810,49 +1820,20 @@ func (s *Stack) unregisterPacketEndpointLocked(nicID tcpip.NICID, netProto tcpip
 	nic.unregisterPacketEndpoint(netProto, ep)
 }
 
-// WritePacket writes data directly to the specified NIC. It adds an ethernet
-// header based on the arguments.
-func (s *Stack) WritePacket(nicID tcpip.NICID, dst tcpip.LinkAddress, netProto tcpip.NetworkProtocolNumber, payload buffer.VectorisedView) *tcpip.Error {
+// WritePacketToRemote writes a payload on the specified NIC using the provided
+// network protocol and remote link address.
+func (s *Stack) WritePacketToRemote(nicID tcpip.NICID, remote tcpip.LinkAddress, netProto tcpip.NetworkProtocolNumber, payload buffer.VectorisedView) *tcpip.Error {
 	s.mu.Lock()
 	nic, ok := s.nics[nicID]
 	s.mu.Unlock()
 	if !ok {
 		return tcpip.ErrUnknownDevice
 	}
-
-	// Add our own fake ethernet header.
-	ethFields := header.EthernetFields{
-		SrcAddr: nic.LinkEndpoint.LinkAddress(),
-		DstAddr: dst,
-		Type:    netProto,
-	}
-	fakeHeader := make(header.Ethernet, header.EthernetMinimumSize)
-	fakeHeader.Encode(&ethFields)
-	vv := buffer.View(fakeHeader).ToVectorisedView()
-	vv.Append(payload)
-
-	if err := nic.LinkEndpoint.WriteRawPacket(vv); err != nil {
-		return err
-	}
-
-	return nil
-}
-
-// WriteRawPacket writes data directly to the specified NIC without adding any
-// headers.
-func (s *Stack) WriteRawPacket(nicID tcpip.NICID, payload buffer.VectorisedView) *tcpip.Error {
-	s.mu.Lock()
-	nic, ok := s.nics[nicID]
-	s.mu.Unlock()
-	if !ok {
-		return tcpip.ErrUnknownDevice
-	}
-
-	if err := nic.LinkEndpoint.WriteRawPacket(payload); err != nil {
-		return err
-	}
-
-	return nil
+	pkt := NewPacketBuffer(PacketBufferOptions{
+		ReserveHeaderBytes: int(nic.MaxHeaderLength()),
+		Data:               payload,
+	})
+	return nic.WritePacketToRemote(remote, nil, netProto, pkt)
 }
 
 // NetworkProtocolInstance returns the protocol instance in the stack for the
@@ -1912,7 +1893,6 @@ func (s *Stack) RemoveTCPProbe() {
 
 // JoinGroup joins the given multicast group on the given NIC.
 func (s *Stack) JoinGroup(protocol tcpip.NetworkProtocolNumber, nicID tcpip.NICID, multicastAddr tcpip.Address) *tcpip.Error {
-	// TODO: notify network of subscription via igmp protocol.
 	s.mu.RLock()
 	defer s.mu.RUnlock()
 
@@ -2159,3 +2139,43 @@ func (s *Stack) networkProtocolNumbers() []tcpip.NetworkProtocolNumber {
 	}
 	return protos
 }
+
+func isSubnetBroadcastOnNIC(nic *NIC, protocol tcpip.NetworkProtocolNumber, addr tcpip.Address) bool {
+	addressEndpoint := nic.getAddressOrCreateTempInner(protocol, addr, false /* createTemp */, NeverPrimaryEndpoint)
+	if addressEndpoint == nil {
+		return false
+	}
+
+	subnet := addressEndpoint.Subnet()
+	addressEndpoint.DecRef()
+	return subnet.IsBroadcast(addr)
+}
+
+// IsSubnetBroadcast returns true if the provided address is a subnet-local
+// broadcast address on the specified NIC and protocol.
+//
+// Returns false if the NIC is unknown or if the protocol is unknown or does
+// not support addressing.
+//
+// If the NIC is not specified, the stack will check all NICs.
+func (s *Stack) IsSubnetBroadcast(nicID tcpip.NICID, protocol tcpip.NetworkProtocolNumber, addr tcpip.Address) bool {
+	s.mu.RLock()
+	defer s.mu.RUnlock()
+
+	if nicID != 0 {
+		nic, ok := s.nics[nicID]
+		if !ok {
+			return false
+		}
+
+		return isSubnetBroadcastOnNIC(nic, protocol, addr)
+	}
+
+	for _, nic := range s.nics {
+		if isSubnetBroadcastOnNIC(nic, protocol, addr) {
+			return true
+		}
+	}
+
+	return false
+}
diff --git a/pkg/tcpip/stack/stack_test.go b/pkg/tcpip/stack/stack_test.go
index 61db3164b..457990945 100644
--- a/pkg/tcpip/stack/stack_test.go
+++ b/pkg/tcpip/stack/stack_test.go
@@ -27,7 +27,6 @@ import (
 	"time"
 
 	"github.com/google/go-cmp/cmp"
-	"github.com/google/go-cmp/cmp/cmpopts"
 	"gvisor.dev/gvisor/pkg/rand"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/tcpip"
@@ -407,7 +406,7 @@ func sendTo(s *stack.Stack, addr tcpip.Address, payload buffer.View) *tcpip.Erro
 	return send(r, payload)
 }
 
-func send(r stack.Route, payload buffer.View) *tcpip.Error {
+func send(r *stack.Route, payload buffer.View) *tcpip.Error {
 	return r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: fakeTransNumber, TTL: 123, TOS: stack.DefaultTOS}, stack.NewPacketBuffer(stack.PacketBufferOptions{
 		ReserveHeaderBytes: int(r.MaxHeaderLength()),
 		Data:               payload.ToVectorisedView(),
@@ -425,7 +424,7 @@ func testSendTo(t *testing.T, s *stack.Stack, addr tcpip.Address, ep *channel.En
 	}
 }
 
-func testSend(t *testing.T, r stack.Route, ep *channel.Endpoint, payload buffer.View) {
+func testSend(t *testing.T, r *stack.Route, ep *channel.Endpoint, payload buffer.View) {
 	t.Helper()
 	ep.Drain()
 	if err := send(r, payload); err != nil {
@@ -436,7 +435,7 @@ func testSend(t *testing.T, r stack.Route, ep *channel.Endpoint, payload buffer.
 	}
 }
 
-func testFailingSend(t *testing.T, r stack.Route, ep *channel.Endpoint, payload buffer.View, wantErr *tcpip.Error) {
+func testFailingSend(t *testing.T, r *stack.Route, ep *channel.Endpoint, payload buffer.View, wantErr *tcpip.Error) {
 	t.Helper()
 	if gotErr := send(r, payload); gotErr != wantErr {
 		t.Errorf("send failed: got = %s, want = %s ", gotErr, wantErr)
@@ -1563,15 +1562,15 @@ func TestSpoofingNoAddress(t *testing.T) {
 	// testSendTo(t, s, remoteAddr, ep, nil)
 }
 
-func verifyRoute(gotRoute, wantRoute stack.Route) error {
+func verifyRoute(gotRoute, wantRoute *stack.Route) error {
 	if gotRoute.LocalAddress != wantRoute.LocalAddress {
 		return fmt.Errorf("bad local address: got %s, want = %s", gotRoute.LocalAddress, wantRoute.LocalAddress)
 	}
 	if gotRoute.RemoteAddress != wantRoute.RemoteAddress {
 		return fmt.Errorf("bad remote address: got %s, want = %s", gotRoute.RemoteAddress, wantRoute.RemoteAddress)
 	}
-	if gotRoute.RemoteLinkAddress != wantRoute.RemoteLinkAddress {
-		return fmt.Errorf("bad remote link address: got %s, want = %s", gotRoute.RemoteLinkAddress, wantRoute.RemoteLinkAddress)
+	if got, want := gotRoute.RemoteLinkAddress(), wantRoute.RemoteLinkAddress(); got != want {
+		return fmt.Errorf("bad remote link address: got %s, want = %s", got, want)
 	}
 	if gotRoute.NextHop != wantRoute.NextHop {
 		return fmt.Errorf("bad next-hop address: got %s, want = %s", gotRoute.NextHop, wantRoute.NextHop)
@@ -1603,7 +1602,7 @@ func TestOutgoingBroadcastWithEmptyRouteTable(t *testing.T) {
 	if err != nil {
 		t.Fatalf("FindRoute(1, %v, %v, %d) failed: %v", header.IPv4Any, header.IPv4Broadcast, fakeNetNumber, err)
 	}
-	if err := verifyRoute(r, stack.Route{LocalAddress: header.IPv4Any, RemoteAddress: header.IPv4Broadcast}); err != nil {
+	if err := verifyRoute(r, &stack.Route{LocalAddress: header.IPv4Any, RemoteAddress: header.IPv4Broadcast}); err != nil {
 		t.Errorf("FindRoute(1, %v, %v, %d) returned unexpected Route: %v", header.IPv4Any, header.IPv4Broadcast, fakeNetNumber, err)
 	}
 
@@ -1657,7 +1656,7 @@ func TestOutgoingBroadcastWithRouteTable(t *testing.T) {
 	if err != nil {
 		t.Fatalf("FindRoute(1, %v, %v, %d) failed: %v", nic1Addr.Address, header.IPv4Broadcast, fakeNetNumber, err)
 	}
-	if err := verifyRoute(r, stack.Route{LocalAddress: nic1Addr.Address, RemoteAddress: header.IPv4Broadcast}); err != nil {
+	if err := verifyRoute(r, &stack.Route{LocalAddress: nic1Addr.Address, RemoteAddress: header.IPv4Broadcast}); err != nil {
 		t.Errorf("FindRoute(1, %v, %v, %d) returned unexpected Route: %v", nic1Addr.Address, header.IPv4Broadcast, fakeNetNumber, err)
 	}
 
@@ -1667,7 +1666,7 @@ func TestOutgoingBroadcastWithRouteTable(t *testing.T) {
 	if err != nil {
 		t.Fatalf("FindRoute(0, \"\", %s, %d) failed: %s", header.IPv4Broadcast, fakeNetNumber, err)
 	}
-	if err := verifyRoute(r, stack.Route{LocalAddress: nic2Addr.Address, RemoteAddress: header.IPv4Broadcast}); err != nil {
+	if err := verifyRoute(r, &stack.Route{LocalAddress: nic2Addr.Address, RemoteAddress: header.IPv4Broadcast}); err != nil {
 		t.Errorf("FindRoute(0, \"\", %s, %d) returned unexpected Route: %s)", header.IPv4Broadcast, fakeNetNumber, err)
 	}
 
@@ -1683,7 +1682,7 @@ func TestOutgoingBroadcastWithRouteTable(t *testing.T) {
 	if err != nil {
 		t.Fatalf("FindRoute(0, \"\", %s, %d) failed: %s", header.IPv4Broadcast, fakeNetNumber, err)
 	}
-	if err := verifyRoute(r, stack.Route{LocalAddress: nic1Addr.Address, RemoteAddress: header.IPv4Broadcast}); err != nil {
+	if err := verifyRoute(r, &stack.Route{LocalAddress: nic1Addr.Address, RemoteAddress: header.IPv4Broadcast}); err != nil {
 		t.Errorf("FindRoute(0, \"\", %s, %d) returned unexpected Route: %s)", header.IPv4Broadcast, fakeNetNumber, err)
 	}
 }
@@ -2407,9 +2406,9 @@ func TestNICAutoGenLinkLocalAddr(t *testing.T) {
 			}
 			opts := stack.Options{
 				NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
-					AutoGenIPv6LinkLocal: test.autoGen,
-					NDPDisp:              &ndpDisp,
-					OpaqueIIDOpts:        test.iidOpts,
+					AutoGenLinkLocal: test.autoGen,
+					NDPDisp:          &ndpDisp,
+					OpaqueIIDOpts:    test.iidOpts,
 				})},
 			}
 
@@ -2502,8 +2501,8 @@ func TestNoLinkLocalAutoGenForLoopbackNIC(t *testing.T) {
 		t.Run(test.name, func(t *testing.T) {
 			opts := stack.Options{
 				NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
-					AutoGenIPv6LinkLocal: true,
-					OpaqueIIDOpts:        test.opaqueIIDOpts,
+					AutoGenLinkLocal: true,
+					OpaqueIIDOpts:    test.opaqueIIDOpts,
 				})},
 			}
 
@@ -2536,9 +2535,9 @@ func TestNICAutoGenAddrDoesDAD(t *testing.T) {
 	ndpConfigs := ipv6.DefaultNDPConfigurations()
 	opts := stack.Options{
 		NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
-			NDPConfigs:           ndpConfigs,
-			AutoGenIPv6LinkLocal: true,
-			NDPDisp:              &ndpDisp,
+			NDPConfigs:       ndpConfigs,
+			AutoGenLinkLocal: true,
+			NDPDisp:          &ndpDisp,
 		})},
 	}
 
@@ -3351,11 +3350,16 @@ func TestOutgoingSubnetBroadcast(t *testing.T) {
 	remNetSubnetBcast := remNetSubnet.Broadcast()
 
 	tests := []struct {
-		name          string
-		nicAddr       tcpip.ProtocolAddress
-		routes        []tcpip.Route
-		remoteAddr    tcpip.Address
-		expectedRoute stack.Route
+		name                      string
+		nicAddr                   tcpip.ProtocolAddress
+		routes                    []tcpip.Route
+		remoteAddr                tcpip.Address
+		expectedLocalAddress      tcpip.Address
+		expectedRemoteAddress     tcpip.Address
+		expectedRemoteLinkAddress tcpip.LinkAddress
+		expectedNextHop           tcpip.Address
+		expectedNetProto          tcpip.NetworkProtocolNumber
+		expectedLoop              stack.PacketLooping
 	}{
 		// Broadcast to a locally attached subnet populates the broadcast MAC.
 		{
@@ -3370,14 +3374,12 @@ func TestOutgoingSubnetBroadcast(t *testing.T) {
 					NIC:         nicID1,
 				},
 			},
-			remoteAddr: ipv4SubnetBcast,
-			expectedRoute: stack.Route{
-				LocalAddress:      ipv4Addr.Address,
-				RemoteAddress:     ipv4SubnetBcast,
-				RemoteLinkAddress: header.EthernetBroadcastAddress,
-				NetProto:          header.IPv4ProtocolNumber,
-				Loop:              stack.PacketOut | stack.PacketLoop,
-			},
+			remoteAddr:                ipv4SubnetBcast,
+			expectedLocalAddress:      ipv4Addr.Address,
+			expectedRemoteAddress:     ipv4SubnetBcast,
+			expectedRemoteLinkAddress: header.EthernetBroadcastAddress,
+			expectedNetProto:          header.IPv4ProtocolNumber,
+			expectedLoop:              stack.PacketOut | stack.PacketLoop,
 		},
 		// Broadcast to a locally attached /31 subnet does not populate the
 		// broadcast MAC.
@@ -3393,13 +3395,11 @@ func TestOutgoingSubnetBroadcast(t *testing.T) {
 					NIC:         nicID1,
 				},
 			},
-			remoteAddr: ipv4Subnet31Bcast,
-			expectedRoute: stack.Route{
-				LocalAddress:  ipv4AddrPrefix31.Address,
-				RemoteAddress: ipv4Subnet31Bcast,
-				NetProto:      header.IPv4ProtocolNumber,
-				Loop:          stack.PacketOut,
-			},
+			remoteAddr:            ipv4Subnet31Bcast,
+			expectedLocalAddress:  ipv4AddrPrefix31.Address,
+			expectedRemoteAddress: ipv4Subnet31Bcast,
+			expectedNetProto:      header.IPv4ProtocolNumber,
+			expectedLoop:          stack.PacketOut,
 		},
 		// Broadcast to a locally attached /32 subnet does not populate the
 		// broadcast MAC.
@@ -3415,13 +3415,11 @@ func TestOutgoingSubnetBroadcast(t *testing.T) {
 					NIC:         nicID1,
 				},
 			},
-			remoteAddr: ipv4Subnet32Bcast,
-			expectedRoute: stack.Route{
-				LocalAddress:  ipv4AddrPrefix32.Address,
-				RemoteAddress: ipv4Subnet32Bcast,
-				NetProto:      header.IPv4ProtocolNumber,
-				Loop:          stack.PacketOut,
-			},
+			remoteAddr:            ipv4Subnet32Bcast,
+			expectedLocalAddress:  ipv4AddrPrefix32.Address,
+			expectedRemoteAddress: ipv4Subnet32Bcast,
+			expectedNetProto:      header.IPv4ProtocolNumber,
+			expectedLoop:          stack.PacketOut,
 		},
 		// IPv6 has no notion of a broadcast.
 		{
@@ -3436,13 +3434,11 @@ func TestOutgoingSubnetBroadcast(t *testing.T) {
 					NIC:         nicID1,
 				},
 			},
-			remoteAddr: ipv6SubnetBcast,
-			expectedRoute: stack.Route{
-				LocalAddress:  ipv6Addr.Address,
-				RemoteAddress: ipv6SubnetBcast,
-				NetProto:      header.IPv6ProtocolNumber,
-				Loop:          stack.PacketOut,
-			},
+			remoteAddr:            ipv6SubnetBcast,
+			expectedLocalAddress:  ipv6Addr.Address,
+			expectedRemoteAddress: ipv6SubnetBcast,
+			expectedNetProto:      header.IPv6ProtocolNumber,
+			expectedLoop:          stack.PacketOut,
 		},
 		// Broadcast to a remote subnet in the route table is send to the next-hop
 		// gateway.
@@ -3459,14 +3455,12 @@ func TestOutgoingSubnetBroadcast(t *testing.T) {
 					NIC:         nicID1,
 				},
 			},
-			remoteAddr: remNetSubnetBcast,
-			expectedRoute: stack.Route{
-				LocalAddress:  ipv4Addr.Address,
-				RemoteAddress: remNetSubnetBcast,
-				NextHop:       ipv4Gateway,
-				NetProto:      header.IPv4ProtocolNumber,
-				Loop:          stack.PacketOut,
-			},
+			remoteAddr:            remNetSubnetBcast,
+			expectedLocalAddress:  ipv4Addr.Address,
+			expectedRemoteAddress: remNetSubnetBcast,
+			expectedNextHop:       ipv4Gateway,
+			expectedNetProto:      header.IPv4ProtocolNumber,
+			expectedLoop:          stack.PacketOut,
 		},
 		// Broadcast to an unknown subnet follows the default route. Note that this
 		// is essentially just routing an unknown destination IP, because w/o any
@@ -3484,14 +3478,12 @@ func TestOutgoingSubnetBroadcast(t *testing.T) {
 					NIC:         nicID1,
 				},
 			},
-			remoteAddr: remNetSubnetBcast,
-			expectedRoute: stack.Route{
-				LocalAddress:  ipv4Addr.Address,
-				RemoteAddress: remNetSubnetBcast,
-				NextHop:       ipv4Gateway,
-				NetProto:      header.IPv4ProtocolNumber,
-				Loop:          stack.PacketOut,
-			},
+			remoteAddr:            remNetSubnetBcast,
+			expectedLocalAddress:  ipv4Addr.Address,
+			expectedRemoteAddress: remNetSubnetBcast,
+			expectedNextHop:       ipv4Gateway,
+			expectedNetProto:      header.IPv4ProtocolNumber,
+			expectedLoop:          stack.PacketOut,
 		},
 	}
 
@@ -3520,10 +3512,27 @@ func TestOutgoingSubnetBroadcast(t *testing.T) {
 				t.Fatalf("got unexpected address length = %d bytes", l)
 			}
 
-			if r, err := s.FindRoute(unspecifiedNICID, "" /* localAddr */, test.remoteAddr, netProto, false /* multicastLoop */); err != nil {
+			r, err := s.FindRoute(unspecifiedNICID, "" /* localAddr */, test.remoteAddr, netProto, false /* multicastLoop */)
+			if err != nil {
 				t.Fatalf("FindRoute(%d, '', %s, %d): %s", unspecifiedNICID, test.remoteAddr, netProto, err)
-			} else if diff := cmp.Diff(r, test.expectedRoute, cmpopts.IgnoreUnexported(r)); diff != "" {
-				t.Errorf("route mismatch (-want +got):\n%s", diff)
+			}
+			if r.LocalAddress != test.expectedLocalAddress {
+				t.Errorf("got r.LocalAddress = %s, want = %s", r.LocalAddress, test.expectedLocalAddress)
+			}
+			if r.RemoteAddress != test.expectedRemoteAddress {
+				t.Errorf("got r.RemoteAddress = %s, want = %s", r.RemoteAddress, test.expectedRemoteAddress)
+			}
+			if got := r.RemoteLinkAddress(); got != test.expectedRemoteLinkAddress {
+				t.Errorf("got r.RemoteLinkAddress() = %s, want = %s", got, test.expectedRemoteLinkAddress)
+			}
+			if r.NextHop != test.expectedNextHop {
+				t.Errorf("got r.NextHop = %s, want = %s", r.NextHop, test.expectedNextHop)
+			}
+			if r.NetProto != test.expectedNetProto {
+				t.Errorf("got r.NetProto = %d, want = %d", r.NetProto, test.expectedNetProto)
+			}
+			if r.Loop != test.expectedLoop {
+				t.Errorf("got r.Loop = %x, want = %x", r.Loop, test.expectedLoop)
 			}
 		})
 	}
@@ -4091,10 +4100,12 @@ func TestFindRouteWithForwarding(t *testing.T) {
 			s.SetRouteTable([]tcpip.Route{{Destination: test.netCfg.remoteAddr.WithPrefix().Subnet(), NIC: nicID2}})
 
 			r, err := s.FindRoute(test.addrNIC, test.localAddr, test.netCfg.remoteAddr, test.netCfg.proto, false /* multicastLoop */)
+			if r != nil {
+				defer r.Release()
+			}
 			if err != test.findRouteErr {
 				t.Fatalf("FindRoute(%d, %s, %s, %d, false) = %s, want = %s", test.addrNIC, test.localAddr, test.netCfg.remoteAddr, test.netCfg.proto, err, test.findRouteErr)
 			}
-			defer r.Release()
 
 			if test.findRouteErr != nil {
 				return
@@ -4152,3 +4163,63 @@ func TestFindRouteWithForwarding(t *testing.T) {
 		})
 	}
 }
+
+func TestWritePacketToRemote(t *testing.T) {
+	const nicID = 1
+	const MTU = 1280
+	e := channel.New(1, MTU, linkAddr1)
+	s := stack.New(stack.Options{})
+	if err := s.CreateNIC(nicID, e); err != nil {
+		t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+	}
+	if err := s.EnableNIC(nicID); err != nil {
+		t.Fatalf("CreateNIC(%d) = %s", nicID, err)
+	}
+	tests := []struct {
+		name     string
+		protocol tcpip.NetworkProtocolNumber
+		payload  []byte
+	}{
+		{
+			name:     "SuccessIPv4",
+			protocol: header.IPv4ProtocolNumber,
+			payload:  []byte{1, 2, 3, 4},
+		},
+		{
+			name:     "SuccessIPv6",
+			protocol: header.IPv6ProtocolNumber,
+			payload:  []byte{5, 6, 7, 8},
+		},
+	}
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			if err := s.WritePacketToRemote(nicID, linkAddr2, test.protocol, buffer.View(test.payload).ToVectorisedView()); err != nil {
+				t.Fatalf("s.WritePacketToRemote(_, _, _, _) = %s", err)
+			}
+
+			pkt, ok := e.Read()
+			if got, want := ok, true; got != want {
+				t.Fatalf("e.Read() = %t, want %t", got, want)
+			}
+			if got, want := pkt.Proto, test.protocol; got != want {
+				t.Fatalf("pkt.Proto = %d, want %d", got, want)
+			}
+			if got, want := pkt.Route.RemoteLinkAddress(), linkAddr2; got != want {
+				t.Fatalf("pkt.Route.RemoteAddress = %s, want %s", got, want)
+			}
+			if diff := cmp.Diff(pkt.Pkt.Data.ToView(), buffer.View(test.payload)); diff != "" {
+				t.Errorf("pkt.Pkt.Data mismatch (-want +got):\n%s", diff)
+			}
+		})
+	}
+
+	t.Run("InvalidNICID", func(t *testing.T) {
+		if got, want := s.WritePacketToRemote(234, linkAddr2, header.IPv4ProtocolNumber, buffer.View([]byte{1}).ToVectorisedView()), tcpip.ErrUnknownDevice; got != want {
+			t.Fatalf("s.WritePacketToRemote(_, _, _, _) = %s, want = %s", got, want)
+		}
+		pkt, ok := e.Read()
+		if got, want := ok, false; got != want {
+			t.Fatalf("e.Read() = %t, %v; want %t", got, pkt, want)
+		}
+	})
+}
diff --git a/pkg/tcpip/stack/transport_demuxer_test.go b/pkg/tcpip/stack/transport_demuxer_test.go
index 41a8e5ad0..2cdb5ca79 100644
--- a/pkg/tcpip/stack/transport_demuxer_test.go
+++ b/pkg/tcpip/stack/transport_demuxer_test.go
@@ -307,9 +307,7 @@ func TestBindToDeviceDistribution(t *testing.T) {
 						}(ep)
 
 						defer ep.Close()
-						if err := ep.SetSockOptBool(tcpip.ReusePortOption, endpoint.reuse); err != nil {
-							t.Fatalf("SetSockOptBool(ReusePortOption, %t) on endpoint %d failed: %s", endpoint.reuse, i, err)
-						}
+						ep.SocketOptions().SetReusePort(endpoint.reuse)
 						bindToDeviceOption := tcpip.BindToDeviceOption(endpoint.bindToDevice)
 						if err := ep.SetSockOpt(&bindToDeviceOption); err != nil {
 							t.Fatalf("SetSockOpt(&%T(%d)) on endpoint %d failed: %s", bindToDeviceOption, bindToDeviceOption, i, err)
diff --git a/pkg/tcpip/stack/transport_test.go b/pkg/tcpip/stack/transport_test.go
index 5b9043d85..d9769e47d 100644
--- a/pkg/tcpip/stack/transport_test.go
+++ b/pkg/tcpip/stack/transport_test.go
@@ -38,14 +38,15 @@ const (
 // use it.
 type fakeTransportEndpoint struct {
 	stack.TransportEndpointInfo
+	tcpip.DefaultSocketOptionsHandler
 
 	proto    *fakeTransportProtocol
 	peerAddr tcpip.Address
-	route    stack.Route
+	route    *stack.Route
 	uniqueID uint64
 
 	// acceptQueue is non-nil iff bound.
-	acceptQueue []fakeTransportEndpoint
+	acceptQueue []*fakeTransportEndpoint
 
 	// ops is used to set and get socket options.
 	ops tcpip.SocketOptions
@@ -64,8 +65,11 @@ func (*fakeTransportEndpoint) SetOwner(owner tcpip.PacketOwner) {}
 func (f *fakeTransportEndpoint) SocketOptions() *tcpip.SocketOptions {
 	return &f.ops
 }
+
 func newFakeTransportEndpoint(proto *fakeTransportProtocol, netProto tcpip.NetworkProtocolNumber, uniqueID uint64) tcpip.Endpoint {
-	return &fakeTransportEndpoint{TransportEndpointInfo: stack.TransportEndpointInfo{NetProto: netProto}, proto: proto, uniqueID: uniqueID}
+	ep := &fakeTransportEndpoint{TransportEndpointInfo: stack.TransportEndpointInfo{NetProto: netProto}, proto: proto, uniqueID: uniqueID}
+	ep.ops.InitHandler(ep)
+	return ep
 }
 
 func (f *fakeTransportEndpoint) Abort() {
@@ -114,21 +118,11 @@ func (*fakeTransportEndpoint) SetSockOpt(tcpip.SettableSocketOption) *tcpip.Erro
 	return tcpip.ErrInvalidEndpointState
 }
 
-// SetSockOptBool sets a socket option. Currently not supported.
-func (*fakeTransportEndpoint) SetSockOptBool(tcpip.SockOptBool, bool) *tcpip.Error {
-	return tcpip.ErrInvalidEndpointState
-}
-
 // SetSockOptInt sets a socket option. Currently not supported.
 func (*fakeTransportEndpoint) SetSockOptInt(tcpip.SockOptInt, int) *tcpip.Error {
 	return tcpip.ErrInvalidEndpointState
 }
 
-// GetSockOptBool implements tcpip.Endpoint.GetSockOptBool.
-func (*fakeTransportEndpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) {
-	return false, tcpip.ErrUnknownProtocolOption
-}
-
 // GetSockOptInt implements tcpip.Endpoint.GetSockOptInt.
 func (*fakeTransportEndpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
 	return -1, tcpip.ErrUnknownProtocolOption
@@ -189,7 +183,7 @@ func (f *fakeTransportEndpoint) Accept(*tcpip.FullAddress) (tcpip.Endpoint, *wai
 	if len(f.acceptQueue) == 0 {
 		return nil, nil, nil
 	}
-	a := &f.acceptQueue[0]
+	a := f.acceptQueue[0]
 	f.acceptQueue = f.acceptQueue[1:]
 	return a, nil, nil
 }
@@ -206,7 +200,7 @@ func (f *fakeTransportEndpoint) Bind(a tcpip.FullAddress) *tcpip.Error {
 	); err != nil {
 		return err
 	}
-	f.acceptQueue = []fakeTransportEndpoint{}
+	f.acceptQueue = []*fakeTransportEndpoint{}
 	return nil
 }
 
@@ -232,7 +226,7 @@ func (f *fakeTransportEndpoint) HandlePacket(id stack.TransportEndpointID, pkt *
 	}
 	route.ResolveWith(pkt.SourceLinkAddress())
 
-	f.acceptQueue = append(f.acceptQueue, fakeTransportEndpoint{
+	ep := &fakeTransportEndpoint{
 		TransportEndpointInfo: stack.TransportEndpointInfo{
 			ID:       f.ID,
 			NetProto: f.NetProto,
@@ -240,7 +234,9 @@ func (f *fakeTransportEndpoint) HandlePacket(id stack.TransportEndpointID, pkt *
 		proto:    f.proto,
 		peerAddr: route.RemoteAddress,
 		route:    route,
-	})
+	}
+	ep.ops.InitHandler(ep)
+	f.acceptQueue = append(f.acceptQueue, ep)
 }
 
 func (f *fakeTransportEndpoint) HandleControlPacket(stack.TransportEndpointID, stack.ControlType, uint32, *stack.PacketBuffer) {
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index f9e83dd1c..5c9225b5d 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -247,6 +247,16 @@ func (a Address) WithPrefix() AddressWithPrefix {
 	}
 }
 
+// Unspecified returns true if the address is unspecified.
+func (a Address) Unspecified() bool {
+	for _, b := range a {
+		if b != 0 {
+			return false
+		}
+	}
+	return true
+}
+
 // AddressMask is a bitmask for an address.
 type AddressMask string
 
@@ -593,10 +603,6 @@ type Endpoint interface {
 	// SetSockOpt sets a socket option.
 	SetSockOpt(opt SettableSocketOption) *Error
 
-	// SetSockOptBool sets a socket option, for simple cases where a value
-	// has the bool type.
-	SetSockOptBool(opt SockOptBool, v bool) *Error
-
 	// SetSockOptInt sets a socket option, for simple cases where a value
 	// has the int type.
 	SetSockOptInt(opt SockOptInt, v int) *Error
@@ -604,10 +610,6 @@ type Endpoint interface {
 	// GetSockOpt gets a socket option.
 	GetSockOpt(opt GettableSocketOption) *Error
 
-	// GetSockOptBool gets a socket option for simple cases where a return
-	// value has the bool type.
-	GetSockOptBool(SockOptBool) (bool, *Error)
-
 	// GetSockOptInt gets a socket option for simple cases where a return
 	// value has the int type.
 	GetSockOptInt(SockOptInt) (int, *Error)
@@ -694,79 +696,6 @@ type WriteOptions struct {
 	Atomic bool
 }
 
-// SockOptBool represents socket options which values have the bool type.
-type SockOptBool int
-
-const (
-	// CorkOption is used by SetSockOptBool/GetSockOptBool to specify if
-	// data should be held until segments are full by the TCP transport
-	// protocol.
-	CorkOption SockOptBool = iota
-
-	// DelayOption is used by SetSockOptBool/GetSockOptBool to specify if
-	// data should be sent out immediately by the transport protocol. For
-	// TCP, it determines if the Nagle algorithm is on or off.
-	DelayOption
-
-	// KeepaliveEnabledOption is used by SetSockOptBool/GetSockOptBool to
-	// specify whether TCP keepalive is enabled for this socket.
-	KeepaliveEnabledOption
-
-	// MulticastLoopOption is used by SetSockOptBool/GetSockOptBool to
-	// specify whether multicast packets sent over a non-loopback interface
-	// will be looped back.
-	MulticastLoopOption
-
-	// NoChecksumOption is used by SetSockOptBool/GetSockOptBool to specify
-	// whether UDP checksum is disabled for this socket.
-	NoChecksumOption
-
-	// PasscredOption is used by SetSockOptBool/GetSockOptBool to specify
-	// whether SCM_CREDENTIALS socket control messages are enabled.
-	//
-	// Only supported on Unix sockets.
-	PasscredOption
-
-	// QuickAckOption is stubbed out in SetSockOptBool/GetSockOptBool.
-	QuickAckOption
-
-	// ReceiveTClassOption is used by SetSockOptBool/GetSockOptBool to
-	// specify if the IPV6_TCLASS ancillary message is passed with incoming
-	// packets.
-	ReceiveTClassOption
-
-	// ReceiveTOSOption is used by SetSockOptBool/GetSockOptBool to specify
-	// if the TOS ancillary message is passed with incoming packets.
-	ReceiveTOSOption
-
-	// ReceiveIPPacketInfoOption is used by SetSockOptBool/GetSockOptBool to
-	// specify if more inforamtion is provided with incoming packets such as
-	// interface index and address.
-	ReceiveIPPacketInfoOption
-
-	// ReuseAddressOption is used by SetSockOptBool/GetSockOptBool to
-	// specify whether Bind() should allow reuse of local address.
-	ReuseAddressOption
-
-	// ReusePortOption is used by SetSockOptBool/GetSockOptBool to permit
-	// multiple sockets to be bound to an identical socket address.
-	ReusePortOption
-
-	// V6OnlyOption is used by SetSockOptBool/GetSockOptBool to specify
-	// whether an IPv6 socket is to be restricted to sending and receiving
-	// IPv6 packets only.
-	V6OnlyOption
-
-	// IPHdrIncludedOption is used by SetSockOpt to indicate for a raw
-	// endpoint that all packets being written have an IP header and the
-	// endpoint should not attach an IP header.
-	IPHdrIncludedOption
-
-	// AcceptConnOption is used by GetSockOptBool to indicate if the
-	// socket is a listening socket.
-	AcceptConnOption
-)
-
 // SockOptInt represents socket options which values have the int type.
 type SockOptInt int
 
@@ -1389,6 +1318,18 @@ type ICMPv6PacketStats struct {
 	// RedirectMsg is the total number of ICMPv6 redirect message packets
 	// counted.
 	RedirectMsg *StatCounter
+
+	// MulticastListenerQuery is the total number of Multicast Listener Query
+	// messages counted.
+	MulticastListenerQuery *StatCounter
+
+	// MulticastListenerReport is the total number of Multicast Listener Report
+	// messages counted.
+	MulticastListenerReport *StatCounter
+
+	// MulticastListenerDone is the total number of Multicast Listener Done
+	// messages counted.
+	MulticastListenerDone *StatCounter
 }
 
 // ICMPv4SentPacketStats collects outbound ICMPv4-specific stats.
@@ -1430,6 +1371,10 @@ type ICMPv6SentPacketStats struct {
 type ICMPv6ReceivedPacketStats struct {
 	ICMPv6PacketStats
 
+	// Unrecognized is the total number of ICMPv6 packets received that the
+	// transport layer does not know how to parse.
+	Unrecognized *StatCounter
+
 	// Invalid is the total number of ICMPv6 packets received that the
 	// transport layer could not parse.
 	Invalid *StatCounter
@@ -1439,25 +1384,91 @@ type ICMPv6ReceivedPacketStats struct {
 	RouterOnlyPacketsDroppedByHost *StatCounter
 }
 
-// ICMPStats collects ICMP-specific stats (both v4 and v6).
-type ICMPStats struct {
+// ICMPv4Stats collects ICMPv4-specific stats.
+type ICMPv4Stats struct {
 	// ICMPv4SentPacketStats contains counts of sent packets by ICMPv4 packet type
 	// and a single count of packets which failed to write to the link
 	// layer.
-	V4PacketsSent ICMPv4SentPacketStats
+	PacketsSent ICMPv4SentPacketStats
 
 	// ICMPv4ReceivedPacketStats contains counts of received packets by ICMPv4
 	// packet type and a single count of invalid packets received.
-	V4PacketsReceived ICMPv4ReceivedPacketStats
+	PacketsReceived ICMPv4ReceivedPacketStats
+}
 
+// ICMPv6Stats collects ICMPv6-specific stats.
+type ICMPv6Stats struct {
 	// ICMPv6SentPacketStats contains counts of sent packets by ICMPv6 packet type
 	// and a single count of packets which failed to write to the link
 	// layer.
-	V6PacketsSent ICMPv6SentPacketStats
+	PacketsSent ICMPv6SentPacketStats
 
 	// ICMPv6ReceivedPacketStats contains counts of received packets by ICMPv6
 	// packet type and a single count of invalid packets received.
-	V6PacketsReceived ICMPv6ReceivedPacketStats
+	PacketsReceived ICMPv6ReceivedPacketStats
+}
+
+// ICMPStats collects ICMP-specific stats (both v4 and v6).
+type ICMPStats struct {
+	// V4 contains the ICMPv4-specifics stats.
+	V4 ICMPv4Stats
+
+	// V6 contains the ICMPv4-specifics stats.
+	V6 ICMPv6Stats
+}
+
+// IGMPPacketStats enumerates counts for all IGMP packet types.
+type IGMPPacketStats struct {
+	// MembershipQuery is the total number of Membership Query messages counted.
+	MembershipQuery *StatCounter
+
+	// V1MembershipReport is the total number of Version 1 Membership Report
+	// messages counted.
+	V1MembershipReport *StatCounter
+
+	// V2MembershipReport is the total number of Version 2 Membership Report
+	// messages counted.
+	V2MembershipReport *StatCounter
+
+	// LeaveGroup is the total number of Leave Group messages counted.
+	LeaveGroup *StatCounter
+}
+
+// IGMPSentPacketStats collects outbound IGMP-specific stats.
+type IGMPSentPacketStats struct {
+	IGMPPacketStats
+
+	// Dropped is the total number of IGMP packets dropped due to link layer
+	// errors.
+	Dropped *StatCounter
+}
+
+// IGMPReceivedPacketStats collects inbound IGMP-specific stats.
+type IGMPReceivedPacketStats struct {
+	IGMPPacketStats
+
+	// Invalid is the total number of IGMP packets received that IGMP could not
+	// parse.
+	Invalid *StatCounter
+
+	// ChecksumErrors is the total number of IGMP packets dropped due to bad
+	// checksums.
+	ChecksumErrors *StatCounter
+
+	// Unrecognized is the total number of unrecognized messages counted, these
+	// are silently ignored for forward-compatibilty.
+	Unrecognized *StatCounter
+}
+
+// IGMPStats colelcts IGMP-specific stats.
+type IGMPStats struct {
+	// IGMPSentPacketStats contains counts of sent packets by IGMP packet type
+	// and a single count of invalid packets received.
+	PacketsSent IGMPSentPacketStats
+
+	// IGMPReceivedPacketStats contains counts of received packets by IGMP packet
+	// type and a single count of invalid packets received.
+	PacketsReceived IGMPReceivedPacketStats
 }
 
 // IPStats collects IP-specific stats (both v4 and v6).
@@ -1665,6 +1676,9 @@ type Stats struct {
 	// ICMP breaks out ICMP-specific stats (both v4 and v6).
 	ICMP ICMPStats
 
+	// IGMP breaks out IGMP-specific stats.
+	IGMP IGMPStats
+
 	// IP breaks out IP-specific stats (both v4 and v6).
 	IP IPStats
 
diff --git a/pkg/tcpip/tcpip_test.go b/pkg/tcpip/tcpip_test.go
index 1c8e2bc34..c461da137 100644
--- a/pkg/tcpip/tcpip_test.go
+++ b/pkg/tcpip/tcpip_test.go
@@ -226,3 +226,47 @@ func TestAddressWithPrefixSubnet(t *testing.T) {
 		}
 	}
 }
+
+func TestAddressUnspecified(t *testing.T) {
+	tests := []struct {
+		addr        Address
+		unspecified bool
+	}{
+		{
+			addr:        "",
+			unspecified: true,
+		},
+		{
+			addr:        "\x00",
+			unspecified: true,
+		},
+		{
+			addr:        "\x01",
+			unspecified: false,
+		},
+		{
+			addr:        "\x00\x00",
+			unspecified: true,
+		},
+		{
+			addr:        "\x01\x00",
+			unspecified: false,
+		},
+		{
+			addr:        "\x00\x01",
+			unspecified: false,
+		},
+		{
+			addr:        "\x01\x01",
+			unspecified: false,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(fmt.Sprintf("addr=%s", test.addr), func(t *testing.T) {
+			if got := test.addr.Unspecified(); got != test.unspecified {
+				t.Fatalf("got addr.Unspecified() = %t, want = %t", got, test.unspecified)
+			}
+		})
+	}
+}
diff --git a/pkg/tcpip/tests/integration/BUILD b/pkg/tcpip/tests/integration/BUILD
index 9b0f3b675..800025fb9 100644
--- a/pkg/tcpip/tests/integration/BUILD
+++ b/pkg/tcpip/tests/integration/BUILD
@@ -25,6 +25,7 @@ go_test(
         "//pkg/tcpip/network/ipv6",
         "//pkg/tcpip/stack",
         "//pkg/tcpip/transport/icmp",
+        "//pkg/tcpip/transport/tcp",
         "//pkg/tcpip/transport/udp",
         "//pkg/waiter",
         "@com_github_google_go_cmp//cmp:go_default_library",
diff --git a/pkg/tcpip/tests/integration/loopback_test.go b/pkg/tcpip/tests/integration/loopback_test.go
index 421da1add..baaa741cd 100644
--- a/pkg/tcpip/tests/integration/loopback_test.go
+++ b/pkg/tcpip/tests/integration/loopback_test.go
@@ -26,6 +26,7 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
 	"gvisor.dev/gvisor/pkg/tcpip/network/ipv6"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
+	"gvisor.dev/gvisor/pkg/tcpip/transport/tcp"
 	"gvisor.dev/gvisor/pkg/tcpip/transport/udp"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
@@ -70,8 +71,8 @@ func TestInitialLoopbackAddresses(t *testing.T) {
 
 	s := stack.New(stack.Options{
 		NetworkProtocols: []stack.NetworkProtocolFactory{ipv4.NewProtocol, ipv6.NewProtocolWithOptions(ipv6.Options{
-			NDPDisp:              &ndpDispatcher{},
-			AutoGenIPv6LinkLocal: true,
+			NDPDisp:          &ndpDispatcher{},
+			AutoGenLinkLocal: true,
 			OpaqueIIDOpts: ipv6.OpaqueInterfaceIdentifierOptions{
 				NICNameFromID: func(nicID tcpip.NICID, nicName string) string {
 					t.Fatalf("should not attempt to get name for NIC with ID = %d; nicName = %s", nicID, nicName)
@@ -93,9 +94,10 @@ func TestInitialLoopbackAddresses(t *testing.T) {
 	}
 }
 
-// TestLoopbackAcceptAllInSubnet tests that a loopback interface considers
-// itself bound to all addresses in the subnet of an assigned address.
-func TestLoopbackAcceptAllInSubnet(t *testing.T) {
+// TestLoopbackAcceptAllInSubnetUDP tests that a loopback interface considers
+// itself bound to all addresses in the subnet of an assigned address and UDP
+// traffic is sent/received correctly.
+func TestLoopbackAcceptAllInSubnetUDP(t *testing.T) {
 	const (
 		nicID     = 1
 		localPort = 80
@@ -107,7 +109,7 @@ func TestLoopbackAcceptAllInSubnet(t *testing.T) {
 		Protocol:          header.IPv4ProtocolNumber,
 		AddressWithPrefix: ipv4Addr,
 	}
-	ipv4Bytes := []byte(ipv4Addr.Address)
+	ipv4Bytes := []byte(ipv4ProtocolAddress.AddressWithPrefix.Address)
 	ipv4Bytes[len(ipv4Bytes)-1]++
 	otherIPv4Address := tcpip.Address(ipv4Bytes)
 
@@ -129,7 +131,7 @@ func TestLoopbackAcceptAllInSubnet(t *testing.T) {
 		{
 			name:       "IPv4 bind to wildcard and send to assigned address",
 			addAddress: ipv4ProtocolAddress,
-			dstAddr:    ipv4Addr.Address,
+			dstAddr:    ipv4ProtocolAddress.AddressWithPrefix.Address,
 			expectRx:   true,
 		},
 		{
@@ -148,7 +150,7 @@ func TestLoopbackAcceptAllInSubnet(t *testing.T) {
 			name:       "IPv4 bind to other subnet-local address and send to assigned address",
 			addAddress: ipv4ProtocolAddress,
 			bindAddr:   otherIPv4Address,
-			dstAddr:    ipv4Addr.Address,
+			dstAddr:    ipv4ProtocolAddress.AddressWithPrefix.Address,
 			expectRx:   false,
 		},
 		{
@@ -161,7 +163,7 @@ func TestLoopbackAcceptAllInSubnet(t *testing.T) {
 		{
 			name:       "IPv4 bind to assigned address and send to other subnet-local address",
 			addAddress: ipv4ProtocolAddress,
-			bindAddr:   ipv4Addr.Address,
+			bindAddr:   ipv4ProtocolAddress.AddressWithPrefix.Address,
 			dstAddr:    otherIPv4Address,
 			expectRx:   false,
 		},
@@ -236,13 +238,17 @@ func TestLoopbackAcceptAllInSubnet(t *testing.T) {
 				t.Fatalf("got sep.Write(_, _) = (%d, _, nil), want = (%d, _, nil)", n, want)
 			}
 
-			if gotPayload, _, err := rep.Read(nil); test.expectRx {
+			var addr tcpip.FullAddress
+			if gotPayload, _, err := rep.Read(&addr); test.expectRx {
 				if err != nil {
-					t.Fatalf("reep.Read(nil): %s", err)
+					t.Fatalf("reep.Read(_): %s", err)
 				}
 				if diff := cmp.Diff(buffer.View(data), gotPayload); diff != "" {
 					t.Errorf("got UDP payload mismatch (-want +got):\n%s", diff)
 				}
+				if addr.Addr != test.addAddress.AddressWithPrefix.Address {
+					t.Errorf("got addr.Addr = %s, want = %s", addr.Addr, test.addAddress.AddressWithPrefix.Address)
+				}
 			} else {
 				if err != tcpip.ErrWouldBlock {
 					t.Fatalf("got rep.Read(nil) = (%x, _, %s), want = (_, _, %s)", gotPayload, err, tcpip.ErrWouldBlock)
@@ -312,3 +318,168 @@ func TestLoopbackSubnetLifetimeBoundToAddr(t *testing.T) {
 		t.Fatalf("got r.WritePacket(nil, %#v, _) = %s, want = %s", params, err, tcpip.ErrInvalidEndpointState)
 	}
 }
+
+// TestLoopbackAcceptAllInSubnetTCP tests that a loopback interface considers
+// itself bound to all addresses in the subnet of an assigned address and TCP
+// traffic is sent/received correctly.
+func TestLoopbackAcceptAllInSubnetTCP(t *testing.T) {
+	const (
+		nicID     = 1
+		localPort = 80
+	)
+
+	ipv4ProtocolAddress := tcpip.ProtocolAddress{
+		Protocol:          header.IPv4ProtocolNumber,
+		AddressWithPrefix: ipv4Addr,
+	}
+	ipv4ProtocolAddress.AddressWithPrefix.PrefixLen = 8
+	ipv4Bytes := []byte(ipv4ProtocolAddress.AddressWithPrefix.Address)
+	ipv4Bytes[len(ipv4Bytes)-1]++
+	otherIPv4Address := tcpip.Address(ipv4Bytes)
+
+	ipv6ProtocolAddress := tcpip.ProtocolAddress{
+		Protocol:          header.IPv6ProtocolNumber,
+		AddressWithPrefix: ipv6Addr,
+	}
+	ipv6Bytes := []byte(ipv6Addr.Address)
+	ipv6Bytes[len(ipv6Bytes)-1]++
+	otherIPv6Address := tcpip.Address(ipv6Bytes)
+
+	tests := []struct {
+		name         string
+		addAddress   tcpip.ProtocolAddress
+		bindAddr     tcpip.Address
+		dstAddr      tcpip.Address
+		expectAccept bool
+	}{
+		{
+			name:         "IPv4 bind to wildcard and send to assigned address",
+			addAddress:   ipv4ProtocolAddress,
+			dstAddr:      ipv4ProtocolAddress.AddressWithPrefix.Address,
+			expectAccept: true,
+		},
+		{
+			name:         "IPv4 bind to wildcard and send to other subnet-local address",
+			addAddress:   ipv4ProtocolAddress,
+			dstAddr:      otherIPv4Address,
+			expectAccept: true,
+		},
+		{
+			name:         "IPv4 bind to wildcard send to other address",
+			addAddress:   ipv4ProtocolAddress,
+			dstAddr:      remoteIPv4Addr,
+			expectAccept: false,
+		},
+		{
+			name:         "IPv4 bind to other subnet-local address and send to assigned address",
+			addAddress:   ipv4ProtocolAddress,
+			bindAddr:     otherIPv4Address,
+			dstAddr:      ipv4ProtocolAddress.AddressWithPrefix.Address,
+			expectAccept: false,
+		},
+		{
+			name:         "IPv4 bind and send to other subnet-local address",
+			addAddress:   ipv4ProtocolAddress,
+			bindAddr:     otherIPv4Address,
+			dstAddr:      otherIPv4Address,
+			expectAccept: true,
+		},
+		{
+			name:         "IPv4 bind to assigned address and send to other subnet-local address",
+			addAddress:   ipv4ProtocolAddress,
+			bindAddr:     ipv4ProtocolAddress.AddressWithPrefix.Address,
+			dstAddr:      otherIPv4Address,
+			expectAccept: false,
+		},
+
+		{
+			name:         "IPv6 bind and send to assigned address",
+			addAddress:   ipv6ProtocolAddress,
+			bindAddr:     ipv6Addr.Address,
+			dstAddr:      ipv6Addr.Address,
+			expectAccept: true,
+		},
+		{
+			name:         "IPv6 bind to wildcard and send to other subnet-local address",
+			addAddress:   ipv6ProtocolAddress,
+			dstAddr:      otherIPv6Address,
+			expectAccept: false,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			s := stack.New(stack.Options{
+				NetworkProtocols:   []stack.NetworkProtocolFactory{ipv4.NewProtocol, ipv6.NewProtocol},
+				TransportProtocols: []stack.TransportProtocolFactory{tcp.NewProtocol},
+			})
+			if err := s.CreateNIC(nicID, loopback.New()); err != nil {
+				t.Fatalf("CreateNIC(%d, _): %s", nicID, err)
+			}
+			if err := s.AddProtocolAddress(nicID, test.addAddress); err != nil {
+				t.Fatalf("AddProtocolAddress(%d, %#v): %s", nicID, test.addAddress, err)
+			}
+			s.SetRouteTable([]tcpip.Route{
+				tcpip.Route{
+					Destination: header.IPv4EmptySubnet,
+					NIC:         nicID,
+				},
+				tcpip.Route{
+					Destination: header.IPv6EmptySubnet,
+					NIC:         nicID,
+				},
+			})
+
+			var wq waiter.Queue
+			we, ch := waiter.NewChannelEntry(nil)
+			wq.EventRegister(&we, waiter.EventIn)
+			defer wq.EventUnregister(&we)
+			listeningEndpoint, err := s.NewEndpoint(tcp.ProtocolNumber, test.addAddress.Protocol, &wq)
+			if err != nil {
+				t.Fatalf("NewEndpoint(%d, %d, _): %s", udp.ProtocolNumber, test.addAddress.Protocol, err)
+			}
+			defer listeningEndpoint.Close()
+
+			bindAddr := tcpip.FullAddress{Addr: test.bindAddr, Port: localPort}
+			if err := listeningEndpoint.Bind(bindAddr); err != nil {
+				t.Fatalf("listeningEndpoint.Bind(%#v): %s", bindAddr, err)
+			}
+
+			if err := listeningEndpoint.Listen(1); err != nil {
+				t.Fatalf("listeningEndpoint.Listen(1): %s", err)
+			}
+
+			connectingEndpoint, err := s.NewEndpoint(tcp.ProtocolNumber, test.addAddress.Protocol, &wq)
+			if err != nil {
+				t.Fatalf("s.NewEndpoint(%d, %d, _): %s", udp.ProtocolNumber, test.addAddress.Protocol, err)
+			}
+			defer connectingEndpoint.Close()
+
+			connectAddr := tcpip.FullAddress{
+				Addr: test.dstAddr,
+				Port: localPort,
+			}
+			if err := connectingEndpoint.Connect(connectAddr); err != tcpip.ErrConnectStarted {
+				t.Fatalf("connectingEndpoint.Connect(%#v): %s", connectAddr, err)
+			}
+
+			if !test.expectAccept {
+				if _, _, err := listeningEndpoint.Accept(nil); err != tcpip.ErrWouldBlock {
+					t.Fatalf("got listeningEndpoint.Accept(nil) = %s, want = %s", err, tcpip.ErrWouldBlock)
+				}
+				return
+			}
+
+			// Wait for the listening endpoint to be "readable". That is, wait for a
+			// new connection.
+			<-ch
+			var addr tcpip.FullAddress
+			if _, _, err := listeningEndpoint.Accept(&addr); err != nil {
+				t.Fatalf("listeningEndpoint.Accept(nil): %s", err)
+			}
+			if addr.Addr != test.addAddress.AddressWithPrefix.Address {
+				t.Errorf("got addr.Addr = %s, want = %s", addr.Addr, test.addAddress.AddressWithPrefix.Address)
+			}
+		})
+	}
+}
diff --git a/pkg/tcpip/tests/integration/multicast_broadcast_test.go b/pkg/tcpip/tests/integration/multicast_broadcast_test.go
index 9d30329f5..8be791a00 100644
--- a/pkg/tcpip/tests/integration/multicast_broadcast_test.go
+++ b/pkg/tcpip/tests/integration/multicast_broadcast_test.go
@@ -510,10 +510,7 @@ func TestReuseAddrAndBroadcast(t *testing.T) {
 					}
 					defer ep.Close()
 
-					if err := ep.SetSockOptBool(tcpip.ReuseAddressOption, true); err != nil {
-						t.Fatalf("eps[%d].SetSockOptBool(tcpip.ReuseAddressOption, true): %s", len(eps), err)
-					}
-
+					ep.SocketOptions().SetReuseAddress(true)
 					ep.SocketOptions().SetBroadcast(true)
 
 					bindAddr := tcpip.FullAddress{Port: localPort}
diff --git a/pkg/tcpip/transport/icmp/endpoint.go b/pkg/tcpip/transport/icmp/endpoint.go
index 440cb0352..94fcd72d9 100644
--- a/pkg/tcpip/transport/icmp/endpoint.go
+++ b/pkg/tcpip/transport/icmp/endpoint.go
@@ -49,6 +49,7 @@ const (
 // +stateify savable
 type endpoint struct {
 	stack.TransportEndpointInfo
+	tcpip.DefaultSocketOptionsHandler
 
 	// The following fields are initialized at creation time and are
 	// immutable.
@@ -71,7 +72,7 @@ type endpoint struct {
 	// shutdownFlags represent the current shutdown state of the endpoint.
 	shutdownFlags tcpip.ShutdownFlags
 	state         endpointState
-	route         stack.Route `state:"manual"`
+	route         *stack.Route `state:"manual"`
 	ttl           uint8
 	stats         tcpip.TransportEndpointStats `state:"nosave"`
 	// linger is used for SO_LINGER socket option.
@@ -85,7 +86,7 @@ type endpoint struct {
 }
 
 func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
-	return &endpoint{
+	ep := &endpoint{
 		stack: s,
 		TransportEndpointInfo: stack.TransportEndpointInfo{
 			NetProto:   netProto,
@@ -96,7 +97,9 @@ func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, transProt
 		sndBufSize:    32 * 1024,
 		state:         stateInitial,
 		uniqueID:      s.UniqueID(),
-	}, nil
+	}
+	ep.ops.InitHandler(ep)
+	return ep, nil
 }
 
 // UniqueID implements stack.TransportEndpoint.UniqueID.
@@ -129,7 +132,10 @@ func (e *endpoint) Close() {
 	}
 	e.rcvMu.Unlock()
 
-	e.route.Release()
+	if e.route != nil {
+		e.route.Release()
+		e.route = nil
+	}
 
 	// Update the state.
 	e.state = stateClosed
@@ -142,6 +148,7 @@ func (e *endpoint) Close() {
 // ModerateRecvBuf implements tcpip.Endpoint.ModerateRecvBuf.
 func (e *endpoint) ModerateRecvBuf(copied int) {}
 
+// SetOwner implements tcpip.Endpoint.SetOwner.
 func (e *endpoint) SetOwner(owner tcpip.PacketOwner) {
 	e.owner = owner
 }
@@ -267,26 +274,8 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c
 		}
 	}
 
-	var route *stack.Route
-	if to == nil {
-		route = &e.route
-
-		if route.IsResolutionRequired() {
-			// Promote lock to exclusive if using a shared route,
-			// given that it may need to change in Route.Resolve()
-			// call below.
-			e.mu.RUnlock()
-			defer e.mu.RLock()
-
-			e.mu.Lock()
-			defer e.mu.Unlock()
-
-			// Recheck state after lock was re-acquired.
-			if e.state != stateConnected {
-				return 0, nil, tcpip.ErrInvalidEndpointState
-			}
-		}
-	} else {
+	route := e.route
+	if to != nil {
 		// Reject destination address if it goes through a different
 		// NIC than the endpoint was bound to.
 		nicID := to.NIC
@@ -310,7 +299,7 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c
 		}
 		defer r.Release()
 
-		route = &r
+		route = r
 	}
 
 	if route.IsResolutionRequired() {
@@ -361,11 +350,6 @@ func (e *endpoint) SetSockOpt(opt tcpip.SettableSocketOption) *tcpip.Error {
 	return nil
 }
 
-// SetSockOptBool sets a socket option. Currently not supported.
-func (e *endpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error {
-	return nil
-}
-
 // SetSockOptInt sets a socket option. Currently not supported.
 func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
 	switch opt {
@@ -378,17 +362,6 @@ func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
 	return nil
 }
 
-// GetSockOptBool implements tcpip.Endpoint.GetSockOptBool.
-func (e *endpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) {
-	switch opt {
-	case tcpip.KeepaliveEnabledOption, tcpip.AcceptConnOption:
-		return false, nil
-
-	default:
-		return false, tcpip.ErrUnknownProtocolOption
-	}
-}
-
 // GetSockOptInt implements tcpip.Endpoint.GetSockOptInt.
 func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
 	switch opt {
@@ -857,6 +830,7 @@ func (*endpoint) LastError() *tcpip.Error {
 	return nil
 }
 
+// SocketOptions implements tcpip.Endpoint.SocketOptions.
 func (e *endpoint) SocketOptions() *tcpip.SocketOptions {
 	return &e.ops
 }
diff --git a/pkg/tcpip/transport/packet/endpoint.go b/pkg/tcpip/transport/packet/endpoint.go
index 3bff3755a..3666bac0f 100644
--- a/pkg/tcpip/transport/packet/endpoint.go
+++ b/pkg/tcpip/transport/packet/endpoint.go
@@ -60,6 +60,8 @@ type packet struct {
 // +stateify savable
 type endpoint struct {
 	stack.TransportEndpointInfo
+	tcpip.DefaultSocketOptionsHandler
+
 	// The following fields are initialized at creation time and are
 	// immutable.
 	stack       *stack.Stack `state:"manual"`
@@ -107,6 +109,7 @@ func NewEndpoint(s *stack.Stack, cooked bool, netProto tcpip.NetworkProtocolNumb
 		rcvBufSizeMax: 32 * 1024,
 		sndBufSize:    32 * 1024,
 	}
+	ep.ops.InitHandler(ep)
 
 	// Override with stack defaults.
 	var ss stack.SendBufferSizeOption
@@ -318,11 +321,6 @@ func (ep *endpoint) SetSockOpt(opt tcpip.SettableSocketOption) *tcpip.Error {
 	}
 }
 
-// SetSockOptBool implements tcpip.Endpoint.SetSockOptBool.
-func (ep *endpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error {
-	return tcpip.ErrUnknownProtocolOption
-}
-
 // SetSockOptInt implements tcpip.Endpoint.SetSockOptInt.
 func (ep *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
 	switch opt {
@@ -390,16 +388,6 @@ func (ep *endpoint) GetSockOpt(opt tcpip.GettableSocketOption) *tcpip.Error {
 	}
 }
 
-// GetSockOptBool implements tcpip.Endpoint.GetSockOptBool.
-func (*endpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) {
-	switch opt {
-	case tcpip.AcceptConnOption:
-		return false, nil
-	default:
-		return false, tcpip.ErrNotSupported
-	}
-}
-
 // GetSockOptInt implements tcpip.Endpoint.GetSockOptInt.
 func (ep *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
 	switch opt {
@@ -551,8 +539,10 @@ func (ep *endpoint) Stats() tcpip.EndpointStats {
 	return &ep.stats
 }
 
+// SetOwner implements tcpip.Endpoint.SetOwner.
 func (ep *endpoint) SetOwner(owner tcpip.PacketOwner) {}
 
+// SocketOptions implements tcpip.Endpoint.SocketOptions.
 func (ep *endpoint) SocketOptions() *tcpip.SocketOptions {
 	return &ep.ops
 }
diff --git a/pkg/tcpip/transport/raw/endpoint.go b/pkg/tcpip/transport/raw/endpoint.go
index 4ae1f92ab..0840a4b3d 100644
--- a/pkg/tcpip/transport/raw/endpoint.go
+++ b/pkg/tcpip/transport/raw/endpoint.go
@@ -58,12 +58,13 @@ type rawPacket struct {
 // +stateify savable
 type endpoint struct {
 	stack.TransportEndpointInfo
+	tcpip.DefaultSocketOptionsHandler
+
 	// The following fields are initialized at creation time and are
 	// immutable.
 	stack       *stack.Stack `state:"manual"`
 	waiterQueue *waiter.Queue
 	associated  bool
-	hdrIncluded bool
 
 	// The following fields are used to manage the receive queue and are
 	// protected by rcvMu.
@@ -82,7 +83,7 @@ type endpoint struct {
 	bound         bool
 	// route is the route to a remote network endpoint. It is set via
 	// Connect(), and is valid only when conneted is true.
-	route stack.Route                  `state:"manual"`
+	route *stack.Route                 `state:"manual"`
 	stats tcpip.TransportEndpointStats `state:"nosave"`
 	// linger is used for SO_LINGER socket option.
 	linger tcpip.LingerOption
@@ -114,8 +115,9 @@ func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, transProt
 		rcvBufSizeMax: 32 * 1024,
 		sndBufSizeMax: 32 * 1024,
 		associated:    associated,
-		hdrIncluded:   !associated,
 	}
+	e.ops.InitHandler(e)
+	e.ops.SetHeaderIncluded(!associated)
 
 	// Override with stack defaults.
 	var ss stack.SendBufferSizeOption
@@ -170,9 +172,11 @@ func (e *endpoint) Close() {
 		e.rcvList.Remove(e.rcvList.Front())
 	}
 
-	if e.connected {
+	e.connected = false
+
+	if e.route != nil {
 		e.route.Release()
-		e.connected = false
+		e.route = nil
 	}
 
 	e.closed = true
@@ -266,7 +270,7 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c
 
 	// If this is an unassociated socket and callee provided a nonzero
 	// destination address, route using that address.
-	if e.hdrIncluded {
+	if e.ops.GetHeaderIncluded() {
 		ip := header.IPv4(payloadBytes)
 		if !ip.IsValid(len(payloadBytes)) {
 			e.mu.RUnlock()
@@ -296,7 +300,7 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c
 		}
 
 		if e.route.IsResolutionRequired() {
-			savedRoute := &e.route
+			savedRoute := e.route
 			// Promote lock to exclusive if using a shared route,
 			// given that it may need to change in finishWrite.
 			e.mu.RUnlock()
@@ -304,7 +308,7 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c
 
 			// Make sure that the route didn't change during the
 			// time we didn't hold the lock.
-			if !e.connected || savedRoute != &e.route {
+			if !e.connected || savedRoute != e.route {
 				e.mu.Unlock()
 				return 0, nil, tcpip.ErrInvalidEndpointState
 			}
@@ -314,7 +318,7 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c
 			return n, ch, err
 		}
 
-		n, ch, err := e.finishWrite(payloadBytes, &e.route)
+		n, ch, err := e.finishWrite(payloadBytes, e.route)
 		e.mu.RUnlock()
 		return n, ch, err
 	}
@@ -335,7 +339,7 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c
 		return 0, nil, err
 	}
 
-	n, ch, err := e.finishWrite(payloadBytes, &route)
+	n, ch, err := e.finishWrite(payloadBytes, route)
 	route.Release()
 	e.mu.RUnlock()
 	return n, ch, err
@@ -356,7 +360,7 @@ func (e *endpoint) finishWrite(payloadBytes []byte, route *stack.Route) (int64,
 		}
 	}
 
-	if e.hdrIncluded {
+	if e.ops.GetHeaderIncluded() {
 		pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
 			Data: buffer.View(payloadBytes).ToVectorisedView(),
 		})
@@ -531,18 +535,6 @@ func (e *endpoint) SetSockOpt(opt tcpip.SettableSocketOption) *tcpip.Error {
 	}
 }
 
-// SetSockOptBool implements tcpip.Endpoint.SetSockOptBool.
-func (e *endpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error {
-	switch opt {
-	case tcpip.IPHdrIncludedOption:
-		e.mu.Lock()
-		e.hdrIncluded = v
-		e.mu.Unlock()
-		return nil
-	}
-	return tcpip.ErrUnknownProtocolOption
-}
-
 // SetSockOptInt implements tcpip.Endpoint.SetSockOptInt.
 func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
 	switch opt {
@@ -601,23 +593,6 @@ func (e *endpoint) GetSockOpt(opt tcpip.GettableSocketOption) *tcpip.Error {
 	}
 }
 
-// GetSockOptBool implements tcpip.Endpoint.GetSockOptBool.
-func (e *endpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) {
-	switch opt {
-	case tcpip.KeepaliveEnabledOption, tcpip.AcceptConnOption:
-		return false, nil
-
-	case tcpip.IPHdrIncludedOption:
-		e.mu.Lock()
-		v := e.hdrIncluded
-		e.mu.Unlock()
-		return v, nil
-
-	default:
-		return false, tcpip.ErrUnknownProtocolOption
-	}
-}
-
 // GetSockOptInt implements tcpip.Endpoint.GetSockOptInt.
 func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
 	switch opt {
@@ -756,10 +731,12 @@ func (e *endpoint) Stats() tcpip.EndpointStats {
 // Wait implements stack.TransportEndpoint.Wait.
 func (*endpoint) Wait() {}
 
+// LastError implements tcpip.Endpoint.LastError.
 func (*endpoint) LastError() *tcpip.Error {
 	return nil
 }
 
+// SocketOptions implements tcpip.Endpoint.SocketOptions.
 func (e *endpoint) SocketOptions() *tcpip.SocketOptions {
 	return &e.ops
 }
diff --git a/pkg/tcpip/transport/raw/endpoint_state.go b/pkg/tcpip/transport/raw/endpoint_state.go
index 7d97cbdc7..4a7e1c039 100644
--- a/pkg/tcpip/transport/raw/endpoint_state.go
+++ b/pkg/tcpip/transport/raw/endpoint_state.go
@@ -73,7 +73,13 @@ func (e *endpoint) Resume(s *stack.Stack) {
 	// If the endpoint is connected, re-connect.
 	if e.connected {
 		var err *tcpip.Error
-		e.route, err = e.stack.FindRoute(e.RegisterNICID, e.BindAddr, e.route.RemoteAddress, e.NetProto, false)
+		// TODO(gvisor.dev/issue/4906): Properly restore the route with the right
+		// remote address. We used to pass e.remote.RemoteAddress which was
+		// effectively the empty address but since moving e.route to hold a pointer
+		// to a route instead of the route by value, we pass the empty address
+		// directly. Obviously this was always wrong since we should provide the
+		// remote address we were connected to, to properly restore the route.
+		e.route, err = e.stack.FindRoute(e.RegisterNICID, e.BindAddr, "", e.NetProto, false)
 		if err != nil {
 			panic(err)
 		}
diff --git a/pkg/tcpip/transport/tcp/BUILD b/pkg/tcpip/transport/tcp/BUILD
index 518449602..cf232b508 100644
--- a/pkg/tcpip/transport/tcp/BUILD
+++ b/pkg/tcpip/transport/tcp/BUILD
@@ -1,4 +1,4 @@
-load("//tools:defs.bzl", "go_library", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test", "more_shards")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 
 package(licenses = ["notice"])
@@ -45,7 +45,9 @@ go_library(
         "rcv.go",
         "rcv_state.go",
         "reno.go",
+        "reno_recovery.go",
         "sack.go",
+        "sack_recovery.go",
         "sack_scoreboard.go",
         "segment.go",
         "segment_heap.go",
@@ -91,7 +93,7 @@ go_test(
         "tcp_test.go",
         "tcp_timestamp_test.go",
     ],
-    shard_count = 10,
+    shard_count = more_shards,
     deps = [
         ":tcp",
         "//pkg/rand",
diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go
index 6e5adc383..3e1041cbe 100644
--- a/pkg/tcpip/transport/tcp/accept.go
+++ b/pkg/tcpip/transport/tcp/accept.go
@@ -213,7 +213,7 @@ func (l *listenContext) createConnectingEndpoint(s *segment, iss seqnum.Value, i
 	route.ResolveWith(s.remoteLinkAddr)
 
 	n := newEndpoint(l.stack, netProto, queue)
-	n.v6only = l.v6Only
+	n.ops.SetV6Only(l.v6Only)
 	n.ID = s.id
 	n.boundNICID = s.nicID
 	n.route = route
@@ -599,7 +599,7 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) *tcpip.Er
 				ack:    s.sequenceNumber + 1,
 				rcvWnd: ctx.rcvWnd,
 			}
-			if err := e.sendSynTCP(&route, fields, synOpts); err != nil {
+			if err := e.sendSynTCP(route, fields, synOpts); err != nil {
 				return err
 			}
 			e.stack.Stats().TCP.ListenOverflowSynCookieSent.Increment()
@@ -752,7 +752,7 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) *tcpip.Er
 // its own goroutine and is responsible for handling connection requests.
 func (e *endpoint) protocolListenLoop(rcvWnd seqnum.Size) {
 	e.mu.Lock()
-	v6Only := e.v6only
+	v6Only := e.ops.GetV6Only()
 	ctx := newListenContext(e.stack, e, rcvWnd, v6Only, e.NetProto)
 
 	defer func() {
diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index ac6d879a7..c944dccc0 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -16,6 +16,7 @@ package tcp
 
 import (
 	"encoding/binary"
+	"math"
 	"time"
 
 	"gvisor.dev/gvisor/pkg/rand"
@@ -133,7 +134,7 @@ func FindWndScale(wnd seqnum.Size) int {
 		return 0
 	}
 
-	max := seqnum.Size(0xffff)
+	max := seqnum.Size(math.MaxUint16)
 	s := 0
 	for wnd > max && s < header.MaxWndScale {
 		s++
@@ -300,7 +301,7 @@ func (h *handshake) synSentState(s *segment) *tcpip.Error {
 	if ttl == 0 {
 		ttl = h.ep.route.DefaultTTL()
 	}
-	h.ep.sendSynTCP(&h.ep.route, tcpFields{
+	h.ep.sendSynTCP(h.ep.route, tcpFields{
 		id:     h.ep.ID,
 		ttl:    ttl,
 		tos:    h.ep.sendTOS,
@@ -361,7 +362,7 @@ func (h *handshake) synRcvdState(s *segment) *tcpip.Error {
 			SACKPermitted: h.ep.sackPermitted,
 			MSS:           h.ep.amss,
 		}
-		h.ep.sendSynTCP(&h.ep.route, tcpFields{
+		h.ep.sendSynTCP(h.ep.route, tcpFields{
 			id:     h.ep.ID,
 			ttl:    h.ep.ttl,
 			tos:    h.ep.sendTOS,
@@ -496,7 +497,7 @@ func (h *handshake) resolveRoute() *tcpip.Error {
 				h.ep.mu.Lock()
 			}
 			if n&notifyError != 0 {
-				return h.ep.LastError()
+				return h.ep.lastErrorLocked()
 			}
 		}
 
@@ -547,7 +548,7 @@ func (h *handshake) start() *tcpip.Error {
 	}
 
 	h.sendSYNOpts = synOpts
-	h.ep.sendSynTCP(&h.ep.route, tcpFields{
+	h.ep.sendSynTCP(h.ep.route, tcpFields{
 		id:     h.ep.ID,
 		ttl:    h.ep.ttl,
 		tos:    h.ep.sendTOS,
@@ -575,7 +576,6 @@ func (h *handshake) complete() *tcpip.Error {
 		return err
 	}
 	defer timer.stop()
-
 	for h.state != handshakeCompleted {
 		// Unlock before blocking, and reacquire again afterwards (h.ep.mu is held
 		// throughout handshake processing).
@@ -597,7 +597,7 @@ func (h *handshake) complete() *tcpip.Error {
 			// the connection with another ACK or data (as ACKs are never
 			// retransmitted on their own).
 			if h.active || !h.acked || h.deferAccept != 0 && time.Since(h.startTime) > h.deferAccept {
-				h.ep.sendSynTCP(&h.ep.route, tcpFields{
+				h.ep.sendSynTCP(h.ep.route, tcpFields{
 					id:     h.ep.ID,
 					ttl:    h.ep.ttl,
 					tos:    h.ep.sendTOS,
@@ -631,9 +631,8 @@ func (h *handshake) complete() *tcpip.Error {
 				h.ep.mu.Lock()
 			}
 			if n&notifyError != 0 {
-				return h.ep.LastError()
+				return h.ep.lastErrorLocked()
 			}
-
 		case wakerForNewSegment:
 			if err := h.processSegments(); err != nil {
 				return err
@@ -820,8 +819,8 @@ func sendTCPBatch(r *stack.Route, tf tcpFields, data buffer.VectorisedView, gso
 	data = data.Clone(nil)
 
 	optLen := len(tf.opts)
-	if tf.rcvWnd > 0xffff {
-		tf.rcvWnd = 0xffff
+	if tf.rcvWnd > math.MaxUint16 {
+		tf.rcvWnd = math.MaxUint16
 	}
 
 	mss := int(gso.MSS)
@@ -865,8 +864,8 @@ func sendTCPBatch(r *stack.Route, tf tcpFields, data buffer.VectorisedView, gso
 // network endpoint and under the provided identity.
 func sendTCP(r *stack.Route, tf tcpFields, data buffer.VectorisedView, gso *stack.GSO, owner tcpip.PacketOwner) *tcpip.Error {
 	optLen := len(tf.opts)
-	if tf.rcvWnd > 0xffff {
-		tf.rcvWnd = 0xffff
+	if tf.rcvWnd > math.MaxUint16 {
+		tf.rcvWnd = math.MaxUint16
 	}
 
 	if r.Loop&stack.PacketLoop == 0 && gso != nil && gso.Type == stack.GSOSW && int(gso.MSS) < data.Size() {
@@ -941,7 +940,7 @@ func (e *endpoint) sendRaw(data buffer.VectorisedView, flags byte, seq, ack seqn
 		sackBlocks = e.sack.Blocks[:e.sack.NumBlocks]
 	}
 	options := e.makeOptions(sackBlocks)
-	err := e.sendTCP(&e.route, tcpFields{
+	err := e.sendTCP(e.route, tcpFields{
 		id:     e.ID,
 		ttl:    e.ttl,
 		tos:    e.sendTOS,
@@ -1002,7 +1001,7 @@ func (e *endpoint) resetConnectionLocked(err *tcpip.Error) {
 	// Only send a reset if the connection is being aborted for a reason
 	// other than receiving a reset.
 	e.setEndpointState(StateError)
-	e.HardError = err
+	e.hardError = err
 	if err != tcpip.ErrConnectionReset && err != tcpip.ErrTimeout {
 		// The exact sequence number to be used for the RST is the same as the
 		// one used by Linux. We need to handle the case of window being shrunk
@@ -1080,7 +1079,7 @@ func (e *endpoint) transitionToStateCloseLocked() {
 // to any other listening endpoint. We reply with RST if we cannot find one.
 func (e *endpoint) tryDeliverSegmentFromClosedEndpoint(s *segment) {
 	ep := e.stack.FindTransportEndpoint(e.NetProto, e.TransProto, e.ID, s.nicID)
-	if ep == nil && e.NetProto == header.IPv6ProtocolNumber && e.EndpointInfo.TransportEndpointInfo.ID.LocalAddress.To4() != "" {
+	if ep == nil && e.NetProto == header.IPv6ProtocolNumber && e.TransportEndpointInfo.ID.LocalAddress.To4() != "" {
 		// Dual-stack socket, try IPv4.
 		ep = e.stack.FindTransportEndpoint(header.IPv4ProtocolNumber, e.TransProto, e.ID, s.nicID)
 	}
@@ -1141,7 +1140,7 @@ func (e *endpoint) handleReset(s *segment) (ok bool, err *tcpip.Error) {
 		//  delete the TCB, and return.
 		case StateCloseWait:
 			e.transitionToStateCloseLocked()
-			e.HardError = tcpip.ErrAborted
+			e.hardError = tcpip.ErrAborted
 			e.notifyProtocolGoroutine(notifyTickleWorker)
 			return false, nil
 		default:
@@ -1286,7 +1285,7 @@ func (e *endpoint) keepaliveTimerExpired() *tcpip.Error {
 	userTimeout := e.userTimeout
 
 	e.keepalive.Lock()
-	if !e.keepalive.enabled || !e.keepalive.timer.checkExpiration() {
+	if !e.SocketOptions().GetKeepAlive() || !e.keepalive.timer.checkExpiration() {
 		e.keepalive.Unlock()
 		return nil
 	}
@@ -1323,7 +1322,7 @@ func (e *endpoint) resetKeepaliveTimer(receivedData bool) {
 	}
 	// Start the keepalive timer IFF it's enabled and there is no pending
 	// data to send.
-	if !e.keepalive.enabled || e.snd == nil || e.snd.sndUna != e.snd.sndNxt {
+	if !e.SocketOptions().GetKeepAlive() || e.snd == nil || e.snd.sndUna != e.snd.sndNxt {
 		e.keepalive.timer.disable()
 		e.keepalive.Unlock()
 		return
@@ -1353,7 +1352,6 @@ func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{
 
 	epilogue := func() {
 		// e.mu is expected to be hold upon entering this section.
-
 		if e.snd != nil {
 			e.snd.resendTimer.cleanup()
 		}
@@ -1383,7 +1381,7 @@ func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{
 			e.lastErrorMu.Unlock()
 
 			e.setEndpointState(StateError)
-			e.HardError = err
+			e.hardError = err
 
 			e.workerCleanup = true
 			// Lock released below.
@@ -1638,7 +1636,7 @@ func (e *endpoint) handleTimeWaitSegments() (extendTimeWait bool, reuseTW func()
 		}
 		extTW, newSyn := e.rcv.handleTimeWaitSegment(s)
 		if newSyn {
-			info := e.EndpointInfo.TransportEndpointInfo
+			info := e.TransportEndpointInfo
 			newID := info.ID
 			newID.RemoteAddress = ""
 			newID.RemotePort = 0
diff --git a/pkg/tcpip/transport/tcp/dual_stack_test.go b/pkg/tcpip/transport/tcp/dual_stack_test.go
index a6f25896b..1d1b01a6c 100644
--- a/pkg/tcpip/transport/tcp/dual_stack_test.go
+++ b/pkg/tcpip/transport/tcp/dual_stack_test.go
@@ -405,14 +405,6 @@ func testV4Accept(t *testing.T, c *context.Context) {
 		}
 	}
 
-	// Make sure we get the same error when calling the original ep and the
-	// new one. This validates that v4-mapped endpoints are still able to
-	// query the V6Only flag, whereas pure v4 endpoints are not.
-	_, expected := c.EP.GetSockOptBool(tcpip.V6OnlyOption)
-	if _, err := nep.GetSockOptBool(tcpip.V6OnlyOption); err != expected {
-		t.Fatalf("GetSockOpt returned unexpected value: got %v, want %v", err, expected)
-	}
-
 	// Check the peer address.
 	addr, err := nep.GetRemoteAddress()
 	if err != nil {
@@ -530,12 +522,12 @@ func TestV6AcceptOnV6(t *testing.T) {
 	c.WQ.EventRegister(&we, waiter.EventIn)
 	defer c.WQ.EventUnregister(&we)
 	var addr tcpip.FullAddress
-	nep, _, err := c.EP.Accept(&addr)
+	_, _, err := c.EP.Accept(&addr)
 	if err == tcpip.ErrWouldBlock {
 		// Wait for connection to be established.
 		select {
 		case <-ch:
-			nep, _, err = c.EP.Accept(&addr)
+			_, _, err = c.EP.Accept(&addr)
 			if err != nil {
 				t.Fatalf("Accept failed: %v", err)
 			}
@@ -548,12 +540,6 @@ func TestV6AcceptOnV6(t *testing.T) {
 	if addr.Addr != context.TestV6Addr {
 		t.Errorf("Unexpected remote address: got %s, want %s", addr.Addr, context.TestV6Addr)
 	}
-
-	// Make sure we can still query the v6 only status of the new endpoint,
-	// that is, that it is in fact a v6 socket.
-	if _, err := nep.GetSockOptBool(tcpip.V6OnlyOption); err != nil {
-		t.Errorf("GetSockOptBool(tcpip.V6OnlyOption) failed: %s", err)
-	}
 }
 
 func TestV4AcceptOnV4(t *testing.T) {
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index 4f4f4c65e..87eda2efb 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -310,16 +310,12 @@ type Stats struct {
 func (*Stats) IsEndpointStats() {}
 
 // EndpointInfo holds useful information about a transport endpoint which
-// can be queried by monitoring tools.
+// can be queried by monitoring tools. This exists to allow tcp-only state to
+// be exposed.
 //
 // +stateify savable
 type EndpointInfo struct {
 	stack.TransportEndpointInfo
-
-	// HardError is meaningful only when state is stateError. It stores the
-	// error to be returned when read/write syscalls are called and the
-	// endpoint is in this state. HardError is protected by endpoint mu.
-	HardError *tcpip.Error `state:".(string)"`
 }
 
 // IsEndpointInfo is an empty method to implement the tcpip.EndpointInfo
@@ -367,6 +363,7 @@ func (*EndpointInfo) IsEndpointInfo() {}
 // +stateify savable
 type endpoint struct {
 	EndpointInfo
+	tcpip.DefaultSocketOptionsHandler
 
 	// endpointEntry is used to queue endpoints for processing to the
 	// a given tcp processor goroutine.
@@ -386,6 +383,11 @@ type endpoint struct {
 	waiterQueue *waiter.Queue `state:"wait"`
 	uniqueID    uint64
 
+	// hardError is meaningful only when state is stateError. It stores the
+	// error to be returned when read/write syscalls are called and the
+	// endpoint is in this state. hardError is protected by endpoint mu.
+	hardError *tcpip.Error `state:".(string)"`
+
 	// lastError represents the last error that the endpoint reported;
 	// access to it is protected by the following mutex.
 	lastErrorMu sync.Mutex   `state:"nosave"`
@@ -421,7 +423,10 @@ type endpoint struct {
 
 	// mu protects all endpoint fields unless documented otherwise. mu must
 	// be acquired before interacting with the endpoint fields.
-	mu          sync.Mutex `state:"nosave"`
+	//
+	// During handshake, mu is locked by the protocol listen goroutine and
+	// released by the handshake completion goroutine.
+	mu          sync.CrossGoroutineMutex `state:"nosave"`
 	ownedByUser uint32
 
 	// state must be read/set using the EndpointState()/setEndpointState()
@@ -436,9 +441,8 @@ type endpoint struct {
 	isPortReserved    bool `state:"manual"`
 	isRegistered      bool `state:"manual"`
 	boundNICID        tcpip.NICID
-	route             stack.Route `state:"manual"`
+	route             *stack.Route `state:"manual"`
 	ttl               uint8
-	v6only            bool
 	isConnectNotified bool
 
 	// h stores a reference to the current handshake state if the endpoint is in
@@ -506,24 +510,9 @@ type endpoint struct {
 	// delay is a boolean (0 is false) and must be accessed atomically.
 	delay uint32
 
-	// cork holds back segments until full.
-	//
-	// cork is a boolean (0 is false) and must be accessed atomically.
-	cork uint32
-
 	// scoreboard holds TCP SACK Scoreboard information for this endpoint.
 	scoreboard *SACKScoreboard
 
-	// The options below aren't implemented, but we remember the user
-	// settings because applications expect to be able to set/query these
-	// options.
-
-	// slowAck holds the negated state of quick ack. It is stubbed out and
-	// does nothing.
-	//
-	// slowAck is a boolean (0 is false) and must be accessed atomically.
-	slowAck uint32
-
 	// segmentQueue is used to hand received segments to the protocol
 	// goroutine. Segments are queued as long as the queue is not full,
 	// and dropped when it is.
@@ -701,7 +690,7 @@ func (e *endpoint) UniqueID() uint64 {
 //
 // If userMSS is non-zero and is not greater than the maximum possible MSS for
 // r, it will be used; otherwise, the maximum possible MSS will be used.
-func calculateAdvertisedMSS(userMSS uint16, r stack.Route) uint16 {
+func calculateAdvertisedMSS(userMSS uint16, r *stack.Route) uint16 {
 	// The maximum possible MSS is dependent on the route.
 	// TODO(b/143359391): Respect TCP Min and Max size.
 	maxMSS := uint16(r.MTU() - header.TCPMinimumSize)
@@ -850,7 +839,6 @@ func (e *endpoint) recentTimestamp() uint32 {
 // +stateify savable
 type keepalive struct {
 	sync.Mutex `state:"nosave"`
-	enabled    bool
 	idle       time.Duration
 	interval   time.Duration
 	count      int
@@ -884,6 +872,9 @@ func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQue
 		windowClamp:   DefaultReceiveBufferSize,
 		maxSynRetries: DefaultSynRetries,
 	}
+	e.ops.InitHandler(e)
+	e.ops.SetMulticastLoop(true)
+	e.ops.SetQuickAck(true)
 
 	var ss tcpip.TCPSendBufferSizeRangeOption
 	if err := s.TransportProtocolOption(ProtocolNumber, &ss); err == nil {
@@ -907,7 +898,7 @@ func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQue
 
 	var de tcpip.TCPDelayEnabled
 	if err := s.TransportProtocolOption(ProtocolNumber, &de); err == nil && de {
-		e.SetSockOptBool(tcpip.DelayOption, true)
+		e.ops.SetDelayOption(true)
 	}
 
 	var tcpLT tcpip.TCPLingerTimeoutOption
@@ -1169,7 +1160,11 @@ func (e *endpoint) cleanupLocked() {
 	e.boundPortFlags = ports.Flags{}
 	e.boundDest = tcpip.FullAddress{}
 
-	e.route.Release()
+	if e.route != nil {
+		e.route.Release()
+		e.route = nil
+	}
+
 	e.stack.CompleteTransportEndpointCleanup(e)
 	tcpip.DeleteDanglingEndpoint(e)
 }
@@ -1279,11 +1274,20 @@ func (e *endpoint) ModerateRecvBuf(copied int) {
 	e.rcvListMu.Unlock()
 }
 
+// SetOwner implements tcpip.Endpoint.SetOwner.
 func (e *endpoint) SetOwner(owner tcpip.PacketOwner) {
 	e.owner = owner
 }
 
-func (e *endpoint) LastError() *tcpip.Error {
+// Preconditions: e.mu must be held to call this function.
+func (e *endpoint) hardErrorLocked() *tcpip.Error {
+	err := e.hardError
+	e.hardError = nil
+	return err
+}
+
+// Preconditions: e.mu must be held to call this function.
+func (e *endpoint) lastErrorLocked() *tcpip.Error {
 	e.lastErrorMu.Lock()
 	defer e.lastErrorMu.Unlock()
 	err := e.lastError
@@ -1291,6 +1295,16 @@ func (e *endpoint) LastError() *tcpip.Error {
 	return err
 }
 
+// LastError implements tcpip.Endpoint.LastError.
+func (e *endpoint) LastError() *tcpip.Error {
+	e.LockUser()
+	defer e.UnlockUser()
+	if err := e.hardErrorLocked(); err != nil {
+		return err
+	}
+	return e.lastErrorLocked()
+}
+
 // Read reads data from the endpoint.
 func (e *endpoint) Read(*tcpip.FullAddress) (buffer.View, tcpip.ControlMessages, *tcpip.Error) {
 	e.LockUser()
@@ -1312,9 +1326,11 @@ func (e *endpoint) Read(*tcpip.FullAddress) (buffer.View, tcpip.ControlMessages,
 	bufUsed := e.rcvBufUsed
 	if s := e.EndpointState(); !s.connected() && s != StateClose && bufUsed == 0 {
 		e.rcvListMu.Unlock()
-		he := e.HardError
 		if s == StateError {
-			return buffer.View{}, tcpip.ControlMessages{}, he
+			if err := e.hardErrorLocked(); err != nil {
+				return buffer.View{}, tcpip.ControlMessages{}, err
+			}
+			return buffer.View{}, tcpip.ControlMessages{}, tcpip.ErrClosedForReceive
 		}
 		e.stats.ReadErrors.NotConnected.Increment()
 		return buffer.View{}, tcpip.ControlMessages{}, tcpip.ErrNotConnected
@@ -1370,9 +1386,13 @@ func (e *endpoint) readLocked() (buffer.View, *tcpip.Error) {
 // indicating the reason why it's not writable.
 // Caller must hold e.mu and e.sndBufMu
 func (e *endpoint) isEndpointWritableLocked() (int, *tcpip.Error) {
+	// The endpoint cannot be written to if it's not connected.
 	switch s := e.EndpointState(); {
 	case s == StateError:
-		return 0, e.HardError
+		if err := e.hardErrorLocked(); err != nil {
+			return 0, err
+		}
+		return 0, tcpip.ErrClosedForSend
 	case !s.connecting() && !s.connected():
 		return 0, tcpip.ErrClosedForSend
 	case s.connecting():
@@ -1486,7 +1506,7 @@ func (e *endpoint) Peek(vec [][]byte) (int64, tcpip.ControlMessages, *tcpip.Erro
 	// but has some pending unread data.
 	if s := e.EndpointState(); !s.connected() && s != StateClose {
 		if s == StateError {
-			return 0, tcpip.ControlMessages{}, e.HardError
+			return 0, tcpip.ControlMessages{}, e.hardErrorLocked()
 		}
 		e.stats.ReadErrors.InvalidEndpointState.Increment()
 		return 0, tcpip.ControlMessages{}, tcpip.ErrInvalidEndpointState
@@ -1602,72 +1622,39 @@ func (e *endpoint) windowCrossedACKThresholdLocked(deltaBefore int) (crossed boo
 	return false, false
 }
 
-// SetSockOptBool sets a socket option.
-func (e *endpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error {
-	switch opt {
-
-	case tcpip.CorkOption:
-		e.LockUser()
-		if !v {
-			atomic.StoreUint32(&e.cork, 0)
-
-			// Handle the corked data.
-			e.sndWaker.Assert()
-		} else {
-			atomic.StoreUint32(&e.cork, 1)
-		}
-		e.UnlockUser()
-
-	case tcpip.DelayOption:
-		if v {
-			atomic.StoreUint32(&e.delay, 1)
-		} else {
-			atomic.StoreUint32(&e.delay, 0)
-
-			// Handle delayed data.
-			e.sndWaker.Assert()
-		}
-
-	case tcpip.KeepaliveEnabledOption:
-		e.keepalive.Lock()
-		e.keepalive.enabled = v
-		e.keepalive.Unlock()
-		e.notifyProtocolGoroutine(notifyKeepaliveChanged)
-
-	case tcpip.QuickAckOption:
-		o := uint32(1)
-		if v {
-			o = 0
-		}
-		atomic.StoreUint32(&e.slowAck, o)
-
-	case tcpip.ReuseAddressOption:
-		e.LockUser()
-		e.portFlags.TupleOnly = v
-		e.UnlockUser()
-
-	case tcpip.ReusePortOption:
-		e.LockUser()
-		e.portFlags.LoadBalanced = v
-		e.UnlockUser()
+// OnReuseAddressSet implements tcpip.SocketOptionsHandler.OnReuseAddressSet.
+func (e *endpoint) OnReuseAddressSet(v bool) {
+	e.LockUser()
+	e.portFlags.TupleOnly = v
+	e.UnlockUser()
+}
 
-	case tcpip.V6OnlyOption:
-		// We only recognize this option on v6 endpoints.
-		if e.NetProto != header.IPv6ProtocolNumber {
-			return tcpip.ErrInvalidEndpointState
-		}
+// OnReusePortSet implements tcpip.SocketOptionsHandler.OnReusePortSet.
+func (e *endpoint) OnReusePortSet(v bool) {
+	e.LockUser()
+	e.portFlags.LoadBalanced = v
+	e.UnlockUser()
+}
 
-		// We only allow this to be set when we're in the initial state.
-		if e.EndpointState() != StateInitial {
-			return tcpip.ErrInvalidEndpointState
-		}
+// OnKeepAliveSet implements tcpip.SocketOptionsHandler.OnKeepAliveSet.
+func (e *endpoint) OnKeepAliveSet(v bool) {
+	e.notifyProtocolGoroutine(notifyKeepaliveChanged)
+}
 
-		e.LockUser()
-		e.v6only = v
-		e.UnlockUser()
+// OnDelayOptionSet implements tcpip.SocketOptionsHandler.OnDelayOptionSet.
+func (e *endpoint) OnDelayOptionSet(v bool) {
+	if !v {
+		// Handle delayed data.
+		e.sndWaker.Assert()
 	}
+}
 
-	return nil
+// OnCorkOptionSet implements tcpip.SocketOptionsHandler.OnCorkOptionSet.
+func (e *endpoint) OnCorkOptionSet(v bool) {
+	if !v {
+		// Handle the corked data.
+		e.sndWaker.Assert()
+	}
 }
 
 // SetSockOptInt sets a socket option.
@@ -1949,67 +1936,6 @@ func (e *endpoint) readyReceiveSize() (int, *tcpip.Error) {
 	return e.rcvBufUsed, nil
 }
 
-// GetSockOptBool implements tcpip.Endpoint.GetSockOptBool.
-func (e *endpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) {
-	switch opt {
-
-	case tcpip.CorkOption:
-		return atomic.LoadUint32(&e.cork) != 0, nil
-
-	case tcpip.DelayOption:
-		return atomic.LoadUint32(&e.delay) != 0, nil
-
-	case tcpip.KeepaliveEnabledOption:
-		e.keepalive.Lock()
-		v := e.keepalive.enabled
-		e.keepalive.Unlock()
-
-		return v, nil
-
-	case tcpip.QuickAckOption:
-		v := atomic.LoadUint32(&e.slowAck) == 0
-		return v, nil
-
-	case tcpip.ReuseAddressOption:
-		e.LockUser()
-		v := e.portFlags.TupleOnly
-		e.UnlockUser()
-
-		return v, nil
-
-	case tcpip.ReusePortOption:
-		e.LockUser()
-		v := e.portFlags.LoadBalanced
-		e.UnlockUser()
-
-		return v, nil
-
-	case tcpip.V6OnlyOption:
-		// We only recognize this option on v6 endpoints.
-		if e.NetProto != header.IPv6ProtocolNumber {
-			return false, tcpip.ErrUnknownProtocolOption
-		}
-
-		e.LockUser()
-		v := e.v6only
-		e.UnlockUser()
-
-		return v, nil
-
-	case tcpip.MulticastLoopOption:
-		return true, nil
-
-	case tcpip.AcceptConnOption:
-		e.LockUser()
-		defer e.UnlockUser()
-
-		return e.EndpointState() == StateListen, nil
-
-	default:
-		return false, tcpip.ErrUnknownProtocolOption
-	}
-}
-
 // GetSockOptInt implements tcpip.Endpoint.GetSockOptInt.
 func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
 	switch opt {
@@ -2166,7 +2092,7 @@ func (e *endpoint) GetSockOpt(opt tcpip.GettableSocketOption) *tcpip.Error {
 // checkV4MappedLocked determines the effective network protocol and converts
 // addr to its canonical form.
 func (e *endpoint) checkV4MappedLocked(addr tcpip.FullAddress) (tcpip.FullAddress, tcpip.NetworkProtocolNumber, *tcpip.Error) {
-	unwrapped, netProto, err := e.TransportEndpointInfo.AddrNetProtoLocked(addr, e.v6only)
+	unwrapped, netProto, err := e.TransportEndpointInfo.AddrNetProtoLocked(addr, e.ops.GetV6Only())
 	if err != nil {
 		return tcpip.FullAddress{}, 0, err
 	}
@@ -2243,7 +2169,10 @@ func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) *tc
 		return tcpip.ErrAlreadyConnecting
 
 	case StateError:
-		return e.HardError
+		if err := e.hardErrorLocked(); err != nil {
+			return err
+		}
+		return tcpip.ErrConnectionAborted
 
 	default:
 		return tcpip.ErrInvalidEndpointState
@@ -2417,7 +2346,7 @@ func (e *endpoint) startMainLoop(handshake bool) *tcpip.Error {
 				e.lastErrorMu.Unlock()
 
 				e.setEndpointState(StateError)
-				e.HardError = err
+				e.hardError = err
 
 				// Call cleanupLocked to free up any reservations.
 				e.cleanupLocked()
@@ -2697,7 +2626,7 @@ func (e *endpoint) bindLocked(addr tcpip.FullAddress) (err *tcpip.Error) {
 	// v6only set to false.
 	if netProto == header.IPv6ProtocolNumber {
 		stackHasV4 := e.stack.CheckNetworkProtocol(header.IPv4ProtocolNumber)
-		alsoBindToV4 := !e.v6only && addr.Addr == "" && stackHasV4
+		alsoBindToV4 := !e.ops.GetV6Only() && addr.Addr == "" && stackHasV4
 		if alsoBindToV4 {
 			netProtos = append(netProtos, header.IPv4ProtocolNumber)
 		}
@@ -2782,7 +2711,7 @@ func (e *endpoint) getRemoteAddress() tcpip.FullAddress {
 
 func (*endpoint) HandlePacket(stack.TransportEndpointID, *stack.PacketBuffer) {
 	// TCP HandlePacket is not required anymore as inbound packets first
-	// land at the Dispatcher which then can either delivery using the
+	// land at the Dispatcher which then can either deliver using the
 	// worker go routine or directly do the invoke the tcp processing inline
 	// based on the state of the endpoint.
 }
@@ -3161,7 +3090,7 @@ func (e *endpoint) State() uint32 {
 func (e *endpoint) Info() tcpip.EndpointInfo {
 	e.LockUser()
 	// Make a copy of the endpoint info.
-	ret := e.EndpointInfo
+	ret := e.TransportEndpointInfo
 	e.UnlockUser()
 	return &ret
 }
@@ -3187,6 +3116,7 @@ func (e *endpoint) Wait() {
 	}
 }
 
+// SocketOptions implements tcpip.Endpoint.SocketOptions.
 func (e *endpoint) SocketOptions() *tcpip.SocketOptions {
 	return &e.ops
 }
diff --git a/pkg/tcpip/transport/tcp/endpoint_state.go b/pkg/tcpip/transport/tcp/endpoint_state.go
index bb901c0f8..ba67176b5 100644
--- a/pkg/tcpip/transport/tcp/endpoint_state.go
+++ b/pkg/tcpip/transport/tcp/endpoint_state.go
@@ -321,21 +321,21 @@ func (e *endpoint) loadRecentTSTime(unix unixTime) {
 }
 
 // saveHardError is invoked by stateify.
-func (e *EndpointInfo) saveHardError() string {
-	if e.HardError == nil {
+func (e *endpoint) saveHardError() string {
+	if e.hardError == nil {
 		return ""
 	}
 
-	return e.HardError.String()
+	return e.hardError.String()
 }
 
 // loadHardError is invoked by stateify.
-func (e *EndpointInfo) loadHardError(s string) {
+func (e *endpoint) loadHardError(s string) {
 	if s == "" {
 		return
 	}
 
-	e.HardError = tcpip.StringToError(s)
+	e.hardError = tcpip.StringToError(s)
 }
 
 // saveMeasureTime is invoked by stateify.
diff --git a/pkg/tcpip/transport/tcp/protocol.go b/pkg/tcpip/transport/tcp/protocol.go
index 2329aca4b..672159eed 100644
--- a/pkg/tcpip/transport/tcp/protocol.go
+++ b/pkg/tcpip/transport/tcp/protocol.go
@@ -250,7 +250,7 @@ func replyWithReset(stack *stack.Stack, s *segment, tos, ttl uint8) *tcpip.Error
 		ttl = route.DefaultTTL()
 	}
 
-	return sendTCP(&route, tcpFields{
+	return sendTCP(route, tcpFields{
 		id:     s.id,
 		ttl:    ttl,
 		tos:    tos,
diff --git a/pkg/tcpip/transport/tcp/rcv.go b/pkg/tcpip/transport/tcp/rcv.go
index 8e0b7c843..f2b1b68da 100644
--- a/pkg/tcpip/transport/tcp/rcv.go
+++ b/pkg/tcpip/transport/tcp/rcv.go
@@ -16,6 +16,7 @@ package tcp
 
 import (
 	"container/heap"
+	"math"
 	"time"
 
 	"gvisor.dev/gvisor/pkg/tcpip"
@@ -48,6 +49,10 @@ type receiver struct {
 
 	rcvWndScale uint8
 
+	// prevBufused is the snapshot of endpoint rcvBufUsed taken when we
+	// advertise a receive window.
+	prevBufUsed int
+
 	closed bool
 
 	// pendingRcvdSegments is bounded by the receive buffer size of the
@@ -80,9 +85,9 @@ func (r *receiver) acceptable(segSeq seqnum.Value, segLen seqnum.Size) bool {
 	// outgoing packets, we should use what we have advertised for acceptability
 	// test.
 	scaledWindowSize := r.rcvWnd >> r.rcvWndScale
-	if scaledWindowSize > 0xffff {
+	if scaledWindowSize > math.MaxUint16 {
 		// This is what we actually put in the Window field.
-		scaledWindowSize = 0xffff
+		scaledWindowSize = math.MaxUint16
 	}
 	advertisedWindowSize := scaledWindowSize << r.rcvWndScale
 	return header.Acceptable(segSeq, segLen, r.rcvNxt, r.rcvNxt.Add(advertisedWindowSize))
@@ -106,6 +111,34 @@ func (r *receiver) currentWindow() (curWnd seqnum.Size) {
 func (r *receiver) getSendParams() (rcvNxt seqnum.Value, rcvWnd seqnum.Size) {
 	newWnd := r.ep.selectWindow()
 	curWnd := r.currentWindow()
+	unackLen := int(r.ep.snd.maxSentAck.Size(r.rcvNxt))
+	bufUsed := r.ep.receiveBufferUsed()
+
+	// Grow the right edge of the window only for payloads larger than the
+	// the segment overhead OR if the application is actively consuming data.
+	//
+	// Avoiding growing the right edge otherwise, addresses a situation below:
+	// An application has been slow in reading data and we have burst of
+	// incoming segments lengths < segment overhead. Here, our available free
+	// memory would reduce drastically when compared to the advertised receive
+	// window.
+	//
+	// For example: With incoming 512 bytes segments, segment overhead of
+	// 552 bytes (at the time of writing this comment), with receive window
+	// starting from 1MB and with rcvAdvWndScale being 1, buffer would reach 0
+	// when the curWnd is still 19436 bytes, because for every incoming segment
+	// newWnd would reduce by (552+512) >> rcvAdvWndScale (current value 1),
+	// while curWnd would reduce by 512 bytes.
+	// Such a situation causes us to keep tail dropping the incoming segments
+	// and never advertise zero receive window to the peer.
+	//
+	// Linux does a similar check for minimal sk_buff size (128):
+	// https://github.com/torvalds/linux/blob/d5beb3140f91b1c8a3d41b14d729aefa4dcc58bc/net/ipv4/tcp_input.c#L783
+	//
+	// Also, if the application is reading the data, we keep growing the right
+	// edge, as we are still advertising a window that we think can be serviced.
+	toGrow := unackLen >= SegSize || bufUsed <= r.prevBufUsed
+
 	// Update rcvAcc only if new window is > previously advertised window. We
 	// should never shrink the acceptable sequence space once it has been
 	// advertised the peer. If we shrink the acceptable sequence space then we
@@ -115,7 +148,7 @@ func (r *receiver) getSendParams() (rcvNxt seqnum.Value, rcvWnd seqnum.Size) {
 	// rcvWUP       rcvNxt         rcvAcc          new rcvAcc
 	//               <=====curWnd ===>
 	//               <========= newWnd > curWnd ========= >
-	if r.rcvNxt.Add(seqnum.Size(curWnd)).LessThan(r.rcvNxt.Add(seqnum.Size(newWnd))) {
+	if r.rcvNxt.Add(seqnum.Size(curWnd)).LessThan(r.rcvNxt.Add(seqnum.Size(newWnd))) && toGrow {
 		// If the new window moves the right edge, then update rcvAcc.
 		r.rcvAcc = r.rcvNxt.Add(seqnum.Size(newWnd))
 	} else {
@@ -130,11 +163,24 @@ func (r *receiver) getSendParams() (rcvNxt seqnum.Value, rcvWnd seqnum.Size) {
 	// receiver's estimated RTT.
 	r.rcvWnd = newWnd
 	r.rcvWUP = r.rcvNxt
+	r.prevBufUsed = bufUsed
 	scaledWnd := r.rcvWnd >> r.rcvWndScale
 	if scaledWnd == 0 {
 		// Increment a metric if we are advertising an actual zero window.
 		r.ep.stats.ReceiveErrors.ZeroRcvWindowState.Increment()
 	}
+
+	// If we started off with a window larger than what can he held in
+	// the 16bit window field, we ceil the value to the max value.
+	// While ceiling, we still do not want to grow the right edge when
+	// not applicable.
+	if scaledWnd > math.MaxUint16 {
+		if toGrow {
+			scaledWnd = seqnum.Size(math.MaxUint16)
+		} else {
+			scaledWnd = seqnum.Size(uint16(scaledWnd))
+		}
+	}
 	return r.rcvNxt, scaledWnd
 }
 
diff --git a/pkg/tcpip/transport/tcp/reno_recovery.go b/pkg/tcpip/transport/tcp/reno_recovery.go
new file mode 100644
index 000000000..2aa708e97
--- /dev/null
+++ b/pkg/tcpip/transport/tcp/reno_recovery.go
@@ -0,0 +1,67 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp
+
+// renoRecovery stores the variables related to TCP Reno loss recovery
+// algorithm.
+//
+// +stateify savable
+type renoRecovery struct {
+	s *sender
+}
+
+func newRenoRecovery(s *sender) *renoRecovery {
+	return &renoRecovery{s: s}
+}
+
+func (rr *renoRecovery) DoRecovery(rcvdSeg *segment, fastRetransmit bool) {
+	ack := rcvdSeg.ackNumber
+	snd := rr.s
+
+	// We are in fast recovery mode. Ignore the ack if it's out of range.
+	if !ack.InRange(snd.sndUna, snd.sndNxt+1) {
+		return
+	}
+
+	// Don't count this as a duplicate if it is carrying data or
+	// updating the window.
+	if rcvdSeg.logicalLen() != 0 || snd.sndWnd != rcvdSeg.window {
+		return
+	}
+
+	// Inflate the congestion window if we're getting duplicate acks
+	// for the packet we retransmitted.
+	if !fastRetransmit && ack == snd.fr.first {
+		// We received a dup, inflate the congestion window by 1 packet
+		// if we're not at the max yet. Only inflate the window if
+		// regular FastRecovery is in use, RFC6675 does not require
+		// inflating cwnd on duplicate ACKs.
+		if snd.sndCwnd < snd.fr.maxCwnd {
+			snd.sndCwnd++
+		}
+		return
+	}
+
+	// A partial ack was received. Retransmit this packet and remember it
+	// so that we don't retransmit it again.
+	//
+	// We don't inflate the window because we're putting the same packet
+	// back onto the wire.
+	//
+	// N.B. The retransmit timer will be reset by the caller.
+	snd.fr.first = ack
+	snd.dupAckCount = 0
+	snd.resendSegment()
+}
diff --git a/pkg/tcpip/transport/tcp/sack_recovery.go b/pkg/tcpip/transport/tcp/sack_recovery.go
new file mode 100644
index 000000000..7e813fa96
--- /dev/null
+++ b/pkg/tcpip/transport/tcp/sack_recovery.go
@@ -0,0 +1,120 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp
+
+import "gvisor.dev/gvisor/pkg/tcpip/seqnum"
+
+// sackRecovery stores the variables related to TCP SACK loss recovery
+// algorithm.
+//
+// +stateify savable
+type sackRecovery struct {
+	s *sender
+}
+
+func newSACKRecovery(s *sender) *sackRecovery {
+	return &sackRecovery{s: s}
+}
+
+// handleSACKRecovery implements the loss recovery phase as described in RFC6675
+// section 5, step C.
+func (sr *sackRecovery) handleSACKRecovery(limit int, end seqnum.Value) (dataSent bool) {
+	snd := sr.s
+	snd.SetPipe()
+
+	if smss := int(snd.ep.scoreboard.SMSS()); limit > smss {
+		// Cap segment size limit to s.smss as SACK recovery requires
+		// that all retransmissions or new segments send during recovery
+		// be of <= SMSS.
+		limit = smss
+	}
+
+	nextSegHint := snd.writeList.Front()
+	for snd.outstanding < snd.sndCwnd {
+		var nextSeg *segment
+		var rescueRtx bool
+		nextSeg, nextSegHint, rescueRtx = snd.NextSeg(nextSegHint)
+		if nextSeg == nil {
+			return dataSent
+		}
+		if !snd.isAssignedSequenceNumber(nextSeg) || snd.sndNxt.LessThanEq(nextSeg.sequenceNumber) {
+			// New data being sent.
+
+			// Step C.3 described below is handled by
+			// maybeSendSegment which increments sndNxt when
+			// a segment is transmitted.
+			//
+			// Step C.3 "If any of the data octets sent in
+			// (C.1) are above HighData, HighData must be
+			// updated to reflect the transmission of
+			// previously unsent data."
+			//
+			// We pass s.smss as the limit as the Step 2) requires that
+			// new data sent should be of size s.smss or less.
+			if sent := snd.maybeSendSegment(nextSeg, limit, end); !sent {
+				return dataSent
+			}
+			dataSent = true
+			snd.outstanding++
+			snd.writeNext = nextSeg.Next()
+			continue
+		}
+
+		// Now handle the retransmission case where we matched either step 1,3 or 4
+		// of the NextSeg algorithm.
+		// RFC 6675, Step C.4.
+		//
+		// "The estimate of the amount of data outstanding in the network
+		// must be updated by incrementing pipe by the number of octets
+		// transmitted in (C.1)."
+		snd.outstanding++
+		dataSent = true
+		snd.sendSegment(nextSeg)
+
+		segEnd := nextSeg.sequenceNumber.Add(nextSeg.logicalLen())
+		if rescueRtx {
+			// We do the last part of rule (4) of NextSeg here to update
+			// RescueRxt as until this point we don't know if we are going
+			// to use the rescue transmission.
+			snd.fr.rescueRxt = snd.fr.last
+		} else {
+			// RFC 6675, Step C.2
+			//
+			// "If any of the data octets sent in (C.1) are below
+			// HighData, HighRxt MUST be set to the highest sequence
+			// number of the retransmitted segment unless NextSeg ()
+			// rule (4) was invoked for this retransmission."
+			snd.fr.highRxt = segEnd - 1
+		}
+	}
+	return dataSent
+}
+
+func (sr *sackRecovery) DoRecovery(rcvdSeg *segment, fastRetransmit bool) {
+	snd := sr.s
+	if fastRetransmit {
+		snd.resendSegment()
+	}
+
+	// We are in fast recovery mode. Ignore the ack if it's out of range.
+	if ack := rcvdSeg.ackNumber; !ack.InRange(snd.sndUna, snd.sndNxt+1) {
+		return
+	}
+
+	// RFC 6675 recovery algorithm step C 1-5.
+	end := snd.sndUna.Add(snd.sndWnd)
+	dataSent := sr.handleSACKRecovery(snd.maxPayloadSize, end)
+	snd.postXmit(dataSent)
+}
diff --git a/pkg/tcpip/transport/tcp/segment.go b/pkg/tcpip/transport/tcp/segment.go
index 2091989cc..5ef73ec74 100644
--- a/pkg/tcpip/transport/tcp/segment.go
+++ b/pkg/tcpip/transport/tcp/segment.go
@@ -204,7 +204,7 @@ func (s *segment) payloadSize() int {
 // segMemSize is the amount of memory used to hold the segment data and
 // the associated metadata.
 func (s *segment) segMemSize() int {
-	return segSize + s.data.Size()
+	return SegSize + s.data.Size()
 }
 
 // parse populates the sequence & ack numbers, flags, and window fields of the
diff --git a/pkg/tcpip/transport/tcp/segment_unsafe.go b/pkg/tcpip/transport/tcp/segment_unsafe.go
index 0ab7b8f56..392ff0859 100644
--- a/pkg/tcpip/transport/tcp/segment_unsafe.go
+++ b/pkg/tcpip/transport/tcp/segment_unsafe.go
@@ -19,5 +19,6 @@ import (
 )
 
 const (
-	segSize = int(unsafe.Sizeof(segment{}))
+	// SegSize is the minimal size of the segment overhead.
+	SegSize = int(unsafe.Sizeof(segment{}))
 )
diff --git a/pkg/tcpip/transport/tcp/snd.go b/pkg/tcpip/transport/tcp/snd.go
index 0e0fdf14c..baec762e1 100644
--- a/pkg/tcpip/transport/tcp/snd.go
+++ b/pkg/tcpip/transport/tcp/snd.go
@@ -18,7 +18,6 @@ import (
 	"fmt"
 	"math"
 	"sort"
-	"sync/atomic"
 	"time"
 
 	"gvisor.dev/gvisor/pkg/sleep"
@@ -92,6 +91,17 @@ type congestionControl interface {
 	PostRecovery()
 }
 
+// lossRecovery is an interface that must be implemented by any supported
+// loss recovery algorithm.
+type lossRecovery interface {
+	// DoRecovery is invoked when loss is detected and segments need
+	// to be retransmitted. The cumulative or selective ACK is passed along
+	// with the flag which identifies whether the connection entered fast
+	// retransmit with this ACK and to retransmit the first unacknowledged
+	// segment.
+	DoRecovery(rcvdSeg *segment, fastRetransmit bool)
+}
+
 // sender holds the state necessary to send TCP segments.
 //
 // +stateify savable
@@ -108,6 +118,9 @@ type sender struct {
 	// fr holds state related to fast recovery.
 	fr fastRecovery
 
+	// lr is the loss recovery algorithm used by the sender.
+	lr lossRecovery
+
 	// sndCwnd is the congestion window, in packets.
 	sndCwnd int
 
@@ -276,6 +289,8 @@ func newSender(ep *endpoint, iss, irs seqnum.Value, sndWnd seqnum.Size, mss uint
 
 	s.cc = s.initCongestionControl(ep.cc)
 
+	s.lr = s.initLossRecovery()
+
 	// A negative sndWndScale means that no scaling is in use, otherwise we
 	// store the scaling value.
 	if sndWndScale > 0 {
@@ -330,6 +345,14 @@ func (s *sender) initCongestionControl(congestionControlName tcpip.CongestionCon
 	}
 }
 
+// initLossRecovery initiates the loss recovery algorithm for the sender.
+func (s *sender) initLossRecovery() lossRecovery {
+	if s.ep.sackPermitted {
+		return newSACKRecovery(s)
+	}
+	return newRenoRecovery(s)
+}
+
 // updateMaxPayloadSize updates the maximum payload size based on the given
 // MTU. If this is in response to "packet too big" control packets (indicated
 // by the count argument), it also reduces the number of outstanding packets and
@@ -550,7 +573,7 @@ func (s *sender) retransmitTimerExpired() bool {
 		// We were attempting fast recovery but were not successful.
 		// Leave the state. We don't need to update ssthresh because it
 		// has already been updated when entered fast-recovery.
-		s.leaveFastRecovery()
+		s.leaveRecovery()
 	}
 
 	s.state = RTORecovery
@@ -789,7 +812,7 @@ func (s *sender) maybeSendSegment(seg *segment, limit int, end seqnum.Value) (se
 			}
 			if !nextTooBig && seg.data.Size() < available {
 				// Segment is not full.
-				if s.outstanding > 0 && atomic.LoadUint32(&s.ep.delay) != 0 {
+				if s.outstanding > 0 && s.ep.ops.GetDelayOption() {
 					// Nagle's algorithm. From Wikipedia:
 					//   Nagle's algorithm works by
 					//   combining a number of small
@@ -808,7 +831,7 @@ func (s *sender) maybeSendSegment(seg *segment, limit int, end seqnum.Value) (se
 				// send space and MSS.
 				// TODO(gvisor.dev/issue/2833): Drain the held segments after a
 				// timeout.
-				if seg.data.Size() < s.maxPayloadSize && atomic.LoadUint32(&s.ep.cork) != 0 {
+				if seg.data.Size() < s.maxPayloadSize && s.ep.ops.GetCorkOption() {
 					return false
 				}
 			}
@@ -913,79 +936,6 @@ func (s *sender) maybeSendSegment(seg *segment, limit int, end seqnum.Value) (se
 	return true
 }
 
-// handleSACKRecovery implements the loss recovery phase as described in RFC6675
-// section 5, step C.
-func (s *sender) handleSACKRecovery(limit int, end seqnum.Value) (dataSent bool) {
-	s.SetPipe()
-
-	if smss := int(s.ep.scoreboard.SMSS()); limit > smss {
-		// Cap segment size limit to s.smss as SACK recovery requires
-		// that all retransmissions or new segments send during recovery
-		// be of <= SMSS.
-		limit = smss
-	}
-
-	nextSegHint := s.writeList.Front()
-	for s.outstanding < s.sndCwnd {
-		var nextSeg *segment
-		var rescueRtx bool
-		nextSeg, nextSegHint, rescueRtx = s.NextSeg(nextSegHint)
-		if nextSeg == nil {
-			return dataSent
-		}
-		if !s.isAssignedSequenceNumber(nextSeg) || s.sndNxt.LessThanEq(nextSeg.sequenceNumber) {
-			// New data being sent.
-
-			// Step C.3 described below is handled by
-			// maybeSendSegment which increments sndNxt when
-			// a segment is transmitted.
-			//
-			// Step C.3 "If any of the data octets sent in
-			// (C.1) are above HighData, HighData must be
-			// updated to reflect the transmission of
-			// previously unsent data."
-			//
-			// We pass s.smss as the limit as the Step 2) requires that
-			// new data sent should be of size s.smss or less.
-			if sent := s.maybeSendSegment(nextSeg, limit, end); !sent {
-				return dataSent
-			}
-			dataSent = true
-			s.outstanding++
-			s.writeNext = nextSeg.Next()
-			continue
-		}
-
-		// Now handle the retransmission case where we matched either step 1,3 or 4
-		// of the NextSeg algorithm.
-		// RFC 6675, Step C.4.
-		//
-		// "The estimate of the amount of data outstanding in the network
-		// must be updated by incrementing pipe by the number of octets
-		// transmitted in (C.1)."
-		s.outstanding++
-		dataSent = true
-		s.sendSegment(nextSeg)
-
-		segEnd := nextSeg.sequenceNumber.Add(nextSeg.logicalLen())
-		if rescueRtx {
-			// We do the last part of rule (4) of NextSeg here to update
-			// RescueRxt as until this point we don't know if we are going
-			// to use the rescue transmission.
-			s.fr.rescueRxt = s.fr.last
-		} else {
-			// RFC 6675, Step C.2
-			//
-			// "If any of the data octets sent in (C.1) are below
-			// HighData, HighRxt MUST be set to the highest sequence
-			// number of the retransmitted segment unless NextSeg ()
-			// rule (4) was invoked for this retransmission."
-			s.fr.highRxt = segEnd - 1
-		}
-	}
-	return dataSent
-}
-
 func (s *sender) sendZeroWindowProbe() {
 	ack, win := s.ep.rcv.getSendParams()
 	s.unackZeroWindowProbes++
@@ -1014,6 +964,30 @@ func (s *sender) disableZeroWindowProbing() {
 	s.resendTimer.disable()
 }
 
+func (s *sender) postXmit(dataSent bool) {
+	if dataSent {
+		// We sent data, so we should stop the keepalive timer to ensure
+		// that no keepalives are sent while there is pending data.
+		s.ep.disableKeepaliveTimer()
+	}
+
+	// If the sender has advertized zero receive window and we have
+	// data to be sent out, start zero window probing to query the
+	// the remote for it's receive window size.
+	if s.writeNext != nil && s.sndWnd == 0 {
+		s.enableZeroWindowProbing()
+	}
+
+	// Enable the timer if we have pending data and it's not enabled yet.
+	if !s.resendTimer.enabled() && s.sndUna != s.sndNxt {
+		s.resendTimer.enable(s.rto)
+	}
+	// If we have no more pending data, start the keepalive timer.
+	if s.sndUna == s.sndNxt {
+		s.ep.resetKeepaliveTimer(false)
+	}
+}
+
 // sendData sends new data segments. It is called when data becomes available or
 // when the send window opens up.
 func (s *sender) sendData() {
@@ -1034,55 +1008,29 @@ func (s *sender) sendData() {
 	}
 
 	var dataSent bool
-
-	// RFC 6675 recovery algorithm step C 1-5.
-	if s.fr.active && s.ep.sackPermitted {
-		dataSent = s.handleSACKRecovery(s.maxPayloadSize, end)
-	} else {
-		for seg := s.writeNext; seg != nil && s.outstanding < s.sndCwnd; seg = seg.Next() {
-			cwndLimit := (s.sndCwnd - s.outstanding) * s.maxPayloadSize
-			if cwndLimit < limit {
-				limit = cwndLimit
-			}
-			if s.isAssignedSequenceNumber(seg) && s.ep.sackPermitted && s.ep.scoreboard.IsSACKED(seg.sackBlock()) {
-				// Move writeNext along so that we don't try and scan data that
-				// has already been SACKED.
-				s.writeNext = seg.Next()
-				continue
-			}
-			if sent := s.maybeSendSegment(seg, limit, end); !sent {
-				break
-			}
-			dataSent = true
-			s.outstanding += s.pCount(seg)
+	for seg := s.writeNext; seg != nil && s.outstanding < s.sndCwnd; seg = seg.Next() {
+		cwndLimit := (s.sndCwnd - s.outstanding) * s.maxPayloadSize
+		if cwndLimit < limit {
+			limit = cwndLimit
+		}
+		if s.isAssignedSequenceNumber(seg) && s.ep.sackPermitted && s.ep.scoreboard.IsSACKED(seg.sackBlock()) {
+			// Move writeNext along so that we don't try and scan data that
+			// has already been SACKED.
 			s.writeNext = seg.Next()
+			continue
 		}
+		if sent := s.maybeSendSegment(seg, limit, end); !sent {
+			break
+		}
+		dataSent = true
+		s.outstanding += s.pCount(seg)
+		s.writeNext = seg.Next()
 	}
 
-	if dataSent {
-		// We sent data, so we should stop the keepalive timer to ensure
-		// that no keepalives are sent while there is pending data.
-		s.ep.disableKeepaliveTimer()
-	}
-
-	// If the sender has advertized zero receive window and we have
-	// data to be sent out, start zero window probing to query the
-	// the remote for it's receive window size.
-	if s.writeNext != nil && s.sndWnd == 0 {
-		s.enableZeroWindowProbing()
-	}
-
-	// Enable the timer if we have pending data and it's not enabled yet.
-	if !s.resendTimer.enabled() && s.sndUna != s.sndNxt {
-		s.resendTimer.enable(s.rto)
-	}
-	// If we have no more pending data, start the keepalive timer.
-	if s.sndUna == s.sndNxt {
-		s.ep.resetKeepaliveTimer(false)
-	}
+	s.postXmit(dataSent)
 }
 
-func (s *sender) enterFastRecovery() {
+func (s *sender) enterRecovery() {
 	s.fr.active = true
 	// Save state to reflect we're now in fast recovery.
 	//
@@ -1104,7 +1052,7 @@ func (s *sender) enterFastRecovery() {
 	s.ep.stack.Stats().TCP.FastRecovery.Increment()
 }
 
-func (s *sender) leaveFastRecovery() {
+func (s *sender) leaveRecovery() {
 	s.fr.active = false
 	s.fr.maxCwnd = 0
 	s.dupAckCount = 0
@@ -1115,57 +1063,6 @@ func (s *sender) leaveFastRecovery() {
 	s.cc.PostRecovery()
 }
 
-func (s *sender) handleFastRecovery(seg *segment) (rtx bool) {
-	ack := seg.ackNumber
-	// We are in fast recovery mode. Ignore the ack if it's out of
-	// range.
-	if !ack.InRange(s.sndUna, s.sndNxt+1) {
-		return false
-	}
-
-	// Leave fast recovery if it acknowledges all the data covered by
-	// this fast recovery session.
-	if s.fr.last.LessThan(ack) {
-		s.leaveFastRecovery()
-		return false
-	}
-
-	if s.ep.sackPermitted {
-		// When SACK is enabled we let retransmission be governed by
-		// the SACK logic.
-		return false
-	}
-
-	// Don't count this as a duplicate if it is carrying data or
-	// updating the window.
-	if seg.logicalLen() != 0 || s.sndWnd != seg.window {
-		return false
-	}
-
-	// Inflate the congestion window if we're getting duplicate acks
-	// for the packet we retransmitted.
-	if ack == s.fr.first {
-		// We received a dup, inflate the congestion window by 1 packet
-		// if we're not at the max yet. Only inflate the window if
-		// regular FastRecovery is in use, RFC6675 does not require
-		// inflating cwnd on duplicate ACKs.
-		if s.sndCwnd < s.fr.maxCwnd {
-			s.sndCwnd++
-		}
-		return false
-	}
-
-	// A partial ack was received. Retransmit this packet and
-	// remember it so that we don't retransmit it again. We don't
-	// inflate the window because we're putting the same packet back
-	// onto the wire.
-	//
-	// N.B. The retransmit timer will be reset by the caller.
-	s.fr.first = ack
-	s.dupAckCount = 0
-	return true
-}
-
 // isAssignedSequenceNumber relies on the fact that we only set flags once a
 // sequencenumber is assigned and that is only done right before we send the
 // segment. As a result any segment that has a non-zero flag has a valid
@@ -1228,14 +1125,11 @@ func (s *sender) SetPipe() {
 	s.outstanding = pipe
 }
 
-// checkDuplicateAck is called when an ack is received. It manages the state
-// related to duplicate acks and determines if a retransmit is needed according
-// to the rules in RFC 6582 (NewReno).
-func (s *sender) checkDuplicateAck(seg *segment) (rtx bool) {
+// detectLoss is called when an ack is received and returns whether a loss is
+// detected. It manages the state related to duplicate acks and determines if
+// a retransmit is needed according to the rules in RFC 6582 (NewReno).
+func (s *sender) detectLoss(seg *segment) (fastRetransmit bool) {
 	ack := seg.ackNumber
-	if s.fr.active {
-		return s.handleFastRecovery(seg)
-	}
 
 	// We're not in fast recovery yet. A segment is considered a duplicate
 	// only if it doesn't carry any data and doesn't update the send window,
@@ -1266,14 +1160,14 @@ func (s *sender) checkDuplicateAck(seg *segment) (rtx bool) {
 	// See: https://tools.ietf.org/html/rfc6582#section-3.2 Step 2
 	//
 	// We only do the check here, the incrementing of last to the highest
-	// sequence number transmitted till now is done when enterFastRecovery
+	// sequence number transmitted till now is done when enterRecovery
 	// is invoked.
 	if !s.fr.last.LessThan(seg.ackNumber) {
 		s.dupAckCount = 0
 		return false
 	}
 	s.cc.HandleNDupAcks()
-	s.enterFastRecovery()
+	s.enterRecovery()
 	s.dupAckCount = 0
 	return true
 }
@@ -1415,14 +1309,23 @@ func (s *sender) handleRcvdSegment(rcvdSeg *segment) {
 		s.SetPipe()
 	}
 
-	// Count the duplicates and do the fast retransmit if needed.
-	rtx := s.checkDuplicateAck(rcvdSeg)
+	ack := rcvdSeg.ackNumber
+	fastRetransmit := false
+	// Do not leave fast recovery, if the ACK is out of range.
+	if s.fr.active {
+		// Leave fast recovery if it acknowledges all the data covered by
+		// this fast recovery session.
+		if ack.InRange(s.sndUna, s.sndNxt+1) && s.fr.last.LessThan(ack) {
+			s.leaveRecovery()
+		}
+	} else {
+		// Detect loss by counting the duplicates and enter recovery.
+		fastRetransmit = s.detectLoss(rcvdSeg)
+	}
 
 	// Stash away the current window size.
 	s.sndWnd = rcvdSeg.window
 
-	ack := rcvdSeg.ackNumber
-
 	// Disable zero window probing if remote advertizes a non-zero receive
 	// window. This can be with an ACK to the zero window probe (where the
 	// acknumber refers to the already acknowledged byte) OR to any previously
@@ -1539,19 +1442,24 @@ func (s *sender) handleRcvdSegment(rcvdSeg *segment) {
 			s.resendTimer.disable()
 		}
 	}
+
 	// Now that we've popped all acknowledged data from the retransmit
 	// queue, retransmit if needed.
-	if rtx {
-		s.resendSegment()
+	if s.fr.active {
+		s.lr.DoRecovery(rcvdSeg, fastRetransmit)
+		// When SACK is enabled data sending is governed by steps in
+		// RFC 6675 Section 5 recovery steps  A-C.
+		// See: https://tools.ietf.org/html/rfc6675#section-5.
+		if s.ep.sackPermitted {
+			return
+		}
 	}
 
 	// Send more data now that some of the pending data has been ack'd, or
 	// that the window opened up, or the congestion window was inflated due
 	// to a duplicate ack during fast recovery. This will also re-enable
 	// the retransmit timer if needed.
-	if !s.ep.sackPermitted || s.fr.active || s.dupAckCount == 0 || rcvdSeg.hasNewSACKInfo {
-		s.sendData()
-	}
+	s.sendData()
 }
 
 // sendSegment sends the specified segment.
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index 9f0fb41e3..1759ebea9 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -75,9 +75,6 @@ func TestGiveUpConnect(t *testing.T) {
 
 	// Wait for ep to become writable.
 	<-notifyCh
-	if err := ep.LastError(); err != tcpip.ErrAborted {
-		t.Fatalf("got ep.LastError() = %s, want = %s", err, tcpip.ErrAborted)
-	}
 
 	// Call Connect again to retreive the handshake failure status
 	// and stats updates.
@@ -267,7 +264,7 @@ func TestTCPResetsSentNoICMP(t *testing.T) {
 	}
 
 	// Read outgoing ICMP stats and check no ICMP DstUnreachable was recorded.
-	sent := stats.ICMP.V4PacketsSent
+	sent := stats.ICMP.V4.PacketsSent
 	if got, want := sent.DstUnreachable.Value(), uint64(0); got != want {
 		t.Errorf("got ICMP DstUnreachable.Value() = %d, want = %d", got, want)
 	}
@@ -2532,10 +2529,10 @@ func TestSegmentMerging(t *testing.T) {
 		{
 			"cork",
 			func(ep tcpip.Endpoint) {
-				ep.SetSockOptBool(tcpip.CorkOption, true)
+				ep.SocketOptions().SetCorkOption(true)
 			},
 			func(ep tcpip.Endpoint) {
-				ep.SetSockOptBool(tcpip.CorkOption, false)
+				ep.SocketOptions().SetCorkOption(false)
 			},
 		},
 	}
@@ -2627,7 +2624,7 @@ func TestDelay(t *testing.T) {
 
 	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
 
-	c.EP.SetSockOptBool(tcpip.DelayOption, true)
+	c.EP.SocketOptions().SetDelayOption(true)
 
 	var allData []byte
 	for i, data := range [][]byte{{0}, {1, 2, 3, 4}, {5, 6, 7}, {8, 9}, {10}, {11}} {
@@ -2675,7 +2672,7 @@ func TestUndelay(t *testing.T) {
 
 	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
 
-	c.EP.SetSockOptBool(tcpip.DelayOption, true)
+	c.EP.SocketOptions().SetDelayOption(true)
 
 	allData := [][]byte{{0}, {1, 2, 3}}
 	for i, data := range allData {
@@ -2708,7 +2705,7 @@ func TestUndelay(t *testing.T) {
 	// Check that we don't get the second packet yet.
 	c.CheckNoPacketTimeout("delayed second packet transmitted", 100*time.Millisecond)
 
-	c.EP.SetSockOptBool(tcpip.DelayOption, false)
+	c.EP.SocketOptions().SetDelayOption(false)
 
 	// Check that data is received.
 	second := c.GetPacket()
@@ -2745,8 +2742,8 @@ func TestMSSNotDelayed(t *testing.T) {
 		fn   func(tcpip.Endpoint)
 	}{
 		{"no-op", func(tcpip.Endpoint) {}},
-		{"delay", func(ep tcpip.Endpoint) { ep.SetSockOptBool(tcpip.DelayOption, true) }},
-		{"cork", func(ep tcpip.Endpoint) { ep.SetSockOptBool(tcpip.CorkOption, true) }},
+		{"delay", func(ep tcpip.Endpoint) { ep.SocketOptions().SetDelayOption(true) }},
+		{"cork", func(ep tcpip.Endpoint) { ep.SocketOptions().SetCorkOption(true) }},
 	}
 
 	for _, test := range tests {
@@ -3198,6 +3195,11 @@ loop:
 		case tcpip.ErrWouldBlock:
 			select {
 			case <-ch:
+				// Expect the state to be StateError and subsequent Reads to fail with HardError.
+				if _, _, err := c.EP.Read(nil); err != tcpip.ErrConnectionReset {
+					t.Fatalf("got c.EP.Read(nil) = %s, want = %s", err, tcpip.ErrConnectionReset)
+				}
+				break loop
 			case <-time.After(1 * time.Second):
 				t.Fatalf("Timed out waiting for reset to arrive")
 			}
@@ -3207,14 +3209,10 @@ loop:
 			t.Fatalf("got c.EP.Read(nil) = %s, want = %s", err, tcpip.ErrConnectionReset)
 		}
 	}
-	// Expect the state to be StateError and subsequent Reads to fail with HardError.
-	if _, _, err := c.EP.Read(nil); err != tcpip.ErrConnectionReset {
-		t.Fatalf("got c.EP.Read(nil) = %s, want = %s", err, tcpip.ErrConnectionReset)
-	}
+
 	if tcp.EndpointState(c.EP.State()) != tcp.StateError {
 		t.Fatalf("got EP state is not StateError")
 	}
-
 	if got := c.Stack().Stats().TCP.EstablishedResets.Value(); got != 1 {
 		t.Errorf("got stats.TCP.EstablishedResets.Value() = %d, want = 1", got)
 	}
@@ -4193,9 +4191,7 @@ func TestReusePort(t *testing.T) {
 	if err != nil {
 		t.Fatalf("NewEndpoint failed; %s", err)
 	}
-	if err := c.EP.SetSockOptBool(tcpip.ReuseAddressOption, true); err != nil {
-		t.Fatalf("SetSockOptBool ReuseAddressOption failed: %s", err)
-	}
+	c.EP.SocketOptions().SetReuseAddress(true)
 	if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
 		t.Fatalf("Bind failed: %s", err)
 	}
@@ -4205,9 +4201,7 @@ func TestReusePort(t *testing.T) {
 	if err != nil {
 		t.Fatalf("NewEndpoint failed; %s", err)
 	}
-	if err := c.EP.SetSockOptBool(tcpip.ReuseAddressOption, true); err != nil {
-		t.Fatalf("SetSockOptBool ReuseAddressOption failed: %s", err)
-	}
+	c.EP.SocketOptions().SetReuseAddress(true)
 	if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
 		t.Fatalf("Bind failed: %s", err)
 	}
@@ -4218,9 +4212,7 @@ func TestReusePort(t *testing.T) {
 	if err != nil {
 		t.Fatalf("NewEndpoint failed; %s", err)
 	}
-	if err := c.EP.SetSockOptBool(tcpip.ReuseAddressOption, true); err != nil {
-		t.Fatalf("SetSockOptBool ReuseAddressOption failed: %s", err)
-	}
+	c.EP.SocketOptions().SetReuseAddress(true)
 	if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
 		t.Fatalf("Bind failed: %s", err)
 	}
@@ -4233,9 +4225,7 @@ func TestReusePort(t *testing.T) {
 	if err != nil {
 		t.Fatalf("NewEndpoint failed; %s", err)
 	}
-	if err := c.EP.SetSockOptBool(tcpip.ReuseAddressOption, true); err != nil {
-		t.Fatalf("SetSockOptBool ReuseAddressOption failed: %s", err)
-	}
+	c.EP.SocketOptions().SetReuseAddress(true)
 	if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
 		t.Fatalf("Bind failed: %s", err)
 	}
@@ -4246,9 +4236,7 @@ func TestReusePort(t *testing.T) {
 	if err != nil {
 		t.Fatalf("NewEndpoint failed; %s", err)
 	}
-	if err := c.EP.SetSockOptBool(tcpip.ReuseAddressOption, true); err != nil {
-		t.Fatalf("SetSockOptBool ReuseAddressOption failed: %s", err)
-	}
+	c.EP.SocketOptions().SetReuseAddress(true)
 	if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
 		t.Fatalf("Bind failed: %s", err)
 	}
@@ -4261,9 +4249,7 @@ func TestReusePort(t *testing.T) {
 	if err != nil {
 		t.Fatalf("NewEndpoint failed; %s", err)
 	}
-	if err := c.EP.SetSockOptBool(tcpip.ReuseAddressOption, true); err != nil {
-		t.Fatalf("SetSockOptBool ReuseAddressOption failed: %s", err)
-	}
+	c.EP.SocketOptions().SetReuseAddress(true)
 	if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
 		t.Fatalf("Bind failed: %s", err)
 	}
@@ -4656,13 +4642,9 @@ func TestConnectAvoidsBoundPorts(t *testing.T) {
 												switch network {
 												case "ipv4":
 												case "ipv6":
-													if err := ep.SetSockOptBool(tcpip.V6OnlyOption, true); err != nil {
-														t.Fatalf("SetSockOptBool(V6OnlyOption(true)) failed: %s", err)
-													}
+													ep.SocketOptions().SetV6Only(true)
 												case "dual":
-													if err := ep.SetSockOptBool(tcpip.V6OnlyOption, false); err != nil {
-														t.Fatalf("SetSockOptBool(V6OnlyOption(false)) failed: %s", err)
-													}
+													ep.SocketOptions().SetV6Only(false)
 												default:
 													t.Fatalf("unknown network: '%s'", network)
 												}
@@ -4998,9 +4980,7 @@ func TestKeepalive(t *testing.T) {
 	if err := c.EP.SetSockOptInt(tcpip.KeepaliveCountOption, 5); err != nil {
 		t.Fatalf("c.EP.SetSockOptInt(tcpip.KeepaliveCountOption, 5): %s", err)
 	}
-	if err := c.EP.SetSockOptBool(tcpip.KeepaliveEnabledOption, true); err != nil {
-		t.Fatalf("c.EP.SetSockOptBool(tcpip.KeepaliveEnabledOption, true): %s", err)
-	}
+	c.EP.SocketOptions().SetKeepAlive(true)
 
 	// 5 unacked keepalives are sent. ACK each one, and check that the
 	// connection stays alive after 5.
@@ -6118,10 +6098,13 @@ func TestReceiveBufferAutoTuningApplicationLimited(t *testing.T) {
 	// Introduce a 25ms latency by delaying the first byte.
 	latency := 25 * time.Millisecond
 	time.Sleep(latency)
-	rawEP.SendPacketWithTS([]byte{1}, tsVal)
+	// Send an initial payload with atleast segment overhead size. The receive
+	// window would not grow for smaller segments.
+	rawEP.SendPacketWithTS(make([]byte, tcp.SegSize), tsVal)
 
 	pkt := rawEP.VerifyAndReturnACKWithTS(tsVal)
 	rcvWnd := header.TCP(header.IPv4(pkt).Payload()).WindowSize()
+
 	time.Sleep(25 * time.Millisecond)
 
 	// Allocate a large enough payload for the test.
@@ -6394,10 +6377,7 @@ func checkDelayOption(t *testing.T, c *context.Context, wantDelayEnabled tcpip.T
 	if err != nil {
 		t.Fatalf("NewEndPoint(tcp, ipv4, new(waiter.Queue)) failed: %s", err)
 	}
-	gotDelayOption, err := ep.GetSockOptBool(tcpip.DelayOption)
-	if err != nil {
-		t.Fatalf("ep.GetSockOptBool(tcpip.DelayOption) failed: %s", err)
-	}
+	gotDelayOption := ep.SocketOptions().GetDelayOption()
 	if gotDelayOption != wantDelayOption {
 		t.Errorf("ep.GetSockOptBool(tcpip.DelayOption) got: %t, want: %t", gotDelayOption, wantDelayOption)
 	}
@@ -7250,9 +7230,7 @@ func TestKeepaliveWithUserTimeout(t *testing.T) {
 	if err := c.EP.SetSockOptInt(tcpip.KeepaliveCountOption, 10); err != nil {
 		t.Fatalf("c.EP.SetSockOptInt(tcpip.KeepaliveCountOption, 10): %s", err)
 	}
-	if err := c.EP.SetSockOptBool(tcpip.KeepaliveEnabledOption, true); err != nil {
-		t.Fatalf("c.EP.SetSockOptBool(tcpip.KeepaliveEnabledOption, true): %s", err)
-	}
+	c.EP.SocketOptions().SetKeepAlive(true)
 
 	// Set userTimeout to be the duration to be 1 keepalive
 	// probes. Which means that after the first probe is sent
diff --git a/pkg/tcpip/transport/tcp/testing/context/context.go b/pkg/tcpip/transport/tcp/testing/context/context.go
index e6aa4fc4b..010a23e45 100644
--- a/pkg/tcpip/transport/tcp/testing/context/context.go
+++ b/pkg/tcpip/transport/tcp/testing/context/context.go
@@ -592,9 +592,7 @@ func (c *Context) CreateV6Endpoint(v6only bool) {
 		c.t.Fatalf("NewEndpoint failed: %v", err)
 	}
 
-	if err := c.EP.SetSockOptBool(tcpip.V6OnlyOption, v6only); err != nil {
-		c.t.Fatalf("SetSockOpt failed failed: %v", err)
-	}
+	c.EP.SocketOptions().SetV6Only(v6only)
 }
 
 // GetV6Packet reads a single packet from the link layer endpoint of the context
diff --git a/pkg/tcpip/transport/udp/BUILD b/pkg/tcpip/transport/udp/BUILD
index c78549424..7ebae63d8 100644
--- a/pkg/tcpip/transport/udp/BUILD
+++ b/pkg/tcpip/transport/udp/BUILD
@@ -56,6 +56,7 @@ go_test(
         "//pkg/tcpip/network/ipv4",
         "//pkg/tcpip/network/ipv6",
         "//pkg/tcpip/stack",
+        "//pkg/tcpip/transport/icmp",
         "//pkg/waiter",
     ],
 )
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index 57976d4e3..5043e7aa5 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -16,8 +16,8 @@ package udp
 
 import (
 	"fmt"
+	"sync/atomic"
 
-	"gvisor.dev/gvisor/pkg/sleep"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
@@ -77,6 +77,7 @@ func (s EndpointState) String() string {
 // +stateify savable
 type endpoint struct {
 	stack.TransportEndpointInfo
+	tcpip.DefaultSocketOptionsHandler
 
 	// The following fields are initialized at creation time and do not
 	// change throughout the lifetime of the endpoint.
@@ -94,21 +95,20 @@ type endpoint struct {
 	rcvClosed     bool
 
 	// The following fields are protected by the mu mutex.
-	mu             sync.RWMutex `state:"nosave"`
-	sndBufSize     int
-	sndBufSizeMax  int
+	mu            sync.RWMutex `state:"nosave"`
+	sndBufSize    int
+	sndBufSizeMax int
+	// state must be read/set using the EndpointState()/setEndpointState()
+	// methods.
 	state          EndpointState
-	route          stack.Route `state:"manual"`
+	route          *stack.Route `state:"manual"`
 	dstPort        uint16
-	v6only         bool
 	ttl            uint8
 	multicastTTL   uint8
 	multicastAddr  tcpip.Address
 	multicastNICID tcpip.NICID
-	multicastLoop  bool
 	portFlags      ports.Flags
 	bindToDevice   tcpip.NICID
-	noChecksum     bool
 
 	lastErrorMu sync.Mutex   `state:"nosave"`
 	lastError   *tcpip.Error `state:".(string)"`
@@ -122,17 +122,6 @@ type endpoint struct {
 	// applied while sending packets. Defaults to 0 as on Linux.
 	sendTOS uint8
 
-	// receiveTOS determines if the incoming IPv4 TOS header field is passed
-	// as ancillary data to ControlMessages on Read.
-	receiveTOS bool
-
-	// receiveTClass determines if the incoming IPv6 TClass header field is
-	// passed as ancillary data to ControlMessages on Read.
-	receiveTClass bool
-
-	// receiveIPPacketInfo determines if the packet info is returned by Read.
-	receiveIPPacketInfo bool
-
 	// shutdownFlags represent the current shutdown state of the endpoint.
 	shutdownFlags tcpip.ShutdownFlags
 
@@ -188,13 +177,14 @@ func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQue
 		//
 		// Linux defaults to TTL=1.
 		multicastTTL:         1,
-		multicastLoop:        true,
 		rcvBufSizeMax:        32 * 1024,
 		sndBufSizeMax:        32 * 1024,
 		multicastMemberships: make(map[multicastMembership]struct{}),
 		state:                StateInitial,
 		uniqueID:             s.UniqueID(),
 	}
+	e.ops.InitHandler(e)
+	e.ops.SetMulticastLoop(true)
 
 	// Override with stack defaults.
 	var ss stack.SendBufferSizeOption
@@ -210,6 +200,20 @@ func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQue
 	return e
 }
 
+// setEndpointState updates the state of the endpoint to state atomically. This
+// method is unexported as the only place we should update the state is in this
+// package but we allow the state to be read freely without holding e.mu.
+//
+// Precondition: e.mu must be held to call this method.
+func (e *endpoint) setEndpointState(state EndpointState) {
+	atomic.StoreUint32((*uint32)(&e.state), uint32(state))
+}
+
+// EndpointState() returns the current state of the endpoint.
+func (e *endpoint) EndpointState() EndpointState {
+	return EndpointState(atomic.LoadUint32((*uint32)(&e.state)))
+}
+
 // UniqueID implements stack.TransportEndpoint.UniqueID.
 func (e *endpoint) UniqueID() uint64 {
 	return e.uniqueID
@@ -235,7 +239,7 @@ func (e *endpoint) Close() {
 	e.mu.Lock()
 	e.shutdownFlags = tcpip.ShutdownRead | tcpip.ShutdownWrite
 
-	switch e.state {
+	switch e.EndpointState() {
 	case StateBound, StateConnected:
 		e.stack.UnregisterTransportEndpoint(e.RegisterNICID, e.effectiveNetProtos, ProtocolNumber, e.ID, e, e.boundPortFlags, e.boundBindToDevice)
 		e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.ID.LocalAddress, e.ID.LocalPort, e.boundPortFlags, e.boundBindToDevice, tcpip.FullAddress{})
@@ -258,10 +262,13 @@ func (e *endpoint) Close() {
 	}
 	e.rcvMu.Unlock()
 
-	e.route.Release()
+	if e.route != nil {
+		e.route.Release()
+		e.route = nil
+	}
 
 	// Update the state.
-	e.state = StateClosed
+	e.setEndpointState(StateClosed)
 
 	e.mu.Unlock()
 
@@ -303,21 +310,16 @@ func (e *endpoint) Read(addr *tcpip.FullAddress) (buffer.View, tcpip.ControlMess
 		HasTimestamp: true,
 		Timestamp:    p.timestamp,
 	}
-	e.mu.RLock()
-	receiveTOS := e.receiveTOS
-	receiveTClass := e.receiveTClass
-	receiveIPPacketInfo := e.receiveIPPacketInfo
-	e.mu.RUnlock()
-	if receiveTOS {
+	if e.ops.GetReceiveTOS() {
 		cm.HasTOS = true
 		cm.TOS = p.tos
 	}
-	if receiveTClass {
+	if e.ops.GetReceiveTClass() {
 		cm.HasTClass = true
 		// Although TClass is an 8-bit value it's read in the CMsg as a uint32.
 		cm.TClass = uint32(p.tos)
 	}
-	if receiveIPPacketInfo {
+	if e.ops.GetReceivePacketInfo() {
 		cm.HasIPPacketInfo = true
 		cm.PacketInfo = p.packetInfo
 	}
@@ -330,7 +332,7 @@ func (e *endpoint) Read(addr *tcpip.FullAddress) (buffer.View, tcpip.ControlMess
 //
 // Returns true for retry if preparation should be retried.
 func (e *endpoint) prepareForWrite(to *tcpip.FullAddress) (retry bool, err *tcpip.Error) {
-	switch e.state {
+	switch e.EndpointState() {
 	case StateInitial:
 	case StateConnected:
 		return false, nil
@@ -352,7 +354,7 @@ func (e *endpoint) prepareForWrite(to *tcpip.FullAddress) (retry bool, err *tcpi
 
 	// The state changed when we released the shared locked and re-acquired
 	// it in exclusive mode. Try again.
-	if e.state != StateInitial {
+	if e.EndpointState() != StateInitial {
 		return true, nil
 	}
 
@@ -367,9 +369,9 @@ func (e *endpoint) prepareForWrite(to *tcpip.FullAddress) (retry bool, err *tcpi
 // connectRoute establishes a route to the specified interface or the
 // configured multicast interface if no interface is specified and the
 // specified address is a multicast address.
-func (e *endpoint) connectRoute(nicID tcpip.NICID, addr tcpip.FullAddress, netProto tcpip.NetworkProtocolNumber) (stack.Route, tcpip.NICID, *tcpip.Error) {
+func (e *endpoint) connectRoute(nicID tcpip.NICID, addr tcpip.FullAddress, netProto tcpip.NetworkProtocolNumber) (*stack.Route, tcpip.NICID, *tcpip.Error) {
 	localAddr := e.ID.LocalAddress
-	if isBroadcastOrMulticast(localAddr) {
+	if e.isBroadcastOrMulticast(nicID, netProto, localAddr) {
 		// A packet can only originate from a unicast address (i.e., an interface).
 		localAddr = ""
 	}
@@ -384,9 +386,9 @@ func (e *endpoint) connectRoute(nicID tcpip.NICID, addr tcpip.FullAddress, netPr
 	}
 
 	// Find a route to the desired destination.
-	r, err := e.stack.FindRoute(nicID, localAddr, addr.Addr, netProto, e.multicastLoop)
+	r, err := e.stack.FindRoute(nicID, localAddr, addr.Addr, netProto, e.ops.GetMulticastLoop())
 	if err != nil {
-		return stack.Route{}, 0, err
+		return nil, 0, err
 	}
 	return r, nicID, nil
 }
@@ -429,7 +431,13 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c
 	to := opts.To
 
 	e.mu.RLock()
-	defer e.mu.RUnlock()
+	lockReleased := false
+	defer func() {
+		if lockReleased {
+			return
+		}
+		e.mu.RUnlock()
+	}()
 
 	// If we've shutdown with SHUT_WR we are in an invalid state for sending.
 	if e.shutdownFlags&tcpip.ShutdownWrite != 0 {
@@ -448,36 +456,9 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c
 		}
 	}
 
-	var route *stack.Route
-	var resolve func(waker *sleep.Waker) (ch <-chan struct{}, err *tcpip.Error)
-	var dstPort uint16
-	if to == nil {
-		route = &e.route
-		dstPort = e.dstPort
-		resolve = func(waker *sleep.Waker) (ch <-chan struct{}, err *tcpip.Error) {
-			// Promote lock to exclusive if using a shared route, given that it may
-			// need to change in Route.Resolve() call below.
-			e.mu.RUnlock()
-			e.mu.Lock()
-
-			// Recheck state after lock was re-acquired.
-			if e.state != StateConnected {
-				err = tcpip.ErrInvalidEndpointState
-			}
-			if err == nil && route.IsResolutionRequired() {
-				ch, err = route.Resolve(waker)
-			}
-
-			e.mu.Unlock()
-			e.mu.RLock()
-
-			// Recheck state after lock was re-acquired.
-			if e.state != StateConnected {
-				err = tcpip.ErrInvalidEndpointState
-			}
-			return
-		}
-	} else {
+	route := e.route
+	dstPort := e.dstPort
+	if to != nil {
 		// Reject destination address if it goes through a different
 		// NIC than the endpoint was bound to.
 		nicID := to.NIC
@@ -505,9 +486,8 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c
 		}
 		defer r.Release()
 
-		route = &r
+		route = r
 		dstPort = dst.Port
-		resolve = route.Resolve
 	}
 
 	if !e.ops.GetBroadcast() && route.IsOutboundBroadcast() {
@@ -515,7 +495,7 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c
 	}
 
 	if route.IsResolutionRequired() {
-		if ch, err := resolve(nil); err != nil {
+		if ch, err := route.Resolve(nil); err != nil {
 			if err == tcpip.ErrWouldBlock {
 				return 0, ch, tcpip.ErrNoLinkAddress
 			}
@@ -541,7 +521,24 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c
 		useDefaultTTL = false
 	}
 
-	if err := sendUDP(route, buffer.View(v).ToVectorisedView(), e.ID.LocalPort, dstPort, ttl, useDefaultTTL, e.sendTOS, e.owner, e.noChecksum); err != nil {
+	localPort := e.ID.LocalPort
+	sendTOS := e.sendTOS
+	owner := e.owner
+	noChecksum := e.SocketOptions().GetNoChecksum()
+	lockReleased = true
+	e.mu.RUnlock()
+
+	// Do not hold lock when sending as loopback is synchronous and if the UDP
+	// datagram ends up generating an ICMP response then it can result in a
+	// deadlock where the ICMP response handling ends up acquiring this endpoint's
+	// mutex using e.mu.RLock() in endpoint.HandleControlPacket which can cause a
+	// deadlock if another caller is trying to acquire e.mu in exclusive mode w/
+	// e.mu.Lock(). Since e.mu.Lock() prevents any new read locks to ensure the
+	// lock can be eventually acquired.
+	//
+	// See: https://golang.org/pkg/sync/#RWMutex for details on why recursive read
+	// locking is prohibited.
+	if err := sendUDP(route, buffer.View(v).ToVectorisedView(), localPort, dstPort, ttl, useDefaultTTL, sendTOS, owner, noChecksum); err != nil {
 		return 0, nil, err
 	}
 	return int64(len(v)), nil, nil
@@ -552,66 +549,18 @@ func (e *endpoint) Peek([][]byte) (int64, tcpip.ControlMessages, *tcpip.Error) {
 	return 0, tcpip.ControlMessages{}, nil
 }
 
-// SetSockOptBool implements tcpip.Endpoint.SetSockOptBool.
-func (e *endpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error {
-	switch opt {
-	case tcpip.MulticastLoopOption:
-		e.mu.Lock()
-		e.multicastLoop = v
-		e.mu.Unlock()
-
-	case tcpip.NoChecksumOption:
-		e.mu.Lock()
-		e.noChecksum = v
-		e.mu.Unlock()
-
-	case tcpip.ReceiveTOSOption:
-		e.mu.Lock()
-		e.receiveTOS = v
-		e.mu.Unlock()
-
-	case tcpip.ReceiveTClassOption:
-		// We only support this option on v6 endpoints.
-		if e.NetProto != header.IPv6ProtocolNumber {
-			return tcpip.ErrNotSupported
-		}
-
-		e.mu.Lock()
-		e.receiveTClass = v
-		e.mu.Unlock()
-
-	case tcpip.ReceiveIPPacketInfoOption:
-		e.mu.Lock()
-		e.receiveIPPacketInfo = v
-		e.mu.Unlock()
-
-	case tcpip.ReuseAddressOption:
-		e.mu.Lock()
-		e.portFlags.MostRecent = v
-		e.mu.Unlock()
-
-	case tcpip.ReusePortOption:
-		e.mu.Lock()
-		e.portFlags.LoadBalanced = v
-		e.mu.Unlock()
-
-	case tcpip.V6OnlyOption:
-		// We only recognize this option on v6 endpoints.
-		if e.NetProto != header.IPv6ProtocolNumber {
-			return tcpip.ErrInvalidEndpointState
-		}
-
-		e.mu.Lock()
-		defer e.mu.Unlock()
-
-		// We only allow this to be set when we're in the initial state.
-		if e.state != StateInitial {
-			return tcpip.ErrInvalidEndpointState
-		}
+// OnReuseAddressSet implements tcpip.SocketOptionsHandler.OnReuseAddressSet.
+func (e *endpoint) OnReuseAddressSet(v bool) {
+	e.mu.Lock()
+	e.portFlags.MostRecent = v
+	e.mu.Unlock()
+}
 
-		e.v6only = v
-	}
-	return nil
+// OnReusePortSet implements tcpip.SocketOptionsHandler.OnReusePortSet.
+func (e *endpoint) OnReusePortSet(v bool) {
+	e.mu.Lock()
+	e.portFlags.LoadBalanced = v
+	e.mu.Unlock()
 }
 
 // SetSockOptInt implements tcpip.Endpoint.SetSockOptInt.
@@ -823,81 +772,6 @@ func (e *endpoint) SetSockOpt(opt tcpip.SettableSocketOption) *tcpip.Error {
 	return nil
 }
 
-// GetSockOptBool implements tcpip.Endpoint.GetSockOptBool.
-func (e *endpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) {
-	switch opt {
-	case tcpip.KeepaliveEnabledOption:
-		return false, nil
-
-	case tcpip.MulticastLoopOption:
-		e.mu.RLock()
-		v := e.multicastLoop
-		e.mu.RUnlock()
-		return v, nil
-
-	case tcpip.NoChecksumOption:
-		e.mu.RLock()
-		v := e.noChecksum
-		e.mu.RUnlock()
-		return v, nil
-
-	case tcpip.ReceiveTOSOption:
-		e.mu.RLock()
-		v := e.receiveTOS
-		e.mu.RUnlock()
-		return v, nil
-
-	case tcpip.ReceiveTClassOption:
-		// We only support this option on v6 endpoints.
-		if e.NetProto != header.IPv6ProtocolNumber {
-			return false, tcpip.ErrNotSupported
-		}
-
-		e.mu.RLock()
-		v := e.receiveTClass
-		e.mu.RUnlock()
-		return v, nil
-
-	case tcpip.ReceiveIPPacketInfoOption:
-		e.mu.RLock()
-		v := e.receiveIPPacketInfo
-		e.mu.RUnlock()
-		return v, nil
-
-	case tcpip.ReuseAddressOption:
-		e.mu.RLock()
-		v := e.portFlags.MostRecent
-		e.mu.RUnlock()
-
-		return v, nil
-
-	case tcpip.ReusePortOption:
-		e.mu.RLock()
-		v := e.portFlags.LoadBalanced
-		e.mu.RUnlock()
-
-		return v, nil
-
-	case tcpip.V6OnlyOption:
-		// We only recognize this option on v6 endpoints.
-		if e.NetProto != header.IPv6ProtocolNumber {
-			return false, tcpip.ErrUnknownProtocolOption
-		}
-
-		e.mu.RLock()
-		v := e.v6only
-		e.mu.RUnlock()
-
-		return v, nil
-
-	case tcpip.AcceptConnOption:
-		return false, nil
-
-	default:
-		return false, tcpip.ErrUnknownProtocolOption
-	}
-}
-
 // GetSockOptInt implements tcpip.Endpoint.GetSockOptInt.
 func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
 	switch opt {
@@ -1036,7 +910,7 @@ func sendUDP(r *stack.Route, data buffer.VectorisedView, localPort, remotePort u
 // checkV4MappedLocked determines the effective network protocol and converts
 // addr to its canonical form.
 func (e *endpoint) checkV4MappedLocked(addr tcpip.FullAddress) (tcpip.FullAddress, tcpip.NetworkProtocolNumber, *tcpip.Error) {
-	unwrapped, netProto, err := e.TransportEndpointInfo.AddrNetProtoLocked(addr, e.v6only)
+	unwrapped, netProto, err := e.TransportEndpointInfo.AddrNetProtoLocked(addr, e.ops.GetV6Only())
 	if err != nil {
 		return tcpip.FullAddress{}, 0, err
 	}
@@ -1048,7 +922,7 @@ func (e *endpoint) Disconnect() *tcpip.Error {
 	e.mu.Lock()
 	defer e.mu.Unlock()
 
-	if e.state != StateConnected {
+	if e.EndpointState() != StateConnected {
 		return nil
 	}
 	var (
@@ -1071,7 +945,7 @@ func (e *endpoint) Disconnect() *tcpip.Error {
 		if err != nil {
 			return err
 		}
-		e.state = StateBound
+		e.setEndpointState(StateBound)
 		boundPortFlags = e.boundPortFlags
 	} else {
 		if e.ID.LocalPort != 0 {
@@ -1079,14 +953,14 @@ func (e *endpoint) Disconnect() *tcpip.Error {
 			e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.ID.LocalAddress, e.ID.LocalPort, boundPortFlags, e.boundBindToDevice, tcpip.FullAddress{})
 			e.boundPortFlags = ports.Flags{}
 		}
-		e.state = StateInitial
+		e.setEndpointState(StateInitial)
 	}
 
 	e.stack.UnregisterTransportEndpoint(e.RegisterNICID, e.effectiveNetProtos, ProtocolNumber, e.ID, e, boundPortFlags, e.boundBindToDevice)
 	e.ID = id
 	e.boundBindToDevice = btd
 	e.route.Release()
-	e.route = stack.Route{}
+	e.route = nil
 	e.dstPort = 0
 
 	return nil
@@ -1104,7 +978,7 @@ func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
 
 	nicID := addr.NIC
 	var localPort uint16
-	switch e.state {
+	switch e.EndpointState() {
 	case StateInitial:
 	case StateBound, StateConnected:
 		localPort = e.ID.LocalPort
@@ -1139,7 +1013,7 @@ func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
 		RemoteAddress: r.RemoteAddress,
 	}
 
-	if e.state == StateInitial {
+	if e.EndpointState() == StateInitial {
 		id.LocalAddress = r.LocalAddress
 	}
 
@@ -1147,7 +1021,7 @@ func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
 	// packets on a different network protocol, so we register both even if
 	// v6only is set to false and this is an ipv6 endpoint.
 	netProtos := []tcpip.NetworkProtocolNumber{netProto}
-	if netProto == header.IPv6ProtocolNumber && !e.v6only {
+	if netProto == header.IPv6ProtocolNumber && !e.ops.GetV6Only() {
 		netProtos = []tcpip.NetworkProtocolNumber{
 			header.IPv4ProtocolNumber,
 			header.IPv6ProtocolNumber,
@@ -1173,7 +1047,7 @@ func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
 	e.RegisterNICID = nicID
 	e.effectiveNetProtos = netProtos
 
-	e.state = StateConnected
+	e.setEndpointState(StateConnected)
 
 	e.rcvMu.Lock()
 	e.rcvReady = true
@@ -1195,7 +1069,7 @@ func (e *endpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error {
 
 	// A socket in the bound state can still receive multicast messages,
 	// so we need to notify waiters on shutdown.
-	if e.state != StateBound && e.state != StateConnected {
+	if state := e.EndpointState(); state != StateBound && state != StateConnected {
 		return tcpip.ErrNotConnected
 	}
 
@@ -1246,7 +1120,7 @@ func (e *endpoint) registerWithStack(nicID tcpip.NICID, netProtos []tcpip.Networ
 func (e *endpoint) bindLocked(addr tcpip.FullAddress) *tcpip.Error {
 	// Don't allow binding once endpoint is not in the initial state
 	// anymore.
-	if e.state != StateInitial {
+	if e.EndpointState() != StateInitial {
 		return tcpip.ErrInvalidEndpointState
 	}
 
@@ -1259,7 +1133,7 @@ func (e *endpoint) bindLocked(addr tcpip.FullAddress) *tcpip.Error {
 	// wildcard (empty) address, and this is an IPv6 endpoint with v6only
 	// set to false.
 	netProtos := []tcpip.NetworkProtocolNumber{netProto}
-	if netProto == header.IPv6ProtocolNumber && !e.v6only && addr.Addr == "" {
+	if netProto == header.IPv6ProtocolNumber && !e.ops.GetV6Only() && addr.Addr == "" {
 		netProtos = []tcpip.NetworkProtocolNumber{
 			header.IPv6ProtocolNumber,
 			header.IPv4ProtocolNumber,
@@ -1267,7 +1141,7 @@ func (e *endpoint) bindLocked(addr tcpip.FullAddress) *tcpip.Error {
 	}
 
 	nicID := addr.NIC
-	if len(addr.Addr) != 0 && !isBroadcastOrMulticast(addr.Addr) {
+	if len(addr.Addr) != 0 && !e.isBroadcastOrMulticast(addr.NIC, netProto, addr.Addr) {
 		// A local unicast address was specified, verify that it's valid.
 		nicID = e.stack.CheckLocalAddress(addr.NIC, netProto, addr.Addr)
 		if nicID == 0 {
@@ -1290,7 +1164,7 @@ func (e *endpoint) bindLocked(addr tcpip.FullAddress) *tcpip.Error {
 	e.effectiveNetProtos = netProtos
 
 	// Mark endpoint as bound.
-	e.state = StateBound
+	e.setEndpointState(StateBound)
 
 	e.rcvMu.Lock()
 	e.rcvReady = true
@@ -1322,7 +1196,7 @@ func (e *endpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) {
 	defer e.mu.RUnlock()
 
 	addr := e.ID.LocalAddress
-	if e.state == StateConnected {
+	if e.EndpointState() == StateConnected {
 		addr = e.route.LocalAddress
 	}
 
@@ -1338,7 +1212,7 @@ func (e *endpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) {
 	e.mu.RLock()
 	defer e.mu.RUnlock()
 
-	if e.state != StateConnected {
+	if e.EndpointState() != StateConnected {
 		return tcpip.FullAddress{}, tcpip.ErrNotConnected
 	}
 
@@ -1470,25 +1344,20 @@ func (e *endpoint) HandlePacket(id stack.TransportEndpointID, pkt *stack.PacketB
 // HandleControlPacket implements stack.TransportEndpoint.HandleControlPacket.
 func (e *endpoint) HandleControlPacket(id stack.TransportEndpointID, typ stack.ControlType, extra uint32, pkt *stack.PacketBuffer) {
 	if typ == stack.ControlPortUnreachable {
-		e.mu.RLock()
-		if e.state == StateConnected {
+		if e.EndpointState() == StateConnected {
 			e.lastErrorMu.Lock()
 			e.lastError = tcpip.ErrConnectionRefused
 			e.lastErrorMu.Unlock()
-			e.mu.RUnlock()
 
 			e.waiterQueue.Notify(waiter.EventErr)
 			return
 		}
-		e.mu.RUnlock()
 	}
 }
 
 // State implements tcpip.Endpoint.State.
 func (e *endpoint) State() uint32 {
-	e.mu.Lock()
-	defer e.mu.Unlock()
-	return uint32(e.state)
+	return uint32(e.EndpointState())
 }
 
 // Info returns a copy of the endpoint info.
@@ -1508,14 +1377,16 @@ func (e *endpoint) Stats() tcpip.EndpointStats {
 // Wait implements tcpip.Endpoint.Wait.
 func (*endpoint) Wait() {}
 
-func isBroadcastOrMulticast(a tcpip.Address) bool {
-	return a == header.IPv4Broadcast || header.IsV4MulticastAddress(a) || header.IsV6MulticastAddress(a)
+func (e *endpoint) isBroadcastOrMulticast(nicID tcpip.NICID, netProto tcpip.NetworkProtocolNumber, addr tcpip.Address) bool {
+	return addr == header.IPv4Broadcast || header.IsV4MulticastAddress(addr) || header.IsV6MulticastAddress(addr) || e.stack.IsSubnetBroadcast(nicID, netProto, addr)
 }
 
+// SetOwner implements tcpip.Endpoint.SetOwner.
 func (e *endpoint) SetOwner(owner tcpip.PacketOwner) {
 	e.owner = owner
 }
 
+// SocketOptions implements tcpip.Endpoint.SocketOptions.
 func (e *endpoint) SocketOptions() *tcpip.SocketOptions {
 	return &e.ops
 }
diff --git a/pkg/tcpip/transport/udp/endpoint_state.go b/pkg/tcpip/transport/udp/endpoint_state.go
index 858c99a45..13b72dc88 100644
--- a/pkg/tcpip/transport/udp/endpoint_state.go
+++ b/pkg/tcpip/transport/udp/endpoint_state.go
@@ -98,7 +98,8 @@ func (e *endpoint) Resume(s *stack.Stack) {
 		}
 	}
 
-	if e.state != StateBound && e.state != StateConnected {
+	state := e.EndpointState()
+	if state != StateBound && state != StateConnected {
 		return
 	}
 
@@ -113,12 +114,12 @@ func (e *endpoint) Resume(s *stack.Stack) {
 	}
 
 	var err *tcpip.Error
-	if e.state == StateConnected {
-		e.route, err = e.stack.FindRoute(e.RegisterNICID, e.ID.LocalAddress, e.ID.RemoteAddress, netProto, e.multicastLoop)
+	if state == StateConnected {
+		e.route, err = e.stack.FindRoute(e.RegisterNICID, e.ID.LocalAddress, e.ID.RemoteAddress, netProto, e.ops.GetMulticastLoop())
 		if err != nil {
 			panic(err)
 		}
-	} else if len(e.ID.LocalAddress) != 0 && !isBroadcastOrMulticast(e.ID.LocalAddress) { // stateBound
+	} else if len(e.ID.LocalAddress) != 0 && !e.isBroadcastOrMulticast(e.RegisterNICID, netProto, e.ID.LocalAddress) { // stateBound
 		// A local unicast address is specified, verify that it's valid.
 		if e.stack.CheckLocalAddress(e.RegisterNICID, netProto, e.ID.LocalAddress) == 0 {
 			panic(tcpip.ErrBadLocalAddress)
diff --git a/pkg/tcpip/transport/udp/udp_test.go b/pkg/tcpip/transport/udp/udp_test.go
index 764ad0857..e384f52dd 100644
--- a/pkg/tcpip/transport/udp/udp_test.go
+++ b/pkg/tcpip/transport/udp/udp_test.go
@@ -32,6 +32,7 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
 	"gvisor.dev/gvisor/pkg/tcpip/network/ipv6"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
+	"gvisor.dev/gvisor/pkg/tcpip/transport/icmp"
 	"gvisor.dev/gvisor/pkg/tcpip/transport/udp"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
@@ -54,6 +55,7 @@ const (
 	stackPort       = 1234
 	testAddr        = "\x0a\x00\x00\x02"
 	testPort        = 4096
+	invalidPort     = 8192
 	multicastAddr   = "\xe8\x2b\xd3\xea"
 	multicastV6Addr = "\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
 	broadcastAddr   = header.IPv4Broadcast
@@ -295,7 +297,8 @@ func newDualTestContext(t *testing.T, mtu uint32) *testContext {
 	t.Helper()
 	return newDualTestContextWithOptions(t, mtu, stack.Options{
 		NetworkProtocols:   []stack.NetworkProtocolFactory{ipv4.NewProtocol, ipv6.NewProtocol},
-		TransportProtocols: []stack.TransportProtocolFactory{udp.NewProtocol},
+		TransportProtocols: []stack.TransportProtocolFactory{udp.NewProtocol, icmp.NewProtocol6, icmp.NewProtocol4},
+		HandleLocal:        true,
 	})
 }
 
@@ -360,9 +363,7 @@ func (c *testContext) createEndpointForFlow(flow testFlow) {
 
 	c.createEndpoint(flow.sockProto())
 	if flow.isV6Only() {
-		if err := c.ep.SetSockOptBool(tcpip.V6OnlyOption, true); err != nil {
-			c.t.Fatalf("SetSockOptBool failed: %s", err)
-		}
+		c.ep.SocketOptions().SetV6Only(true)
 	} else if flow.isBroadcast() {
 		c.ep.SocketOptions().SetBroadcast(true)
 	}
@@ -972,7 +973,7 @@ func testFailingWrite(c *testContext, flow testFlow, wantErr *tcpip.Error) {
 // provided.
 func testWrite(c *testContext, flow testFlow, checkers ...checker.NetworkChecker) uint16 {
 	c.t.Helper()
-	return testWriteInternal(c, flow, true, checkers...)
+	return testWriteAndVerifyInternal(c, flow, true, checkers...)
 }
 
 // testWriteWithoutDestination sends a packet of the given test flow from the
@@ -981,10 +982,10 @@ func testWrite(c *testContext, flow testFlow, checkers ...checker.NetworkChecker
 // checker functions provided.
 func testWriteWithoutDestination(c *testContext, flow testFlow, checkers ...checker.NetworkChecker) uint16 {
 	c.t.Helper()
-	return testWriteInternal(c, flow, false, checkers...)
+	return testWriteAndVerifyInternal(c, flow, false, checkers...)
 }
 
-func testWriteInternal(c *testContext, flow testFlow, setDest bool, checkers ...checker.NetworkChecker) uint16 {
+func testWriteNoVerify(c *testContext, flow testFlow, setDest bool) buffer.View {
 	c.t.Helper()
 	// Take a snapshot of the stats to validate them at the end of the test.
 	epstats := c.ep.Stats().(*tcpip.TransportEndpointStats).Clone()
@@ -1006,6 +1007,12 @@ func testWriteInternal(c *testContext, flow testFlow, setDest bool, checkers ...
 		c.t.Fatalf("Bad number of bytes written: got %v, want %v", n, len(payload))
 	}
 	c.checkEndpointWriteStats(1, epstats, err)
+	return payload
+}
+
+func testWriteAndVerifyInternal(c *testContext, flow testFlow, setDest bool, checkers ...checker.NetworkChecker) uint16 {
+	c.t.Helper()
+	payload := testWriteNoVerify(c, flow, setDest)
 	// Received the packet and check the payload.
 	b := c.getPacketAndVerify(flow, checkers...)
 	var udp header.UDP
@@ -1150,6 +1157,39 @@ func TestV4WriteOnConnected(t *testing.T) {
 	testWriteWithoutDestination(c, unicastV4)
 }
 
+func TestWriteOnConnectedInvalidPort(t *testing.T) {
+	protocols := map[string]tcpip.NetworkProtocolNumber{
+		"ipv4": ipv4.ProtocolNumber,
+		"ipv6": ipv6.ProtocolNumber,
+	}
+	for name, pn := range protocols {
+		t.Run(name, func(t *testing.T) {
+			c := newDualTestContext(t, defaultMTU)
+			defer c.cleanup()
+
+			c.createEndpoint(pn)
+			if err := c.ep.Connect(tcpip.FullAddress{Addr: stackAddr, Port: invalidPort}); err != nil {
+				c.t.Fatalf("Connect failed: %s", err)
+			}
+			writeOpts := tcpip.WriteOptions{
+				To: &tcpip.FullAddress{Addr: stackAddr, Port: invalidPort},
+			}
+			payload := buffer.View(newPayload())
+			n, _, err := c.ep.Write(tcpip.SlicePayload(payload), writeOpts)
+			if err != nil {
+				c.t.Fatalf("c.ep.Write(...) = %+s, want nil", err)
+			}
+			if got, want := n, int64(len(payload)); got != want {
+				c.t.Fatalf("c.ep.Write(...) wrote %d bytes, want %d bytes", got, want)
+			}
+
+			if err := c.ep.LastError(); err != tcpip.ErrConnectionRefused {
+				c.t.Fatalf("expected c.ep.LastError() == ErrConnectionRefused, got: %+v", err)
+			}
+		})
+	}
+}
+
 // TestWriteOnBoundToV4Multicast checks that we can send packets out of a socket
 // that is bound to a V4 multicast address.
 func TestWriteOnBoundToV4Multicast(t *testing.T) {
@@ -1372,9 +1412,7 @@ func TestReadIPPacketInfo(t *testing.T) {
 				}
 			}
 
-			if err := c.ep.SetSockOptBool(tcpip.ReceiveIPPacketInfoOption, true); err != nil {
-				t.Fatalf("c.ep.SetSockOptBool(tcpip.ReceiveIPPacketInfoOption, true): %s", err)
-			}
+			c.ep.SocketOptions().SetReceivePacketInfo(true)
 
 			testRead(c, test.flow, checker.ReceiveIPPacketInfo(tcpip.IPPacketInfo{
 				NIC:             1,
@@ -1412,16 +1450,12 @@ func TestNoChecksum(t *testing.T) {
 			c.createEndpointForFlow(flow)
 
 			// Disable the checksum generation.
-			if err := c.ep.SetSockOptBool(tcpip.NoChecksumOption, true); err != nil {
-				t.Fatalf("SetSockOptBool failed: %s", err)
-			}
+			c.ep.SocketOptions().SetNoChecksum(true)
 			// This option is effective on IPv4 only.
 			testWrite(c, flow, checker.UDP(checker.NoChecksum(flow.isV4())))
 
 			// Enable the checksum generation.
-			if err := c.ep.SetSockOptBool(tcpip.NoChecksumOption, false); err != nil {
-				t.Fatalf("SetSockOptBool failed: %s", err)
-			}
+			c.ep.SocketOptions().SetNoChecksum(false)
 			testWrite(c, flow, checker.UDP(checker.NoChecksum(false)))
 		})
 	}
@@ -1591,13 +1625,15 @@ func TestSetTClass(t *testing.T) {
 }
 
 func TestReceiveTosTClass(t *testing.T) {
+	const RcvTOSOpt = "ReceiveTosOption"
+	const RcvTClassOpt = "ReceiveTClassOption"
+
 	testCases := []struct {
-		name             string
-		getReceiveOption tcpip.SockOptBool
-		tests            []testFlow
+		name  string
+		tests []testFlow
 	}{
-		{"ReceiveTosOption", tcpip.ReceiveTOSOption, []testFlow{unicastV4, broadcast}},
-		{"ReceiveTClassOption", tcpip.ReceiveTClassOption, []testFlow{unicastV4in6, unicastV6, unicastV6Only, broadcastIn6}},
+		{RcvTOSOpt, []testFlow{unicastV4, broadcast}},
+		{RcvTClassOpt, []testFlow{unicastV4in6, unicastV6, unicastV6Only, broadcastIn6}},
 	}
 	for _, testCase := range testCases {
 		for _, flow := range testCase.tests {
@@ -1606,29 +1642,32 @@ func TestReceiveTosTClass(t *testing.T) {
 				defer c.cleanup()
 
 				c.createEndpointForFlow(flow)
-				option := testCase.getReceiveOption
 				name := testCase.name
 
-				// Verify that setting and reading the option works.
-				v, err := c.ep.GetSockOptBool(option)
-				if err != nil {
-					c.t.Errorf("GetSockOptBool(%s) failed: %s", name, err)
+				var optionGetter func() bool
+				var optionSetter func(bool)
+				switch name {
+				case RcvTOSOpt:
+					optionGetter = c.ep.SocketOptions().GetReceiveTOS
+					optionSetter = c.ep.SocketOptions().SetReceiveTOS
+				case RcvTClassOpt:
+					optionGetter = c.ep.SocketOptions().GetReceiveTClass
+					optionSetter = c.ep.SocketOptions().SetReceiveTClass
+				default:
+					t.Fatalf("unkown test variant: %s", name)
 				}
+
+				// Verify that setting and reading the option works.
+				v := optionGetter()
 				// Test for expected default value.
 				if v != false {
 					c.t.Errorf("got GetSockOptBool(%s) = %t, want = %t", name, v, false)
 				}
 
 				want := true
-				if err := c.ep.SetSockOptBool(option, want); err != nil {
-					c.t.Fatalf("SetSockOptBool(%s, %t) failed: %s", name, want, err)
-				}
-
-				got, err := c.ep.GetSockOptBool(option)
-				if err != nil {
-					c.t.Errorf("GetSockOptBool(%s) failed: %s", name, err)
-				}
+				optionSetter(want)
 
+				got := optionGetter()
 				if got != want {
 					c.t.Errorf("got GetSockOptBool(%s) = %t, want = %t", name, got, want)
 				}
@@ -1638,10 +1677,10 @@ func TestReceiveTosTClass(t *testing.T) {
 				if err := c.ep.Bind(tcpip.FullAddress{Port: stackPort}); err != nil {
 					c.t.Fatalf("Bind failed: %s", err)
 				}
-				switch option {
-				case tcpip.ReceiveTClassOption:
+				switch name {
+				case RcvTClassOpt:
 					testRead(c, flow, checker.ReceiveTClass(testTOS))
-				case tcpip.ReceiveTOSOption:
+				case RcvTOSOpt:
 					testRead(c, flow, checker.ReceiveTOS(testTOS))
 				default:
 					t.Fatalf("unknown test variant: %s", name)
diff --git a/pkg/test/criutil/criutil.go b/pkg/test/criutil/criutil.go
index 70945f234..e41769017 100644
--- a/pkg/test/criutil/criutil.go
+++ b/pkg/test/criutil/criutil.go
@@ -54,14 +54,20 @@ func ResolvePath(executable string) string {
 		}
 	}
 
+	// Favor /usr/local/bin, if it exists.
+	localBin := fmt.Sprintf("/usr/local/bin/%s", executable)
+	if _, err := os.Stat(localBin); err == nil {
+		return localBin
+	}
+
 	// Try to find via the path.
-	guess, err := exec.LookPath(executable)
+	guess, _ := exec.LookPath(executable)
 	if err == nil {
 		return guess
 	}
 
-	// Return a default path.
-	return fmt.Sprintf("/usr/local/bin/%s", executable)
+	// Return a bare path; this generates a suitable error.
+	return executable
 }
 
 // NewCrictl returns a Crictl configured with a timeout and an endpoint over
diff --git a/pkg/test/dockerutil/container.go b/pkg/test/dockerutil/container.go
index 64d17f661..2bf0a22ff 100644
--- a/pkg/test/dockerutil/container.go
+++ b/pkg/test/dockerutil/container.go
@@ -17,6 +17,7 @@ package dockerutil
 import (
 	"bytes"
 	"context"
+	"errors"
 	"fmt"
 	"io/ioutil"
 	"net"
@@ -351,6 +352,9 @@ func (c *Container) SandboxPid(ctx context.Context) (int, error) {
 	return resp.ContainerJSONBase.State.Pid, nil
 }
 
+// ErrNoIP indicates that no IP address is available.
+var ErrNoIP = errors.New("no IP available")
+
 // FindIP returns the IP address of the container.
 func (c *Container) FindIP(ctx context.Context, ipv6 bool) (net.IP, error) {
 	resp, err := c.client.ContainerInspect(ctx, c.id)
@@ -365,7 +369,7 @@ func (c *Container) FindIP(ctx context.Context, ipv6 bool) (net.IP, error) {
 		ip = net.ParseIP(resp.NetworkSettings.DefaultNetworkSettings.IPAddress)
 	}
 	if ip == nil {
-		return net.IP{}, fmt.Errorf("invalid IP: %q", ip)
+		return net.IP{}, ErrNoIP
 	}
 	return ip, nil
 }
diff --git a/pkg/test/dockerutil/exec.go b/pkg/test/dockerutil/exec.go
index 4c739c9e9..bf968acec 100644
--- a/pkg/test/dockerutil/exec.go
+++ b/pkg/test/dockerutil/exec.go
@@ -77,11 +77,6 @@ func (c *Container) doExec(ctx context.Context, r ExecOpts, args []string) (Proc
 		return Process{}, fmt.Errorf("exec attach failed with err: %v", err)
 	}
 
-	if err := c.client.ContainerExecStart(ctx, resp.ID, types.ExecStartCheck{}); err != nil {
-		hijack.Close()
-		return Process{}, fmt.Errorf("exec start failed with err: %v", err)
-	}
-
 	return Process{
 		container: c,
 		execid:    resp.ID,
diff --git a/pkg/test/testutil/testutil.go b/pkg/test/testutil/testutil.go
index 49ab87c58..fdd416b5e 100644
--- a/pkg/test/testutil/testutil.go
+++ b/pkg/test/testutil/testutil.go
@@ -36,7 +36,6 @@ import (
 	"path/filepath"
 	"strconv"
 	"strings"
-	"sync/atomic"
 	"syscall"
 	"testing"
 	"time"
@@ -49,7 +48,10 @@ import (
 )
 
 var (
-	checkpoint = flag.Bool("checkpoint", true, "control checkpoint/restore support")
+	checkpoint           = flag.Bool("checkpoint", true, "control checkpoint/restore support")
+	partition            = flag.Int("partition", 1, "partition number, this is 1-indexed")
+	totalPartitions      = flag.Int("total_partitions", 1, "total number of partitions")
+	isRunningWithHostNet = flag.Bool("hostnet", false, "whether test is running with hostnet")
 )
 
 // IsCheckpointSupported returns the relevant command line flag.
@@ -57,6 +59,11 @@ func IsCheckpointSupported() bool {
 	return *checkpoint
 }
 
+// IsRunningWithHostNet returns the relevant command line flag.
+func IsRunningWithHostNet() bool {
+	return *isRunningWithHostNet
+}
+
 // ImageByName mangles the image name used locally. This depends on the image
 // build infrastructure in images/ and tools/vm.
 func ImageByName(name string) string {
@@ -249,14 +256,25 @@ func writeSpec(dir string, spec *specs.Spec) error {
 // idRandomSrc is a pseudo random generator used to in RandomID.
 var idRandomSrc = rand.New(rand.NewSource(time.Now().UnixNano()))
 
+// idRandomSrcMtx is the mutex protecting idRandomSrc.Read from being used
+// concurrently in differnt goroutines.
+var idRandomSrcMtx sync.Mutex
+
 // RandomID returns 20 random bytes following the given prefix.
 func RandomID(prefix string) string {
 	// Read 20 random bytes.
 	b := make([]byte, 20)
+	// Rand.Read is not safe for concurrent use. Packetimpact tests can be run in
+	// parallel now, so we have to protect the Read with a mutex. Otherwise we'll
+	// run into name conflicts.
+	// https://golang.org/pkg/math/rand/#Rand.Read
+	idRandomSrcMtx.Lock()
 	// "[Read] always returns len(p) and a nil error." --godoc
 	if _, err := idRandomSrc.Read(b); err != nil {
+		idRandomSrcMtx.Unlock()
 		panic("rand.Read failed: " + err.Error())
 	}
+	idRandomSrcMtx.Unlock()
 	if prefix != "" {
 		prefix = prefix + "-"
 	}
@@ -417,33 +435,35 @@ func StartReaper() func() {
 
 // WaitUntilRead reads from the given reader until the wanted string is found
 // or until timeout.
-func WaitUntilRead(r io.Reader, want string, split bufio.SplitFunc, timeout time.Duration) error {
+func WaitUntilRead(r io.Reader, want string, timeout time.Duration) error {
 	sc := bufio.NewScanner(r)
-	if split != nil {
-		sc.Split(split)
-	}
 	// done must be accessed atomically. A value greater than 0 indicates
 	// that the read loop can exit.
-	var done uint32
-	doneCh := make(chan struct{})
+	doneCh := make(chan bool)
+	defer close(doneCh)
 	go func() {
 		for sc.Scan() {
 			t := sc.Text()
 			if strings.Contains(t, want) {
-				atomic.StoreUint32(&done, 1)
-				close(doneCh)
-				break
+				doneCh <- true
+				return
 			}
-			if atomic.LoadUint32(&done) > 0 {
-				break
+			select {
+			case <-doneCh:
+				return
+			default:
 			}
 		}
+		doneCh <- false
 	}()
+
 	select {
 	case <-time.After(timeout):
-		atomic.StoreUint32(&done, 1)
 		return fmt.Errorf("timeout waiting to read %q", want)
-	case <-doneCh:
+	case res := <-doneCh:
+		if !res {
+			return fmt.Errorf("reader closed while waiting to read %q", want)
+		}
 		return nil
 	}
 }
@@ -509,7 +529,8 @@ func TouchShardStatusFile() error {
 }
 
 // TestIndicesForShard returns indices for this test shard based on the
-// TEST_SHARD_INDEX and TEST_TOTAL_SHARDS environment vars.
+// TEST_SHARD_INDEX and TEST_TOTAL_SHARDS environment vars, as well as
+// the passed partition flags.
 //
 // If either of the env vars are not present, then the function will return all
 // tests. If there are more shards than there are tests, then the returned list
@@ -534,6 +555,11 @@ func TestIndicesForShard(numTests int) ([]int, error) {
 		}
 	}
 
+	// Combine with the partitions.
+	partitionSize := shardTotal
+	shardTotal = (*totalPartitions) * shardTotal
+	shardIndex = partitionSize*(*partition-1) + shardIndex
+
 	// Calculate!
 	var indices []int
 	numBlocks := int(math.Ceil(float64(numTests) / float64(shardTotal)))
diff --git a/pkg/waiter/waiter.go b/pkg/waiter/waiter.go
index 08519d986..83d4f893a 100644
--- a/pkg/waiter/waiter.go
+++ b/pkg/waiter/waiter.go
@@ -119,7 +119,10 @@ type EntryCallback interface {
 	// The callback is supposed to perform minimal work, and cannot call
 	// any method on the queue itself because it will be locked while the
 	// callback is running.
-	Callback(e *Entry)
+	//
+	// The mask indicates the events that occurred and that the entry is
+	// interested in.
+	Callback(e *Entry, mask EventMask)
 }
 
 // Entry represents a waiter that can be add to the a wait queue. It can
@@ -140,7 +143,7 @@ type channelCallback struct {
 }
 
 // Callback implements EntryCallback.Callback.
-func (c *channelCallback) Callback(*Entry) {
+func (c *channelCallback) Callback(*Entry, EventMask) {
 	select {
 	case c.ch <- struct{}{}:
 	default:
@@ -193,8 +196,8 @@ func (q *Queue) EventUnregister(e *Entry) {
 func (q *Queue) Notify(mask EventMask) {
 	q.mu.RLock()
 	for e := q.list.Front(); e != nil; e = e.Next() {
-		if mask&e.mask != 0 {
-			e.Callback.Callback(e)
+		if m := mask & e.mask; m != 0 {
+			e.Callback.Callback(e, m)
 		}
 	}
 	q.mu.RUnlock()
diff --git a/pkg/waiter/waiter_test.go b/pkg/waiter/waiter_test.go
index c1b94a4f3..6928f28b4 100644
--- a/pkg/waiter/waiter_test.go
+++ b/pkg/waiter/waiter_test.go
@@ -20,12 +20,12 @@ import (
 )
 
 type callbackStub struct {
-	f func(e *Entry)
+	f func(e *Entry, m EventMask)
 }
 
 // Callback implements EntryCallback.Callback.
-func (c *callbackStub) Callback(e *Entry) {
-	c.f(e)
+func (c *callbackStub) Callback(e *Entry, m EventMask) {
+	c.f(e, m)
 }
 
 func TestEmptyQueue(t *testing.T) {
@@ -36,7 +36,7 @@ func TestEmptyQueue(t *testing.T) {
 
 	// Register then unregister a waiter, then notify the queue.
 	cnt := 0
-	e := Entry{Callback: &callbackStub{func(*Entry) { cnt++ }}}
+	e := Entry{Callback: &callbackStub{func(*Entry, EventMask) { cnt++ }}}
 	q.EventRegister(&e, EventIn)
 	q.EventUnregister(&e)
 	q.Notify(EventIn)
@@ -49,7 +49,7 @@ func TestMask(t *testing.T) {
 	// Register a waiter.
 	var q Queue
 	var cnt int
-	e := Entry{Callback: &callbackStub{func(*Entry) { cnt++ }}}
+	e := Entry{Callback: &callbackStub{func(*Entry, EventMask) { cnt++ }}}
 	q.EventRegister(&e, EventIn|EventErr)
 
 	// Notify with an overlapping mask.
@@ -101,11 +101,14 @@ func TestConcurrentRegistration(t *testing.T) {
 	for i := 0; i < concurrency; i++ {
 		go func() {
 			var e Entry
-			e.Callback = &callbackStub{func(entry *Entry) {
+			e.Callback = &callbackStub{func(entry *Entry, mask EventMask) {
 				cnt++
 				if entry != &e {
 					t.Errorf("entry = %p, want %p", entry, &e)
 				}
+				if mask != EventIn {
+					t.Errorf("mask = %#x want %#x", mask, EventIn)
+				}
 			}}
 
 			// Wait for notification, then register.
@@ -158,11 +161,14 @@ func TestConcurrentNotification(t *testing.T) {
 	// Register waiters.
 	for i := 0; i < waiterCount; i++ {
 		var e Entry
-		e.Callback = &callbackStub{func(entry *Entry) {
+		e.Callback = &callbackStub{func(entry *Entry, mask EventMask) {
 			atomic.AddInt32(&cnt, 1)
 			if entry != &e {
 				t.Errorf("entry = %p, want %p", entry, &e)
 			}
+			if mask != EventIn {
+				t.Errorf("mask = %#x want %#x", mask, EventIn)
+			}
 		}}
 
 		q.EventRegister(&e, EventIn|EventErr)