155 files changed, 2793 insertions, 1932 deletions
diff --git a/.travis.yml b/.travis.yml
index a2a260538..acbd3d61b 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -17,3 +17,7 @@ matrix:
 script:
    - uname -a
    - make DOCKER_RUN_OPTIONS="" BAZEL_OPTIONS="build runsc:runsc" bazel && $RUNSC_PATH --alsologtostderr --network none --debug --TESTONLY-unsafe-nonroot=true --rootless do ls
+branches:
+  except:
+  # Skip copybara branches.
+  - /^test\/cl.*$/
diff --git a/BUILD b/BUILD
index 5fd929378..a709a9816 100644
--- a/BUILD
+++ b/BUILD
@@ -49,10 +49,31 @@ gazelle(name = "gazelle")
 # live in the tools subdirectory (unless they are standard).
 nogo(
     name = "nogo",
-    config = "//tools:nogo.js",
+    config = "//tools:nogo.json",
     visibility = ["//visibility:public"],
     deps = [
         "//tools/checkunsafe",
+        "@org_golang_x_tools//go/analysis/passes/asmdecl:go_tool_library",
+        "@org_golang_x_tools//go/analysis/passes/assign:go_tool_library",
+        "@org_golang_x_tools//go/analysis/passes/atomic:go_tool_library",
+        "@org_golang_x_tools//go/analysis/passes/atomicalign:go_tool_library",
+        "@org_golang_x_tools//go/analysis/passes/bools:go_tool_library",
+        "@org_golang_x_tools//go/analysis/passes/buildtag:go_tool_library",
+        "@org_golang_x_tools//go/analysis/passes/cgocall:go_tool_library",
+        "@org_golang_x_tools//go/analysis/passes/copylock:go_tool_library",
+        "@org_golang_x_tools//go/analysis/passes/deepequalerrors:go_tool_library",
+        "@org_golang_x_tools//go/analysis/passes/loopclosure:go_tool_library",
+        "@org_golang_x_tools//go/analysis/passes/lostcancel:go_tool_library",
+        "@org_golang_x_tools//go/analysis/passes/nilfunc:go_tool_library",
+        "@org_golang_x_tools//go/analysis/passes/nilness:go_tool_library",
+        "@org_golang_x_tools//go/analysis/passes/printf:go_tool_library",
+        "@org_golang_x_tools//go/analysis/passes/shift:go_tool_library",
+        "@org_golang_x_tools//go/analysis/passes/stdmethods:go_tool_library",
+        "@org_golang_x_tools//go/analysis/passes/structtag:go_tool_library",
+        "@org_golang_x_tools//go/analysis/passes/tests:go_tool_library",
+        "@org_golang_x_tools//go/analysis/passes/unmarshal:go_tool_library",
+        "@org_golang_x_tools//go/analysis/passes/unsafeptr:go_tool_library",
+        "@org_golang_x_tools//go/analysis/passes/unusedresult:go_tool_library",
     ],
 )
 
diff --git a/benchmarks/harness/machine_producers/gcloud_producer.py b/benchmarks/harness/machine_producers/gcloud_producer.py
index 513d16e4f..1a624df2e 100644
--- a/benchmarks/harness/machine_producers/gcloud_producer.py
+++ b/benchmarks/harness/machine_producers/gcloud_producer.py
@@ -168,7 +168,9 @@ class GCloudProducer(machine_producer.MachineProducer):
     cmd.append("--zone=" + self.zone)
     cmd.append("--machine-type=" + self.machine_type)
     res = self._run_command(cmd)
-    return json.loads(res.stdout)
+    data = res.stdout
+    data = str(data, "utf-8") if isinstance(data, (bytes, bytearray)) else data
+    return json.loads(data)
 
   def _add_ssh_key_to_instances(self, names: List[str]) -> None:
     """Adds ssh key to instances by calling gcloud ssh command.
@@ -186,11 +188,11 @@ class GCloudProducer(machine_producer.MachineProducer):
       TimeoutError: when 3 unsuccessful tries to ssh into the host return 255.
     """
     for name in names:
-      cmd = "gcloud compute ssh {name}".format(name=name).split(" ")
+      cmd = "gcloud compute ssh {user}@{name}".format(
+          user=self.ssh_user, name=name).split(" ")
       cmd.append("--ssh-key-file={key}".format(key=self.ssh_key_file))
       cmd.append("--zone={zone}".format(zone=self.zone))
       cmd.append("--command=uname")
-      cmd.append("--ssh-key-expire-after=60m")
       timeout = datetime.timedelta(seconds=5 * 60)
       start = datetime.datetime.now()
       while datetime.datetime.now() <= timeout + start:
diff --git a/kokoro/kythe/generate_xrefs.sh b/kokoro/kythe/generate_xrefs.sh
index 2f531aa72..323b0f77b 100644
--- a/kokoro/kythe/generate_xrefs.sh
+++ b/kokoro/kythe/generate_xrefs.sh
@@ -23,7 +23,7 @@ bazel version
 
 python3 -V
 
-readonly KYTHE_VERSION='v0.0.43'
+readonly KYTHE_VERSION='v0.0.41'
 readonly WORKDIR="$(mktemp -d)"
 readonly KYTHE_DIR="${WORKDIR}/kythe-${KYTHE_VERSION}"
 if [[ -n "$KOKORO_GIT_COMMIT" ]]; then
diff --git a/kokoro/packetimpact_tests.cfg b/kokoro/packetimpact_tests.cfg
new file mode 100644
index 000000000..db86b52d5
--- /dev/null
+++ b/kokoro/packetimpact_tests.cfg
@@ -0,0 +1,9 @@
+build_file: "repo/scripts/packetimpact_tests.sh"
+
+action {
+  define_artifacts {
+    regex: "**/sponge_log.xml"
+    regex: "**/sponge_log.log"
+    regex: "**/outputs.zip"
+  }
+}
diff --git a/pkg/abi/linux/file.go b/pkg/abi/linux/file.go
index dbe58acbe..055ac1d7c 100644
--- a/pkg/abi/linux/file.go
+++ b/pkg/abi/linux/file.go
@@ -287,6 +287,11 @@ func (m FileMode) ExtraBits() FileMode {
 	return m &^ (PermissionsMask | FileTypeMask)
 }
 
+// IsDir returns true if file type represents a directory.
+func (m FileMode) IsDir() bool {
+	return m.FileType() == S_IFDIR
+}
+
 // String returns a string representation of m.
 func (m FileMode) String() string {
 	var s []string
diff --git a/pkg/bits/bits_template.go b/pkg/bits/bits_template.go
index 93a435b80..998645388 100644
--- a/pkg/bits/bits_template.go
+++ b/pkg/bits/bits_template.go
@@ -42,3 +42,11 @@ func Mask(is ...int) T {
 func MaskOf(i int) T {
 	return T(1) << T(i)
 }
+
+// IsPowerOfTwo returns true if v is power of 2.
+func IsPowerOfTwo(v T) bool {
+	if v == 0 {
+		return false
+	}
+	return v&(v-1) == 0
+}
diff --git a/pkg/bits/uint64_test.go b/pkg/bits/uint64_test.go
index 1b018d808..193d1ebcd 100644
--- a/pkg/bits/uint64_test.go
+++ b/pkg/bits/uint64_test.go
@@ -114,3 +114,21 @@ func TestIsOn(t *testing.T) {
 		}
 	}
 }
+
+func TestIsPowerOfTwo(t *testing.T) {
+	for _, tc := range []struct {
+		v    uint64
+		want bool
+	}{
+		{v: 0, want: false},
+		{v: 1, want: true},
+		{v: 2, want: true},
+		{v: 3, want: false},
+		{v: 4, want: true},
+		{v: 5, want: false},
+	} {
+		if got := IsPowerOfTwo64(tc.v); got != tc.want {
+			t.Errorf("IsPowerOfTwo(%d) = %t, want: %t", tc.v, got, tc.want)
+		}
+	}
+}
diff --git a/pkg/gate/gate_test.go b/pkg/gate/gate_test.go
index 850693df8..316015e06 100644
--- a/pkg/gate/gate_test.go
+++ b/pkg/gate/gate_test.go
@@ -15,6 +15,7 @@
 package gate_test
 
 import (
+	"runtime"
 	"testing"
 	"time"
 
@@ -165,6 +166,8 @@ func worker(g *gate.Gate, done *sync.WaitGroup) {
 		if !g.Enter() {
 			break
 		}
+		// Golang before v1.14 doesn't preempt busyloops.
+		runtime.Gosched()
 		g.Leave()
 	}
 	done.Done()
diff --git a/pkg/rand/rand_linux.go b/pkg/rand/rand_linux.go
index 1aec96e2b..fa6a21026 100644
--- a/pkg/rand/rand_linux.go
+++ b/pkg/rand/rand_linux.go
@@ -17,6 +17,7 @@
 package rand
 
 import (
+	"bufio"
 	"crypto/rand"
 	"io"
 
@@ -45,18 +46,26 @@ func (r *reader) Read(p []byte) (int, error) {
 	return rand.Read(p)
 }
 
-// mu protects the global Reader below.
-var mu sync.Mutex
+// bufferedReader implements a threadsafe buffered io.Reader.
+type bufferedReader struct {
+	mu sync.Mutex
+	r  *bufio.Reader
+}
+
+// Read implements io.Reader.Read.
+func (b *bufferedReader) Read(p []byte) (int, error) {
+	b.mu.Lock()
+	n, err := b.r.Read(p)
+	b.mu.Unlock()
+	return n, err
+}
 
 // Reader is the default reader.
-var Reader io.Reader = &reader{}
+var Reader io.Reader = &bufferedReader{r: bufio.NewReader(&reader{})}
 
 // Read reads from the default reader.
 func Read(b []byte) (int, error) {
-	mu.Lock()
-	n, err := io.ReadFull(Reader, b)
-	mu.Unlock()
-	return n, err
+	return io.ReadFull(Reader, b)
 }
 
 // Init can be called to make sure /dev/urandom is pre-opened on kernels that
diff --git a/pkg/sentry/arch/arch_aarch64.go b/pkg/sentry/arch/arch_aarch64.go
index b998f84fc..c29e1b841 100644
--- a/pkg/sentry/arch/arch_aarch64.go
+++ b/pkg/sentry/arch/arch_aarch64.go
@@ -100,6 +100,9 @@ type State struct {
 
 	// FeatureSet is a pointer to the currently active feature set.
 	FeatureSet *cpuid.FeatureSet
+
+	// OrigR0 stores the value of register R0.
+	OrigR0 uint64
 }
 
 // Proto returns a protobuf representation of the system registers in State.
@@ -150,6 +153,7 @@ func (s *State) Fork() State {
 		aarch64FPState: s.aarch64FPState.fork(),
 		TPValue:        s.TPValue,
 		FeatureSet:     s.FeatureSet,
+		OrigR0:         s.OrigR0,
 	}
 }
 
diff --git a/pkg/sentry/arch/syscalls_arm64.go b/pkg/sentry/arch/syscalls_arm64.go
index 00d5ef461..dc13b6124 100644
--- a/pkg/sentry/arch/syscalls_arm64.go
+++ b/pkg/sentry/arch/syscalls_arm64.go
@@ -50,13 +50,21 @@ func (c *context64) SyscallArgs() SyscallArguments {
 }
 
 // RestartSyscall implements Context.RestartSyscall.
+// Prepare for system call restart, OrigR0 will be restored to R0.
+// Please see the linux code as reference:
+// arch/arm64/kernel/signal.c:do_signal()
 func (c *context64) RestartSyscall() {
 	c.Regs.Pc -= SyscallWidth
-	c.Regs.Regs[8] = uint64(restartSyscallNr)
+	// R0 will be backed up into OrigR0 when entering doSyscall().
+	// Please see the linux code as reference:
+	// arch/arm64/kernel/syscall.c:el0_svc_common().
+	// Here we restore it back.
+	c.Regs.Regs[0] = uint64(c.OrigR0)
 }
 
 // RestartSyscallWithRestartBlock implements Context.RestartSyscallWithRestartBlock.
 func (c *context64) RestartSyscallWithRestartBlock() {
 	c.Regs.Pc -= SyscallWidth
+	c.Regs.Regs[0] = uint64(c.OrigR0)
 	c.Regs.Regs[8] = uint64(restartSyscallNr)
 }
diff --git a/pkg/sentry/fs/dirent.go b/pkg/sentry/fs/dirent.go
index e0b32e1c1..0266a5287 100644
--- a/pkg/sentry/fs/dirent.go
+++ b/pkg/sentry/fs/dirent.go
@@ -17,7 +17,6 @@ package fs
 import (
 	"fmt"
 	"path"
-	"sort"
 	"sync/atomic"
 	"syscall"
 
@@ -121,9 +120,6 @@ type Dirent struct {
 	// deleted may be set atomically when removed.
 	deleted int32
 
-	// frozen indicates this entry can't walk to unknown nodes.
-	frozen bool
-
 	// mounted is true if Dirent is a mount point, similar to include/linux/dcache.h:DCACHE_MOUNTED.
 	mounted bool
 
@@ -253,8 +249,7 @@ func (d *Dirent) IsNegative() bool {
 	return d.Inode == nil
 }
 
-// hashChild will hash child into the children list of its new parent d, carrying over
-// any "frozen" state from d.
+// hashChild will hash child into the children list of its new parent d.
 //
 // Returns (*WeakRef, true) if hashing child caused a Dirent to be unhashed. The caller must
 // validate the returned unhashed weak reference. Common cases:
@@ -282,9 +277,6 @@ func (d *Dirent) hashChild(child *Dirent) (*refs.WeakRef, bool) {
 		d.IncRef()
 	}
 
-	// Carry over parent's frozen state.
-	child.frozen = d.frozen
-
 	return d.hashChildParentSet(child)
 }
 
@@ -400,38 +392,6 @@ func (d *Dirent) MountRoot() *Dirent {
 	return mountRoot
 }
 
-// Freeze prevents this dirent from walking to more nodes. Freeze is applied
-// recursively to all children.
-//
-// If this particular Dirent represents a Virtual node, then Walks and Creates
-// may proceed as before.
-//
-// Freeze can only be called before the application starts running, otherwise
-// the root it might be out of sync with the application root if modified by
-// sys_chroot.
-func (d *Dirent) Freeze() {
-	d.mu.Lock()
-	defer d.mu.Unlock()
-	if d.frozen {
-		// Already frozen.
-		return
-	}
-	d.frozen = true
-
-	// Take a reference when freezing.
-	for _, w := range d.children {
-		if child := w.Get(); child != nil {
-			// NOTE: We would normally drop the reference here. But
-			// instead we're hanging on to it.
-			ch := child.(*Dirent)
-			ch.Freeze()
-		}
-	}
-
-	// Drop all expired weak references.
-	d.flush()
-}
-
 // descendantOf returns true if the receiver dirent is equal to, or a
 // descendant of, the argument dirent.
 //
@@ -524,11 +484,6 @@ func (d *Dirent) walk(ctx context.Context, root *Dirent, name string, walkMayUnl
 		w.Drop()
 	}
 
-	// Are we allowed to do the lookup?
-	if d.frozen && !d.Inode.IsVirtual() {
-		return nil, syscall.ENOENT
-	}
-
 	// Slow path: load the InodeOperations into memory. Since this is a hot path and the lookup may be
 	// expensive, if possible release the lock and re-acquire it.
 	if walkMayUnlock {
@@ -659,11 +614,6 @@ func (d *Dirent) Create(ctx context.Context, root *Dirent, name string, flags Fi
 		return nil, syscall.EEXIST
 	}
 
-	// Are we frozen?
-	if d.frozen && !d.Inode.IsVirtual() {
-		return nil, syscall.ENOENT
-	}
-
 	// Try the create. We need to trust the file system to return EEXIST (or something
 	// that will translate to EEXIST) if name already exists.
 	file, err := d.Inode.Create(ctx, d, name, flags, perms)
@@ -727,11 +677,6 @@ func (d *Dirent) genericCreate(ctx context.Context, root *Dirent, name string, c
 		return syscall.EEXIST
 	}
 
-	// Are we frozen?
-	if d.frozen && !d.Inode.IsVirtual() {
-		return syscall.ENOENT
-	}
-
 	// Remove any negative Dirent. We've already asserted above with d.exists
 	// that the only thing remaining here can be a negative Dirent.
 	if w, ok := d.children[name]; ok {
@@ -862,49 +807,6 @@ func (d *Dirent) GetDotAttrs(root *Dirent) (DentAttr, DentAttr) {
 	return dot, dot
 }
 
-// readdirFrozen returns readdir results based solely on the frozen children.
-func (d *Dirent) readdirFrozen(root *Dirent, offset int64, dirCtx *DirCtx) (int64, error) {
-	// Collect attrs for "." and  "..".
-	attrs := make(map[string]DentAttr)
-	names := []string{".", ".."}
-	attrs["."], attrs[".."] = d.GetDotAttrs(root)
-
-	// Get info from all children.
-	d.mu.Lock()
-	defer d.mu.Unlock()
-	for name, w := range d.children {
-		if child := w.Get(); child != nil {
-			defer child.DecRef()
-
-			// Skip negative children.
-			if child.(*Dirent).IsNegative() {
-				continue
-			}
-
-			sattr := child.(*Dirent).Inode.StableAttr
-			attrs[name] = DentAttr{
-				Type:    sattr.Type,
-				InodeID: sattr.InodeID,
-			}
-			names = append(names, name)
-		}
-	}
-
-	sort.Strings(names)
-
-	if int(offset) >= len(names) {
-		return offset, nil
-	}
-	names = names[int(offset):]
-	for _, name := range names {
-		if err := dirCtx.DirEmit(name, attrs[name]); err != nil {
-			return offset, err
-		}
-		offset++
-	}
-	return offset, nil
-}
-
 // DirIterator is an open directory containing directory entries that can be read.
 type DirIterator interface {
 	// IterateDir emits directory entries by calling dirCtx.EmitDir, beginning
@@ -964,10 +866,6 @@ func direntReaddir(ctx context.Context, d *Dirent, it DirIterator, root *Dirent,
 		return offset, nil
 	}
 
-	if d.frozen {
-		return d.readdirFrozen(root, offset, dirCtx)
-	}
-
 	// Collect attrs for "." and "..".
 	dot, dotdot := d.GetDotAttrs(root)
 
@@ -1068,11 +966,6 @@ func (d *Dirent) mount(ctx context.Context, inode *Inode) (newChild *Dirent, err
 		return nil, syserror.EINVAL
 	}
 
-	// Are we frozen?
-	if d.parent.frozen && !d.parent.Inode.IsVirtual() {
-		return nil, syserror.ENOENT
-	}
-
 	// Dirent that'll replace d.
 	//
 	// Note that NewDirent returns with one reference taken; the reference
@@ -1101,11 +994,6 @@ func (d *Dirent) unmount(ctx context.Context, replacement *Dirent) error {
 		return syserror.ENOENT
 	}
 
-	// Are we frozen?
-	if d.parent.frozen && !d.parent.Inode.IsVirtual() {
-		return syserror.ENOENT
-	}
-
 	// Remount our former child in its place.
 	//
 	// As replacement used to be our child, it must already have the right
@@ -1135,11 +1023,6 @@ func (d *Dirent) Remove(ctx context.Context, root *Dirent, name string, dirPath
 	unlock := d.lockDirectory()
 	defer unlock()
 
-	// Are we frozen?
-	if d.frozen && !d.Inode.IsVirtual() {
-		return syscall.ENOENT
-	}
-
 	// Try to walk to the node.
 	child, err := d.walk(ctx, root, name, false /* may unlock */)
 	if err != nil {
@@ -1201,11 +1084,6 @@ func (d *Dirent) RemoveDirectory(ctx context.Context, root *Dirent, name string)
 	unlock := d.lockDirectory()
 	defer unlock()
 
-	// Are we frozen?
-	if d.frozen && !d.Inode.IsVirtual() {
-		return syscall.ENOENT
-	}
-
 	// Check for dots.
 	if name == "." {
 		// Rejected as the last component by rmdir(2).
@@ -1519,15 +1397,6 @@ func Rename(ctx context.Context, root *Dirent, oldParent *Dirent, oldName string
 		return err
 	}
 
-	// Are we frozen?
-	// TODO(jamieliu): Is this the right errno?
-	if oldParent.frozen && !oldParent.Inode.IsVirtual() {
-		return syscall.ENOENT
-	}
-	if newParent.frozen && !newParent.Inode.IsVirtual() {
-		return syscall.ENOENT
-	}
-
 	// Do we have general permission to remove from oldParent and
 	// create/replace in newParent?
 	if err := oldParent.Inode.CheckPermission(ctx, PermMask{Write: true, Execute: true}); err != nil {
diff --git a/pkg/sentry/fs/file_overlay_test.go b/pkg/sentry/fs/file_overlay_test.go
index a76d87e3a..1971cc680 100644
--- a/pkg/sentry/fs/file_overlay_test.go
+++ b/pkg/sentry/fs/file_overlay_test.go
@@ -175,90 +175,6 @@ func TestReaddirRevalidation(t *testing.T) {
 	}
 }
 
-// TestReaddirOverlayFrozen tests that calling Readdir on an overlay file with
-// a frozen dirent tree does not make Readdir calls to the underlying files.
-// This is a regression test for b/114808269.
-func TestReaddirOverlayFrozen(t *testing.T) {
-	ctx := contexttest.Context(t)
-
-	// Create an overlay with two directories, each with two files.
-	upper := newTestRamfsDir(ctx, []dirContent{{name: "upper-file1"}, {name: "upper-file2"}}, nil)
-	lower := newTestRamfsDir(ctx, []dirContent{{name: "lower-file1"}, {name: "lower-file2"}}, nil)
-	overlayInode := fs.NewTestOverlayDir(ctx, upper, lower, false)
-
-	// Set that overlay as the root.
-	root := fs.NewDirent(ctx, overlayInode, "root")
-	ctx = &rootContext{
-		Context: ctx,
-		root:    root,
-	}
-
-	// Check that calling Readdir on the root now returns all 4 files (2
-	// from each layer in the overlay).
-	rootFile, err := root.Inode.GetFile(ctx, root, fs.FileFlags{Read: true})
-	if err != nil {
-		t.Fatalf("root.Inode.GetFile failed: %v", err)
-	}
-	defer rootFile.DecRef()
-	ser := &fs.CollectEntriesSerializer{}
-	if err := rootFile.Readdir(ctx, ser); err != nil {
-		t.Fatalf("rootFile.Readdir failed: %v", err)
-	}
-	if got, want := ser.Order, []string{".", "..", "lower-file1", "lower-file2", "upper-file1", "upper-file2"}; !reflect.DeepEqual(got, want) {
-		t.Errorf("Readdir got names %v, want %v", got, want)
-	}
-
-	// Readdir should have been called on upper and lower.
-	upperDir := upper.InodeOperations.(*dir)
-	lowerDir := lower.InodeOperations.(*dir)
-	if !upperDir.ReaddirCalled {
-		t.Errorf("upperDir.ReaddirCalled got %v, want true", upperDir.ReaddirCalled)
-	}
-	if !lowerDir.ReaddirCalled {
-		t.Errorf("lowerDir.ReaddirCalled got %v, want true", lowerDir.ReaddirCalled)
-	}
-
-	// Reset.
-	upperDir.ReaddirCalled = false
-	lowerDir.ReaddirCalled = false
-
-	// Take references on "upper-file1" and "lower-file1", pinning them in
-	// the dirent tree.
-	for _, name := range []string{"upper-file1", "lower-file1"} {
-		if _, err := root.Walk(ctx, root, name); err != nil {
-			t.Fatalf("root.Walk(%q) failed: %v", name, err)
-		}
-		// Don't drop a reference on the returned dirent so that it
-		// will stay in the tree.
-	}
-
-	// Freeze the dirent tree.
-	root.Freeze()
-
-	// Seek back to the beginning of the file.
-	if _, err := rootFile.Seek(ctx, fs.SeekSet, 0); err != nil {
-		t.Fatalf("error seeking to beginning of directory: %v", err)
-	}
-
-	// Calling Readdir on the root now will return only the pinned
-	// children.
-	ser = &fs.CollectEntriesSerializer{}
-	if err := rootFile.Readdir(ctx, ser); err != nil {
-		t.Fatalf("rootFile.Readdir failed: %v", err)
-	}
-	if got, want := ser.Order, []string{".", "..", "lower-file1", "upper-file1"}; !reflect.DeepEqual(got, want) {
-		t.Errorf("Readdir got names %v, want %v", got, want)
-	}
-
-	// Readdir should NOT have been called on upper or lower.
-	if upperDir.ReaddirCalled {
-		t.Errorf("upperDir.ReaddirCalled got %v, want false", upperDir.ReaddirCalled)
-	}
-	if lowerDir.ReaddirCalled {
-		t.Errorf("lowerDir.ReaddirCalled got %v, want false", lowerDir.ReaddirCalled)
-	}
-}
-
 type rootContext struct {
 	context.Context
 	root *fs.Dirent
diff --git a/pkg/sentry/fs/host/ioctl_unsafe.go b/pkg/sentry/fs/host/ioctl_unsafe.go
index 271582e54..150ac8e19 100644
--- a/pkg/sentry/fs/host/ioctl_unsafe.go
+++ b/pkg/sentry/fs/host/ioctl_unsafe.go
@@ -21,6 +21,8 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 )
 
+// LINT.IfChange
+
 func ioctlGetTermios(fd int) (*linux.Termios, error) {
 	var t linux.Termios
 	_, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), linux.TCGETS, uintptr(unsafe.Pointer(&t)))
@@ -54,3 +56,5 @@ func ioctlSetWinsize(fd int, w *linux.Winsize) error {
 	}
 	return nil
 }
+
+// LINT.ThenChange(../../fsimpl/host/ioctl_unsafe.go)
diff --git a/pkg/sentry/fs/host/tty.go b/pkg/sentry/fs/host/tty.go
index 3f218b4a7..cb91355ab 100644
--- a/pkg/sentry/fs/host/tty.go
+++ b/pkg/sentry/fs/host/tty.go
@@ -26,6 +26,8 @@ import (
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
+// LINT.IfChange
+
 // TTYFileOperations implements fs.FileOperations for a host file descriptor
 // that wraps a TTY FD.
 //
@@ -43,6 +45,7 @@ type TTYFileOperations struct {
 	// connected to this TTY.
 	fgProcessGroup *kernel.ProcessGroup
 
+	// termios contains the terminal attributes for this TTY.
 	termios linux.KernelTermios
 }
 
@@ -357,3 +360,5 @@ func (t *TTYFileOperations) checkChange(ctx context.Context, sig linux.Signal) e
 	_ = pg.SendSignal(kernel.SignalInfoPriv(sig))
 	return kernel.ERESTARTSYS
 }
+
+// LINT.ThenChange(../../fsimpl/host/tty.go)
diff --git a/pkg/sentry/fs/host/util.go b/pkg/sentry/fs/host/util.go
index 388108fdf..1b0356930 100644
--- a/pkg/sentry/fs/host/util.go
+++ b/pkg/sentry/fs/host/util.go
@@ -23,7 +23,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/device"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
-	"gvisor.dev/gvisor/pkg/sentry/kernel/time"
+	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
@@ -80,9 +80,9 @@ func unstableAttr(s *syscall.Stat_t) fs.UnstableAttr {
 		Usage:            s.Blocks * 512,
 		Perms:            fs.FilePermsFromMode(linux.FileMode(s.Mode)),
 		Owner:            owner(s),
-		AccessTime:       time.FromUnix(s.Atim.Sec, s.Atim.Nsec),
-		ModificationTime: time.FromUnix(s.Mtim.Sec, s.Mtim.Nsec),
-		StatusChangeTime: time.FromUnix(s.Ctim.Sec, s.Ctim.Nsec),
+		AccessTime:       ktime.FromUnix(s.Atim.Sec, s.Atim.Nsec),
+		ModificationTime: ktime.FromUnix(s.Mtim.Sec, s.Mtim.Nsec),
+		StatusChangeTime: ktime.FromUnix(s.Ctim.Sec, s.Ctim.Nsec),
 		Links:            uint64(s.Nlink),
 	}
 }
diff --git a/pkg/sentry/fs/mounts.go b/pkg/sentry/fs/mounts.go
index c7981f66e..b414ddaee 100644
--- a/pkg/sentry/fs/mounts.go
+++ b/pkg/sentry/fs/mounts.go
@@ -273,19 +273,6 @@ func (mns *MountNamespace) DecRef() {
 	mns.DecRefWithDestructor(mns.destroy)
 }
 
-// Freeze freezes the entire mount tree.
-func (mns *MountNamespace) Freeze() {
-	mns.mu.Lock()
-	defer mns.mu.Unlock()
-
-	// We only want to freeze Dirents with active references, not Dirents referenced
-	// by a mount's MountSource.
-	mns.flushMountSourceRefsLocked()
-
-	// Freeze the entire shebang.
-	mns.root.Freeze()
-}
-
 // withMountLocked prevents further walks to `node`, because `node` is about to
 // be a mount point.
 func (mns *MountNamespace) withMountLocked(node *Dirent, fn func() error) error {
diff --git a/pkg/sentry/fsimpl/ext/inode.go b/pkg/sentry/fsimpl/ext/inode.go
index 6962083f5..a39a37318 100644
--- a/pkg/sentry/fsimpl/ext/inode.go
+++ b/pkg/sentry/fsimpl/ext/inode.go
@@ -186,7 +186,7 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts *vfs.OpenOpt
 }
 
 func (in *inode) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error {
-	return vfs.GenericCheckPermissions(creds, ats, in.isDir(), uint16(in.diskInode.Mode()), in.diskInode.UID(), in.diskInode.GID())
+	return vfs.GenericCheckPermissions(creds, ats, in.diskInode.Mode(), in.diskInode.UID(), in.diskInode.GID())
 }
 
 // statTo writes the statx fields to the output parameter.
diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go
index 38e4cdbc5..1e43df9ec 100644
--- a/pkg/sentry/fsimpl/gofer/filesystem.go
+++ b/pkg/sentry/fsimpl/gofer/filesystem.go
@@ -119,7 +119,7 @@ func (fs *filesystem) stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *
 	if !d.isDir() {
 		return nil, syserror.ENOTDIR
 	}
-	if err := d.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil {
+	if err := d.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
 		return nil, err
 	}
 afterSymlink:
@@ -314,7 +314,7 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir
 	if err != nil {
 		return err
 	}
-	if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true); err != nil {
+	if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
 		return err
 	}
 	if parent.isDeleted() {
@@ -378,7 +378,7 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b
 	if err != nil {
 		return err
 	}
-	if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true); err != nil {
+	if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
 		return err
 	}
 	if err := rp.Mount().CheckBeginWrite(); err != nil {
@@ -454,6 +454,9 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b
 	}
 	if fs.opts.interop != InteropModeShared {
 		parent.touchCMtime(ctx)
+		if dir {
+			parent.decLinks()
+		}
 		parent.cacheNegativeChildLocked(name)
 		parent.dirents = nil
 	}
@@ -509,7 +512,7 @@ func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds
 	if err != nil {
 		return err
 	}
-	return d.checkPermissions(creds, ats, d.isDir())
+	return d.checkPermissions(creds, ats)
 }
 
 // GetDentryAt implements vfs.FilesystemImpl.GetDentryAt.
@@ -525,7 +528,7 @@ func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, op
 		if !d.isDir() {
 			return nil, syserror.ENOTDIR
 		}
-		if err := d.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil {
+		if err := d.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
 			return nil, err
 		}
 	}
@@ -569,8 +572,13 @@ func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.
 func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error {
 	return fs.doCreateAt(ctx, rp, true /* dir */, func(parent *dentry, name string) error {
 		creds := rp.Credentials()
-		_, err := parent.file.mkdir(ctx, name, (p9.FileMode)(opts.Mode), (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID))
-		return err
+		if _, err := parent.file.mkdir(ctx, name, (p9.FileMode)(opts.Mode), (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID)); err != nil {
+			return err
+		}
+		if fs.opts.interop != InteropModeShared {
+			parent.incLinks()
+		}
+		return nil
 	})
 }
 
@@ -616,7 +624,7 @@ afterTrailingSymlink:
 		return nil, err
 	}
 	// Check for search permission in the parent directory.
-	if err := parent.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil {
+	if err := parent.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
 		return nil, err
 	}
 	// Determine whether or not we need to create a file.
@@ -653,7 +661,7 @@ afterTrailingSymlink:
 // Preconditions: fs.renameMu must be locked.
 func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions) (*vfs.FileDescription, error) {
 	ats := vfs.AccessTypesForOpenFlags(opts)
-	if err := d.checkPermissions(rp.Credentials(), ats, d.isDir()); err != nil {
+	if err := d.checkPermissions(rp.Credentials(), ats); err != nil {
 		return nil, err
 	}
 	mnt := rp.Mount()
@@ -714,7 +722,7 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vf
 
 // Preconditions: d.fs.renameMu must be locked. d.dirMu must be locked.
 func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions) (*vfs.FileDescription, error) {
-	if err := d.checkPermissions(rp.Credentials(), vfs.MayWrite, true); err != nil {
+	if err := d.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil {
 		return nil, err
 	}
 	if d.isDeleted() {
@@ -876,7 +884,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 			return err
 		}
 	}
-	if err := oldParent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true); err != nil {
+	if err := oldParent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
 		return err
 	}
 	vfsObj := rp.VirtualFilesystem()
@@ -896,7 +904,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 			return syserror.EINVAL
 		}
 		if oldParent != newParent {
-			if err := renamed.checkPermissions(rp.Credentials(), vfs.MayWrite, true); err != nil {
+			if err := renamed.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil {
 				return err
 			}
 		}
@@ -907,7 +915,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 	}
 
 	if oldParent != newParent {
-		if err := newParent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true); err != nil {
+		if err := newParent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
 			return err
 		}
 		newParent.dirMu.Lock()
@@ -962,6 +970,10 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 		oldParent.dirents = nil
 		delete(newParent.negativeChildren, newName)
 		newParent.dirents = nil
+		if renamed.isDir() {
+			oldParent.decLinks()
+			newParent.incLinks()
+		}
 	}
 	vfsObj.CommitRenameReplaceDentry(&renamed.vfsd, &newParent.vfsd, newName, replacedVFSD)
 	return nil
diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go
index 999485492..cf276a417 100644
--- a/pkg/sentry/fsimpl/gofer/gofer.go
+++ b/pkg/sentry/fsimpl/gofer/gofer.go
@@ -485,6 +485,11 @@ type dentry struct {
 	// locked to mutate it).
 	size uint64
 
+	// nlink counts the number of hard links to this dentry. It's updated and
+	// accessed using atomic operations. It's not protected by metadataMu like the
+	// other metadata fields.
+	nlink uint32
+
 	mapsMu sync.Mutex
 
 	// If this dentry represents a regular file, mappings tracks mappings of
@@ -604,6 +609,9 @@ func (fs *filesystem) newDentry(ctx context.Context, file p9file, qid p9.QID, ma
 	if mask.BTime {
 		d.btime = dentryTimestampFromP9(attr.BTimeSeconds, attr.BTimeNanoSeconds)
 	}
+	if mask.NLink {
+		d.nlink = uint32(attr.NLink)
+	}
 	d.vfsd.Init(d)
 
 	fs.syncMu.Lock()
@@ -645,6 +653,9 @@ func (d *dentry) updateFromP9Attrs(mask p9.AttrMask, attr *p9.Attr) {
 	if mask.BTime {
 		atomic.StoreInt64(&d.btime, dentryTimestampFromP9(attr.BTimeSeconds, attr.BTimeNanoSeconds))
 	}
+	if mask.NLink {
+		atomic.StoreUint32(&d.nlink, uint32(attr.NLink))
+	}
 	if mask.Size {
 		d.dataMu.Lock()
 		atomic.StoreUint64(&d.size, attr.Size)
@@ -687,10 +698,7 @@ func (d *dentry) fileType() uint32 {
 func (d *dentry) statTo(stat *linux.Statx) {
 	stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME | linux.STATX_INO | linux.STATX_SIZE | linux.STATX_BLOCKS | linux.STATX_BTIME
 	stat.Blksize = atomic.LoadUint32(&d.blockSize)
-	stat.Nlink = 1
-	if d.isDir() {
-		stat.Nlink = 2
-	}
+	stat.Nlink = atomic.LoadUint32(&d.nlink)
 	stat.UID = atomic.LoadUint32(&d.uid)
 	stat.GID = atomic.LoadUint32(&d.gid)
 	stat.Mode = uint16(atomic.LoadUint32(&d.mode))
@@ -703,7 +711,7 @@ func (d *dentry) statTo(stat *linux.Statx) {
 	stat.Btime = statxTimestampFromDentry(atomic.LoadInt64(&d.btime))
 	stat.Ctime = statxTimestampFromDentry(atomic.LoadInt64(&d.ctime))
 	stat.Mtime = statxTimestampFromDentry(atomic.LoadInt64(&d.mtime))
-	// TODO(jamieliu): device number
+	// TODO(gvisor.dev/issue/1198): device number
 }
 
 func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *linux.Statx, mnt *vfs.Mount) error {
@@ -713,7 +721,8 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *lin
 	if stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID|linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_SIZE) != 0 {
 		return syserror.EPERM
 	}
-	if err := vfs.CheckSetStat(ctx, creds, stat, uint16(atomic.LoadUint32(&d.mode))&^linux.S_IFMT, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))); err != nil {
+	mode := linux.FileMode(atomic.LoadUint32(&d.mode))
+	if err := vfs.CheckSetStat(ctx, creds, stat, mode, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))); err != nil {
 		return err
 	}
 	if err := mnt.CheckBeginWrite(); err != nil {
@@ -835,8 +844,8 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *lin
 	return nil
 }
 
-func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes, isDir bool) error {
-	return vfs.GenericCheckPermissions(creds, ats, isDir, uint16(atomic.LoadUint32(&d.mode))&0777, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid)))
+func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error {
+	return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(atomic.LoadUint32(&d.mode)), auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid)))
 }
 
 // IncRef implements vfs.DentryImpl.IncRef.
@@ -1094,6 +1103,26 @@ func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool
 	return nil
 }
 
+// incLinks increments link count.
+//
+// Preconditions: d.nlink != 0 && d.nlink < math.MaxUint32.
+func (d *dentry) incLinks() {
+	v := atomic.AddUint32(&d.nlink, 1)
+	if v < 2 {
+		panic(fmt.Sprintf("dentry.nlink is invalid (was 0 or overflowed): %d", v))
+	}
+}
+
+// decLinks decrements link count.
+//
+// Preconditions: d.nlink > 1.
+func (d *dentry) decLinks() {
+	v := atomic.AddUint32(&d.nlink, ^uint32(0))
+	if v == 0 {
+		panic(fmt.Sprintf("dentry.nlink must be greater than 0: %d", v))
+	}
+}
+
 // fileDescription is embedded by gofer implementations of
 // vfs.FileDescriptionImpl.
 type fileDescription struct {
@@ -1112,7 +1141,8 @@ func (fd *fileDescription) dentry() *dentry {
 // Stat implements vfs.FileDescriptionImpl.Stat.
 func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
 	d := fd.dentry()
-	if d.fs.opts.interop == InteropModeShared && opts.Mask&(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID|linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_CTIME|linux.STATX_SIZE|linux.STATX_BLOCKS|linux.STATX_BTIME) != 0 && opts.Sync != linux.AT_STATX_DONT_SYNC {
+	const validMask = uint32(linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME | linux.STATX_SIZE | linux.STATX_BLOCKS | linux.STATX_BTIME)
+	if d.fs.opts.interop == InteropModeShared && opts.Mask&(validMask) != 0 && opts.Sync != linux.AT_STATX_DONT_SYNC {
 		// TODO(jamieliu): Use specialFileFD.handle.file for the getattr if
 		// available?
 		if err := d.updateFromGetattr(ctx); err != nil {
diff --git a/pkg/sentry/fsimpl/host/BUILD b/pkg/sentry/fsimpl/host/BUILD
index 5d67f88e3..82e1fb74b 100644
--- a/pkg/sentry/fsimpl/host/BUILD
+++ b/pkg/sentry/fsimpl/host/BUILD
@@ -5,9 +5,11 @@ licenses(["notice"])
 go_library(
     name = "host",
     srcs = [
-        "default_file.go",
         "host.go",
+        "ioctl_unsafe.go",
+        "tty.go",
         "util.go",
+        "util_unsafe.go",
     ],
     visibility = ["//pkg/sentry:internal"],
     deps = [
@@ -17,9 +19,12 @@ go_library(
         "//pkg/log",
         "//pkg/refs",
         "//pkg/safemem",
+        "//pkg/sentry/arch",
         "//pkg/sentry/fsimpl/kernfs",
+        "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/memmap",
+        "//pkg/sentry/unimpl",
         "//pkg/sentry/vfs",
         "//pkg/sync",
         "//pkg/syserror",
diff --git a/pkg/sentry/fsimpl/host/default_file.go b/pkg/sentry/fsimpl/host/default_file.go
deleted file mode 100644
index 459238603..000000000
--- a/pkg/sentry/fsimpl/host/default_file.go
+++ /dev/null
@@ -1,247 +0,0 @@
-// Copyright 2020 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package host
-
-import (
-	"math"
-	"syscall"
-
-	"golang.org/x/sys/unix"
-	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/context"
-	"gvisor.dev/gvisor/pkg/fd"
-	"gvisor.dev/gvisor/pkg/safemem"
-	"gvisor.dev/gvisor/pkg/sentry/memmap"
-	"gvisor.dev/gvisor/pkg/sentry/vfs"
-	"gvisor.dev/gvisor/pkg/sync"
-	"gvisor.dev/gvisor/pkg/syserror"
-	"gvisor.dev/gvisor/pkg/usermem"
-)
-
-// defaultFileFD implements FileDescriptionImpl for non-socket, non-TTY files.
-type defaultFileFD struct {
-	fileDescription
-
-	// canMap specifies whether we allow the file to be memory mapped.
-	canMap bool
-
-	// mu protects the fields below.
-	mu sync.Mutex
-
-	// offset specifies the current file offset.
-	offset int64
-}
-
-// TODO(gvisor.dev/issue/1672): Implement Waitable interface.
-
-// PRead implements FileDescriptionImpl.
-func (f *defaultFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
-	// TODO(b/34716638): Some char devices do support offsets, e.g. /dev/null.
-	if f.inode.isStream {
-		return 0, syserror.ESPIPE
-	}
-
-	return readFromHostFD(ctx, f.inode.hostFD, dst, offset, int(opts.Flags))
-}
-
-// Read implements FileDescriptionImpl.
-func (f *defaultFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
-	// TODO(b/34716638): Some char devices do support offsets, e.g. /dev/null.
-	if f.inode.isStream {
-		// These files can't be memory mapped, assert this.
-		if f.canMap {
-			panic("files that can return EWOULDBLOCK (sockets, pipes, etc.) cannot be memory mapped")
-		}
-
-		n, err := readFromHostFD(ctx, f.inode.hostFD, dst, -1, int(opts.Flags))
-		if isBlockError(err) {
-			// If we got any data at all, return it as a "completed" partial read
-			// rather than retrying until complete.
-			if n != 0 {
-				err = nil
-			} else {
-				err = syserror.ErrWouldBlock
-			}
-		}
-		return n, err
-	}
-	// TODO(gvisor.dev/issue/1672): Cache pages, when forced to do so.
-	f.mu.Lock()
-	n, err := readFromHostFD(ctx, f.inode.hostFD, dst, f.offset, int(opts.Flags))
-	f.offset += n
-	f.mu.Unlock()
-	return n, err
-}
-
-func readFromHostFD(ctx context.Context, hostFD int, dst usermem.IOSequence, offset int64, flags int) (int64, error) {
-	// TODO(gvisor.dev/issue/1672): Support select preadv2 flags.
-	if flags != 0 {
-		return 0, syserror.EOPNOTSUPP
-	}
-
-	var reader safemem.Reader
-	if offset == -1 {
-		reader = safemem.FromIOReader{fd.NewReadWriter(hostFD)}
-	} else {
-		reader = safemem.FromVecReaderFunc{
-			func(srcs [][]byte) (int64, error) {
-				n, err := unix.Preadv(hostFD, srcs, offset)
-				return int64(n), err
-			},
-		}
-	}
-	n, err := dst.CopyOutFrom(ctx, reader)
-	return int64(n), err
-}
-
-// PWrite implements FileDescriptionImpl.
-func (f *defaultFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
-	// TODO(b/34716638): Some char devices do support offsets, e.g. /dev/null.
-	if f.inode.isStream {
-		return 0, syserror.ESPIPE
-	}
-	return writeToHostFD(ctx, f.inode.hostFD, src, offset, int(opts.Flags))
-}
-
-// Write implements FileDescriptionImpl.
-func (f *defaultFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
-	// TODO(b/34716638): Some char devices do support offsets, e.g. /dev/null.
-	if f.inode.isStream {
-		// These files can't be memory mapped, assert this.
-		if f.canMap {
-			panic("files that can return EWOULDBLOCK (sockets, pipes, etc.) cannot be memory mapped")
-		}
-
-		n, err := writeToHostFD(ctx, f.inode.hostFD, src, -1, int(opts.Flags))
-		if isBlockError(err) {
-			err = syserror.ErrWouldBlock
-		}
-		return n, err
-	}
-	// TODO(gvisor.dev/issue/1672): Cache pages, when forced to do so.
-	// TODO(gvisor.dev/issue/1672): Write to end of file and update offset if O_APPEND is set on this file.
-	f.mu.Lock()
-	n, err := writeToHostFD(ctx, f.inode.hostFD, src, f.offset, int(opts.Flags))
-	f.offset += n
-	f.mu.Unlock()
-	return n, err
-}
-
-func writeToHostFD(ctx context.Context, hostFD int, src usermem.IOSequence, offset int64, flags int) (int64, error) {
-	// TODO(gvisor.dev/issue/1672): Support select pwritev2 flags.
-	if flags != 0 {
-		return 0, syserror.EOPNOTSUPP
-	}
-
-	limit, err := vfs.CheckLimit(ctx, offset, src.NumBytes())
-	if err != nil {
-		return 0, err
-	}
-	src = src.TakeFirst64(limit)
-
-	var writer safemem.Writer
-	if offset == -1 {
-		writer = safemem.FromIOWriter{fd.NewReadWriter(hostFD)}
-	} else {
-		writer = safemem.FromVecWriterFunc{
-			func(srcs [][]byte) (int64, error) {
-				n, err := unix.Pwritev(hostFD, srcs, offset)
-				return int64(n), err
-			},
-		}
-	}
-	n, err := src.CopyInTo(ctx, writer)
-	return int64(n), err
-}
-
-// Seek implements FileDescriptionImpl.
-//
-// Note that we do not support seeking on directories, since we do not even
-// allow directory fds to be imported at all.
-func (f *defaultFileFD) Seek(_ context.Context, offset int64, whence int32) (int64, error) {
-	// TODO(b/34716638): Some char devices do support seeking, e.g. /dev/null.
-	if f.inode.isStream {
-		return 0, syserror.ESPIPE
-	}
-
-	f.mu.Lock()
-	defer f.mu.Unlock()
-
-	switch whence {
-	case linux.SEEK_SET:
-		if offset < 0 {
-			return f.offset, syserror.EINVAL
-		}
-		f.offset = offset
-
-	case linux.SEEK_CUR:
-		// Check for overflow. Note that underflow cannot occur, since f.offset >= 0.
-		if offset > math.MaxInt64-f.offset {
-			return f.offset, syserror.EOVERFLOW
-		}
-		if f.offset+offset < 0 {
-			return f.offset, syserror.EINVAL
-		}
-		f.offset += offset
-
-	case linux.SEEK_END:
-		var s syscall.Stat_t
-		if err := syscall.Fstat(f.inode.hostFD, &s); err != nil {
-			return f.offset, err
-		}
-		size := s.Size
-
-		// Check for overflow. Note that underflow cannot occur, since size >= 0.
-		if offset > math.MaxInt64-size {
-			return f.offset, syserror.EOVERFLOW
-		}
-		if size+offset < 0 {
-			return f.offset, syserror.EINVAL
-		}
-		f.offset = size + offset
-
-	case linux.SEEK_DATA, linux.SEEK_HOLE:
-		// Modifying the offset in the host file table should not matter, since
-		// this is the only place where we use it.
-		//
-		// For reading and writing, we always rely on our internal offset.
-		n, err := unix.Seek(f.inode.hostFD, offset, int(whence))
-		if err != nil {
-			return f.offset, err
-		}
-		f.offset = n
-
-	default:
-		// Invalid whence.
-		return f.offset, syserror.EINVAL
-	}
-
-	return f.offset, nil
-}
-
-// Sync implements FileDescriptionImpl.
-func (f *defaultFileFD) Sync(context.Context) error {
-	// TODO(gvisor.dev/issue/1672): Currently we do not support the SyncData optimization, so we always sync everything.
-	return unix.Fsync(f.inode.hostFD)
-}
-
-// ConfigureMMap implements FileDescriptionImpl.
-func (f *defaultFileFD) ConfigureMMap(_ context.Context, opts *memmap.MMapOpts) error {
-	if !f.canMap {
-		return syserror.ENODEV
-	}
-	// TODO(gvisor.dev/issue/1672): Implement ConfigureMMap and Mappable interface.
-	return syserror.ENODEV
-}
diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go
index 2eebcd60c..a54985ef5 100644
--- a/pkg/sentry/fsimpl/host/host.go
+++ b/pkg/sentry/fsimpl/host/host.go
@@ -19,18 +19,23 @@ package host
 import (
 	"errors"
 	"fmt"
+	"math"
 	"syscall"
 
 	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fd"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/refs"
+	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // filesystem implements vfs.FilesystemImpl.
@@ -70,10 +75,20 @@ func ImportFD(mnt *vfs.Mount, hostFD int, ownerUID auth.KUID, ownerGID auth.KGID
 		hostFD:   hostFD,
 		isStream: isStream,
 		isTTY:    isTTY,
+		canMap:   canMap(uint32(fileType)),
 		ino:      fs.NextIno(),
 		mode:     fileMode,
 		uid:      ownerUID,
 		gid:      ownerGID,
+		// For simplicity, set offset to 0. Technically, we should
+		// only set to 0 on files that are not seekable (sockets, pipes, etc.),
+		// and use the offset from the host fd otherwise.
+		offset: 0,
+	}
+
+	// These files can't be memory mapped, assert this.
+	if i.isStream && i.canMap {
+		panic("files that can return EWOULDBLOCK (sockets, pipes, etc.) cannot be memory mapped")
 	}
 
 	d := &kernfs.Dentry{}
@@ -110,12 +125,17 @@ type inode struct {
 	// This field is initialized at creation time and is immutable.
 	isTTY bool
 
+	// canMap specifies whether we allow the file to be memory mapped.
+	//
+	// This field is initialized at creation time and is immutable.
+	canMap bool
+
 	// ino is an inode number unique within this filesystem.
+	//
+	// This field is initialized at creation time and is immutable.
 	ino uint64
 
-	// mu protects the inode metadata below.
-	// TODO(gvisor.dev/issue/1672): actually protect fields below.
-	//mu sync.Mutex
+	// TODO(gvisor.dev/issue/1672): protect mode, uid, and gid with mutex.
 
 	// mode is the file mode of this inode. Note that this value may become out
 	// of date if the mode is changed on the host, e.g. with chmod.
@@ -125,6 +145,12 @@ type inode struct {
 	// file created on import, not the fd on the host.
 	uid auth.KUID
 	gid auth.KGID
+
+	// offsetMu protects offset.
+	offsetMu sync.Mutex
+
+	// offset specifies the current file offset.
+	offset int64
 }
 
 // Note that these flags may become out of date, since they can be modified
@@ -141,8 +167,8 @@ func fileFlagsFromHostFD(fd int) (int, error) {
 }
 
 // CheckPermissions implements kernfs.Inode.
-func (i *inode) CheckPermissions(ctx context.Context, creds *auth.Credentials, atx vfs.AccessTypes) error {
-	return vfs.GenericCheckPermissions(creds, atx, false /* isDir */, uint16(i.mode), i.uid, i.gid)
+func (i *inode) CheckPermissions(ctx context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error {
+	return vfs.GenericCheckPermissions(creds, ats, i.mode, i.uid, i.gid)
 }
 
 // Mode implements kernfs.Inode.
@@ -280,7 +306,7 @@ func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Cre
 	if m&^(linux.STATX_MODE|linux.STATX_SIZE|linux.STATX_ATIME|linux.STATX_MTIME) != 0 {
 		return syserror.EPERM
 	}
-	if err := vfs.CheckSetStat(ctx, creds, &s, uint16(i.Mode().Permissions()), i.uid, i.gid); err != nil {
+	if err := vfs.CheckSetStat(ctx, creds, &s, i.Mode(), i.uid, i.gid); err != nil {
 		return err
 	}
 
@@ -296,11 +322,11 @@ func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Cre
 		}
 	}
 	if m&(linux.STATX_ATIME|linux.STATX_MTIME) != 0 {
-		timestamps := []unix.Timespec{
+		ts := [2]syscall.Timespec{
 			toTimespec(s.Atime, m&linux.STATX_ATIME == 0),
 			toTimespec(s.Mtime, m&linux.STATX_MTIME == 0),
 		}
-		if err := unix.UtimesNanoAt(i.hostFD, "", timestamps, unix.AT_EMPTY_PATH); err != nil {
+		if err := setTimestamps(i.hostFD, &ts); err != nil {
 			return err
 		}
 	}
@@ -336,36 +362,40 @@ func (i *inode) open(d *vfs.Dentry, mnt *vfs.Mount) (*vfs.FileDescription, error
 
 	// TODO(gvisor.dev/issue/1672): Whitelist specific file types here, so that
 	// we don't allow importing arbitrary file types without proper support.
+	var (
+		vfsfd  *vfs.FileDescription
+		fdImpl vfs.FileDescriptionImpl
+	)
 	if i.isTTY {
-		// TODO(gvisor.dev/issue/1672): support importing host fd as TTY.
-		return nil, errors.New("importing host fd as TTY not supported")
-	}
-
-	// For simplicity, set offset to 0. Technically, we should
-	// only set to 0 on files that are not seekable (sockets, pipes, etc.),
-	// and use the offset from the host fd otherwise.
-	fd := &defaultFileFD{
-		fileDescription: fileDescription{
-			inode: i,
-		},
-		canMap: canMap(uint32(fileType)),
-		mu:     sync.Mutex{},
-		offset: 0,
+		fd := &ttyFD{
+			fileDescription: fileDescription{inode: i},
+			termios:         linux.DefaultSlaveTermios,
+		}
+		vfsfd = &fd.vfsfd
+		fdImpl = fd
+	} else {
+		// For simplicity, set offset to 0. Technically, we should
+		// only set to 0 on files that are not seekable (sockets, pipes, etc.),
+		// and use the offset from the host fd otherwise.
+		fd := &fileDescription{inode: i}
+		vfsfd = &fd.vfsfd
+		fdImpl = fd
 	}
 
-	vfsfd := &fd.vfsfd
 	flags, err := fileFlagsFromHostFD(i.hostFD)
 	if err != nil {
 		return nil, err
 	}
 
-	if err := vfsfd.Init(fd, uint32(flags), mnt, d, &vfs.FileDescriptionOptions{}); err != nil {
+	if err := vfsfd.Init(fdImpl, uint32(flags), mnt, d, &vfs.FileDescriptionOptions{}); err != nil {
 		return nil, err
 	}
 	return vfsfd, nil
 }
 
 // fileDescription is embedded by host fd implementations of FileDescriptionImpl.
+//
+// TODO(gvisor.dev/issue/1672): Implement Waitable interface.
 type fileDescription struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
@@ -394,3 +424,193 @@ func (f *fileDescription) Stat(_ context.Context, opts vfs.StatOptions) (linux.S
 func (f *fileDescription) Release() {
 	// noop
 }
+
+// PRead implements FileDescriptionImpl.
+func (f *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+	i := f.inode
+	// TODO(b/34716638): Some char devices do support offsets, e.g. /dev/null.
+	if i.isStream {
+		return 0, syserror.ESPIPE
+	}
+
+	return readFromHostFD(ctx, i.hostFD, dst, offset, opts.Flags)
+}
+
+// Read implements FileDescriptionImpl.
+func (f *fileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+	i := f.inode
+	// TODO(b/34716638): Some char devices do support offsets, e.g. /dev/null.
+	if i.isStream {
+		n, err := readFromHostFD(ctx, i.hostFD, dst, -1, opts.Flags)
+		if isBlockError(err) {
+			// If we got any data at all, return it as a "completed" partial read
+			// rather than retrying until complete.
+			if n != 0 {
+				err = nil
+			} else {
+				err = syserror.ErrWouldBlock
+			}
+		}
+		return n, err
+	}
+	// TODO(gvisor.dev/issue/1672): Cache pages, when forced to do so.
+	i.offsetMu.Lock()
+	n, err := readFromHostFD(ctx, i.hostFD, dst, i.offset, opts.Flags)
+	i.offset += n
+	i.offsetMu.Unlock()
+	return n, err
+}
+
+func readFromHostFD(ctx context.Context, hostFD int, dst usermem.IOSequence, offset int64, flags uint32) (int64, error) {
+	// TODO(gvisor.dev/issue/1672): Support select preadv2 flags.
+	if flags != 0 {
+		return 0, syserror.EOPNOTSUPP
+	}
+
+	var reader safemem.Reader
+	if offset == -1 {
+		reader = safemem.FromIOReader{fd.NewReadWriter(hostFD)}
+	} else {
+		reader = safemem.FromVecReaderFunc{
+			func(srcs [][]byte) (int64, error) {
+				n, err := unix.Preadv(hostFD, srcs, offset)
+				return int64(n), err
+			},
+		}
+	}
+	n, err := dst.CopyOutFrom(ctx, reader)
+	return int64(n), err
+}
+
+// PWrite implements FileDescriptionImpl.
+func (f *fileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+	i := f.inode
+	// TODO(b/34716638): Some char devices do support offsets, e.g. /dev/null.
+	if i.isStream {
+		return 0, syserror.ESPIPE
+	}
+
+	return writeToHostFD(ctx, i.hostFD, src, offset, opts.Flags)
+}
+
+// Write implements FileDescriptionImpl.
+func (f *fileDescription) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+	i := f.inode
+	// TODO(b/34716638): Some char devices do support offsets, e.g. /dev/null.
+	if i.isStream {
+		n, err := writeToHostFD(ctx, i.hostFD, src, -1, opts.Flags)
+		if isBlockError(err) {
+			err = syserror.ErrWouldBlock
+		}
+		return n, err
+	}
+	// TODO(gvisor.dev/issue/1672): Cache pages, when forced to do so.
+	// TODO(gvisor.dev/issue/1672): Write to end of file and update offset if O_APPEND is set on this file.
+	i.offsetMu.Lock()
+	n, err := writeToHostFD(ctx, i.hostFD, src, i.offset, opts.Flags)
+	i.offset += n
+	i.offsetMu.Unlock()
+	return n, err
+}
+
+func writeToHostFD(ctx context.Context, hostFD int, src usermem.IOSequence, offset int64, flags uint32) (int64, error) {
+	// TODO(gvisor.dev/issue/1672): Support select pwritev2 flags.
+	if flags != 0 {
+		return 0, syserror.EOPNOTSUPP
+	}
+
+	var writer safemem.Writer
+	if offset == -1 {
+		writer = safemem.FromIOWriter{fd.NewReadWriter(hostFD)}
+	} else {
+		writer = safemem.FromVecWriterFunc{
+			func(srcs [][]byte) (int64, error) {
+				n, err := unix.Pwritev(hostFD, srcs, offset)
+				return int64(n), err
+			},
+		}
+	}
+	n, err := src.CopyInTo(ctx, writer)
+	return int64(n), err
+}
+
+// Seek implements FileDescriptionImpl.
+//
+// Note that we do not support seeking on directories, since we do not even
+// allow directory fds to be imported at all.
+func (f *fileDescription) Seek(_ context.Context, offset int64, whence int32) (int64, error) {
+	i := f.inode
+	// TODO(b/34716638): Some char devices do support seeking, e.g. /dev/null.
+	if i.isStream {
+		return 0, syserror.ESPIPE
+	}
+
+	i.offsetMu.Lock()
+	defer i.offsetMu.Unlock()
+
+	switch whence {
+	case linux.SEEK_SET:
+		if offset < 0 {
+			return i.offset, syserror.EINVAL
+		}
+		i.offset = offset
+
+	case linux.SEEK_CUR:
+		// Check for overflow. Note that underflow cannot occur, since i.offset >= 0.
+		if offset > math.MaxInt64-i.offset {
+			return i.offset, syserror.EOVERFLOW
+		}
+		if i.offset+offset < 0 {
+			return i.offset, syserror.EINVAL
+		}
+		i.offset += offset
+
+	case linux.SEEK_END:
+		var s syscall.Stat_t
+		if err := syscall.Fstat(i.hostFD, &s); err != nil {
+			return i.offset, err
+		}
+		size := s.Size
+
+		// Check for overflow. Note that underflow cannot occur, since size >= 0.
+		if offset > math.MaxInt64-size {
+			return i.offset, syserror.EOVERFLOW
+		}
+		if size+offset < 0 {
+			return i.offset, syserror.EINVAL
+		}
+		i.offset = size + offset
+
+	case linux.SEEK_DATA, linux.SEEK_HOLE:
+		// Modifying the offset in the host file table should not matter, since
+		// this is the only place where we use it.
+		//
+		// For reading and writing, we always rely on our internal offset.
+		n, err := unix.Seek(i.hostFD, offset, int(whence))
+		if err != nil {
+			return i.offset, err
+		}
+		i.offset = n
+
+	default:
+		// Invalid whence.
+		return i.offset, syserror.EINVAL
+	}
+
+	return i.offset, nil
+}
+
+// Sync implements FileDescriptionImpl.
+func (f *fileDescription) Sync(context.Context) error {
+	// TODO(gvisor.dev/issue/1672): Currently we do not support the SyncData optimization, so we always sync everything.
+	return unix.Fsync(f.inode.hostFD)
+}
+
+// ConfigureMMap implements FileDescriptionImpl.
+func (f *fileDescription) ConfigureMMap(_ context.Context, opts *memmap.MMapOpts) error {
+	if !f.inode.canMap {
+		return syserror.ENODEV
+	}
+	// TODO(gvisor.dev/issue/1672): Implement ConfigureMMap and Mappable interface.
+	return syserror.ENODEV
+}
diff --git a/pkg/sentry/fsimpl/host/ioctl_unsafe.go b/pkg/sentry/fsimpl/host/ioctl_unsafe.go
new file mode 100644
index 000000000..0983bf7d8
--- /dev/null
+++ b/pkg/sentry/fsimpl/host/ioctl_unsafe.go
@@ -0,0 +1,56 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+	"syscall"
+	"unsafe"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+)
+
+func ioctlGetTermios(fd int) (*linux.Termios, error) {
+	var t linux.Termios
+	_, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), linux.TCGETS, uintptr(unsafe.Pointer(&t)))
+	if errno != 0 {
+		return nil, errno
+	}
+	return &t, nil
+}
+
+func ioctlSetTermios(fd int, req uint64, t *linux.Termios) error {
+	_, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), uintptr(req), uintptr(unsafe.Pointer(t)))
+	if errno != 0 {
+		return errno
+	}
+	return nil
+}
+
+func ioctlGetWinsize(fd int) (*linux.Winsize, error) {
+	var w linux.Winsize
+	_, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), linux.TIOCGWINSZ, uintptr(unsafe.Pointer(&w)))
+	if errno != 0 {
+		return nil, errno
+	}
+	return &w, nil
+}
+
+func ioctlSetWinsize(fd int, w *linux.Winsize) error {
+	_, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), linux.TIOCSWINSZ, uintptr(unsafe.Pointer(w)))
+	if errno != 0 {
+		return errno
+	}
+	return nil
+}
diff --git a/pkg/sentry/fsimpl/host/tty.go b/pkg/sentry/fsimpl/host/tty.go
new file mode 100644
index 000000000..8936afb06
--- /dev/null
+++ b/pkg/sentry/fsimpl/host/tty.go
@@ -0,0 +1,379 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/unimpl"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// ttyFD implements vfs.FileDescriptionImpl for a host file descriptor
+// that wraps a TTY FD.
+type ttyFD struct {
+	fileDescription
+
+	// mu protects the fields below.
+	mu sync.Mutex `state:"nosave"`
+
+	// session is the session attached to this ttyFD.
+	session *kernel.Session
+
+	// fgProcessGroup is the foreground process group that is currently
+	// connected to this TTY.
+	fgProcessGroup *kernel.ProcessGroup
+
+	// termios contains the terminal attributes for this TTY.
+	termios linux.KernelTermios
+}
+
+// InitForegroundProcessGroup sets the foreground process group and session for
+// the TTY. This should only be called once, after the foreground process group
+// has been created, but before it has started running.
+func (t *ttyFD) InitForegroundProcessGroup(pg *kernel.ProcessGroup) {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	if t.fgProcessGroup != nil {
+		panic("foreground process group is already set")
+	}
+	t.fgProcessGroup = pg
+	t.session = pg.Session()
+}
+
+// ForegroundProcessGroup returns the foreground process for the TTY.
+func (t *ttyFD) ForegroundProcessGroup() *kernel.ProcessGroup {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	return t.fgProcessGroup
+}
+
+// Release implements fs.FileOperations.Release.
+func (t *ttyFD) Release() {
+	t.mu.Lock()
+	t.fgProcessGroup = nil
+	t.mu.Unlock()
+
+	t.fileDescription.Release()
+}
+
+// PRead implements vfs.FileDescriptionImpl.
+//
+// Reading from a TTY is only allowed for foreground process groups. Background
+// process groups will either get EIO or a SIGTTIN.
+func (t *ttyFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+
+	// Are we allowed to do the read?
+	// drivers/tty/n_tty.c:n_tty_read()=>job_control()=>tty_check_change().
+	if err := t.checkChange(ctx, linux.SIGTTIN); err != nil {
+		return 0, err
+	}
+
+	// Do the read.
+	return t.fileDescription.PRead(ctx, dst, offset, opts)
+}
+
+// Read implements vfs.FileDescriptionImpl.
+//
+// Reading from a TTY is only allowed for foreground process groups. Background
+// process groups will either get EIO or a SIGTTIN.
+func (t *ttyFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+
+	// Are we allowed to do the read?
+	// drivers/tty/n_tty.c:n_tty_read()=>job_control()=>tty_check_change().
+	if err := t.checkChange(ctx, linux.SIGTTIN); err != nil {
+		return 0, err
+	}
+
+	// Do the read.
+	return t.fileDescription.Read(ctx, dst, opts)
+}
+
+// PWrite implements vfs.FileDescriptionImpl.
+func (t *ttyFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+
+	// Check whether TOSTOP is enabled. This corresponds to the check in
+	// drivers/tty/n_tty.c:n_tty_write().
+	if t.termios.LEnabled(linux.TOSTOP) {
+		if err := t.checkChange(ctx, linux.SIGTTOU); err != nil {
+			return 0, err
+		}
+	}
+	return t.fileDescription.PWrite(ctx, src, offset, opts)
+}
+
+// Write implements vfs.FileDescriptionImpl.
+func (t *ttyFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+
+	// Check whether TOSTOP is enabled. This corresponds to the check in
+	// drivers/tty/n_tty.c:n_tty_write().
+	if t.termios.LEnabled(linux.TOSTOP) {
+		if err := t.checkChange(ctx, linux.SIGTTOU); err != nil {
+			return 0, err
+		}
+	}
+	return t.fileDescription.Write(ctx, src, opts)
+}
+
+// Ioctl implements vfs.FileDescriptionImpl.
+func (t *ttyFD) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	// Ignore arg[0]. This is the real FD:
+	fd := t.inode.hostFD
+	ioctl := args[1].Uint64()
+	switch ioctl {
+	case linux.TCGETS:
+		termios, err := ioctlGetTermios(fd)
+		if err != nil {
+			return 0, err
+		}
+		_, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), termios, usermem.IOOpts{
+			AddressSpaceActive: true,
+		})
+		return 0, err
+
+	case linux.TCSETS, linux.TCSETSW, linux.TCSETSF:
+		t.mu.Lock()
+		defer t.mu.Unlock()
+
+		if err := t.checkChange(ctx, linux.SIGTTOU); err != nil {
+			return 0, err
+		}
+
+		var termios linux.Termios
+		if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &termios, usermem.IOOpts{
+			AddressSpaceActive: true,
+		}); err != nil {
+			return 0, err
+		}
+		err := ioctlSetTermios(fd, ioctl, &termios)
+		if err == nil {
+			t.termios.FromTermios(termios)
+		}
+		return 0, err
+
+	case linux.TIOCGPGRP:
+		// Args: pid_t *argp
+		// When successful, equivalent to *argp = tcgetpgrp(fd).
+		// Get the process group ID of the foreground process group on this
+		// terminal.
+
+		pidns := kernel.PIDNamespaceFromContext(ctx)
+		if pidns == nil {
+			return 0, syserror.ENOTTY
+		}
+
+		t.mu.Lock()
+		defer t.mu.Unlock()
+
+		// Map the ProcessGroup into a ProcessGroupID in the task's PID namespace.
+		pgID := pidns.IDOfProcessGroup(t.fgProcessGroup)
+		_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), &pgID, usermem.IOOpts{
+			AddressSpaceActive: true,
+		})
+		return 0, err
+
+	case linux.TIOCSPGRP:
+		// Args: const pid_t *argp
+		// Equivalent to tcsetpgrp(fd, *argp).
+		// Set the foreground process group ID of this terminal.
+
+		task := kernel.TaskFromContext(ctx)
+		if task == nil {
+			return 0, syserror.ENOTTY
+		}
+
+		t.mu.Lock()
+		defer t.mu.Unlock()
+
+		// Check that we are allowed to set the process group.
+		if err := t.checkChange(ctx, linux.SIGTTOU); err != nil {
+			// drivers/tty/tty_io.c:tiocspgrp() converts -EIO from tty_check_change()
+			// to -ENOTTY.
+			if err == syserror.EIO {
+				return 0, syserror.ENOTTY
+			}
+			return 0, err
+		}
+
+		// Check that calling task's process group is in the TTY session.
+		if task.ThreadGroup().Session() != t.session {
+			return 0, syserror.ENOTTY
+		}
+
+		var pgID kernel.ProcessGroupID
+		if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &pgID, usermem.IOOpts{
+			AddressSpaceActive: true,
+		}); err != nil {
+			return 0, err
+		}
+
+		// pgID must be non-negative.
+		if pgID < 0 {
+			return 0, syserror.EINVAL
+		}
+
+		// Process group with pgID must exist in this PID namespace.
+		pidns := task.PIDNamespace()
+		pg := pidns.ProcessGroupWithID(pgID)
+		if pg == nil {
+			return 0, syserror.ESRCH
+		}
+
+		// Check that new process group is in the TTY session.
+		if pg.Session() != t.session {
+			return 0, syserror.EPERM
+		}
+
+		t.fgProcessGroup = pg
+		return 0, nil
+
+	case linux.TIOCGWINSZ:
+		// Args: struct winsize *argp
+		// Get window size.
+		winsize, err := ioctlGetWinsize(fd)
+		if err != nil {
+			return 0, err
+		}
+		_, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), winsize, usermem.IOOpts{
+			AddressSpaceActive: true,
+		})
+		return 0, err
+
+	case linux.TIOCSWINSZ:
+		// Args: const struct winsize *argp
+		// Set window size.
+
+		// Unlike setting the termios, any process group (even background ones) can
+		// set the winsize.
+
+		var winsize linux.Winsize
+		if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &winsize, usermem.IOOpts{
+			AddressSpaceActive: true,
+		}); err != nil {
+			return 0, err
+		}
+		err := ioctlSetWinsize(fd, &winsize)
+		return 0, err
+
+	// Unimplemented commands.
+	case linux.TIOCSETD,
+		linux.TIOCSBRK,
+		linux.TIOCCBRK,
+		linux.TCSBRK,
+		linux.TCSBRKP,
+		linux.TIOCSTI,
+		linux.TIOCCONS,
+		linux.FIONBIO,
+		linux.TIOCEXCL,
+		linux.TIOCNXCL,
+		linux.TIOCGEXCL,
+		linux.TIOCNOTTY,
+		linux.TIOCSCTTY,
+		linux.TIOCGSID,
+		linux.TIOCGETD,
+		linux.TIOCVHANGUP,
+		linux.TIOCGDEV,
+		linux.TIOCMGET,
+		linux.TIOCMSET,
+		linux.TIOCMBIC,
+		linux.TIOCMBIS,
+		linux.TIOCGICOUNT,
+		linux.TCFLSH,
+		linux.TIOCSSERIAL,
+		linux.TIOCGPTPEER:
+
+		unimpl.EmitUnimplementedEvent(ctx)
+		fallthrough
+	default:
+		return 0, syserror.ENOTTY
+	}
+}
+
+// checkChange checks that the process group is allowed to read, write, or
+// change the state of the TTY.
+//
+// This corresponds to Linux drivers/tty/tty_io.c:tty_check_change(). The logic
+// is a bit convoluted, but documented inline.
+//
+// Preconditions: t.mu must be held.
+func (t *ttyFD) checkChange(ctx context.Context, sig linux.Signal) error {
+	task := kernel.TaskFromContext(ctx)
+	if task == nil {
+		// No task? Linux does not have an analog for this case, but
+		// tty_check_change is more of a blacklist of cases than a
+		// whitelist, and is surprisingly permissive. Allowing the
+		// change seems most appropriate.
+		return nil
+	}
+
+	tg := task.ThreadGroup()
+	pg := tg.ProcessGroup()
+
+	// If the session for the task is different than the session for the
+	// controlling TTY, then the change is allowed. Seems like a bad idea,
+	// but that's exactly what linux does.
+	if tg.Session() != t.fgProcessGroup.Session() {
+		return nil
+	}
+
+	// If we are the foreground process group, then the change is allowed.
+	if pg == t.fgProcessGroup {
+		return nil
+	}
+
+	// We are not the foreground process group.
+
+	// Is the provided signal blocked or ignored?
+	if (task.SignalMask()&linux.SignalSetOf(sig) != 0) || tg.SignalHandlers().IsIgnored(sig) {
+		// If the signal is SIGTTIN, then we are attempting to read
+		// from the TTY. Don't send the signal and return EIO.
+		if sig == linux.SIGTTIN {
+			return syserror.EIO
+		}
+
+		// Otherwise, we are writing or changing terminal state. This is allowed.
+		return nil
+	}
+
+	// If the process group is an orphan, return EIO.
+	if pg.IsOrphan() {
+		return syserror.EIO
+	}
+
+	// Otherwise, send the signal to the process group and return ERESTARTSYS.
+	//
+	// Note that Linux also unconditionally sets TIF_SIGPENDING on current,
+	// but this isn't necessary in gVisor because the rationale given in
+	// 040b6362d58f "tty: fix leakage of -ERESTARTSYS to userland" doesn't
+	// apply: the sentry will handle -ERESTARTSYS in
+	// kernel.runApp.execute() even if the kernel.Task isn't interrupted.
+	//
+	// Linux ignores the result of kill_pgrp().
+	_ = pg.SendSignal(kernel.SignalInfoPriv(sig))
+	return kernel.ERESTARTSYS
+}
diff --git a/pkg/sentry/fsimpl/host/util.go b/pkg/sentry/fsimpl/host/util.go
index d519feef5..2bc757b1a 100644
--- a/pkg/sentry/fsimpl/host/util.go
+++ b/pkg/sentry/fsimpl/host/util.go
@@ -22,15 +22,15 @@ import (
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
-func toTimespec(ts linux.StatxTimestamp, omit bool) unix.Timespec {
+func toTimespec(ts linux.StatxTimestamp, omit bool) syscall.Timespec {
 	if omit {
-		return unix.Timespec{
+		return syscall.Timespec{
 			Sec:  0,
 			Nsec: unix.UTIME_OMIT,
 		}
 	}
-	return unix.Timespec{
-		Sec:  int64(ts.Sec),
+	return syscall.Timespec{
+		Sec:  ts.Sec,
 		Nsec: int64(ts.Nsec),
 	}
 }
diff --git a/pkg/sentry/fsimpl/host/util_unsafe.go b/pkg/sentry/fsimpl/host/util_unsafe.go
new file mode 100644
index 000000000..5136ac844
--- /dev/null
+++ b/pkg/sentry/fsimpl/host/util_unsafe.go
@@ -0,0 +1,34 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+	"syscall"
+	"unsafe"
+)
+
+func setTimestamps(fd int, ts *[2]syscall.Timespec) error {
+	_, _, errno := syscall.Syscall6(
+		syscall.SYS_UTIMENSAT,
+		uintptr(fd),
+		0, /* path */
+		uintptr(unsafe.Pointer(ts)),
+		0, /* flags */
+		0, 0)
+	if errno != 0 {
+		return errno
+	}
+	return nil
+}
diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
index 75c4bab1a..bfa786c88 100644
--- a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
+++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
@@ -206,8 +206,7 @@ func (fd *GenericDirectoryFD) Stat(ctx context.Context, opts vfs.StatOptions) (l
 
 // SetStat implements vfs.FileDescriptionImpl.SetStat.
 func (fd *GenericDirectoryFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
-	fs := fd.filesystem()
 	creds := auth.CredentialsFromContext(ctx)
 	inode := fd.vfsfd.VirtualDentry().Dentry().Impl().(*Dentry).inode
-	return inode.SetStat(ctx, fs, creds, opts)
+	return inode.SetStat(ctx, fd.filesystem(), creds, opts)
 }
diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
index c612dcf07..5c84b10c9 100644
--- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
+++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
@@ -241,7 +241,7 @@ func (a *InodeAttrs) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *aut
 	if opts.Stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID) != 0 {
 		return syserror.EPERM
 	}
-	if err := vfs.CheckSetStat(ctx, creds, &opts.Stat, uint16(a.Mode().Permissions()), auth.KUID(atomic.LoadUint32(&a.uid)), auth.KGID(atomic.LoadUint32(&a.gid))); err != nil {
+	if err := vfs.CheckSetStat(ctx, creds, &opts.Stat, a.Mode(), auth.KUID(atomic.LoadUint32(&a.uid)), auth.KGID(atomic.LoadUint32(&a.gid))); err != nil {
 		return err
 	}
 
@@ -273,12 +273,10 @@ func (a *InodeAttrs) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *aut
 
 // CheckPermissions implements Inode.CheckPermissions.
 func (a *InodeAttrs) CheckPermissions(_ context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error {
-	mode := a.Mode()
 	return vfs.GenericCheckPermissions(
 		creds,
 		ats,
-		mode.FileType() == linux.ModeDirectory,
-		uint16(mode),
+		a.Mode(),
 		auth.KUID(atomic.LoadUint32(&a.uid)),
 		auth.KGID(atomic.LoadUint32(&a.gid)),
 	)
diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go
index 49d6efb0e..aee2a4392 100644
--- a/pkg/sentry/fsimpl/proc/task.go
+++ b/pkg/sentry/fsimpl/proc/task.go
@@ -172,14 +172,7 @@ func (i *taskOwnedInode) Stat(fs *vfs.Filesystem, opts vfs.StatOptions) (linux.S
 func (i *taskOwnedInode) CheckPermissions(_ context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error {
 	mode := i.Mode()
 	uid, gid := i.getOwner(mode)
-	return vfs.GenericCheckPermissions(
-		creds,
-		ats,
-		mode.FileType() == linux.ModeDirectory,
-		uint16(mode),
-		uid,
-		gid,
-	)
+	return vfs.GenericCheckPermissions(creds, ats, mode, uid, gid)
 }
 
 func (i *taskOwnedInode) getOwner(mode linux.FileMode) (auth.KUID, auth.KGID) {
diff --git a/pkg/sentry/fsimpl/tmpfs/device_file.go b/pkg/sentry/fsimpl/tmpfs/device_file.go
index 84b181b90..83bf885ee 100644
--- a/pkg/sentry/fsimpl/tmpfs/device_file.go
+++ b/pkg/sentry/fsimpl/tmpfs/device_file.go
@@ -15,6 +15,8 @@
 package tmpfs
 
 import (
+	"fmt"
+
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
@@ -33,6 +35,14 @@ func (fs *filesystem) newDeviceFile(creds *auth.Credentials, mode linux.FileMode
 		major: major,
 		minor: minor,
 	}
+	switch kind {
+	case vfs.BlockDevice:
+		mode |= linux.S_IFBLK
+	case vfs.CharDevice:
+		mode |= linux.S_IFCHR
+	default:
+		panic(fmt.Sprintf("invalid DeviceKind: %v", kind))
+	}
 	file.inode.init(file, fs, creds, mode)
 	file.inode.nlink = 1 // from parent directory
 	return &file.inode
diff --git a/pkg/sentry/fsimpl/tmpfs/directory.go b/pkg/sentry/fsimpl/tmpfs/directory.go
index b4380af38..37c75ab64 100644
--- a/pkg/sentry/fsimpl/tmpfs/directory.go
+++ b/pkg/sentry/fsimpl/tmpfs/directory.go
@@ -34,16 +34,11 @@ type directory struct {
 
 func (fs *filesystem) newDirectory(creds *auth.Credentials, mode linux.FileMode) *inode {
 	dir := &directory{}
-	dir.inode.init(dir, fs, creds, mode)
+	dir.inode.init(dir, fs, creds, linux.S_IFDIR|mode)
 	dir.inode.nlink = 2 // from "." and parent directory or ".." for root
 	return &dir.inode
 }
 
-func (i *inode) isDir() bool {
-	_, ok := i.impl.(*directory)
-	return ok
-}
-
 type directoryFD struct {
 	fileDescription
 	vfs.DirectoryFileDescriptionDefaultImpl
diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go
index 75d01b853..12cc64385 100644
--- a/pkg/sentry/fsimpl/tmpfs/filesystem.go
+++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go
@@ -41,7 +41,7 @@ func stepLocked(rp *vfs.ResolvingPath, d *dentry) (*dentry, error) {
 	if !d.inode.isDir() {
 		return nil, syserror.ENOTDIR
 	}
-	if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil {
+	if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
 		return nil, err
 	}
 afterSymlink:
@@ -125,7 +125,7 @@ func (fs *filesystem) doCreateAt(rp *vfs.ResolvingPath, dir bool, create func(pa
 	if err != nil {
 		return err
 	}
-	if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil {
+	if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
 		return err
 	}
 	name := rp.Component()
@@ -163,7 +163,7 @@ func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds
 	if err != nil {
 		return err
 	}
-	return d.inode.checkPermissions(creds, ats, d.inode.isDir())
+	return d.inode.checkPermissions(creds, ats)
 }
 
 // GetDentryAt implements vfs.FilesystemImpl.GetDentryAt.
@@ -178,7 +178,7 @@ func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, op
 		if !d.inode.isDir() {
 			return nil, syserror.ENOTDIR
 		}
-		if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayExec, true /* isDir */); err != nil {
+		if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
 			return nil, err
 		}
 	}
@@ -301,7 +301,7 @@ afterTrailingSymlink:
 		return nil, err
 	}
 	// Check for search permission in the parent directory.
-	if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil {
+	if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
 		return nil, err
 	}
 	// Reject attempts to open directories with O_CREAT.
@@ -316,7 +316,7 @@ afterTrailingSymlink:
 	child, err := stepLocked(rp, parent)
 	if err == syserror.ENOENT {
 		// Already checked for searchability above; now check for writability.
-		if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite, true); err != nil {
+		if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil {
 			return nil, err
 		}
 		if err := rp.Mount().CheckBeginWrite(); err != nil {
@@ -347,7 +347,7 @@ afterTrailingSymlink:
 func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions, afterCreate bool) (*vfs.FileDescription, error) {
 	ats := vfs.AccessTypesForOpenFlags(opts)
 	if !afterCreate {
-		if err := d.inode.checkPermissions(rp.Credentials(), ats, d.inode.isDir()); err != nil {
+		if err := d.inode.checkPermissions(rp.Credentials(), ats); err != nil {
 			return nil, err
 		}
 	}
@@ -428,7 +428,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 	defer mnt.EndWrite()
 
 	oldParent := oldParentVD.Dentry().Impl().(*dentry)
-	if err := oldParent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil {
+	if err := oldParent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
 		return err
 	}
 	// Call vfs.Dentry.Child() instead of stepLocked() or rp.ResolveChild(),
@@ -445,7 +445,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 		}
 		if oldParent != newParent {
 			// Writability is needed to change renamed's "..".
-			if err := renamed.inode.checkPermissions(rp.Credentials(), vfs.MayWrite, true /* isDir */); err != nil {
+			if err := renamed.inode.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil {
 				return err
 			}
 		}
@@ -455,7 +455,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 		}
 	}
 
-	if err := newParent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil {
+	if err := newParent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
 		return err
 	}
 	replacedVFSD := newParent.vfsd.Child(newName)
@@ -528,7 +528,7 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
 	if err != nil {
 		return err
 	}
-	if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil {
+	if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
 		return err
 	}
 	name := rp.Component()
@@ -621,7 +621,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
 	if err != nil {
 		return err
 	}
-	if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil {
+	if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
 		return err
 	}
 	name := rp.Component()
diff --git a/pkg/sentry/fsimpl/tmpfs/named_pipe.go b/pkg/sentry/fsimpl/tmpfs/named_pipe.go
index 0c57fdca3..2c5c739df 100644
--- a/pkg/sentry/fsimpl/tmpfs/named_pipe.go
+++ b/pkg/sentry/fsimpl/tmpfs/named_pipe.go
@@ -34,7 +34,7 @@ type namedPipe struct {
 //   * rp.Mount().CheckBeginWrite() has been called successfully.
 func (fs *filesystem) newNamedPipe(creds *auth.Credentials, mode linux.FileMode) *inode {
 	file := &namedPipe{pipe: pipe.NewVFSPipe(pipe.DefaultPipeSize, usermem.PageSize)}
-	file.inode.init(file, fs, creds, mode)
+	file.inode.init(file, fs, creds, linux.S_IFIFO|mode)
 	file.inode.nlink = 1 // Only the parent has a link.
 	return &file.inode
 }
diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file.go b/pkg/sentry/fsimpl/tmpfs/regular_file.go
index 5a2896bf6..26cd65605 100644
--- a/pkg/sentry/fsimpl/tmpfs/regular_file.go
+++ b/pkg/sentry/fsimpl/tmpfs/regular_file.go
@@ -89,7 +89,7 @@ func (fs *filesystem) newRegularFile(creds *auth.Credentials, mode linux.FileMod
 	file := &regularFile{
 		memFile: fs.memFile,
 	}
-	file.inode.init(file, fs, creds, mode)
+	file.inode.init(file, fs, creds, linux.S_IFREG|mode)
 	file.inode.nlink = 1 // from parent directory
 	return &file.inode
 }
diff --git a/pkg/sentry/fsimpl/tmpfs/symlink.go b/pkg/sentry/fsimpl/tmpfs/symlink.go
index 5246aca84..47e075ed4 100644
--- a/pkg/sentry/fsimpl/tmpfs/symlink.go
+++ b/pkg/sentry/fsimpl/tmpfs/symlink.go
@@ -15,6 +15,7 @@
 package tmpfs
 
 import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 )
 
@@ -27,7 +28,7 @@ func (fs *filesystem) newSymlink(creds *auth.Credentials, target string) *inode
 	link := &symlink{
 		target: target,
 	}
-	link.inode.init(link, fs, creds, 0777)
+	link.inode.init(link, fs, creds, linux.S_IFLNK|0777)
 	link.inode.nlink = 1 // from parent directory
 	return &link.inode
 }
diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
index ff69372b3..2f9e6c876 100644
--- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go
+++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
@@ -144,7 +144,7 @@ type inode struct {
 	// Inode metadata. Writing multiple fields atomically requires holding
 	// mu, othewise atomic operations can be used.
 	mu    sync.Mutex
-	mode  uint32 // excluding file type bits, which are based on impl
+	mode  uint32 // file type and mode
 	nlink uint32 // protected by filesystem.mu instead of inode.mu
 	uid   uint32 // auth.KUID, but stored as raw uint32 for sync/atomic
 	gid   uint32 // auth.KGID, but ...
@@ -168,6 +168,9 @@ type inode struct {
 const maxLinks = math.MaxUint32
 
 func (i *inode) init(impl interface{}, fs *filesystem, creds *auth.Credentials, mode linux.FileMode) {
+	if mode.FileType() == 0 {
+		panic("file type is required in FileMode")
+	}
 	i.clock = fs.clock
 	i.refs = 1
 	i.mode = uint32(mode)
@@ -242,8 +245,9 @@ func (i *inode) decRef() {
 	}
 }
 
-func (i *inode) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes, isDir bool) error {
-	return vfs.GenericCheckPermissions(creds, ats, isDir, uint16(atomic.LoadUint32(&i.mode)), auth.KUID(atomic.LoadUint32(&i.uid)), auth.KGID(atomic.LoadUint32(&i.gid)))
+func (i *inode) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error {
+	mode := linux.FileMode(atomic.LoadUint32(&i.mode))
+	return vfs.GenericCheckPermissions(creds, ats, mode, auth.KUID(atomic.LoadUint32(&i.uid)), auth.KGID(atomic.LoadUint32(&i.gid)))
 }
 
 // Go won't inline this function, and returning linux.Statx (which is quite
@@ -269,31 +273,21 @@ func (i *inode) statTo(stat *linux.Statx) {
 	// TODO(gvisor.dev/issues/1197): Device number.
 	switch impl := i.impl.(type) {
 	case *regularFile:
-		stat.Mode |= linux.S_IFREG
 		stat.Mask |= linux.STATX_SIZE | linux.STATX_BLOCKS
 		stat.Size = uint64(atomic.LoadUint64(&impl.size))
 		// In tmpfs, this will be FileRangeSet.Span() / 512 (but also cached in
 		// a uint64 accessed using atomic memory operations to avoid taking
 		// locks).
 		stat.Blocks = allocatedBlocksForSize(stat.Size)
-	case *directory:
-		stat.Mode |= linux.S_IFDIR
 	case *symlink:
-		stat.Mode |= linux.S_IFLNK
 		stat.Mask |= linux.STATX_SIZE | linux.STATX_BLOCKS
 		stat.Size = uint64(len(impl.target))
 		stat.Blocks = allocatedBlocksForSize(stat.Size)
-	case *namedPipe:
-		stat.Mode |= linux.S_IFIFO
 	case *deviceFile:
-		switch impl.kind {
-		case vfs.BlockDevice:
-			stat.Mode |= linux.S_IFBLK
-		case vfs.CharDevice:
-			stat.Mode |= linux.S_IFCHR
-		}
 		stat.RdevMajor = impl.major
 		stat.RdevMinor = impl.minor
+	case *directory, *namedPipe:
+		// Nothing to do.
 	default:
 		panic(fmt.Sprintf("unknown inode type: %T", i.impl))
 	}
@@ -306,7 +300,8 @@ func (i *inode) setStat(ctx context.Context, creds *auth.Credentials, stat *linu
 	if stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID|linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_CTIME|linux.STATX_SIZE) != 0 {
 		return syserror.EPERM
 	}
-	if err := vfs.CheckSetStat(ctx, creds, stat, uint16(atomic.LoadUint32(&i.mode))&^linux.S_IFMT, auth.KUID(atomic.LoadUint32(&i.uid)), auth.KGID(atomic.LoadUint32(&i.gid))); err != nil {
+	mode := linux.FileMode(atomic.LoadUint32(&i.mode))
+	if err := vfs.CheckSetStat(ctx, creds, stat, mode, auth.KUID(atomic.LoadUint32(&i.uid)), auth.KGID(atomic.LoadUint32(&i.gid))); err != nil {
 		return err
 	}
 	i.mu.Lock()
@@ -316,7 +311,8 @@ func (i *inode) setStat(ctx context.Context, creds *auth.Credentials, stat *linu
 	)
 	mask := stat.Mask
 	if mask&linux.STATX_MODE != 0 {
-		atomic.StoreUint32(&i.mode, uint32(stat.Mode))
+		ft := atomic.LoadUint32(&i.mode) & linux.S_IFMT
+		atomic.StoreUint32(&i.mode, ft|uint32(stat.Mode&^linux.S_IFMT))
 		needsCtimeBump = true
 	}
 	if mask&linux.STATX_UID != 0 {
@@ -439,6 +435,10 @@ func (i *inode) direntType() uint8 {
 	}
 }
 
+func (i *inode) isDir() bool {
+	return linux.FileMode(i.mode).FileType() == linux.S_IFDIR
+}
+
 // fileDescription is embedded by tmpfs implementations of
 // vfs.FileDescriptionImpl.
 type fileDescription struct {
diff --git a/pkg/sentry/kernel/epoll/epoll.go b/pkg/sentry/kernel/epoll/epoll.go
index 8bffb78fc..592650923 100644
--- a/pkg/sentry/kernel/epoll/epoll.go
+++ b/pkg/sentry/kernel/epoll/epoll.go
@@ -296,8 +296,10 @@ func (*readyCallback) Callback(w *waiter.Entry) {
 		e.waitingList.Remove(entry)
 		e.readyList.PushBack(entry)
 		entry.curList = &e.readyList
+		e.listsMu.Unlock()
 
 		e.Notify(waiter.EventIn)
+		return
 	}
 
 	e.listsMu.Unlock()
diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go
index dddc28d5a..d09d97825 100644
--- a/pkg/sentry/kernel/fd_table.go
+++ b/pkg/sentry/kernel/fd_table.go
@@ -338,7 +338,7 @@ func (f *FDTable) NewFDVFS2(ctx context.Context, minfd int32, file *vfs.FileDesc
 		fd = f.next
 	}
 	for fd < end {
-		if d, _, _ := f.get(fd); d == nil {
+		if d, _, _ := f.getVFS2(fd); d == nil {
 			f.setVFS2(fd, file, flags)
 			if fd == f.next {
 				// Update next search start position.
diff --git a/pkg/sentry/platform/ring0/entry_arm64.s b/pkg/sentry/platform/ring0/entry_arm64.s
index d42eda37b..db6465663 100644
--- a/pkg/sentry/platform/ring0/entry_arm64.s
+++ b/pkg/sentry/platform/ring0/entry_arm64.s
@@ -394,6 +394,8 @@ TEXT ·Current(SB),NOSPLIT,$0-8
 
 #define STACK_FRAME_SIZE 16
 
+// kernelExitToEl0 is the entrypoint for application in guest_el0.
+// Prepare the vcpu environment for container application.
 TEXT ·kernelExitToEl0(SB),NOSPLIT,$0
 	// Step1, save sentry context into memory.
 	REGISTERS_SAVE(RSV_REG, CPU_REGISTERS)
@@ -464,7 +466,23 @@ TEXT ·kernelExitToEl0(SB),NOSPLIT,$0
 
 	ERET()
 
+// kernelExitToEl1 is the entrypoint for sentry in guest_el1.
+// Prepare the vcpu environment for sentry.
 TEXT ·kernelExitToEl1(SB),NOSPLIT,$0
+	WORD $0xd538d092     //MRS   TPIDR_EL1, R18
+
+	MOVD CPU_REGISTERS+PTRACE_PSTATE(RSV_REG), R1
+	WORD $0xd5184001  //MSR R1, SPSR_EL1
+
+	MOVD CPU_REGISTERS+PTRACE_PC(RSV_REG), R1
+	MSR R1, ELR_EL1
+
+	MOVD CPU_REGISTERS+PTRACE_SP(RSV_REG), R1
+	MOVD R1, RSP
+
+	REGISTERS_LOAD(RSV_REG, CPU_REGISTERS)
+	MOVD CPU_REGISTERS+PTRACE_R9(RSV_REG), RSV_REG_APP
+
 	ERET()
 
 // Start is the CPU entrypoint.
diff --git a/pkg/sentry/socket/netfilter/BUILD b/pkg/sentry/socket/netfilter/BUILD
index 7cd2ce55b..e801abeb8 100644
--- a/pkg/sentry/socket/netfilter/BUILD
+++ b/pkg/sentry/socket/netfilter/BUILD
@@ -22,7 +22,6 @@ go_library(
         "//pkg/syserr",
         "//pkg/tcpip",
         "//pkg/tcpip/header",
-        "//pkg/tcpip/iptables",
         "//pkg/tcpip/stack",
         "//pkg/usermem",
     ],
diff --git a/pkg/sentry/socket/netfilter/extensions.go b/pkg/sentry/socket/netfilter/extensions.go
index b4b244abf..0336a32d8 100644
--- a/pkg/sentry/socket/netfilter/extensions.go
+++ b/pkg/sentry/socket/netfilter/extensions.go
@@ -19,7 +19,7 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/binary"
-	"gvisor.dev/gvisor/pkg/tcpip/iptables"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
@@ -37,12 +37,12 @@ type matchMaker interface {
 	// name is the matcher name as stored in the xt_entry_match struct.
 	name() string
 
-	// marshal converts from an iptables.Matcher to an ABI struct.
-	marshal(matcher iptables.Matcher) []byte
+	// marshal converts from an stack.Matcher to an ABI struct.
+	marshal(matcher stack.Matcher) []byte
 
 	// unmarshal converts from the ABI matcher struct to an
-	// iptables.Matcher.
-	unmarshal(buf []byte, filter iptables.IPHeaderFilter) (iptables.Matcher, error)
+	// stack.Matcher.
+	unmarshal(buf []byte, filter stack.IPHeaderFilter) (stack.Matcher, error)
 }
 
 // matchMakers maps the name of supported matchers to the matchMaker that
@@ -58,7 +58,7 @@ func registerMatchMaker(mm matchMaker) {
 	matchMakers[mm.name()] = mm
 }
 
-func marshalMatcher(matcher iptables.Matcher) []byte {
+func marshalMatcher(matcher stack.Matcher) []byte {
 	matchMaker, ok := matchMakers[matcher.Name()]
 	if !ok {
 		panic(fmt.Sprintf("Unknown matcher of type %T.", matcher))
@@ -86,7 +86,7 @@ func marshalEntryMatch(name string, data []byte) []byte {
 	return append(buf, make([]byte, size-len(buf))...)
 }
 
-func unmarshalMatcher(match linux.XTEntryMatch, filter iptables.IPHeaderFilter, buf []byte) (iptables.Matcher, error) {
+func unmarshalMatcher(match linux.XTEntryMatch, filter stack.IPHeaderFilter, buf []byte) (stack.Matcher, error) {
 	matchMaker, ok := matchMakers[match.Name.String()]
 	if !ok {
 		return nil, fmt.Errorf("unsupported matcher with name %q", match.Name.String())
diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go
index b5b9be46f..55bcc3ace 100644
--- a/pkg/sentry/socket/netfilter/netfilter.go
+++ b/pkg/sentry/socket/netfilter/netfilter.go
@@ -27,7 +27,6 @@ import (
 	"gvisor.dev/gvisor/pkg/syserr"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
-	"gvisor.dev/gvisor/pkg/tcpip/iptables"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
@@ -129,19 +128,19 @@ func GetEntries(t *kernel.Task, stack *stack.Stack, outPtr usermem.Addr, outLen
 	return entries, nil
 }
 
-func findTable(stack *stack.Stack, tablename linux.TableName) (iptables.Table, error) {
-	ipt := stack.IPTables()
+func findTable(stk *stack.Stack, tablename linux.TableName) (stack.Table, error) {
+	ipt := stk.IPTables()
 	table, ok := ipt.Tables[tablename.String()]
 	if !ok {
-		return iptables.Table{}, fmt.Errorf("couldn't find table %q", tablename)
+		return stack.Table{}, fmt.Errorf("couldn't find table %q", tablename)
 	}
 	return table, nil
 }
 
 // FillDefaultIPTables sets stack's IPTables to the default tables and
 // populates them with metadata.
-func FillDefaultIPTables(stack *stack.Stack) {
-	ipt := iptables.DefaultTables()
+func FillDefaultIPTables(stk *stack.Stack) {
+	ipt := stack.DefaultTables()
 
 	// In order to fill in the metadata, we have to translate ipt from its
 	// netstack format to Linux's giant-binary-blob format.
@@ -154,14 +153,14 @@ func FillDefaultIPTables(stack *stack.Stack) {
 		ipt.Tables[name] = table
 	}
 
-	stack.SetIPTables(ipt)
+	stk.SetIPTables(ipt)
 }
 
 // convertNetstackToBinary converts the iptables as stored in netstack to the
 // format expected by the iptables tool. Linux stores each table as a binary
 // blob that can only be traversed by parsing a bit, reading some offsets,
 // jumping to those offsets, parsing again, etc.
-func convertNetstackToBinary(tablename string, table iptables.Table) (linux.KernelIPTGetEntries, metadata, error) {
+func convertNetstackToBinary(tablename string, table stack.Table) (linux.KernelIPTGetEntries, metadata, error) {
 	// Return values.
 	var entries linux.KernelIPTGetEntries
 	var meta metadata
@@ -234,19 +233,19 @@ func convertNetstackToBinary(tablename string, table iptables.Table) (linux.Kern
 	return entries, meta, nil
 }
 
-func marshalTarget(target iptables.Target) []byte {
+func marshalTarget(target stack.Target) []byte {
 	switch tg := target.(type) {
-	case iptables.AcceptTarget:
-		return marshalStandardTarget(iptables.RuleAccept)
-	case iptables.DropTarget:
-		return marshalStandardTarget(iptables.RuleDrop)
-	case iptables.ErrorTarget:
+	case stack.AcceptTarget:
+		return marshalStandardTarget(stack.RuleAccept)
+	case stack.DropTarget:
+		return marshalStandardTarget(stack.RuleDrop)
+	case stack.ErrorTarget:
 		return marshalErrorTarget(errorTargetName)
-	case iptables.UserChainTarget:
+	case stack.UserChainTarget:
 		return marshalErrorTarget(tg.Name)
-	case iptables.ReturnTarget:
-		return marshalStandardTarget(iptables.RuleReturn)
-	case iptables.RedirectTarget:
+	case stack.ReturnTarget:
+		return marshalStandardTarget(stack.RuleReturn)
+	case stack.RedirectTarget:
 		return marshalRedirectTarget()
 	case JumpTarget:
 		return marshalJumpTarget(tg)
@@ -255,7 +254,7 @@ func marshalTarget(target iptables.Target) []byte {
 	}
 }
 
-func marshalStandardTarget(verdict iptables.RuleVerdict) []byte {
+func marshalStandardTarget(verdict stack.RuleVerdict) []byte {
 	nflog("convert to binary: marshalling standard target")
 
 	// The target's name will be the empty string.
@@ -316,13 +315,13 @@ func marshalJumpTarget(jt JumpTarget) []byte {
 
 // translateFromStandardVerdict translates verdicts the same way as the iptables
 // tool.
-func translateFromStandardVerdict(verdict iptables.RuleVerdict) int32 {
+func translateFromStandardVerdict(verdict stack.RuleVerdict) int32 {
 	switch verdict {
-	case iptables.RuleAccept:
+	case stack.RuleAccept:
 		return -linux.NF_ACCEPT - 1
-	case iptables.RuleDrop:
+	case stack.RuleDrop:
 		return -linux.NF_DROP - 1
-	case iptables.RuleReturn:
+	case stack.RuleReturn:
 		return linux.NF_RETURN
 	default:
 		// TODO(gvisor.dev/issue/170): Support Jump.
@@ -331,18 +330,18 @@ func translateFromStandardVerdict(verdict iptables.RuleVerdict) int32 {
 }
 
 // translateToStandardTarget translates from the value in a
-// linux.XTStandardTarget to an iptables.Verdict.
-func translateToStandardTarget(val int32) (iptables.Target, error) {
+// linux.XTStandardTarget to an stack.Verdict.
+func translateToStandardTarget(val int32) (stack.Target, error) {
 	// TODO(gvisor.dev/issue/170): Support other verdicts.
 	switch val {
 	case -linux.NF_ACCEPT - 1:
-		return iptables.AcceptTarget{}, nil
+		return stack.AcceptTarget{}, nil
 	case -linux.NF_DROP - 1:
-		return iptables.DropTarget{}, nil
+		return stack.DropTarget{}, nil
 	case -linux.NF_QUEUE - 1:
 		return nil, errors.New("unsupported iptables verdict QUEUE")
 	case linux.NF_RETURN:
-		return iptables.ReturnTarget{}, nil
+		return stack.ReturnTarget{}, nil
 	default:
 		return nil, fmt.Errorf("unknown iptables verdict %d", val)
 	}
@@ -350,7 +349,7 @@ func translateToStandardTarget(val int32) (iptables.Target, error) {
 
 // SetEntries sets iptables rules for a single table. See
 // net/ipv4/netfilter/ip_tables.c:translate_table for reference.
-func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
+func SetEntries(stk *stack.Stack, optVal []byte) *syserr.Error {
 	// Get the basic rules data (struct ipt_replace).
 	if len(optVal) < linux.SizeOfIPTReplace {
 		nflog("optVal has insufficient size for replace %d", len(optVal))
@@ -362,12 +361,12 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 	binary.Unmarshal(replaceBuf, usermem.ByteOrder, &replace)
 
 	// TODO(gvisor.dev/issue/170): Support other tables.
-	var table iptables.Table
+	var table stack.Table
 	switch replace.Name.String() {
-	case iptables.TablenameFilter:
-		table = iptables.EmptyFilterTable()
-	case iptables.TablenameNat:
-		table = iptables.EmptyNatTable()
+	case stack.TablenameFilter:
+		table = stack.EmptyFilterTable()
+	case stack.TablenameNat:
+		table = stack.EmptyNatTable()
 	default:
 		nflog("we don't yet support writing to the %q table (gvisor.dev/issue/170)", replace.Name.String())
 		return syserr.ErrInvalidArgument
@@ -434,7 +433,7 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 		}
 		optVal = optVal[targetSize:]
 
-		table.Rules = append(table.Rules, iptables.Rule{
+		table.Rules = append(table.Rules, stack.Rule{
 			Filter:   filter,
 			Target:   target,
 			Matchers: matchers,
@@ -465,11 +464,11 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 					table.Underflows[hk] = ruleIdx
 				}
 			}
-			if ruleIdx := table.BuiltinChains[hk]; ruleIdx == iptables.HookUnset {
+			if ruleIdx := table.BuiltinChains[hk]; ruleIdx == stack.HookUnset {
 				nflog("hook %v is unset.", hk)
 				return syserr.ErrInvalidArgument
 			}
-			if ruleIdx := table.Underflows[hk]; ruleIdx == iptables.HookUnset {
+			if ruleIdx := table.Underflows[hk]; ruleIdx == stack.HookUnset {
 				nflog("underflow %v is unset.", hk)
 				return syserr.ErrInvalidArgument
 			}
@@ -478,7 +477,7 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 
 	// Add the user chains.
 	for ruleIdx, rule := range table.Rules {
-		target, ok := rule.Target.(iptables.UserChainTarget)
+		target, ok := rule.Target.(stack.UserChainTarget)
 		if !ok {
 			continue
 		}
@@ -522,8 +521,8 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 	// PREROUTING chain right now, make sure all other chains point to
 	// ACCEPT rules.
 	for hook, ruleIdx := range table.BuiltinChains {
-		if hook != iptables.Input && hook != iptables.Prerouting {
-			if _, ok := table.Rules[ruleIdx].Target.(iptables.AcceptTarget); !ok {
+		if hook != stack.Input && hook != stack.Prerouting {
+			if _, ok := table.Rules[ruleIdx].Target.(stack.AcceptTarget); !ok {
 				nflog("hook %d is unsupported.", hook)
 				return syserr.ErrInvalidArgument
 			}
@@ -535,7 +534,7 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 	// - There are no chains without an unconditional final rule.
 	// - There are no chains without an unconditional underflow rule.
 
-	ipt := stack.IPTables()
+	ipt := stk.IPTables()
 	table.SetMetadata(metadata{
 		HookEntry:  replace.HookEntry,
 		Underflow:  replace.Underflow,
@@ -543,16 +542,16 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 		Size:       replace.Size,
 	})
 	ipt.Tables[replace.Name.String()] = table
-	stack.SetIPTables(ipt)
+	stk.SetIPTables(ipt)
 
 	return nil
 }
 
 // parseMatchers parses 0 or more matchers from optVal. optVal should contain
 // only the matchers.
-func parseMatchers(filter iptables.IPHeaderFilter, optVal []byte) ([]iptables.Matcher, error) {
+func parseMatchers(filter stack.IPHeaderFilter, optVal []byte) ([]stack.Matcher, error) {
 	nflog("set entries: parsing matchers of size %d", len(optVal))
-	var matchers []iptables.Matcher
+	var matchers []stack.Matcher
 	for len(optVal) > 0 {
 		nflog("set entries: optVal has len %d", len(optVal))
 
@@ -594,7 +593,7 @@ func parseMatchers(filter iptables.IPHeaderFilter, optVal []byte) ([]iptables.Ma
 
 // parseTarget parses a target from optVal. optVal should contain only the
 // target.
-func parseTarget(filter iptables.IPHeaderFilter, optVal []byte) (iptables.Target, error) {
+func parseTarget(filter stack.IPHeaderFilter, optVal []byte) (stack.Target, error) {
 	nflog("set entries: parsing target of size %d", len(optVal))
 	if len(optVal) < linux.SizeOfXTEntryTarget {
 		return nil, fmt.Errorf("optVal has insufficient size for entry target %d", len(optVal))
@@ -638,11 +637,11 @@ func parseTarget(filter iptables.IPHeaderFilter, optVal []byte) (iptables.Target
 		switch name := errorTarget.Name.String(); name {
 		case errorTargetName:
 			nflog("set entries: error target")
-			return iptables.ErrorTarget{}, nil
+			return stack.ErrorTarget{}, nil
 		default:
 			// User defined chain.
 			nflog("set entries: user-defined target %q", name)
-			return iptables.UserChainTarget{Name: name}, nil
+			return stack.UserChainTarget{Name: name}, nil
 		}
 
 	case redirectTargetName:
@@ -659,8 +658,8 @@ func parseTarget(filter iptables.IPHeaderFilter, optVal []byte) (iptables.Target
 		buf = optVal[:linux.SizeOfXTRedirectTarget]
 		binary.Unmarshal(buf, usermem.ByteOrder, &redirectTarget)
 
-		// Copy linux.XTRedirectTarget to iptables.RedirectTarget.
-		var target iptables.RedirectTarget
+		// Copy linux.XTRedirectTarget to stack.RedirectTarget.
+		var target stack.RedirectTarget
 		nfRange := redirectTarget.NfRange
 
 		// RangeSize should be 1.
@@ -699,14 +698,14 @@ func parseTarget(filter iptables.IPHeaderFilter, optVal []byte) (iptables.Target
 	return nil, fmt.Errorf("unknown target %q doesn't exist or isn't supported yet.", target.Name.String())
 }
 
-func filterFromIPTIP(iptip linux.IPTIP) (iptables.IPHeaderFilter, error) {
+func filterFromIPTIP(iptip linux.IPTIP) (stack.IPHeaderFilter, error) {
 	if containsUnsupportedFields(iptip) {
-		return iptables.IPHeaderFilter{}, fmt.Errorf("unsupported fields in struct iptip: %+v", iptip)
+		return stack.IPHeaderFilter{}, fmt.Errorf("unsupported fields in struct iptip: %+v", iptip)
 	}
 	if len(iptip.Dst) != header.IPv4AddressSize || len(iptip.DstMask) != header.IPv4AddressSize {
-		return iptables.IPHeaderFilter{}, fmt.Errorf("incorrect length of destination (%d) and/or destination mask (%d) fields", len(iptip.Dst), len(iptip.DstMask))
+		return stack.IPHeaderFilter{}, fmt.Errorf("incorrect length of destination (%d) and/or destination mask (%d) fields", len(iptip.Dst), len(iptip.DstMask))
 	}
-	return iptables.IPHeaderFilter{
+	return stack.IPHeaderFilter{
 		Protocol:  tcpip.TransportProtocolNumber(iptip.Protocol),
 		Dst:       tcpip.Address(iptip.Dst[:]),
 		DstMask:   tcpip.Address(iptip.DstMask[:]),
@@ -733,30 +732,30 @@ func containsUnsupportedFields(iptip linux.IPTIP) bool {
 		iptip.InverseFlags&^inverseMask != 0
 }
 
-func validUnderflow(rule iptables.Rule) bool {
+func validUnderflow(rule stack.Rule) bool {
 	if len(rule.Matchers) != 0 {
 		return false
 	}
 	switch rule.Target.(type) {
-	case iptables.AcceptTarget, iptables.DropTarget:
+	case stack.AcceptTarget, stack.DropTarget:
 		return true
 	default:
 		return false
 	}
 }
 
-func hookFromLinux(hook int) iptables.Hook {
+func hookFromLinux(hook int) stack.Hook {
 	switch hook {
 	case linux.NF_INET_PRE_ROUTING:
-		return iptables.Prerouting
+		return stack.Prerouting
 	case linux.NF_INET_LOCAL_IN:
-		return iptables.Input
+		return stack.Input
 	case linux.NF_INET_FORWARD:
-		return iptables.Forward
+		return stack.Forward
 	case linux.NF_INET_LOCAL_OUT:
-		return iptables.Output
+		return stack.Output
 	case linux.NF_INET_POST_ROUTING:
-		return iptables.Postrouting
+		return stack.Postrouting
 	}
 	panic(fmt.Sprintf("Unknown hook %d does not correspond to a builtin chain", hook))
 }
diff --git a/pkg/sentry/socket/netfilter/targets.go b/pkg/sentry/socket/netfilter/targets.go
index c421b87cf..c948de876 100644
--- a/pkg/sentry/socket/netfilter/targets.go
+++ b/pkg/sentry/socket/netfilter/targets.go
@@ -15,11 +15,10 @@
 package netfilter
 
 import (
-	"gvisor.dev/gvisor/pkg/tcpip"
-	"gvisor.dev/gvisor/pkg/tcpip/iptables"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
 )
 
-// JumpTarget implements iptables.Target.
+// JumpTarget implements stack.Target.
 type JumpTarget struct {
 	// Offset is the byte offset of the rule to jump to. It is used for
 	// marshaling and unmarshaling.
@@ -29,7 +28,7 @@ type JumpTarget struct {
 	RuleNum int
 }
 
-// Action implements iptables.Target.Action.
-func (jt JumpTarget) Action(tcpip.PacketBuffer) (iptables.RuleVerdict, int) {
-	return iptables.RuleJump, jt.RuleNum
+// Action implements stack.Target.Action.
+func (jt JumpTarget) Action(stack.PacketBuffer) (stack.RuleVerdict, int) {
+	return stack.RuleJump, jt.RuleNum
 }
diff --git a/pkg/sentry/socket/netfilter/tcp_matcher.go b/pkg/sentry/socket/netfilter/tcp_matcher.go
index f9945e214..ff1cfd8f6 100644
--- a/pkg/sentry/socket/netfilter/tcp_matcher.go
+++ b/pkg/sentry/socket/netfilter/tcp_matcher.go
@@ -19,9 +19,8 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/binary"
-	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
-	"gvisor.dev/gvisor/pkg/tcpip/iptables"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
@@ -40,7 +39,7 @@ func (tcpMarshaler) name() string {
 }
 
 // marshal implements matchMaker.marshal.
-func (tcpMarshaler) marshal(mr iptables.Matcher) []byte {
+func (tcpMarshaler) marshal(mr stack.Matcher) []byte {
 	matcher := mr.(*TCPMatcher)
 	xttcp := linux.XTTCP{
 		SourcePortStart:      matcher.sourcePortStart,
@@ -53,7 +52,7 @@ func (tcpMarshaler) marshal(mr iptables.Matcher) []byte {
 }
 
 // unmarshal implements matchMaker.unmarshal.
-func (tcpMarshaler) unmarshal(buf []byte, filter iptables.IPHeaderFilter) (iptables.Matcher, error) {
+func (tcpMarshaler) unmarshal(buf []byte, filter stack.IPHeaderFilter) (stack.Matcher, error) {
 	if len(buf) < linux.SizeOfXTTCP {
 		return nil, fmt.Errorf("buf has insufficient size for TCP match: %d", len(buf))
 	}
@@ -97,7 +96,7 @@ func (*TCPMatcher) Name() string {
 }
 
 // Match implements Matcher.Match.
-func (tm *TCPMatcher) Match(hook iptables.Hook, pkt tcpip.PacketBuffer, interfaceName string) (bool, bool) {
+func (tm *TCPMatcher) Match(hook stack.Hook, pkt stack.PacketBuffer, interfaceName string) (bool, bool) {
 	netHeader := header.IPv4(pkt.NetworkHeader)
 
 	if netHeader.TransportProtocol() != header.TCPProtocolNumber {
@@ -115,7 +114,7 @@ func (tm *TCPMatcher) Match(hook iptables.Hook, pkt tcpip.PacketBuffer, interfac
 	// Now we need the transport header. However, this may not have been set
 	// yet.
 	// TODO(gvisor.dev/issue/170): Parsing the transport header should
-	// ultimately be moved into the iptables.Check codepath as matchers are
+	// ultimately be moved into the stack.Check codepath as matchers are
 	// added.
 	var tcpHeader header.TCP
 	if pkt.TransportHeader != nil {
diff --git a/pkg/sentry/socket/netfilter/udp_matcher.go b/pkg/sentry/socket/netfilter/udp_matcher.go
index 86aa11696..3359418c1 100644
--- a/pkg/sentry/socket/netfilter/udp_matcher.go
+++ b/pkg/sentry/socket/netfilter/udp_matcher.go
@@ -19,9 +19,8 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/binary"
-	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
-	"gvisor.dev/gvisor/pkg/tcpip/iptables"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
@@ -40,7 +39,7 @@ func (udpMarshaler) name() string {
 }
 
 // marshal implements matchMaker.marshal.
-func (udpMarshaler) marshal(mr iptables.Matcher) []byte {
+func (udpMarshaler) marshal(mr stack.Matcher) []byte {
 	matcher := mr.(*UDPMatcher)
 	xtudp := linux.XTUDP{
 		SourcePortStart:      matcher.sourcePortStart,
@@ -53,7 +52,7 @@ func (udpMarshaler) marshal(mr iptables.Matcher) []byte {
 }
 
 // unmarshal implements matchMaker.unmarshal.
-func (udpMarshaler) unmarshal(buf []byte, filter iptables.IPHeaderFilter) (iptables.Matcher, error) {
+func (udpMarshaler) unmarshal(buf []byte, filter stack.IPHeaderFilter) (stack.Matcher, error) {
 	if len(buf) < linux.SizeOfXTUDP {
 		return nil, fmt.Errorf("buf has insufficient size for UDP match: %d", len(buf))
 	}
@@ -94,11 +93,11 @@ func (*UDPMatcher) Name() string {
 }
 
 // Match implements Matcher.Match.
-func (um *UDPMatcher) Match(hook iptables.Hook, pkt tcpip.PacketBuffer, interfaceName string) (bool, bool) {
+func (um *UDPMatcher) Match(hook stack.Hook, pkt stack.PacketBuffer, interfaceName string) (bool, bool) {
 	netHeader := header.IPv4(pkt.NetworkHeader)
 
 	// TODO(gvisor.dev/issue/170): Proto checks should ultimately be moved
-	// into the iptables.Check codepath as matchers are added.
+	// into the stack.Check codepath as matchers are added.
 	if netHeader.TransportProtocol() != header.UDPProtocolNumber {
 		return false, false
 	}
@@ -114,7 +113,7 @@ func (um *UDPMatcher) Match(hook iptables.Hook, pkt tcpip.PacketBuffer, interfac
 	// Now we need the transport header. However, this may not have been set
 	// yet.
 	// TODO(gvisor.dev/issue/170): Parsing the transport header should
-	// ultimately be moved into the iptables.Check codepath as matchers are
+	// ultimately be moved into the stack.Check codepath as matchers are
 	// added.
 	var udpHeader header.UDP
 	if pkt.TransportHeader != nil {
diff --git a/pkg/sentry/socket/netstack/BUILD b/pkg/sentry/socket/netstack/BUILD
index ab01cb4fa..cbf46b1e9 100644
--- a/pkg/sentry/socket/netstack/BUILD
+++ b/pkg/sentry/socket/netstack/BUILD
@@ -38,7 +38,6 @@ go_library(
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/header",
-        "//pkg/tcpip/iptables",
         "//pkg/tcpip/network/ipv4",
         "//pkg/tcpip/network/ipv6",
         "//pkg/tcpip/stack",
diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index 13a9a60b4..f14c336b9 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -29,6 +29,7 @@ import (
 	"io"
 	"math"
 	"reflect"
+	"sync/atomic"
 	"syscall"
 	"time"
 
@@ -264,6 +265,12 @@ type SocketOperations struct {
 	skType   linux.SockType
 	protocol int
 
+	// readViewHasData is 1 iff readView has data to be read, 0 otherwise.
+	// Must be accessed using atomic operations. It must only be written
+	// with readMu held but can be read without holding readMu. The latter
+	// is required to avoid deadlocks in epoll Readiness checks.
+	readViewHasData uint32
+
 	// readMu protects access to the below fields.
 	readMu sync.Mutex `state:"nosave"`
 	// readView contains the remaining payload from the last packet.
@@ -410,21 +417,24 @@ func (s *SocketOperations) isPacketBased() bool {
 
 // fetchReadView updates the readView field of the socket if it's currently
 // empty. It assumes that the socket is locked.
+//
+// Precondition: s.readMu must be held.
 func (s *SocketOperations) fetchReadView() *syserr.Error {
 	if len(s.readView) > 0 {
 		return nil
 	}
-
 	s.readView = nil
 	s.sender = tcpip.FullAddress{}
 
 	v, cms, err := s.Endpoint.Read(&s.sender)
 	if err != nil {
+		atomic.StoreUint32(&s.readViewHasData, 0)
 		return syserr.TranslateNetstackError(err)
 	}
 
 	s.readView = v
 	s.readCM = cms
+	atomic.StoreUint32(&s.readViewHasData, 1)
 
 	return nil
 }
@@ -623,11 +633,9 @@ func (s *SocketOperations) Readiness(mask waiter.EventMask) waiter.EventMask {
 	// Check our cached value iff the caller asked for readability and the
 	// endpoint itself is currently not readable.
 	if (mask & ^r & waiter.EventIn) != 0 {
-		s.readMu.Lock()
-		if len(s.readView) > 0 {
+		if atomic.LoadUint32(&s.readViewHasData) == 1 {
 			r |= waiter.EventIn
 		}
-		s.readMu.Unlock()
 	}
 
 	return r
@@ -2334,6 +2342,10 @@ func (s *SocketOperations) coalescingRead(ctx context.Context, dst usermem.IOSeq
 		}
 		copied += n
 		s.readView.TrimFront(n)
+		if len(s.readView) == 0 {
+			atomic.StoreUint32(&s.readViewHasData, 0)
+		}
+
 		dst = dst.DropFirst(n)
 		if e != nil {
 			err = syserr.FromError(e)
@@ -2380,9 +2392,9 @@ func (s *SocketOperations) nonBlockingRead(ctx context.Context, dst usermem.IOSe
 		// caller-supplied  buffer.
 		s.readMu.Lock()
 		n, err := s.coalescingRead(ctx, dst, trunc)
-		s.readMu.Unlock()
 		cmsg := s.controlMessages()
 		s.fillCmsgInq(&cmsg)
+		s.readMu.Unlock()
 		return n, 0, nil, 0, cmsg, err
 	}
 
@@ -2456,6 +2468,10 @@ func (s *SocketOperations) nonBlockingRead(ctx context.Context, dst usermem.IOSe
 		s.readView.TrimFront(int(n))
 	}
 
+	if len(s.readView) == 0 {
+		atomic.StoreUint32(&s.readViewHasData, 0)
+	}
+
 	var flags int
 	if msgLen > int(n) {
 		flags |= linux.MSG_TRUNC
diff --git a/pkg/sentry/socket/netstack/stack.go b/pkg/sentry/socket/netstack/stack.go
index 0692482e9..f5fa18136 100644
--- a/pkg/sentry/socket/netstack/stack.go
+++ b/pkg/sentry/socket/netstack/stack.go
@@ -23,7 +23,6 @@ import (
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
-	"gvisor.dev/gvisor/pkg/tcpip/iptables"
 	"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
 	"gvisor.dev/gvisor/pkg/tcpip/network/ipv6"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
@@ -200,36 +199,66 @@ func (s *Stack) SetTCPSACKEnabled(enabled bool) error {
 // Statistics implements inet.Stack.Statistics.
 func (s *Stack) Statistics(stat interface{}, arg string) error {
 	switch stats := stat.(type) {
+	case *inet.StatDev:
+		for _, ni := range s.Stack.NICInfo() {
+			if ni.Name != arg {
+				continue
+			}
+			// TODO(gvisor.dev/issue/2103) Support stubbed stats.
+			*stats = inet.StatDev{
+				// Receive section.
+				ni.Stats.Rx.Bytes.Value(),   // bytes.
+				ni.Stats.Rx.Packets.Value(), // packets.
+				0,                           // errs.
+				0,                           // drop.
+				0,                           // fifo.
+				0,                           // frame.
+				0,                           // compressed.
+				0,                           // multicast.
+				// Transmit section.
+				ni.Stats.Tx.Bytes.Value(),   // bytes.
+				ni.Stats.Tx.Packets.Value(), // packets.
+				0,                           // errs.
+				0,                           // drop.
+				0,                           // fifo.
+				0,                           // colls.
+				0,                           // carrier.
+				0,                           // compressed.
+			}
+			break
+		}
 	case *inet.StatSNMPIP:
 		ip := Metrics.IP
+		// TODO(gvisor.dev/issue/969) Support stubbed stats.
 		*stats = inet.StatSNMPIP{
-			0,                          // TODO(gvisor.dev/issue/969): Support Ip/Forwarding.
-			0,                          // TODO(gvisor.dev/issue/969): Support Ip/DefaultTTL.
+			0,                          // Ip/Forwarding.
+			0,                          // Ip/DefaultTTL.
 			ip.PacketsReceived.Value(), // InReceives.
-			0,                          // TODO(gvisor.dev/issue/969): Support Ip/InHdrErrors.
+			0,                          // Ip/InHdrErrors.
 			ip.InvalidDestinationAddressesReceived.Value(), // InAddrErrors.
-			0,                               // TODO(gvisor.dev/issue/969): Support Ip/ForwDatagrams.
-			0,                               // TODO(gvisor.dev/issue/969): Support Ip/InUnknownProtos.
-			0,                               // TODO(gvisor.dev/issue/969): Support Ip/InDiscards.
+			0,                               // Ip/ForwDatagrams.
+			0,                               // Ip/InUnknownProtos.
+			0,                               // Ip/InDiscards.
 			ip.PacketsDelivered.Value(),     // InDelivers.
 			ip.PacketsSent.Value(),          // OutRequests.
 			ip.OutgoingPacketErrors.Value(), // OutDiscards.
-			0,                               // TODO(gvisor.dev/issue/969): Support Ip/OutNoRoutes.
-			0,                               // TODO(gvisor.dev/issue/969): Support Ip/ReasmTimeout.
-			0,                               // TODO(gvisor.dev/issue/969): Support Ip/ReasmReqds.
-			0,                               // TODO(gvisor.dev/issue/969): Support Ip/ReasmOKs.
-			0,                               // TODO(gvisor.dev/issue/969): Support Ip/ReasmFails.
-			0,                               // TODO(gvisor.dev/issue/969): Support Ip/FragOKs.
-			0,                               // TODO(gvisor.dev/issue/969): Support Ip/FragFails.
-			0,                               // TODO(gvisor.dev/issue/969): Support Ip/FragCreates.
+			0,                               // Ip/OutNoRoutes.
+			0,                               // Support Ip/ReasmTimeout.
+			0,                               // Support Ip/ReasmReqds.
+			0,                               // Support Ip/ReasmOKs.
+			0,                               // Support Ip/ReasmFails.
+			0,                               // Support Ip/FragOKs.
+			0,                               // Support Ip/FragFails.
+			0,                               // Support Ip/FragCreates.
 		}
 	case *inet.StatSNMPICMP:
 		in := Metrics.ICMP.V4PacketsReceived.ICMPv4PacketStats
 		out := Metrics.ICMP.V4PacketsSent.ICMPv4PacketStats
+		// TODO(gvisor.dev/issue/969) Support stubbed stats.
 		*stats = inet.StatSNMPICMP{
-			0, // TODO(gvisor.dev/issue/969): Support Icmp/InMsgs.
+			0, // Icmp/InMsgs.
 			Metrics.ICMP.V4PacketsSent.Dropped.Value(), // InErrors.
-			0,                         // TODO(gvisor.dev/issue/969): Support Icmp/InCsumErrors.
+			0,                         // Icmp/InCsumErrors.
 			in.DstUnreachable.Value(), // InDestUnreachs.
 			in.TimeExceeded.Value(),   // InTimeExcds.
 			in.ParamProblem.Value(),   // InParmProbs.
@@ -241,7 +270,7 @@ func (s *Stack) Statistics(stat interface{}, arg string) error {
 			in.TimestampReply.Value(), // InTimestampReps.
 			in.InfoRequest.Value(),    // InAddrMasks.
 			in.InfoReply.Value(),      // InAddrMaskReps.
-			0,                         // TODO(gvisor.dev/issue/969): Support Icmp/OutMsgs.
+			0,                         // Icmp/OutMsgs.
 			Metrics.ICMP.V4PacketsReceived.Invalid.Value(), // OutErrors.
 			out.DstUnreachable.Value(),                     // OutDestUnreachs.
 			out.TimeExceeded.Value(),                       // OutTimeExcds.
@@ -277,15 +306,16 @@ func (s *Stack) Statistics(stat interface{}, arg string) error {
 		}
 	case *inet.StatSNMPUDP:
 		udp := Metrics.UDP
+		// TODO(gvisor.dev/issue/969) Support stubbed stats.
 		*stats = inet.StatSNMPUDP{
 			udp.PacketsReceived.Value(),     // InDatagrams.
 			udp.UnknownPortErrors.Value(),   // NoPorts.
-			0,                               // TODO(gvisor.dev/issue/969): Support Udp/InErrors.
+			0,                               // Udp/InErrors.
 			udp.PacketsSent.Value(),         // OutDatagrams.
 			udp.ReceiveBufferErrors.Value(), // RcvbufErrors.
-			0,                               // TODO(gvisor.dev/issue/969): Support Udp/SndbufErrors.
-			0,                               // TODO(gvisor.dev/issue/969): Support Udp/InCsumErrors.
-			0,                               // TODO(gvisor.dev/issue/969): Support Udp/IgnoredMulti.
+			0,                               // Udp/SndbufErrors.
+			0,                               // Udp/InCsumErrors.
+			0,                               // Udp/IgnoredMulti.
 		}
 	default:
 		return syserr.ErrEndpointOperation.ToError()
@@ -332,7 +362,7 @@ func (s *Stack) RouteTable() []inet.Route {
 }
 
 // IPTables returns the stack's iptables.
-func (s *Stack) IPTables() (iptables.IPTables, error) {
+func (s *Stack) IPTables() (stack.IPTables, error) {
 	return s.Stack.IPTables(), nil
 }
 
diff --git a/pkg/sentry/syscalls/linux/vfs2/BUILD b/pkg/sentry/syscalls/linux/vfs2/BUILD
index e7695e995..2eb210014 100644
--- a/pkg/sentry/syscalls/linux/vfs2/BUILD
+++ b/pkg/sentry/syscalls/linux/vfs2/BUILD
@@ -31,6 +31,7 @@ go_library(
     visibility = ["//:sandbox"],
     deps = [
         "//pkg/abi/linux",
+        "//pkg/bits",
         "//pkg/fspath",
         "//pkg/gohacks",
         "//pkg/sentry/arch",
diff --git a/pkg/sentry/syscalls/linux/vfs2/filesystem.go b/pkg/sentry/syscalls/linux/vfs2/filesystem.go
index fc5ceea4c..a859095e2 100644
--- a/pkg/sentry/syscalls/linux/vfs2/filesystem.go
+++ b/pkg/sentry/syscalls/linux/vfs2/filesystem.go
@@ -250,7 +250,7 @@ func rmdirat(t *kernel.Task, dirfd int32, pathAddr usermem.Addr) error {
 	if err != nil {
 		return err
 	}
-	tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, followFinalSymlink)
+	tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, nofollowFinalSymlink)
 	if err != nil {
 		return err
 	}
diff --git a/pkg/sentry/syscalls/linux/vfs2/getdents.go b/pkg/sentry/syscalls/linux/vfs2/getdents.go
index ddc140b65..a61cc5059 100644
--- a/pkg/sentry/syscalls/linux/vfs2/getdents.go
+++ b/pkg/sentry/syscalls/linux/vfs2/getdents.go
@@ -97,7 +97,7 @@ func (cb *getdentsCallback) Handle(dirent vfs.Dirent) error {
 		//     char           d_name[]; /* Filename (null-terminated) */
 		// };
 		size := 8 + 8 + 2 + 1 + 1 + len(dirent.Name)
-		if size < cb.remaining {
+		if size > cb.remaining {
 			return syserror.EINVAL
 		}
 		buf = cb.t.CopyScratchBuffer(size)
@@ -125,7 +125,7 @@ func (cb *getdentsCallback) Handle(dirent vfs.Dirent) error {
 			panic(fmt.Sprintf("unsupported sizeof(unsigned long): %d", cb.t.Arch().Width()))
 		}
 		size := 8 + 8 + 2 + 1 + 1 + 1 + len(dirent.Name)
-		if size < cb.remaining {
+		if size > cb.remaining {
 			return syserror.EINVAL
 		}
 		buf = cb.t.CopyScratchBuffer(size)
diff --git a/pkg/sentry/syscalls/linux/vfs2/stat.go b/pkg/sentry/syscalls/linux/vfs2/stat.go
index 97eaedd66..fdfe49243 100644
--- a/pkg/sentry/syscalls/linux/vfs2/stat.go
+++ b/pkg/sentry/syscalls/linux/vfs2/stat.go
@@ -16,6 +16,7 @@ package vfs2
 
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/bits"
 	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/gohacks"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
@@ -153,7 +154,11 @@ func Statx(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW|linux.AT_STATX_SYNC_TYPE) != 0 {
 		return 0, nil, syserror.EINVAL
 	}
-
+	// Make sure that only one sync type option is set.
+	syncType := uint32(flags & linux.AT_STATX_SYNC_TYPE)
+	if syncType != 0 && !bits.IsPowerOfTwo32(syncType) {
+		return 0, nil, syserror.EINVAL
+	}
 	if mask&linux.STATX__RESERVED != 0 {
 		return 0, nil, syserror.EINVAL
 	}
@@ -272,6 +277,7 @@ func accessAt(t *kernel.Task, dirfd int32, pathAddr usermem.Addr, mode uint) err
 	if err != nil {
 		return err
 	}
+	defer tpop.Release()
 
 	// access(2) and faccessat(2) check permissions using real
 	// UID/GID, not effective UID/GID.
diff --git a/pkg/sentry/vfs/anonfs.go b/pkg/sentry/vfs/anonfs.go
index 925996517..a62e43589 100644
--- a/pkg/sentry/vfs/anonfs.go
+++ b/pkg/sentry/vfs/anonfs.go
@@ -83,7 +83,7 @@ func (fs *anonFilesystem) AccessAt(ctx context.Context, rp *ResolvingPath, creds
 	if !rp.Done() {
 		return syserror.ENOTDIR
 	}
-	return GenericCheckPermissions(creds, ats, false /* isDir */, anonFileMode, anonFileUID, anonFileGID)
+	return GenericCheckPermissions(creds, ats, anonFileMode, anonFileUID, anonFileGID)
 }
 
 // GetDentryAt implements FilesystemImpl.GetDentryAt.
diff --git a/pkg/sentry/vfs/permissions.go b/pkg/sentry/vfs/permissions.go
index 2c8f23f55..f9647f90e 100644
--- a/pkg/sentry/vfs/permissions.go
+++ b/pkg/sentry/vfs/permissions.go
@@ -29,9 +29,9 @@ type AccessTypes uint16
 
 // Bits in AccessTypes.
 const (
+	MayExec  AccessTypes = 1
+	MayWrite AccessTypes = 2
 	MayRead  AccessTypes = 4
-	MayWrite             = 2
-	MayExec              = 1
 )
 
 // OnlyRead returns true if access _only_ allows read.
@@ -56,16 +56,17 @@ func (a AccessTypes) MayExec() bool {
 
 // GenericCheckPermissions checks that creds has the given access rights on a
 // file with the given permissions, UID, and GID, subject to the rules of
-// fs/namei.c:generic_permission(). isDir is true if the file is a directory.
-func GenericCheckPermissions(creds *auth.Credentials, ats AccessTypes, isDir bool, mode uint16, kuid auth.KUID, kgid auth.KGID) error {
+// fs/namei.c:generic_permission().
+func GenericCheckPermissions(creds *auth.Credentials, ats AccessTypes, mode linux.FileMode, kuid auth.KUID, kgid auth.KGID) error {
 	// Check permission bits.
-	perms := mode
+	perms := uint16(mode.Permissions())
 	if creds.EffectiveKUID == kuid {
 		perms >>= 6
 	} else if creds.InGroup(kgid) {
 		perms >>= 3
 	}
 	if uint16(ats)&perms == uint16(ats) {
+		// All permission bits match, access granted.
 		return nil
 	}
 
@@ -77,7 +78,7 @@ func GenericCheckPermissions(creds *auth.Credentials, ats AccessTypes, isDir boo
 	}
 	// CAP_DAC_READ_SEARCH allows the caller to read and search arbitrary
 	// directories, and read arbitrary non-directory files.
-	if (isDir && !ats.MayWrite()) || ats.OnlyRead() {
+	if (mode.IsDir() && !ats.MayWrite()) || ats.OnlyRead() {
 		if creds.HasCapability(linux.CAP_DAC_READ_SEARCH) {
 			return nil
 		}
@@ -85,7 +86,7 @@ func GenericCheckPermissions(creds *auth.Credentials, ats AccessTypes, isDir boo
 	// CAP_DAC_OVERRIDE allows arbitrary access to directories, read/write
 	// access to non-directory files, and execute access to non-directory files
 	// for which at least one execute bit is set.
-	if isDir || !ats.MayExec() || (mode&0111 != 0) {
+	if mode.IsDir() || !ats.MayExec() || (mode.Permissions()&0111 != 0) {
 		if creds.HasCapability(linux.CAP_DAC_OVERRIDE) {
 			return nil
 		}
@@ -151,7 +152,7 @@ func MayWriteFileWithOpenFlags(flags uint32) bool {
 // CheckSetStat checks that creds has permission to change the metadata of a
 // file with the given permissions, UID, and GID as specified by stat, subject
 // to the rules of Linux's fs/attr.c:setattr_prepare().
-func CheckSetStat(ctx context.Context, creds *auth.Credentials, stat *linux.Statx, mode uint16, kuid auth.KUID, kgid auth.KGID) error {
+func CheckSetStat(ctx context.Context, creds *auth.Credentials, stat *linux.Statx, mode linux.FileMode, kuid auth.KUID, kgid auth.KGID) error {
 	if stat.Mask&linux.STATX_SIZE != 0 {
 		limit, err := CheckLimit(ctx, 0, int64(stat.Size))
 		if err != nil {
@@ -190,11 +191,7 @@ func CheckSetStat(ctx context.Context, creds *auth.Credentials, stat *linux.Stat
 				(stat.Mask&linux.STATX_CTIME != 0 && stat.Ctime.Nsec != linux.UTIME_NOW) {
 				return syserror.EPERM
 			}
-			// isDir is irrelevant in the following call to
-			// GenericCheckPermissions since ats == MayWrite means that
-			// CAP_DAC_READ_SEARCH does not apply, and CAP_DAC_OVERRIDE
-			// applies, regardless of isDir.
-			if err := GenericCheckPermissions(creds, MayWrite, false /* isDir */, mode, kuid, kgid); err != nil {
+			if err := GenericCheckPermissions(creds, MayWrite, mode, kuid, kgid); err != nil {
 				return err
 			}
 		}
diff --git a/pkg/sentry/vfs/resolving_path.go b/pkg/sentry/vfs/resolving_path.go
index eb4ebb511..8f31495da 100644
--- a/pkg/sentry/vfs/resolving_path.go
+++ b/pkg/sentry/vfs/resolving_path.go
@@ -329,10 +329,22 @@ func (rp *ResolvingPath) ResolveComponent(d *Dentry) (*Dentry, error) {
 // component in pcs represents a symbolic link, the symbolic link should be
 // followed.
 //
+// If path is terminated with '/', the '/' is considered the last element and
+// any symlink before that is followed:
+//   - For most non-creating walks, the last path component is handled by
+//     fs/namei.c:lookup_last(), which sets LOOKUP_FOLLOW if the first byte
+//     after the path component is non-NULL (which is only possible if it's '/')
+//     and the path component is of type LAST_NORM.
+//
+//   - For open/openat/openat2 without O_CREAT, the last path component is
+//     handled by fs/namei.c:do_last(), which does the same, though without the
+//     LAST_NORM check.
+//
 // Preconditions: !rp.Done().
 func (rp *ResolvingPath) ShouldFollowSymlink() bool {
-	// Non-final symlinks are always followed.
-	return rp.flags&rpflagsFollowFinalSymlink != 0 || !rp.Final()
+	// Non-final symlinks are always followed. Paths terminated with '/' are also
+	// always followed.
+	return rp.flags&rpflagsFollowFinalSymlink != 0 || !rp.Final() || rp.MustBeDir()
 }
 
 // HandleSymlink is called when the current path component is a symbolic link
diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go
index 2e2880171..03d1fb943 100644
--- a/pkg/sentry/vfs/vfs.go
+++ b/pkg/sentry/vfs/vfs.go
@@ -402,8 +402,6 @@ func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credential
 		if err == nil {
 			vfs.putResolvingPath(rp)
 
-			// TODO(gvisor.dev/issue/1193): Move inside fsimpl to avoid another call
-			// to FileDescription.Stat().
 			if opts.FileExec {
 				if fd.Mount().flags.NoExec {
 					fd.DecRef()
diff --git a/pkg/sync/aliases.go b/pkg/sync/aliases.go
index d2d7132fa..0d4316254 100644
--- a/pkg/sync/aliases.go
+++ b/pkg/sync/aliases.go
@@ -29,3 +29,8 @@ type (
 	// Map is an alias of sync.Map.
 	Map = sync.Map
 )
+
+// NewCond is a wrapper around sync.NewCond.
+func NewCond(l Locker) *Cond {
+	return sync.NewCond(l)
+}
diff --git a/pkg/tcpip/BUILD b/pkg/tcpip/BUILD
index 26f7ba86b..454e07662 100644
--- a/pkg/tcpip/BUILD
+++ b/pkg/tcpip/BUILD
@@ -5,8 +5,6 @@ package(licenses = ["notice"])
 go_library(
     name = "tcpip",
     srcs = [
-        "packet_buffer.go",
-        "packet_buffer_state.go",
         "tcpip.go",
         "time_unsafe.go",
         "timer.go",
diff --git a/pkg/tcpip/iptables/BUILD b/pkg/tcpip/iptables/BUILD
deleted file mode 100644
index d1b73cfdf..000000000
--- a/pkg/tcpip/iptables/BUILD
+++ /dev/null
@@ -1,18 +0,0 @@
-load("//tools:defs.bzl", "go_library")
-
-package(licenses = ["notice"])
-
-go_library(
-    name = "iptables",
-    srcs = [
-        "iptables.go",
-        "targets.go",
-        "types.go",
-    ],
-    visibility = ["//visibility:public"],
-    deps = [
-        "//pkg/log",
-        "//pkg/tcpip",
-        "//pkg/tcpip/header",
-    ],
-)
diff --git a/pkg/tcpip/link/channel/channel.go b/pkg/tcpip/link/channel/channel.go
index 5944ba190..a8d6653ce 100644
--- a/pkg/tcpip/link/channel/channel.go
+++ b/pkg/tcpip/link/channel/channel.go
@@ -28,7 +28,7 @@ import (
 
 // PacketInfo holds all the information about an outbound packet.
 type PacketInfo struct {
-	Pkt   tcpip.PacketBuffer
+	Pkt   stack.PacketBuffer
 	Proto tcpip.NetworkProtocolNumber
 	GSO   *stack.GSO
 	Route stack.Route
@@ -203,12 +203,12 @@ func (e *Endpoint) NumQueued() int {
 }
 
 // InjectInbound injects an inbound packet.
-func (e *Endpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) {
+func (e *Endpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) {
 	e.InjectLinkAddr(protocol, "", pkt)
 }
 
 // InjectLinkAddr injects an inbound packet with a remote link address.
-func (e *Endpoint) InjectLinkAddr(protocol tcpip.NetworkProtocolNumber, remote tcpip.LinkAddress, pkt tcpip.PacketBuffer) {
+func (e *Endpoint) InjectLinkAddr(protocol tcpip.NetworkProtocolNumber, remote tcpip.LinkAddress, pkt stack.PacketBuffer) {
 	e.dispatcher.DeliverNetworkPacket(e, remote, "" /* local */, protocol, pkt)
 }
 
@@ -251,7 +251,7 @@ func (e *Endpoint) LinkAddress() tcpip.LinkAddress {
 }
 
 // WritePacket stores outbound packets into the channel.
-func (e *Endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) *tcpip.Error {
+func (e *Endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) *tcpip.Error {
 	// Clone r then release its resource so we only get the relevant fields from
 	// stack.Route without holding a reference to a NIC's endpoint.
 	route := r.Clone()
@@ -269,7 +269,7 @@ func (e *Endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.Ne
 }
 
 // WritePackets stores outbound packets into the channel.
-func (e *Endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []tcpip.PacketBuffer, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+func (e *Endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []stack.PacketBuffer, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
 	// Clone r then release its resource so we only get the relevant fields from
 	// stack.Route without holding a reference to a NIC's endpoint.
 	route := r.Clone()
@@ -280,7 +280,7 @@ func (e *Endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []tcpip.Pac
 		off := pkt.DataOffset
 		size := pkt.DataSize
 		p := PacketInfo{
-			Pkt: tcpip.PacketBuffer{
+			Pkt: stack.PacketBuffer{
 				Header: pkt.Header,
 				Data:   buffer.NewViewFromBytes(payloadView[off : off+size]).ToVectorisedView(),
 			},
@@ -301,7 +301,7 @@ func (e *Endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []tcpip.Pac
 // WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket.
 func (e *Endpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error {
 	p := PacketInfo{
-		Pkt:   tcpip.PacketBuffer{Data: vv},
+		Pkt:   stack.PacketBuffer{Data: vv},
 		Proto: 0,
 		GSO:   nil,
 	}
diff --git a/pkg/tcpip/link/fdbased/endpoint.go b/pkg/tcpip/link/fdbased/endpoint.go
index 3b36b9673..3b3b6909b 100644
--- a/pkg/tcpip/link/fdbased/endpoint.go
+++ b/pkg/tcpip/link/fdbased/endpoint.go
@@ -386,7 +386,7 @@ const (
 
 // WritePacket writes outbound packets to the file descriptor. If it is not
 // currently writable, the packet is dropped.
-func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) *tcpip.Error {
+func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) *tcpip.Error {
 	if e.hdrSize > 0 {
 		// Add ethernet header if needed.
 		eth := header.Ethernet(pkt.Header.Prepend(header.EthernetMinimumSize))
@@ -405,6 +405,7 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.Ne
 		eth.Encode(ethHdr)
 	}
 
+	fd := e.fds[pkt.Hash%uint32(len(e.fds))]
 	if e.Capabilities()&stack.CapabilityHardwareGSO != 0 {
 		vnetHdr := virtioNetHdr{}
 		if gso != nil {
@@ -428,19 +429,19 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.Ne
 		}
 
 		vnetHdrBuf := vnetHdrToByteSlice(&vnetHdr)
-		return rawfile.NonBlockingWrite3(e.fds[0], vnetHdrBuf, pkt.Header.View(), pkt.Data.ToView())
+		return rawfile.NonBlockingWrite3(fd, vnetHdrBuf, pkt.Header.View(), pkt.Data.ToView())
 	}
 
 	if pkt.Data.Size() == 0 {
-		return rawfile.NonBlockingWrite(e.fds[0], pkt.Header.View())
+		return rawfile.NonBlockingWrite(fd, pkt.Header.View())
 	}
 
-	return rawfile.NonBlockingWrite3(e.fds[0], pkt.Header.View(), pkt.Data.ToView(), nil)
+	return rawfile.NonBlockingWrite3(fd, pkt.Header.View(), pkt.Data.ToView(), nil)
 }
 
 // WritePackets writes outbound packets to the file descriptor. If it is not
 // currently writable, the packet is dropped.
-func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []tcpip.PacketBuffer, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []stack.PacketBuffer, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
 	var ethHdrBuf []byte
 	// hdr + data
 	iovLen := 2
@@ -551,7 +552,8 @@ func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []tcpip.Pac
 
 	packets := 0
 	for packets < n {
-		sent, err := rawfile.NonBlockingSendMMsg(e.fds[0], mmsgHdrs)
+		fd := e.fds[pkts[packets].Hash%uint32(len(e.fds))]
+		sent, err := rawfile.NonBlockingSendMMsg(fd, mmsgHdrs)
 		if err != nil {
 			return packets, err
 		}
@@ -610,7 +612,7 @@ func (e *InjectableEndpoint) Attach(dispatcher stack.NetworkDispatcher) {
 }
 
 // InjectInbound injects an inbound packet.
-func (e *InjectableEndpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) {
+func (e *InjectableEndpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) {
 	e.dispatcher.DeliverNetworkPacket(e, "" /* remote */, "" /* local */, protocol, pkt)
 }
 
diff --git a/pkg/tcpip/link/fdbased/endpoint_test.go b/pkg/tcpip/link/fdbased/endpoint_test.go
index 2066987eb..3bfb15a8e 100644
--- a/pkg/tcpip/link/fdbased/endpoint_test.go
+++ b/pkg/tcpip/link/fdbased/endpoint_test.go
@@ -45,40 +45,46 @@ const (
 type packetInfo struct {
 	raddr    tcpip.LinkAddress
 	proto    tcpip.NetworkProtocolNumber
-	contents tcpip.PacketBuffer
+	contents stack.PacketBuffer
 }
 
 type context struct {
-	t    *testing.T
-	fds  [2]int
-	ep   stack.LinkEndpoint
-	ch   chan packetInfo
-	done chan struct{}
+	t        *testing.T
+	readFDs  []int
+	writeFDs []int
+	ep       stack.LinkEndpoint
+	ch       chan packetInfo
+	done     chan struct{}
 }
 
 func newContext(t *testing.T, opt *Options) *context {
-	fds, err := syscall.Socketpair(syscall.AF_UNIX, syscall.SOCK_SEQPACKET, 0)
+	firstFDPair, err := syscall.Socketpair(syscall.AF_UNIX, syscall.SOCK_SEQPACKET, 0)
+	if err != nil {
+		t.Fatalf("Socketpair failed: %v", err)
+	}
+	secondFDPair, err := syscall.Socketpair(syscall.AF_UNIX, syscall.SOCK_SEQPACKET, 0)
 	if err != nil {
 		t.Fatalf("Socketpair failed: %v", err)
 	}
 
-	done := make(chan struct{}, 1)
+	done := make(chan struct{}, 2)
 	opt.ClosedFunc = func(*tcpip.Error) {
 		done <- struct{}{}
 	}
 
-	opt.FDs = []int{fds[1]}
+	opt.FDs = []int{firstFDPair[1], secondFDPair[1]}
 	ep, err := New(opt)
 	if err != nil {
 		t.Fatalf("Failed to create FD endpoint: %v", err)
 	}
 
 	c := &context{
-		t:    t,
-		fds:  fds,
-		ep:   ep,
-		ch:   make(chan packetInfo, 100),
-		done: done,
+		t:        t,
+		readFDs:  []int{firstFDPair[0], secondFDPair[0]},
+		writeFDs: opt.FDs,
+		ep:       ep,
+		ch:       make(chan packetInfo, 100),
+		done:     done,
 	}
 
 	ep.Attach(c)
@@ -87,12 +93,17 @@ func newContext(t *testing.T, opt *Options) *context {
 }
 
 func (c *context) cleanup() {
-	syscall.Close(c.fds[0])
+	for _, fd := range c.readFDs {
+		syscall.Close(fd)
+	}
+	<-c.done
 	<-c.done
-	syscall.Close(c.fds[1])
+	for _, fd := range c.writeFDs {
+		syscall.Close(fd)
+	}
 }
 
-func (c *context) DeliverNetworkPacket(linkEP stack.LinkEndpoint, remote tcpip.LinkAddress, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) {
+func (c *context) DeliverNetworkPacket(linkEP stack.LinkEndpoint, remote tcpip.LinkAddress, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) {
 	c.ch <- packetInfo{remote, protocol, pkt}
 }
 
@@ -136,7 +147,7 @@ func TestAddress(t *testing.T) {
 	}
 }
 
-func testWritePacket(t *testing.T, plen int, eth bool, gsoMaxSize uint32) {
+func testWritePacket(t *testing.T, plen int, eth bool, gsoMaxSize uint32, hash uint32) {
 	c := newContext(t, &Options{Address: laddr, MTU: mtu, EthernetHeader: eth, GSOMaxSize: gsoMaxSize})
 	defer c.cleanup()
 
@@ -168,16 +179,18 @@ func testWritePacket(t *testing.T, plen int, eth bool, gsoMaxSize uint32) {
 			L3HdrLen:   header.IPv4MaximumHeaderSize,
 		}
 	}
-	if err := c.ep.WritePacket(r, gso, proto, tcpip.PacketBuffer{
+	if err := c.ep.WritePacket(r, gso, proto, stack.PacketBuffer{
 		Header: hdr,
 		Data:   payload.ToVectorisedView(),
+		Hash:   hash,
 	}); err != nil {
 		t.Fatalf("WritePacket failed: %v", err)
 	}
 
-	// Read from fd, then compare with what we wrote.
+	// Read from the corresponding FD, then compare with what we wrote.
 	b = make([]byte, mtu)
-	n, err := syscall.Read(c.fds[0], b)
+	fd := c.readFDs[hash%uint32(len(c.readFDs))]
+	n, err := syscall.Read(fd, b)
 	if err != nil {
 		t.Fatalf("Read failed: %v", err)
 	}
@@ -238,7 +251,7 @@ func TestWritePacket(t *testing.T) {
 				t.Run(
 					fmt.Sprintf("Eth=%v,PayloadLen=%v,GSOMaxSize=%v", eth, plen, gso),
 					func(t *testing.T) {
-						testWritePacket(t, plen, eth, gso)
+						testWritePacket(t, plen, eth, gso, 0)
 					},
 				)
 			}
@@ -246,6 +259,27 @@ func TestWritePacket(t *testing.T) {
 	}
 }
 
+func TestHashedWritePacket(t *testing.T) {
+	lengths := []int{0, 100, 1000}
+	eths := []bool{true, false}
+	gsos := []uint32{0, 32768}
+	hashes := []uint32{0, 1}
+	for _, eth := range eths {
+		for _, plen := range lengths {
+			for _, gso := range gsos {
+				for _, hash := range hashes {
+					t.Run(
+						fmt.Sprintf("Eth=%v,PayloadLen=%v,GSOMaxSize=%v,Hash=%d", eth, plen, gso, hash),
+						func(t *testing.T) {
+							testWritePacket(t, plen, eth, gso, hash)
+						},
+					)
+				}
+			}
+		}
+	}
+}
+
 func TestPreserveSrcAddress(t *testing.T) {
 	baddr := tcpip.LinkAddress("\xcc\xbb\xaa\x77\x88\x99")
 
@@ -261,7 +295,7 @@ func TestPreserveSrcAddress(t *testing.T) {
 	// WritePacket panics given a prependable with anything less than
 	// the minimum size of the ethernet header.
 	hdr := buffer.NewPrependable(header.EthernetMinimumSize)
-	if err := c.ep.WritePacket(r, nil /* gso */, proto, tcpip.PacketBuffer{
+	if err := c.ep.WritePacket(r, nil /* gso */, proto, stack.PacketBuffer{
 		Header: hdr,
 		Data:   buffer.VectorisedView{},
 	}); err != nil {
@@ -270,7 +304,7 @@ func TestPreserveSrcAddress(t *testing.T) {
 
 	// Read from the FD, then compare with what we wrote.
 	b := make([]byte, mtu)
-	n, err := syscall.Read(c.fds[0], b)
+	n, err := syscall.Read(c.readFDs[0], b)
 	if err != nil {
 		t.Fatalf("Read failed: %v", err)
 	}
@@ -314,7 +348,7 @@ func TestDeliverPacket(t *testing.T) {
 				}
 
 				// Write packet via the file descriptor.
-				if _, err := syscall.Write(c.fds[0], all); err != nil {
+				if _, err := syscall.Write(c.readFDs[0], all); err != nil {
 					t.Fatalf("Write failed: %v", err)
 				}
 
@@ -324,7 +358,7 @@ func TestDeliverPacket(t *testing.T) {
 					want := packetInfo{
 						raddr: raddr,
 						proto: proto,
-						contents: tcpip.PacketBuffer{
+						contents: stack.PacketBuffer{
 							Data:       buffer.View(b).ToVectorisedView(),
 							LinkHeader: buffer.View(hdr),
 						},
diff --git a/pkg/tcpip/link/fdbased/mmap.go b/pkg/tcpip/link/fdbased/mmap.go
index 62ed1e569..fe2bf3b0b 100644
--- a/pkg/tcpip/link/fdbased/mmap.go
+++ b/pkg/tcpip/link/fdbased/mmap.go
@@ -25,6 +25,7 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/link/rawfile"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
 )
 
 const (
@@ -190,7 +191,7 @@ func (d *packetMMapDispatcher) dispatch() (bool, *tcpip.Error) {
 	}
 
 	pkt = pkt[d.e.hdrSize:]
-	d.e.dispatcher.DeliverNetworkPacket(d.e, remote, local, p, tcpip.PacketBuffer{
+	d.e.dispatcher.DeliverNetworkPacket(d.e, remote, local, p, stack.PacketBuffer{
 		Data:       buffer.View(pkt).ToVectorisedView(),
 		LinkHeader: buffer.View(eth),
 	})
diff --git a/pkg/tcpip/link/fdbased/packet_dispatchers.go b/pkg/tcpip/link/fdbased/packet_dispatchers.go
index c67d684ce..cb4cbea69 100644
--- a/pkg/tcpip/link/fdbased/packet_dispatchers.go
+++ b/pkg/tcpip/link/fdbased/packet_dispatchers.go
@@ -139,7 +139,7 @@ func (d *readVDispatcher) dispatch() (bool, *tcpip.Error) {
 	}
 
 	used := d.capViews(n, BufConfig)
-	pkt := tcpip.PacketBuffer{
+	pkt := stack.PacketBuffer{
 		Data:       buffer.NewVectorisedView(n, append([]buffer.View(nil), d.views[:used]...)),
 		LinkHeader: buffer.View(eth),
 	}
@@ -296,7 +296,7 @@ func (d *recvMMsgDispatcher) dispatch() (bool, *tcpip.Error) {
 		}
 
 		used := d.capViews(k, int(n), BufConfig)
-		pkt := tcpip.PacketBuffer{
+		pkt := stack.PacketBuffer{
 			Data:       buffer.NewVectorisedView(int(n), append([]buffer.View(nil), d.views[k][:used]...)),
 			LinkHeader: buffer.View(eth),
 		}
diff --git a/pkg/tcpip/link/loopback/loopback.go b/pkg/tcpip/link/loopback/loopback.go
index 499cc608f..4039753b7 100644
--- a/pkg/tcpip/link/loopback/loopback.go
+++ b/pkg/tcpip/link/loopback/loopback.go
@@ -76,7 +76,7 @@ func (*endpoint) Wait() {}
 
 // WritePacket implements stack.LinkEndpoint.WritePacket. It delivers outbound
 // packets to the network-layer dispatcher.
-func (e *endpoint) WritePacket(_ *stack.Route, _ *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) *tcpip.Error {
+func (e *endpoint) WritePacket(_ *stack.Route, _ *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) *tcpip.Error {
 	views := make([]buffer.View, 1, 1+len(pkt.Data.Views()))
 	views[0] = pkt.Header.View()
 	views = append(views, pkt.Data.Views()...)
@@ -84,7 +84,7 @@ func (e *endpoint) WritePacket(_ *stack.Route, _ *stack.GSO, protocol tcpip.Netw
 	// Because we're immediately turning around and writing the packet back
 	// to the rx path, we intentionally don't preserve the remote and local
 	// link addresses from the stack.Route we're passed.
-	e.dispatcher.DeliverNetworkPacket(e, "" /* remote */, "" /* local */, protocol, tcpip.PacketBuffer{
+	e.dispatcher.DeliverNetworkPacket(e, "" /* remote */, "" /* local */, protocol, stack.PacketBuffer{
 		Data: buffer.NewVectorisedView(len(views[0])+pkt.Data.Size(), views),
 	})
 
@@ -92,7 +92,7 @@ func (e *endpoint) WritePacket(_ *stack.Route, _ *stack.GSO, protocol tcpip.Netw
 }
 
 // WritePackets implements stack.LinkEndpoint.WritePackets.
-func (e *endpoint) WritePackets(*stack.Route, *stack.GSO, []tcpip.PacketBuffer, tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+func (e *endpoint) WritePackets(*stack.Route, *stack.GSO, []stack.PacketBuffer, tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
 	panic("not implemented")
 }
 
@@ -106,7 +106,7 @@ func (e *endpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error {
 	// There should be an ethernet header at the beginning of vv.
 	linkHeader := header.Ethernet(vv.First()[:header.EthernetMinimumSize])
 	vv.TrimFront(len(linkHeader))
-	e.dispatcher.DeliverNetworkPacket(e, "" /* remote */, "" /* local */, linkHeader.Type(), tcpip.PacketBuffer{
+	e.dispatcher.DeliverNetworkPacket(e, "" /* remote */, "" /* local */, linkHeader.Type(), stack.PacketBuffer{
 		Data:       vv,
 		LinkHeader: buffer.View(linkHeader),
 	})
diff --git a/pkg/tcpip/link/muxed/injectable.go b/pkg/tcpip/link/muxed/injectable.go
index 445b22c17..f5973066d 100644
--- a/pkg/tcpip/link/muxed/injectable.go
+++ b/pkg/tcpip/link/muxed/injectable.go
@@ -80,14 +80,14 @@ func (m *InjectableEndpoint) IsAttached() bool {
 }
 
 // InjectInbound implements stack.InjectableLinkEndpoint.
-func (m *InjectableEndpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) {
+func (m *InjectableEndpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) {
 	m.dispatcher.DeliverNetworkPacket(m, "" /* remote */, "" /* local */, protocol, pkt)
 }
 
 // WritePackets writes outbound packets to the appropriate
 // LinkInjectableEndpoint based on the RemoteAddress. HandleLocal only works if
 // r.RemoteAddress has a route registered in this endpoint.
-func (m *InjectableEndpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []tcpip.PacketBuffer, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+func (m *InjectableEndpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []stack.PacketBuffer, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
 	endpoint, ok := m.routes[r.RemoteAddress]
 	if !ok {
 		return 0, tcpip.ErrNoRoute
@@ -98,7 +98,7 @@ func (m *InjectableEndpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts [
 // WritePacket writes outbound packets to the appropriate LinkInjectableEndpoint
 // based on the RemoteAddress. HandleLocal only works if r.RemoteAddress has a
 // route registered in this endpoint.
-func (m *InjectableEndpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) *tcpip.Error {
+func (m *InjectableEndpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) *tcpip.Error {
 	if endpoint, ok := m.routes[r.RemoteAddress]; ok {
 		return endpoint.WritePacket(r, gso, protocol, pkt)
 	}
diff --git a/pkg/tcpip/link/muxed/injectable_test.go b/pkg/tcpip/link/muxed/injectable_test.go
index 63b249837..87c734c1f 100644
--- a/pkg/tcpip/link/muxed/injectable_test.go
+++ b/pkg/tcpip/link/muxed/injectable_test.go
@@ -50,7 +50,7 @@ func TestInjectableEndpointDispatch(t *testing.T) {
 	hdr.Prepend(1)[0] = 0xFA
 	packetRoute := stack.Route{RemoteAddress: dstIP}
 
-	endpoint.WritePacket(&packetRoute, nil /* gso */, ipv4.ProtocolNumber, tcpip.PacketBuffer{
+	endpoint.WritePacket(&packetRoute, nil /* gso */, ipv4.ProtocolNumber, stack.PacketBuffer{
 		Header: hdr,
 		Data:   buffer.NewViewFromBytes([]byte{0xFB}).ToVectorisedView(),
 	})
@@ -70,7 +70,7 @@ func TestInjectableEndpointDispatchHdrOnly(t *testing.T) {
 	hdr := buffer.NewPrependable(1)
 	hdr.Prepend(1)[0] = 0xFA
 	packetRoute := stack.Route{RemoteAddress: dstIP}
-	endpoint.WritePacket(&packetRoute, nil /* gso */, ipv4.ProtocolNumber, tcpip.PacketBuffer{
+	endpoint.WritePacket(&packetRoute, nil /* gso */, ipv4.ProtocolNumber, stack.PacketBuffer{
 		Header: hdr,
 		Data:   buffer.NewView(0).ToVectorisedView(),
 	})
diff --git a/pkg/tcpip/link/sharedmem/sharedmem.go b/pkg/tcpip/link/sharedmem/sharedmem.go
index 655e537c4..6461d0108 100644
--- a/pkg/tcpip/link/sharedmem/sharedmem.go
+++ b/pkg/tcpip/link/sharedmem/sharedmem.go
@@ -185,7 +185,7 @@ func (e *endpoint) LinkAddress() tcpip.LinkAddress {
 
 // WritePacket writes outbound packets to the file descriptor. If it is not
 // currently writable, the packet is dropped.
-func (e *endpoint) WritePacket(r *stack.Route, _ *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) *tcpip.Error {
+func (e *endpoint) WritePacket(r *stack.Route, _ *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) *tcpip.Error {
 	// Add the ethernet header here.
 	eth := header.Ethernet(pkt.Header.Prepend(header.EthernetMinimumSize))
 	pkt.LinkHeader = buffer.View(eth)
@@ -214,7 +214,7 @@ func (e *endpoint) WritePacket(r *stack.Route, _ *stack.GSO, protocol tcpip.Netw
 }
 
 // WritePackets implements stack.LinkEndpoint.WritePackets.
-func (e *endpoint) WritePackets(r *stack.Route, _ *stack.GSO, pkts []tcpip.PacketBuffer, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+func (e *endpoint) WritePackets(r *stack.Route, _ *stack.GSO, pkts []stack.PacketBuffer, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
 	panic("not implemented")
 }
 
@@ -275,7 +275,7 @@ func (e *endpoint) dispatchLoop(d stack.NetworkDispatcher) {
 
 		// Send packet up the stack.
 		eth := header.Ethernet(b[:header.EthernetMinimumSize])
-		d.DeliverNetworkPacket(e, eth.SourceAddress(), eth.DestinationAddress(), eth.Type(), tcpip.PacketBuffer{
+		d.DeliverNetworkPacket(e, eth.SourceAddress(), eth.DestinationAddress(), eth.Type(), stack.PacketBuffer{
 			Data:       buffer.View(b[header.EthernetMinimumSize:]).ToVectorisedView(),
 			LinkHeader: buffer.View(eth),
 		})
diff --git a/pkg/tcpip/link/sharedmem/sharedmem_test.go b/pkg/tcpip/link/sharedmem/sharedmem_test.go
index 5c729a439..27ea3f531 100644
--- a/pkg/tcpip/link/sharedmem/sharedmem_test.go
+++ b/pkg/tcpip/link/sharedmem/sharedmem_test.go
@@ -131,7 +131,7 @@ func newTestContext(t *testing.T, mtu, bufferSize uint32, addr tcpip.LinkAddress
 	return c
 }
 
-func (c *testContext) DeliverNetworkPacket(_ stack.LinkEndpoint, remoteLinkAddr, localLinkAddr tcpip.LinkAddress, proto tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) {
+func (c *testContext) DeliverNetworkPacket(_ stack.LinkEndpoint, remoteLinkAddr, localLinkAddr tcpip.LinkAddress, proto tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) {
 	c.mu.Lock()
 	c.packets = append(c.packets, packetInfo{
 		addr:  remoteLinkAddr,
@@ -273,7 +273,7 @@ func TestSimpleSend(t *testing.T) {
 			randomFill(buf)
 
 			proto := tcpip.NetworkProtocolNumber(rand.Intn(0x10000))
-			if err := c.ep.WritePacket(&r, nil /* gso */, proto, tcpip.PacketBuffer{
+			if err := c.ep.WritePacket(&r, nil /* gso */, proto, stack.PacketBuffer{
 				Header: hdr,
 				Data:   buf.ToVectorisedView(),
 			}); err != nil {
@@ -345,7 +345,7 @@ func TestPreserveSrcAddressInSend(t *testing.T) {
 	hdr := buffer.NewPrependable(header.EthernetMinimumSize)
 
 	proto := tcpip.NetworkProtocolNumber(rand.Intn(0x10000))
-	if err := c.ep.WritePacket(&r, nil /* gso */, proto, tcpip.PacketBuffer{
+	if err := c.ep.WritePacket(&r, nil /* gso */, proto, stack.PacketBuffer{
 		Header: hdr,
 	}); err != nil {
 		t.Fatalf("WritePacket failed: %v", err)
@@ -401,7 +401,7 @@ func TestFillTxQueue(t *testing.T) {
 	for i := queuePipeSize / 40; i > 0; i-- {
 		hdr := buffer.NewPrependable(int(c.ep.MaxHeaderLength()))
 
-		if err := c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, tcpip.PacketBuffer{
+		if err := c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, stack.PacketBuffer{
 			Header: hdr,
 			Data:   buf.ToVectorisedView(),
 		}); err != nil {
@@ -419,7 +419,7 @@ func TestFillTxQueue(t *testing.T) {
 
 	// Next attempt to write must fail.
 	hdr := buffer.NewPrependable(int(c.ep.MaxHeaderLength()))
-	if want, err := tcpip.ErrWouldBlock, c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, tcpip.PacketBuffer{
+	if want, err := tcpip.ErrWouldBlock, c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, stack.PacketBuffer{
 		Header: hdr,
 		Data:   buf.ToVectorisedView(),
 	}); err != want {
@@ -447,7 +447,7 @@ func TestFillTxQueueAfterBadCompletion(t *testing.T) {
 	// Send two packets so that the id slice has at least two slots.
 	for i := 2; i > 0; i-- {
 		hdr := buffer.NewPrependable(int(c.ep.MaxHeaderLength()))
-		if err := c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, tcpip.PacketBuffer{
+		if err := c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, stack.PacketBuffer{
 			Header: hdr,
 			Data:   buf.ToVectorisedView(),
 		}); err != nil {
@@ -470,7 +470,7 @@ func TestFillTxQueueAfterBadCompletion(t *testing.T) {
 	ids := make(map[uint64]struct{})
 	for i := queuePipeSize / 40; i > 0; i-- {
 		hdr := buffer.NewPrependable(int(c.ep.MaxHeaderLength()))
-		if err := c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, tcpip.PacketBuffer{
+		if err := c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, stack.PacketBuffer{
 			Header: hdr,
 			Data:   buf.ToVectorisedView(),
 		}); err != nil {
@@ -488,7 +488,7 @@ func TestFillTxQueueAfterBadCompletion(t *testing.T) {
 
 	// Next attempt to write must fail.
 	hdr := buffer.NewPrependable(int(c.ep.MaxHeaderLength()))
-	if want, err := tcpip.ErrWouldBlock, c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, tcpip.PacketBuffer{
+	if want, err := tcpip.ErrWouldBlock, c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, stack.PacketBuffer{
 		Header: hdr,
 		Data:   buf.ToVectorisedView(),
 	}); err != want {
@@ -514,7 +514,7 @@ func TestFillTxMemory(t *testing.T) {
 	ids := make(map[uint64]struct{})
 	for i := queueDataSize / bufferSize; i > 0; i-- {
 		hdr := buffer.NewPrependable(int(c.ep.MaxHeaderLength()))
-		if err := c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, tcpip.PacketBuffer{
+		if err := c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, stack.PacketBuffer{
 			Header: hdr,
 			Data:   buf.ToVectorisedView(),
 		}); err != nil {
@@ -533,7 +533,7 @@ func TestFillTxMemory(t *testing.T) {
 
 	// Next attempt to write must fail.
 	hdr := buffer.NewPrependable(int(c.ep.MaxHeaderLength()))
-	err := c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, tcpip.PacketBuffer{
+	err := c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, stack.PacketBuffer{
 		Header: hdr,
 		Data:   buf.ToVectorisedView(),
 	})
@@ -561,7 +561,7 @@ func TestFillTxMemoryWithMultiBuffer(t *testing.T) {
 	// until there is only one buffer left.
 	for i := queueDataSize/bufferSize - 1; i > 0; i-- {
 		hdr := buffer.NewPrependable(int(c.ep.MaxHeaderLength()))
-		if err := c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, tcpip.PacketBuffer{
+		if err := c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, stack.PacketBuffer{
 			Header: hdr,
 			Data:   buf.ToVectorisedView(),
 		}); err != nil {
@@ -577,7 +577,7 @@ func TestFillTxMemoryWithMultiBuffer(t *testing.T) {
 	{
 		hdr := buffer.NewPrependable(int(c.ep.MaxHeaderLength()))
 		uu := buffer.NewView(bufferSize).ToVectorisedView()
-		if want, err := tcpip.ErrWouldBlock, c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, tcpip.PacketBuffer{
+		if want, err := tcpip.ErrWouldBlock, c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, stack.PacketBuffer{
 			Header: hdr,
 			Data:   uu,
 		}); err != want {
@@ -588,7 +588,7 @@ func TestFillTxMemoryWithMultiBuffer(t *testing.T) {
 	// Attempt to write the one-buffer packet again. It must succeed.
 	{
 		hdr := buffer.NewPrependable(int(c.ep.MaxHeaderLength()))
-		if err := c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, tcpip.PacketBuffer{
+		if err := c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, stack.PacketBuffer{
 			Header: hdr,
 			Data:   buf.ToVectorisedView(),
 		}); err != nil {
diff --git a/pkg/tcpip/link/sniffer/sniffer.go b/pkg/tcpip/link/sniffer/sniffer.go
index 3392b7edd..0a6b8945c 100644
--- a/pkg/tcpip/link/sniffer/sniffer.go
+++ b/pkg/tcpip/link/sniffer/sniffer.go
@@ -123,7 +123,7 @@ func NewWithFile(lower stack.LinkEndpoint, file *os.File, snapLen uint32) (stack
 // DeliverNetworkPacket implements the stack.NetworkDispatcher interface. It is
 // called by the link-layer endpoint being wrapped when a packet arrives, and
 // logs the packet before forwarding to the actual dispatcher.
-func (e *endpoint) DeliverNetworkPacket(linkEP stack.LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) {
+func (e *endpoint) DeliverNetworkPacket(linkEP stack.LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) {
 	if atomic.LoadUint32(&LogPackets) == 1 && e.file == nil {
 		logPacket("recv", protocol, pkt.Data.First(), nil)
 	}
@@ -200,7 +200,7 @@ func (e *endpoint) GSOMaxSize() uint32 {
 	return 0
 }
 
-func (e *endpoint) dumpPacket(gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) {
+func (e *endpoint) dumpPacket(gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) {
 	if atomic.LoadUint32(&LogPackets) == 1 && e.file == nil {
 		logPacket("send", protocol, pkt.Header.View(), gso)
 	}
@@ -232,7 +232,7 @@ func (e *endpoint) dumpPacket(gso *stack.GSO, protocol tcpip.NetworkProtocolNumb
 // WritePacket implements the stack.LinkEndpoint interface. It is called by
 // higher-level protocols to write packets; it just logs the packet and
 // forwards the request to the lower endpoint.
-func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) *tcpip.Error {
+func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) *tcpip.Error {
 	e.dumpPacket(gso, protocol, pkt)
 	return e.lower.WritePacket(r, gso, protocol, pkt)
 }
@@ -240,10 +240,10 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.Ne
 // WritePackets implements the stack.LinkEndpoint interface. It is called by
 // higher-level protocols to write packets; it just logs the packet and
 // forwards the request to the lower endpoint.
-func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []tcpip.PacketBuffer, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []stack.PacketBuffer, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
 	view := pkts[0].Data.ToView()
 	for _, pkt := range pkts {
-		e.dumpPacket(gso, protocol, tcpip.PacketBuffer{
+		e.dumpPacket(gso, protocol, stack.PacketBuffer{
 			Header: pkt.Header,
 			Data:   view[pkt.DataOffset:][:pkt.DataSize].ToVectorisedView(),
 		})
diff --git a/pkg/tcpip/link/tun/device.go b/pkg/tcpip/link/tun/device.go
index f6e301304..617446ea2 100644
--- a/pkg/tcpip/link/tun/device.go
+++ b/pkg/tcpip/link/tun/device.go
@@ -213,7 +213,7 @@ func (d *Device) Write(data []byte) (int64, error) {
 		remote = tcpip.LinkAddress(zeroMAC[:])
 	}
 
-	pkt := tcpip.PacketBuffer{
+	pkt := stack.PacketBuffer{
 		Data: buffer.View(data).ToVectorisedView(),
 	}
 	if ethHdr != nil {
diff --git a/pkg/tcpip/link/waitable/waitable.go b/pkg/tcpip/link/waitable/waitable.go
index a8de38979..52fe397bf 100644
--- a/pkg/tcpip/link/waitable/waitable.go
+++ b/pkg/tcpip/link/waitable/waitable.go
@@ -50,7 +50,7 @@ func New(lower stack.LinkEndpoint) *Endpoint {
 // It is called by the link-layer endpoint being wrapped when a packet arrives,
 // and only forwards to the actual dispatcher if Wait or WaitDispatch haven't
 // been called.
-func (e *Endpoint) DeliverNetworkPacket(linkEP stack.LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) {
+func (e *Endpoint) DeliverNetworkPacket(linkEP stack.LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) {
 	if !e.dispatchGate.Enter() {
 		return
 	}
@@ -99,7 +99,7 @@ func (e *Endpoint) LinkAddress() tcpip.LinkAddress {
 // WritePacket implements stack.LinkEndpoint.WritePacket. It is called by
 // higher-level protocols to write packets. It only forwards packets to the
 // lower endpoint if Wait or WaitWrite haven't been called.
-func (e *Endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) *tcpip.Error {
+func (e *Endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) *tcpip.Error {
 	if !e.writeGate.Enter() {
 		return nil
 	}
@@ -112,7 +112,7 @@ func (e *Endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.Ne
 // WritePackets implements stack.LinkEndpoint.WritePackets. It is called by
 // higher-level protocols to write packets. It only forwards packets to the
 // lower endpoint if Wait or WaitWrite haven't been called.
-func (e *Endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []tcpip.PacketBuffer, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+func (e *Endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []stack.PacketBuffer, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
 	if !e.writeGate.Enter() {
 		return len(pkts), nil
 	}
diff --git a/pkg/tcpip/link/waitable/waitable_test.go b/pkg/tcpip/link/waitable/waitable_test.go
index 31b11a27a..88224e494 100644
--- a/pkg/tcpip/link/waitable/waitable_test.go
+++ b/pkg/tcpip/link/waitable/waitable_test.go
@@ -35,7 +35,7 @@ type countedEndpoint struct {
 	dispatcher stack.NetworkDispatcher
 }
 
-func (e *countedEndpoint) DeliverNetworkPacket(linkEP stack.LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) {
+func (e *countedEndpoint) DeliverNetworkPacket(linkEP stack.LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) {
 	e.dispatchCount++
 }
 
@@ -65,13 +65,13 @@ func (e *countedEndpoint) LinkAddress() tcpip.LinkAddress {
 	return e.linkAddr
 }
 
-func (e *countedEndpoint) WritePacket(r *stack.Route, _ *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) *tcpip.Error {
+func (e *countedEndpoint) WritePacket(r *stack.Route, _ *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) *tcpip.Error {
 	e.writeCount++
 	return nil
 }
 
 // WritePackets implements stack.LinkEndpoint.WritePackets.
-func (e *countedEndpoint) WritePackets(r *stack.Route, _ *stack.GSO, pkts []tcpip.PacketBuffer, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+func (e *countedEndpoint) WritePackets(r *stack.Route, _ *stack.GSO, pkts []stack.PacketBuffer, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
 	e.writeCount += len(pkts)
 	return len(pkts), nil
 }
@@ -89,21 +89,21 @@ func TestWaitWrite(t *testing.T) {
 	wep := New(ep)
 
 	// Write and check that it goes through.
-	wep.WritePacket(nil, nil /* gso */, 0, tcpip.PacketBuffer{})
+	wep.WritePacket(nil, nil /* gso */, 0, stack.PacketBuffer{})
 	if want := 1; ep.writeCount != want {
 		t.Fatalf("Unexpected writeCount: got=%v, want=%v", ep.writeCount, want)
 	}
 
 	// Wait on dispatches, then try to write. It must go through.
 	wep.WaitDispatch()
-	wep.WritePacket(nil, nil /* gso */, 0, tcpip.PacketBuffer{})
+	wep.WritePacket(nil, nil /* gso */, 0, stack.PacketBuffer{})
 	if want := 2; ep.writeCount != want {
 		t.Fatalf("Unexpected writeCount: got=%v, want=%v", ep.writeCount, want)
 	}
 
 	// Wait on writes, then try to write. It must not go through.
 	wep.WaitWrite()
-	wep.WritePacket(nil, nil /* gso */, 0, tcpip.PacketBuffer{})
+	wep.WritePacket(nil, nil /* gso */, 0, stack.PacketBuffer{})
 	if want := 2; ep.writeCount != want {
 		t.Fatalf("Unexpected writeCount: got=%v, want=%v", ep.writeCount, want)
 	}
@@ -120,21 +120,21 @@ func TestWaitDispatch(t *testing.T) {
 	}
 
 	// Dispatch and check that it goes through.
-	ep.dispatcher.DeliverNetworkPacket(ep, "", "", 0, tcpip.PacketBuffer{})
+	ep.dispatcher.DeliverNetworkPacket(ep, "", "", 0, stack.PacketBuffer{})
 	if want := 1; ep.dispatchCount != want {
 		t.Fatalf("Unexpected dispatchCount: got=%v, want=%v", ep.dispatchCount, want)
 	}
 
 	// Wait on writes, then try to dispatch. It must go through.
 	wep.WaitWrite()
-	ep.dispatcher.DeliverNetworkPacket(ep, "", "", 0, tcpip.PacketBuffer{})
+	ep.dispatcher.DeliverNetworkPacket(ep, "", "", 0, stack.PacketBuffer{})
 	if want := 2; ep.dispatchCount != want {
 		t.Fatalf("Unexpected dispatchCount: got=%v, want=%v", ep.dispatchCount, want)
 	}
 
 	// Wait on dispatches, then try to dispatch. It must not go through.
 	wep.WaitDispatch()
-	ep.dispatcher.DeliverNetworkPacket(ep, "", "", 0, tcpip.PacketBuffer{})
+	ep.dispatcher.DeliverNetworkPacket(ep, "", "", 0, stack.PacketBuffer{})
 	if want := 2; ep.dispatchCount != want {
 		t.Fatalf("Unexpected dispatchCount: got=%v, want=%v", ep.dispatchCount, want)
 	}
diff --git a/pkg/tcpip/network/arp/arp.go b/pkg/tcpip/network/arp/arp.go
index e9fcc89a8..255098372 100644
--- a/pkg/tcpip/network/arp/arp.go
+++ b/pkg/tcpip/network/arp/arp.go
@@ -79,20 +79,20 @@ func (e *endpoint) MaxHeaderLength() uint16 {
 
 func (e *endpoint) Close() {}
 
-func (e *endpoint) WritePacket(*stack.Route, *stack.GSO, stack.NetworkHeaderParams, tcpip.PacketBuffer) *tcpip.Error {
+func (e *endpoint) WritePacket(*stack.Route, *stack.GSO, stack.NetworkHeaderParams, stack.PacketBuffer) *tcpip.Error {
 	return tcpip.ErrNotSupported
 }
 
 // WritePackets implements stack.NetworkEndpoint.WritePackets.
-func (e *endpoint) WritePackets(*stack.Route, *stack.GSO, []tcpip.PacketBuffer, stack.NetworkHeaderParams) (int, *tcpip.Error) {
+func (e *endpoint) WritePackets(*stack.Route, *stack.GSO, []stack.PacketBuffer, stack.NetworkHeaderParams) (int, *tcpip.Error) {
 	return 0, tcpip.ErrNotSupported
 }
 
-func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt tcpip.PacketBuffer) *tcpip.Error {
+func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt stack.PacketBuffer) *tcpip.Error {
 	return tcpip.ErrNotSupported
 }
 
-func (e *endpoint) HandlePacket(r *stack.Route, pkt tcpip.PacketBuffer) {
+func (e *endpoint) HandlePacket(r *stack.Route, pkt stack.PacketBuffer) {
 	v := pkt.Data.First()
 	h := header.ARP(v)
 	if !h.IsValid() {
@@ -113,7 +113,7 @@ func (e *endpoint) HandlePacket(r *stack.Route, pkt tcpip.PacketBuffer) {
 		copy(packet.ProtocolAddressSender(), h.ProtocolAddressTarget())
 		copy(packet.HardwareAddressTarget(), h.HardwareAddressSender())
 		copy(packet.ProtocolAddressTarget(), h.ProtocolAddressSender())
-		e.linkEP.WritePacket(r, nil /* gso */, ProtocolNumber, tcpip.PacketBuffer{
+		e.linkEP.WritePacket(r, nil /* gso */, ProtocolNumber, stack.PacketBuffer{
 			Header: hdr,
 		})
 		fallthrough // also fill the cache from requests
@@ -167,7 +167,7 @@ func (*protocol) LinkAddressRequest(addr, localAddr tcpip.Address, linkEP stack.
 	copy(h.ProtocolAddressSender(), localAddr)
 	copy(h.ProtocolAddressTarget(), addr)
 
-	return linkEP.WritePacket(r, nil /* gso */, ProtocolNumber, tcpip.PacketBuffer{
+	return linkEP.WritePacket(r, nil /* gso */, ProtocolNumber, stack.PacketBuffer{
 		Header: hdr,
 	})
 }
diff --git a/pkg/tcpip/network/arp/arp_test.go b/pkg/tcpip/network/arp/arp_test.go
index 03cf03b6d..b3e239ac7 100644
--- a/pkg/tcpip/network/arp/arp_test.go
+++ b/pkg/tcpip/network/arp/arp_test.go
@@ -103,7 +103,7 @@ func TestDirectRequest(t *testing.T) {
 
 	inject := func(addr tcpip.Address) {
 		copy(h.ProtocolAddressTarget(), addr)
-		c.linkEP.InjectInbound(arp.ProtocolNumber, tcpip.PacketBuffer{
+		c.linkEP.InjectInbound(arp.ProtocolNumber, stack.PacketBuffer{
 			Data: v.ToVectorisedView(),
 		})
 	}
diff --git a/pkg/tcpip/network/ip_test.go b/pkg/tcpip/network/ip_test.go
index f4d78f8c6..4950d69fc 100644
--- a/pkg/tcpip/network/ip_test.go
+++ b/pkg/tcpip/network/ip_test.go
@@ -96,7 +96,7 @@ func (t *testObject) checkValues(protocol tcpip.TransportProtocolNumber, vv buff
 // DeliverTransportPacket is called by network endpoints after parsing incoming
 // packets. This is used by the test object to verify that the results of the
 // parsing are expected.
-func (t *testObject) DeliverTransportPacket(r *stack.Route, protocol tcpip.TransportProtocolNumber, pkt tcpip.PacketBuffer) {
+func (t *testObject) DeliverTransportPacket(r *stack.Route, protocol tcpip.TransportProtocolNumber, pkt stack.PacketBuffer) {
 	t.checkValues(protocol, pkt.Data, r.RemoteAddress, r.LocalAddress)
 	t.dataCalls++
 }
@@ -104,7 +104,7 @@ func (t *testObject) DeliverTransportPacket(r *stack.Route, protocol tcpip.Trans
 // DeliverTransportControlPacket is called by network endpoints after parsing
 // incoming control (ICMP) packets. This is used by the test object to verify
 // that the results of the parsing are expected.
-func (t *testObject) DeliverTransportControlPacket(local, remote tcpip.Address, net tcpip.NetworkProtocolNumber, trans tcpip.TransportProtocolNumber, typ stack.ControlType, extra uint32, pkt tcpip.PacketBuffer) {
+func (t *testObject) DeliverTransportControlPacket(local, remote tcpip.Address, net tcpip.NetworkProtocolNumber, trans tcpip.TransportProtocolNumber, typ stack.ControlType, extra uint32, pkt stack.PacketBuffer) {
 	t.checkValues(trans, pkt.Data, remote, local)
 	if typ != t.typ {
 		t.t.Errorf("typ = %v, want %v", typ, t.typ)
@@ -150,7 +150,7 @@ func (*testObject) Wait() {}
 // WritePacket is called by network endpoints after producing a packet and
 // writing it to the link endpoint. This is used by the test object to verify
 // that the produced packet is as expected.
-func (t *testObject) WritePacket(_ *stack.Route, _ *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) *tcpip.Error {
+func (t *testObject) WritePacket(_ *stack.Route, _ *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) *tcpip.Error {
 	var prot tcpip.TransportProtocolNumber
 	var srcAddr tcpip.Address
 	var dstAddr tcpip.Address
@@ -172,7 +172,7 @@ func (t *testObject) WritePacket(_ *stack.Route, _ *stack.GSO, protocol tcpip.Ne
 }
 
 // WritePackets implements stack.LinkEndpoint.WritePackets.
-func (t *testObject) WritePackets(_ *stack.Route, _ *stack.GSO, pkt []tcpip.PacketBuffer, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+func (t *testObject) WritePackets(_ *stack.Route, _ *stack.GSO, pkt []stack.PacketBuffer, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
 	panic("not implemented")
 }
 
@@ -246,7 +246,7 @@ func TestIPv4Send(t *testing.T) {
 	if err != nil {
 		t.Fatalf("could not find route: %v", err)
 	}
-	if err := ep.WritePacket(&r, nil /* gso */, stack.NetworkHeaderParams{Protocol: 123, TTL: 123, TOS: stack.DefaultTOS}, tcpip.PacketBuffer{
+	if err := ep.WritePacket(&r, nil /* gso */, stack.NetworkHeaderParams{Protocol: 123, TTL: 123, TOS: stack.DefaultTOS}, stack.PacketBuffer{
 		Header: hdr,
 		Data:   payload.ToVectorisedView(),
 	}); err != nil {
@@ -289,7 +289,7 @@ func TestIPv4Receive(t *testing.T) {
 	if err != nil {
 		t.Fatalf("could not find route: %v", err)
 	}
-	ep.HandlePacket(&r, tcpip.PacketBuffer{
+	ep.HandlePacket(&r, stack.PacketBuffer{
 		Data: view.ToVectorisedView(),
 	})
 	if o.dataCalls != 1 {
@@ -379,7 +379,7 @@ func TestIPv4ReceiveControl(t *testing.T) {
 			o.extra = c.expectedExtra
 
 			vv := view[:len(view)-c.trunc].ToVectorisedView()
-			ep.HandlePacket(&r, tcpip.PacketBuffer{
+			ep.HandlePacket(&r, stack.PacketBuffer{
 				Data: vv,
 			})
 			if want := c.expectedCount; o.controlCalls != want {
@@ -444,7 +444,7 @@ func TestIPv4FragmentationReceive(t *testing.T) {
 	}
 
 	// Send first segment.
-	ep.HandlePacket(&r, tcpip.PacketBuffer{
+	ep.HandlePacket(&r, stack.PacketBuffer{
 		Data: frag1.ToVectorisedView(),
 	})
 	if o.dataCalls != 0 {
@@ -452,7 +452,7 @@ func TestIPv4FragmentationReceive(t *testing.T) {
 	}
 
 	// Send second segment.
-	ep.HandlePacket(&r, tcpip.PacketBuffer{
+	ep.HandlePacket(&r, stack.PacketBuffer{
 		Data: frag2.ToVectorisedView(),
 	})
 	if o.dataCalls != 1 {
@@ -487,7 +487,7 @@ func TestIPv6Send(t *testing.T) {
 	if err != nil {
 		t.Fatalf("could not find route: %v", err)
 	}
-	if err := ep.WritePacket(&r, nil /* gso */, stack.NetworkHeaderParams{Protocol: 123, TTL: 123, TOS: stack.DefaultTOS}, tcpip.PacketBuffer{
+	if err := ep.WritePacket(&r, nil /* gso */, stack.NetworkHeaderParams{Protocol: 123, TTL: 123, TOS: stack.DefaultTOS}, stack.PacketBuffer{
 		Header: hdr,
 		Data:   payload.ToVectorisedView(),
 	}); err != nil {
@@ -530,7 +530,7 @@ func TestIPv6Receive(t *testing.T) {
 		t.Fatalf("could not find route: %v", err)
 	}
 
-	ep.HandlePacket(&r, tcpip.PacketBuffer{
+	ep.HandlePacket(&r, stack.PacketBuffer{
 		Data: view.ToVectorisedView(),
 	})
 	if o.dataCalls != 1 {
@@ -644,7 +644,7 @@ func TestIPv6ReceiveControl(t *testing.T) {
 			// Set ICMPv6 checksum.
 			icmp.SetChecksum(header.ICMPv6Checksum(icmp, outerSrcAddr, localIpv6Addr, buffer.VectorisedView{}))
 
-			ep.HandlePacket(&r, tcpip.PacketBuffer{
+			ep.HandlePacket(&r, stack.PacketBuffer{
 				Data: view[:len(view)-c.trunc].ToVectorisedView(),
 			})
 			if want := c.expectedCount; o.controlCalls != want {
diff --git a/pkg/tcpip/network/ipv4/BUILD b/pkg/tcpip/network/ipv4/BUILD
index 0fef2b1f1..880ea7de2 100644
--- a/pkg/tcpip/network/ipv4/BUILD
+++ b/pkg/tcpip/network/ipv4/BUILD
@@ -13,7 +13,6 @@ go_library(
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/header",
-        "//pkg/tcpip/iptables",
         "//pkg/tcpip/network/fragmentation",
         "//pkg/tcpip/network/hash",
         "//pkg/tcpip/stack",
diff --git a/pkg/tcpip/network/ipv4/icmp.go b/pkg/tcpip/network/ipv4/icmp.go
index 32bf39e43..c4bf1ba5c 100644
--- a/pkg/tcpip/network/ipv4/icmp.go
+++ b/pkg/tcpip/network/ipv4/icmp.go
@@ -15,7 +15,6 @@
 package ipv4
 
 import (
-	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
@@ -25,7 +24,7 @@ import (
 // the original packet that caused the ICMP one to be sent. This information is
 // used to find out which transport endpoint must be notified about the ICMP
 // packet.
-func (e *endpoint) handleControl(typ stack.ControlType, extra uint32, pkt tcpip.PacketBuffer) {
+func (e *endpoint) handleControl(typ stack.ControlType, extra uint32, pkt stack.PacketBuffer) {
 	h := header.IPv4(pkt.Data.First())
 
 	// We don't use IsValid() here because ICMP only requires that the IP
@@ -53,7 +52,7 @@ func (e *endpoint) handleControl(typ stack.ControlType, extra uint32, pkt tcpip.
 	e.dispatcher.DeliverTransportControlPacket(e.id.LocalAddress, h.DestinationAddress(), ProtocolNumber, p, typ, extra, pkt)
 }
 
-func (e *endpoint) handleICMP(r *stack.Route, pkt tcpip.PacketBuffer) {
+func (e *endpoint) handleICMP(r *stack.Route, pkt stack.PacketBuffer) {
 	stats := r.Stats()
 	received := stats.ICMP.V4PacketsReceived
 	v := pkt.Data.First()
@@ -85,7 +84,7 @@ func (e *endpoint) handleICMP(r *stack.Route, pkt tcpip.PacketBuffer) {
 
 		// It's possible that a raw socket expects to receive this.
 		h.SetChecksum(wantChecksum)
-		e.dispatcher.DeliverTransportPacket(r, header.ICMPv4ProtocolNumber, tcpip.PacketBuffer{
+		e.dispatcher.DeliverTransportPacket(r, header.ICMPv4ProtocolNumber, stack.PacketBuffer{
 			Data:          pkt.Data.Clone(nil),
 			NetworkHeader: append(buffer.View(nil), pkt.NetworkHeader...),
 		})
@@ -99,7 +98,7 @@ func (e *endpoint) handleICMP(r *stack.Route, pkt tcpip.PacketBuffer) {
 		pkt.SetChecksum(0)
 		pkt.SetChecksum(^header.Checksum(pkt, header.ChecksumVV(vv, 0)))
 		sent := stats.ICMP.V4PacketsSent
-		if err := r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: header.ICMPv4ProtocolNumber, TTL: r.DefaultTTL(), TOS: stack.DefaultTOS}, tcpip.PacketBuffer{
+		if err := r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: header.ICMPv4ProtocolNumber, TTL: r.DefaultTTL(), TOS: stack.DefaultTOS}, stack.PacketBuffer{
 			Header:          hdr,
 			Data:            vv,
 			TransportHeader: buffer.View(pkt),
diff --git a/pkg/tcpip/network/ipv4/ipv4.go b/pkg/tcpip/network/ipv4/ipv4.go
index 4f1742938..b3ee6000e 100644
--- a/pkg/tcpip/network/ipv4/ipv4.go
+++ b/pkg/tcpip/network/ipv4/ipv4.go
@@ -26,7 +26,6 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
-	"gvisor.dev/gvisor/pkg/tcpip/iptables"
 	"gvisor.dev/gvisor/pkg/tcpip/network/fragmentation"
 	"gvisor.dev/gvisor/pkg/tcpip/network/hash"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
@@ -125,7 +124,7 @@ func (e *endpoint) GSOMaxSize() uint32 {
 // packet's stated length matches the length of the header+payload. mtu
 // includes the IP header and options. This does not support the DontFragment
 // IP flag.
-func (e *endpoint) writePacketFragments(r *stack.Route, gso *stack.GSO, mtu int, pkt tcpip.PacketBuffer) *tcpip.Error {
+func (e *endpoint) writePacketFragments(r *stack.Route, gso *stack.GSO, mtu int, pkt stack.PacketBuffer) *tcpip.Error {
 	// This packet is too big, it needs to be fragmented.
 	ip := header.IPv4(pkt.Header.View())
 	flags := ip.Flags()
@@ -165,7 +164,7 @@ func (e *endpoint) writePacketFragments(r *stack.Route, gso *stack.GSO, mtu int,
 		if i > 0 {
 			newPayload := pkt.Data.Clone(nil)
 			newPayload.CapLength(innerMTU)
-			if err := e.linkEP.WritePacket(r, gso, ProtocolNumber, tcpip.PacketBuffer{
+			if err := e.linkEP.WritePacket(r, gso, ProtocolNumber, stack.PacketBuffer{
 				Header:        pkt.Header,
 				Data:          newPayload,
 				NetworkHeader: buffer.View(h),
@@ -184,7 +183,7 @@ func (e *endpoint) writePacketFragments(r *stack.Route, gso *stack.GSO, mtu int,
 			newPayload := pkt.Data.Clone(nil)
 			newPayloadLength := outerMTU - pkt.Header.UsedLength()
 			newPayload.CapLength(newPayloadLength)
-			if err := e.linkEP.WritePacket(r, gso, ProtocolNumber, tcpip.PacketBuffer{
+			if err := e.linkEP.WritePacket(r, gso, ProtocolNumber, stack.PacketBuffer{
 				Header:        pkt.Header,
 				Data:          newPayload,
 				NetworkHeader: buffer.View(h),
@@ -198,7 +197,7 @@ func (e *endpoint) writePacketFragments(r *stack.Route, gso *stack.GSO, mtu int,
 			startOfHdr := pkt.Header
 			startOfHdr.TrimBack(pkt.Header.UsedLength() - outerMTU)
 			emptyVV := buffer.NewVectorisedView(0, []buffer.View{})
-			if err := e.linkEP.WritePacket(r, gso, ProtocolNumber, tcpip.PacketBuffer{
+			if err := e.linkEP.WritePacket(r, gso, ProtocolNumber, stack.PacketBuffer{
 				Header:        startOfHdr,
 				Data:          emptyVV,
 				NetworkHeader: buffer.View(h),
@@ -241,7 +240,7 @@ func (e *endpoint) addIPHeader(r *stack.Route, hdr *buffer.Prependable, payloadS
 }
 
 // WritePacket writes a packet to the given destination address and protocol.
-func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.NetworkHeaderParams, pkt tcpip.PacketBuffer) *tcpip.Error {
+func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.NetworkHeaderParams, pkt stack.PacketBuffer) *tcpip.Error {
 	ip := e.addIPHeader(r, &pkt.Header, pkt.Data.Size(), params)
 	pkt.NetworkHeader = buffer.View(ip)
 
@@ -253,7 +252,7 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.Netw
 		views = append(views, pkt.Data.Views()...)
 		loopedR := r.MakeLoopedRoute()
 
-		e.HandlePacket(&loopedR, tcpip.PacketBuffer{
+		e.HandlePacket(&loopedR, stack.PacketBuffer{
 			Data: buffer.NewVectorisedView(len(views[0])+pkt.Data.Size(), views),
 		})
 
@@ -273,7 +272,7 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.Netw
 }
 
 // WritePackets implements stack.NetworkEndpoint.WritePackets.
-func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []tcpip.PacketBuffer, params stack.NetworkHeaderParams) (int, *tcpip.Error) {
+func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []stack.PacketBuffer, params stack.NetworkHeaderParams) (int, *tcpip.Error) {
 	if r.Loop&stack.PacketLoop != 0 {
 		panic("multiple packets in local loop")
 	}
@@ -292,7 +291,7 @@ func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []tcpip.Pac
 
 // WriteHeaderIncludedPacket writes a packet already containing a network
 // header through the given route.
-func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt tcpip.PacketBuffer) *tcpip.Error {
+func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt stack.PacketBuffer) *tcpip.Error {
 	// The packet already has an IP header, but there are a few required
 	// checks.
 	ip := header.IPv4(pkt.Data.First())
@@ -344,7 +343,7 @@ func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt tcpip.PacketBuf
 
 // HandlePacket is called by the link layer when new ipv4 packets arrive for
 // this endpoint.
-func (e *endpoint) HandlePacket(r *stack.Route, pkt tcpip.PacketBuffer) {
+func (e *endpoint) HandlePacket(r *stack.Route, pkt stack.PacketBuffer) {
 	headerView := pkt.Data.First()
 	h := header.IPv4(headerView)
 	if !h.IsValid(pkt.Data.Size()) {
@@ -361,7 +360,7 @@ func (e *endpoint) HandlePacket(r *stack.Route, pkt tcpip.PacketBuffer) {
 	// iptables filtering. All packets that reach here are intended for
 	// this machine and will not be forwarded.
 	ipt := e.stack.IPTables()
-	if ok := ipt.Check(iptables.Input, pkt); !ok {
+	if ok := ipt.Check(stack.Input, pkt); !ok {
 		// iptables is telling us to drop the packet.
 		return
 	}
diff --git a/pkg/tcpip/network/ipv4/ipv4_test.go b/pkg/tcpip/network/ipv4/ipv4_test.go
index e900f1b45..5a864d832 100644
--- a/pkg/tcpip/network/ipv4/ipv4_test.go
+++ b/pkg/tcpip/network/ipv4/ipv4_test.go
@@ -113,7 +113,7 @@ func makeHdrAndPayload(hdrLength int, extraLength int, viewSizes []int) (buffer.
 
 // comparePayloads compared the contents of all the packets against the contents
 // of the source packet.
-func compareFragments(t *testing.T, packets []tcpip.PacketBuffer, sourcePacketInfo tcpip.PacketBuffer, mtu uint32) {
+func compareFragments(t *testing.T, packets []stack.PacketBuffer, sourcePacketInfo stack.PacketBuffer, mtu uint32) {
 	t.Helper()
 	// Make a complete array of the sourcePacketInfo packet.
 	source := header.IPv4(packets[0].Header.View()[:header.IPv4MinimumSize])
@@ -173,7 +173,7 @@ func compareFragments(t *testing.T, packets []tcpip.PacketBuffer, sourcePacketIn
 
 type errorChannel struct {
 	*channel.Endpoint
-	Ch                    chan tcpip.PacketBuffer
+	Ch                    chan stack.PacketBuffer
 	packetCollectorErrors []*tcpip.Error
 }
 
@@ -183,7 +183,7 @@ type errorChannel struct {
 func newErrorChannel(size int, mtu uint32, linkAddr tcpip.LinkAddress, packetCollectorErrors []*tcpip.Error) *errorChannel {
 	return &errorChannel{
 		Endpoint:              channel.New(size, mtu, linkAddr),
-		Ch:                    make(chan tcpip.PacketBuffer, size),
+		Ch:                    make(chan stack.PacketBuffer, size),
 		packetCollectorErrors: packetCollectorErrors,
 	}
 }
@@ -202,7 +202,7 @@ func (e *errorChannel) Drain() int {
 }
 
 // WritePacket stores outbound packets into the channel.
-func (e *errorChannel) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) *tcpip.Error {
+func (e *errorChannel) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) *tcpip.Error {
 	select {
 	case e.Ch <- pkt:
 	default:
@@ -281,13 +281,13 @@ func TestFragmentation(t *testing.T) {
 	for _, ft := range fragTests {
 		t.Run(ft.description, func(t *testing.T) {
 			hdr, payload := makeHdrAndPayload(ft.hdrLength, ft.extraLength, ft.payloadViewsSizes)
-			source := tcpip.PacketBuffer{
+			source := stack.PacketBuffer{
 				Header: hdr,
 				// Save the source payload because WritePacket will modify it.
 				Data: payload.Clone(nil),
 			}
 			c := buildContext(t, nil, ft.mtu)
-			err := c.Route.WritePacket(ft.gso, stack.NetworkHeaderParams{Protocol: tcp.ProtocolNumber, TTL: 42, TOS: stack.DefaultTOS}, tcpip.PacketBuffer{
+			err := c.Route.WritePacket(ft.gso, stack.NetworkHeaderParams{Protocol: tcp.ProtocolNumber, TTL: 42, TOS: stack.DefaultTOS}, stack.PacketBuffer{
 				Header: hdr,
 				Data:   payload,
 			})
@@ -295,7 +295,7 @@ func TestFragmentation(t *testing.T) {
 				t.Errorf("err got %v, want %v", err, nil)
 			}
 
-			var results []tcpip.PacketBuffer
+			var results []stack.PacketBuffer
 		L:
 			for {
 				select {
@@ -337,7 +337,7 @@ func TestFragmentationErrors(t *testing.T) {
 		t.Run(ft.description, func(t *testing.T) {
 			hdr, payload := makeHdrAndPayload(ft.hdrLength, header.IPv4MinimumSize, ft.payloadViewsSizes)
 			c := buildContext(t, ft.packetCollectorErrors, ft.mtu)
-			err := c.Route.WritePacket(&stack.GSO{}, stack.NetworkHeaderParams{Protocol: tcp.ProtocolNumber, TTL: 42, TOS: stack.DefaultTOS}, tcpip.PacketBuffer{
+			err := c.Route.WritePacket(&stack.GSO{}, stack.NetworkHeaderParams{Protocol: tcp.ProtocolNumber, TTL: 42, TOS: stack.DefaultTOS}, stack.PacketBuffer{
 				Header: hdr,
 				Data:   payload,
 			})
@@ -459,7 +459,7 @@ func TestInvalidFragments(t *testing.T) {
 			s.CreateNIC(nicID, sniffer.New(ep))
 
 			for _, pkt := range tc.packets {
-				ep.InjectLinkAddr(header.IPv4ProtocolNumber, remoteLinkAddr, tcpip.PacketBuffer{
+				ep.InjectLinkAddr(header.IPv4ProtocolNumber, remoteLinkAddr, stack.PacketBuffer{
 					Data: buffer.NewVectorisedView(len(pkt), []buffer.View{pkt}),
 				})
 			}
diff --git a/pkg/tcpip/network/ipv6/icmp.go b/pkg/tcpip/network/ipv6/icmp.go
index 45dc757c7..8640feffc 100644
--- a/pkg/tcpip/network/ipv6/icmp.go
+++ b/pkg/tcpip/network/ipv6/icmp.go
@@ -27,7 +27,7 @@ import (
 // the original packet that caused the ICMP one to be sent. This information is
 // used to find out which transport endpoint must be notified about the ICMP
 // packet.
-func (e *endpoint) handleControl(typ stack.ControlType, extra uint32, pkt tcpip.PacketBuffer) {
+func (e *endpoint) handleControl(typ stack.ControlType, extra uint32, pkt stack.PacketBuffer) {
 	h := header.IPv6(pkt.Data.First())
 
 	// We don't use IsValid() here because ICMP only requires that up to
@@ -62,7 +62,7 @@ func (e *endpoint) handleControl(typ stack.ControlType, extra uint32, pkt tcpip.
 	e.dispatcher.DeliverTransportControlPacket(e.id.LocalAddress, h.DestinationAddress(), ProtocolNumber, p, typ, extra, pkt)
 }
 
-func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, pkt tcpip.PacketBuffer) {
+func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, pkt stack.PacketBuffer) {
 	stats := r.Stats().ICMP
 	sent := stats.V6PacketsSent
 	received := stats.V6PacketsReceived
@@ -243,7 +243,7 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, pkt tcpip.P
 		//
 		// The IP Hop Limit field has a value of 255, i.e., the packet
 		// could not possibly have been forwarded by a router.
-		if err := r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: header.ICMPv6ProtocolNumber, TTL: header.NDPHopLimit, TOS: stack.DefaultTOS}, tcpip.PacketBuffer{
+		if err := r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: header.ICMPv6ProtocolNumber, TTL: header.NDPHopLimit, TOS: stack.DefaultTOS}, stack.PacketBuffer{
 			Header: hdr,
 		}); err != nil {
 			sent.Dropped.Increment()
@@ -330,7 +330,7 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, pkt tcpip.P
 		copy(packet, h)
 		packet.SetType(header.ICMPv6EchoReply)
 		packet.SetChecksum(header.ICMPv6Checksum(packet, r.LocalAddress, r.RemoteAddress, pkt.Data))
-		if err := r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: header.ICMPv6ProtocolNumber, TTL: r.DefaultTTL(), TOS: stack.DefaultTOS}, tcpip.PacketBuffer{
+		if err := r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: header.ICMPv6ProtocolNumber, TTL: r.DefaultTTL(), TOS: stack.DefaultTOS}, stack.PacketBuffer{
 			Header: hdr,
 			Data:   pkt.Data,
 		}); err != nil {
@@ -463,7 +463,7 @@ func (*protocol) LinkAddressRequest(addr, localAddr tcpip.Address, linkEP stack.
 	})
 
 	// TODO(stijlist): count this in ICMP stats.
-	return linkEP.WritePacket(r, nil /* gso */, ProtocolNumber, tcpip.PacketBuffer{
+	return linkEP.WritePacket(r, nil /* gso */, ProtocolNumber, stack.PacketBuffer{
 		Header: hdr,
 	})
 }
diff --git a/pkg/tcpip/network/ipv6/icmp_test.go b/pkg/tcpip/network/ipv6/icmp_test.go
index 50c4b6474..bae09ed94 100644
--- a/pkg/tcpip/network/ipv6/icmp_test.go
+++ b/pkg/tcpip/network/ipv6/icmp_test.go
@@ -56,7 +56,7 @@ func (*stubLinkEndpoint) LinkAddress() tcpip.LinkAddress {
 	return ""
 }
 
-func (*stubLinkEndpoint) WritePacket(*stack.Route, *stack.GSO, tcpip.NetworkProtocolNumber, tcpip.PacketBuffer) *tcpip.Error {
+func (*stubLinkEndpoint) WritePacket(*stack.Route, *stack.GSO, tcpip.NetworkProtocolNumber, stack.PacketBuffer) *tcpip.Error {
 	return nil
 }
 
@@ -66,7 +66,7 @@ type stubDispatcher struct {
 	stack.TransportDispatcher
 }
 
-func (*stubDispatcher) DeliverTransportPacket(*stack.Route, tcpip.TransportProtocolNumber, tcpip.PacketBuffer) {
+func (*stubDispatcher) DeliverTransportPacket(*stack.Route, tcpip.TransportProtocolNumber, stack.PacketBuffer) {
 }
 
 type stubLinkAddressCache struct {
@@ -187,7 +187,7 @@ func TestICMPCounts(t *testing.T) {
 			SrcAddr:       r.LocalAddress,
 			DstAddr:       r.RemoteAddress,
 		})
-		ep.HandlePacket(&r, tcpip.PacketBuffer{
+		ep.HandlePacket(&r, stack.PacketBuffer{
 			Data: hdr.View().ToVectorisedView(),
 		})
 	}
@@ -326,7 +326,7 @@ func routeICMPv6Packet(t *testing.T, args routeArgs, fn func(*testing.T, header.
 		views := []buffer.View{pi.Pkt.Header.View(), pi.Pkt.Data.ToView()}
 		size := pi.Pkt.Header.UsedLength() + pi.Pkt.Data.Size()
 		vv := buffer.NewVectorisedView(size, views)
-		args.dst.InjectLinkAddr(pi.Proto, args.dst.LinkAddress(), tcpip.PacketBuffer{
+		args.dst.InjectLinkAddr(pi.Proto, args.dst.LinkAddress(), stack.PacketBuffer{
 			Data: vv,
 		})
 	}
@@ -561,7 +561,7 @@ func TestICMPChecksumValidationSimple(t *testing.T) {
 					SrcAddr:       lladdr1,
 					DstAddr:       lladdr0,
 				})
-				e.InjectInbound(ProtocolNumber, tcpip.PacketBuffer{
+				e.InjectInbound(ProtocolNumber, stack.PacketBuffer{
 					Data: hdr.View().ToVectorisedView(),
 				})
 			}
@@ -738,7 +738,7 @@ func TestICMPChecksumValidationWithPayload(t *testing.T) {
 					SrcAddr:       lladdr1,
 					DstAddr:       lladdr0,
 				})
-				e.InjectInbound(ProtocolNumber, tcpip.PacketBuffer{
+				e.InjectInbound(ProtocolNumber, stack.PacketBuffer{
 					Data: hdr.View().ToVectorisedView(),
 				})
 			}
@@ -916,7 +916,7 @@ func TestICMPChecksumValidationWithPayloadMultipleViews(t *testing.T) {
 					SrcAddr:       lladdr1,
 					DstAddr:       lladdr0,
 				})
-				e.InjectInbound(ProtocolNumber, tcpip.PacketBuffer{
+				e.InjectInbound(ProtocolNumber, stack.PacketBuffer{
 					Data: buffer.NewVectorisedView(header.IPv6MinimumSize+size+payloadSize, []buffer.View{hdr.View(), payload}),
 				})
 			}
diff --git a/pkg/tcpip/network/ipv6/ipv6.go b/pkg/tcpip/network/ipv6/ipv6.go
index 9aef5234b..29e597002 100644
--- a/pkg/tcpip/network/ipv6/ipv6.go
+++ b/pkg/tcpip/network/ipv6/ipv6.go
@@ -112,7 +112,7 @@ func (e *endpoint) addIPHeader(r *stack.Route, hdr *buffer.Prependable, payloadS
 }
 
 // WritePacket writes a packet to the given destination address and protocol.
-func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.NetworkHeaderParams, pkt tcpip.PacketBuffer) *tcpip.Error {
+func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.NetworkHeaderParams, pkt stack.PacketBuffer) *tcpip.Error {
 	ip := e.addIPHeader(r, &pkt.Header, pkt.Data.Size(), params)
 	pkt.NetworkHeader = buffer.View(ip)
 
@@ -124,7 +124,7 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.Netw
 		views = append(views, pkt.Data.Views()...)
 		loopedR := r.MakeLoopedRoute()
 
-		e.HandlePacket(&loopedR, tcpip.PacketBuffer{
+		e.HandlePacket(&loopedR, stack.PacketBuffer{
 			Data: buffer.NewVectorisedView(len(views[0])+pkt.Data.Size(), views),
 		})
 
@@ -139,7 +139,7 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.Netw
 }
 
 // WritePackets implements stack.LinkEndpoint.WritePackets.
-func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []tcpip.PacketBuffer, params stack.NetworkHeaderParams) (int, *tcpip.Error) {
+func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []stack.PacketBuffer, params stack.NetworkHeaderParams) (int, *tcpip.Error) {
 	if r.Loop&stack.PacketLoop != 0 {
 		panic("not implemented")
 	}
@@ -161,14 +161,14 @@ func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []tcpip.Pac
 
 // WriteHeaderIncludedPacker implements stack.NetworkEndpoint. It is not yet
 // supported by IPv6.
-func (*endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt tcpip.PacketBuffer) *tcpip.Error {
+func (*endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt stack.PacketBuffer) *tcpip.Error {
 	// TODO(b/146666412): Support IPv6 header-included packets.
 	return tcpip.ErrNotSupported
 }
 
 // HandlePacket is called by the link layer when new ipv6 packets arrive for
 // this endpoint.
-func (e *endpoint) HandlePacket(r *stack.Route, pkt tcpip.PacketBuffer) {
+func (e *endpoint) HandlePacket(r *stack.Route, pkt stack.PacketBuffer) {
 	headerView := pkt.Data.First()
 	h := header.IPv6(headerView)
 	if !h.IsValid(pkt.Data.Size()) {
diff --git a/pkg/tcpip/network/ipv6/ipv6_test.go b/pkg/tcpip/network/ipv6/ipv6_test.go
index 1cbfa7278..ed98ef22a 100644
--- a/pkg/tcpip/network/ipv6/ipv6_test.go
+++ b/pkg/tcpip/network/ipv6/ipv6_test.go
@@ -55,7 +55,7 @@ func testReceiveICMP(t *testing.T, s *stack.Stack, e *channel.Endpoint, src, dst
 		DstAddr:       dst,
 	})
 
-	e.InjectInbound(ProtocolNumber, tcpip.PacketBuffer{
+	e.InjectInbound(ProtocolNumber, stack.PacketBuffer{
 		Data: hdr.View().ToVectorisedView(),
 	})
 
@@ -113,7 +113,7 @@ func testReceiveUDP(t *testing.T, s *stack.Stack, e *channel.Endpoint, src, dst
 		DstAddr:       dst,
 	})
 
-	e.InjectInbound(ProtocolNumber, tcpip.PacketBuffer{
+	e.InjectInbound(ProtocolNumber, stack.PacketBuffer{
 		Data: hdr.View().ToVectorisedView(),
 	})
 
diff --git a/pkg/tcpip/network/ipv6/ndp_test.go b/pkg/tcpip/network/ipv6/ndp_test.go
index c9395de52..f924ed9e1 100644
--- a/pkg/tcpip/network/ipv6/ndp_test.go
+++ b/pkg/tcpip/network/ipv6/ndp_test.go
@@ -135,7 +135,7 @@ func TestNeighorSolicitationWithSourceLinkLayerOption(t *testing.T) {
 				t.Fatalf("got invalid = %d, want = 0", got)
 			}
 
-			e.InjectInbound(ProtocolNumber, tcpip.PacketBuffer{
+			e.InjectInbound(ProtocolNumber, stack.PacketBuffer{
 				Data: hdr.View().ToVectorisedView(),
 			})
 
@@ -238,7 +238,7 @@ func TestNeighorAdvertisementWithTargetLinkLayerOption(t *testing.T) {
 				t.Fatalf("got invalid = %d, want = 0", got)
 			}
 
-			e.InjectInbound(ProtocolNumber, tcpip.PacketBuffer{
+			e.InjectInbound(ProtocolNumber, stack.PacketBuffer{
 				Data: hdr.View().ToVectorisedView(),
 			})
 
@@ -304,7 +304,7 @@ func TestHopLimitValidation(t *testing.T) {
 			SrcAddr:       r.LocalAddress,
 			DstAddr:       r.RemoteAddress,
 		})
-		ep.HandlePacket(r, tcpip.PacketBuffer{
+		ep.HandlePacket(r, stack.PacketBuffer{
 			Data: hdr.View().ToVectorisedView(),
 		})
 	}
@@ -588,7 +588,7 @@ func TestRouterAdvertValidation(t *testing.T) {
 				t.Fatalf("got rxRA = %d, want = 0", got)
 			}
 
-			e.InjectInbound(header.IPv6ProtocolNumber, tcpip.PacketBuffer{
+			e.InjectInbound(header.IPv6ProtocolNumber, stack.PacketBuffer{
 				Data: hdr.View().ToVectorisedView(),
 			})
 
diff --git a/pkg/tcpip/stack/BUILD b/pkg/tcpip/stack/BUILD
index 6c029b2fb..8d80e9cee 100644
--- a/pkg/tcpip/stack/BUILD
+++ b/pkg/tcpip/stack/BUILD
@@ -21,10 +21,16 @@ go_library(
         "dhcpv6configurationfromndpra_string.go",
         "forwarder.go",
         "icmp_rate_limit.go",
+        "iptables.go",
+        "iptables_targets.go",
+        "iptables_types.go",
         "linkaddrcache.go",
         "linkaddrentry_list.go",
         "ndp.go",
         "nic.go",
+        "packet_buffer.go",
+        "packet_buffer_state.go",
+        "rand.go",
         "registration.go",
         "route.go",
         "stack.go",
@@ -34,6 +40,7 @@ go_library(
     visibility = ["//visibility:public"],
     deps = [
         "//pkg/ilist",
+        "//pkg/log",
         "//pkg/rand",
         "//pkg/sleep",
         "//pkg/sync",
@@ -41,7 +48,6 @@ go_library(
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/hash/jenkins",
         "//pkg/tcpip/header",
-        "//pkg/tcpip/iptables",
         "//pkg/tcpip/ports",
         "//pkg/tcpip/seqnum",
         "//pkg/waiter",
@@ -65,7 +71,6 @@ go_test(
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/checker",
         "//pkg/tcpip/header",
-        "//pkg/tcpip/iptables",
         "//pkg/tcpip/link/channel",
         "//pkg/tcpip/link/loopback",
         "//pkg/tcpip/network/ipv4",
diff --git a/pkg/tcpip/stack/forwarder.go b/pkg/tcpip/stack/forwarder.go
index 631953935..6b64cd37f 100644
--- a/pkg/tcpip/stack/forwarder.go
+++ b/pkg/tcpip/stack/forwarder.go
@@ -32,7 +32,7 @@ type pendingPacket struct {
 	nic   *NIC
 	route *Route
 	proto tcpip.NetworkProtocolNumber
-	pkt   tcpip.PacketBuffer
+	pkt   PacketBuffer
 }
 
 type forwardQueue struct {
@@ -50,7 +50,7 @@ func newForwardQueue() *forwardQueue {
 	return &forwardQueue{packets: make(map[<-chan struct{}][]*pendingPacket)}
 }
 
-func (f *forwardQueue) enqueue(ch <-chan struct{}, n *NIC, r *Route, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) {
+func (f *forwardQueue) enqueue(ch <-chan struct{}, n *NIC, r *Route, protocol tcpip.NetworkProtocolNumber, pkt PacketBuffer) {
 	shouldWait := false
 
 	f.Lock()
diff --git a/pkg/tcpip/stack/forwarder_test.go b/pkg/tcpip/stack/forwarder_test.go
index 321b7524d..c45c43d21 100644
--- a/pkg/tcpip/stack/forwarder_test.go
+++ b/pkg/tcpip/stack/forwarder_test.go
@@ -68,7 +68,7 @@ func (f *fwdTestNetworkEndpoint) ID() *NetworkEndpointID {
 	return &f.id
 }
 
-func (f *fwdTestNetworkEndpoint) HandlePacket(r *Route, pkt tcpip.PacketBuffer) {
+func (f *fwdTestNetworkEndpoint) HandlePacket(r *Route, pkt PacketBuffer) {
 	// Consume the network header.
 	b := pkt.Data.First()
 	pkt.Data.TrimFront(fwdTestNetHeaderLen)
@@ -89,7 +89,7 @@ func (f *fwdTestNetworkEndpoint) Capabilities() LinkEndpointCapabilities {
 	return f.ep.Capabilities()
 }
 
-func (f *fwdTestNetworkEndpoint) WritePacket(r *Route, gso *GSO, params NetworkHeaderParams, pkt tcpip.PacketBuffer) *tcpip.Error {
+func (f *fwdTestNetworkEndpoint) WritePacket(r *Route, gso *GSO, params NetworkHeaderParams, pkt PacketBuffer) *tcpip.Error {
 	// Add the protocol's header to the packet and send it to the link
 	// endpoint.
 	b := pkt.Header.Prepend(fwdTestNetHeaderLen)
@@ -101,11 +101,11 @@ func (f *fwdTestNetworkEndpoint) WritePacket(r *Route, gso *GSO, params NetworkH
 }
 
 // WritePackets implements LinkEndpoint.WritePackets.
-func (f *fwdTestNetworkEndpoint) WritePackets(r *Route, gso *GSO, pkts []tcpip.PacketBuffer, params NetworkHeaderParams) (int, *tcpip.Error) {
+func (f *fwdTestNetworkEndpoint) WritePackets(r *Route, gso *GSO, pkts []PacketBuffer, params NetworkHeaderParams) (int, *tcpip.Error) {
 	panic("not implemented")
 }
 
-func (*fwdTestNetworkEndpoint) WriteHeaderIncludedPacket(r *Route, pkt tcpip.PacketBuffer) *tcpip.Error {
+func (*fwdTestNetworkEndpoint) WriteHeaderIncludedPacket(r *Route, pkt PacketBuffer) *tcpip.Error {
 	return tcpip.ErrNotSupported
 }
 
@@ -183,7 +183,7 @@ func (f *fwdTestNetworkProtocol) LinkAddressProtocol() tcpip.NetworkProtocolNumb
 type fwdTestPacketInfo struct {
 	RemoteLinkAddress tcpip.LinkAddress
 	LocalLinkAddress  tcpip.LinkAddress
-	Pkt               tcpip.PacketBuffer
+	Pkt               PacketBuffer
 }
 
 type fwdTestLinkEndpoint struct {
@@ -196,12 +196,12 @@ type fwdTestLinkEndpoint struct {
 }
 
 // InjectInbound injects an inbound packet.
-func (e *fwdTestLinkEndpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) {
+func (e *fwdTestLinkEndpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt PacketBuffer) {
 	e.InjectLinkAddr(protocol, "", pkt)
 }
 
 // InjectLinkAddr injects an inbound packet with a remote link address.
-func (e *fwdTestLinkEndpoint) InjectLinkAddr(protocol tcpip.NetworkProtocolNumber, remote tcpip.LinkAddress, pkt tcpip.PacketBuffer) {
+func (e *fwdTestLinkEndpoint) InjectLinkAddr(protocol tcpip.NetworkProtocolNumber, remote tcpip.LinkAddress, pkt PacketBuffer) {
 	e.dispatcher.DeliverNetworkPacket(e, remote, "" /* local */, protocol, pkt)
 }
 
@@ -244,7 +244,7 @@ func (e *fwdTestLinkEndpoint) LinkAddress() tcpip.LinkAddress {
 	return e.linkAddr
 }
 
-func (e fwdTestLinkEndpoint) WritePacket(r *Route, gso *GSO, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) *tcpip.Error {
+func (e fwdTestLinkEndpoint) WritePacket(r *Route, gso *GSO, protocol tcpip.NetworkProtocolNumber, pkt PacketBuffer) *tcpip.Error {
 	p := fwdTestPacketInfo{
 		RemoteLinkAddress: r.RemoteLinkAddress,
 		LocalLinkAddress:  r.LocalLinkAddress,
@@ -260,7 +260,7 @@ func (e fwdTestLinkEndpoint) WritePacket(r *Route, gso *GSO, protocol tcpip.Netw
 }
 
 // WritePackets stores outbound packets into the channel.
-func (e *fwdTestLinkEndpoint) WritePackets(r *Route, gso *GSO, pkts []tcpip.PacketBuffer, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+func (e *fwdTestLinkEndpoint) WritePackets(r *Route, gso *GSO, pkts []PacketBuffer, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
 	n := 0
 	for _, pkt := range pkts {
 		e.WritePacket(r, gso, protocol, pkt)
@@ -273,7 +273,7 @@ func (e *fwdTestLinkEndpoint) WritePackets(r *Route, gso *GSO, pkts []tcpip.Pack
 // WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket.
 func (e *fwdTestLinkEndpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error {
 	p := fwdTestPacketInfo{
-		Pkt: tcpip.PacketBuffer{Data: vv},
+		Pkt: PacketBuffer{Data: vv},
 	}
 
 	select {
@@ -355,7 +355,7 @@ func TestForwardingWithStaticResolver(t *testing.T) {
 	// forwarded to NIC 2.
 	buf := buffer.NewView(30)
 	buf[0] = 3
-	ep1.InjectInbound(fwdTestNetNumber, tcpip.PacketBuffer{
+	ep1.InjectInbound(fwdTestNetNumber, PacketBuffer{
 		Data: buf.ToVectorisedView(),
 	})
 
@@ -392,7 +392,7 @@ func TestForwardingWithFakeResolver(t *testing.T) {
 	// forwarded to NIC 2.
 	buf := buffer.NewView(30)
 	buf[0] = 3
-	ep1.InjectInbound(fwdTestNetNumber, tcpip.PacketBuffer{
+	ep1.InjectInbound(fwdTestNetNumber, PacketBuffer{
 		Data: buf.ToVectorisedView(),
 	})
 
@@ -423,7 +423,7 @@ func TestForwardingWithNoResolver(t *testing.T) {
 	// forwarded to NIC 2.
 	buf := buffer.NewView(30)
 	buf[0] = 3
-	ep1.InjectInbound(fwdTestNetNumber, tcpip.PacketBuffer{
+	ep1.InjectInbound(fwdTestNetNumber, PacketBuffer{
 		Data: buf.ToVectorisedView(),
 	})
 
@@ -453,7 +453,7 @@ func TestForwardingWithFakeResolverPartialTimeout(t *testing.T) {
 	// not be forwarded.
 	buf := buffer.NewView(30)
 	buf[0] = 4
-	ep1.InjectInbound(fwdTestNetNumber, tcpip.PacketBuffer{
+	ep1.InjectInbound(fwdTestNetNumber, PacketBuffer{
 		Data: buf.ToVectorisedView(),
 	})
 
@@ -461,7 +461,7 @@ func TestForwardingWithFakeResolverPartialTimeout(t *testing.T) {
 	// forwarded to NIC 2.
 	buf = buffer.NewView(30)
 	buf[0] = 3
-	ep1.InjectInbound(fwdTestNetNumber, tcpip.PacketBuffer{
+	ep1.InjectInbound(fwdTestNetNumber, PacketBuffer{
 		Data: buf.ToVectorisedView(),
 	})
 
@@ -503,7 +503,7 @@ func TestForwardingWithFakeResolverTwoPackets(t *testing.T) {
 	for i := 0; i < 2; i++ {
 		buf := buffer.NewView(30)
 		buf[0] = 3
-		ep1.InjectInbound(fwdTestNetNumber, tcpip.PacketBuffer{
+		ep1.InjectInbound(fwdTestNetNumber, PacketBuffer{
 			Data: buf.ToVectorisedView(),
 		})
 	}
@@ -550,7 +550,7 @@ func TestForwardingWithFakeResolverManyPackets(t *testing.T) {
 		buf[0] = 3
 		// Set the packet sequence number.
 		binary.BigEndian.PutUint16(buf[fwdTestNetHeaderLen:], uint16(i))
-		ep1.InjectInbound(fwdTestNetNumber, tcpip.PacketBuffer{
+		ep1.InjectInbound(fwdTestNetNumber, PacketBuffer{
 			Data: buf.ToVectorisedView(),
 		})
 	}
@@ -603,7 +603,7 @@ func TestForwardingWithFakeResolverManyResolutions(t *testing.T) {
 		// maxPendingResolutions + 7).
 		buf := buffer.NewView(30)
 		buf[0] = byte(3 + i)
-		ep1.InjectInbound(fwdTestNetNumber, tcpip.PacketBuffer{
+		ep1.InjectInbound(fwdTestNetNumber, PacketBuffer{
 			Data: buf.ToVectorisedView(),
 		})
 	}
diff --git a/pkg/tcpip/iptables/iptables.go b/pkg/tcpip/stack/iptables.go
index d30571c74..37907ae24 100644
--- a/pkg/tcpip/iptables/iptables.go
+++ b/pkg/tcpip/stack/iptables.go
@@ -12,14 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// Package iptables supports packet filtering and manipulation via the iptables
-// tool.
-package iptables
+package stack
 
 import (
 	"fmt"
 
-	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 )
 
@@ -176,7 +173,7 @@ const (
 // dropped.
 //
 // Precondition: pkt.NetworkHeader is set.
-func (it *IPTables) Check(hook Hook, pkt tcpip.PacketBuffer) bool {
+func (it *IPTables) Check(hook Hook, pkt PacketBuffer) bool {
 	// Go through each table containing the hook.
 	for _, tablename := range it.Priorities[hook] {
 		table := it.Tables[tablename]
@@ -213,7 +210,7 @@ func (it *IPTables) Check(hook Hook, pkt tcpip.PacketBuffer) bool {
 }
 
 // Precondition: pkt.NetworkHeader is set.
-func (it *IPTables) checkChain(hook Hook, pkt tcpip.PacketBuffer, table Table, ruleIdx int) chainVerdict {
+func (it *IPTables) checkChain(hook Hook, pkt PacketBuffer, table Table, ruleIdx int) chainVerdict {
 	// Start from ruleIdx and walk the list of rules until a rule gives us
 	// a verdict.
 	for ruleIdx < len(table.Rules) {
@@ -258,7 +255,7 @@ func (it *IPTables) checkChain(hook Hook, pkt tcpip.PacketBuffer, table Table, r
 }
 
 // Precondition: pk.NetworkHeader is set.
-func (it *IPTables) checkRule(hook Hook, pkt tcpip.PacketBuffer, table Table, ruleIdx int) (RuleVerdict, int) {
+func (it *IPTables) checkRule(hook Hook, pkt PacketBuffer, table Table, ruleIdx int) (RuleVerdict, int) {
 	rule := table.Rules[ruleIdx]
 
 	// If pkt.NetworkHeader hasn't been set yet, it will be contained in
diff --git a/pkg/tcpip/iptables/targets.go b/pkg/tcpip/stack/iptables_targets.go
index e457f2349..7b4543caf 100644
--- a/pkg/tcpip/iptables/targets.go
+++ b/pkg/tcpip/stack/iptables_targets.go
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-package iptables
+package stack
 
 import (
 	"gvisor.dev/gvisor/pkg/log"
@@ -24,7 +24,7 @@ import (
 type AcceptTarget struct{}
 
 // Action implements Target.Action.
-func (AcceptTarget) Action(packet tcpip.PacketBuffer) (RuleVerdict, int) {
+func (AcceptTarget) Action(packet PacketBuffer) (RuleVerdict, int) {
 	return RuleAccept, 0
 }
 
@@ -32,7 +32,7 @@ func (AcceptTarget) Action(packet tcpip.PacketBuffer) (RuleVerdict, int) {
 type DropTarget struct{}
 
 // Action implements Target.Action.
-func (DropTarget) Action(packet tcpip.PacketBuffer) (RuleVerdict, int) {
+func (DropTarget) Action(packet PacketBuffer) (RuleVerdict, int) {
 	return RuleDrop, 0
 }
 
@@ -41,7 +41,7 @@ func (DropTarget) Action(packet tcpip.PacketBuffer) (RuleVerdict, int) {
 type ErrorTarget struct{}
 
 // Action implements Target.Action.
-func (ErrorTarget) Action(packet tcpip.PacketBuffer) (RuleVerdict, int) {
+func (ErrorTarget) Action(packet PacketBuffer) (RuleVerdict, int) {
 	log.Debugf("ErrorTarget triggered.")
 	return RuleDrop, 0
 }
@@ -52,7 +52,7 @@ type UserChainTarget struct {
 }
 
 // Action implements Target.Action.
-func (UserChainTarget) Action(tcpip.PacketBuffer) (RuleVerdict, int) {
+func (UserChainTarget) Action(PacketBuffer) (RuleVerdict, int) {
 	panic("UserChainTarget should never be called.")
 }
 
@@ -61,7 +61,7 @@ func (UserChainTarget) Action(tcpip.PacketBuffer) (RuleVerdict, int) {
 type ReturnTarget struct{}
 
 // Action implements Target.Action.
-func (ReturnTarget) Action(tcpip.PacketBuffer) (RuleVerdict, int) {
+func (ReturnTarget) Action(PacketBuffer) (RuleVerdict, int) {
 	return RuleReturn, 0
 }
 
@@ -92,7 +92,7 @@ type RedirectTarget struct {
 // TODO(gvisor.dev/issue/170): Parse headers without copying. The current
 // implementation only works for PREROUTING and calls pkt.Clone(), neither
 // of which should be the case.
-func (rt RedirectTarget) Action(pkt tcpip.PacketBuffer) (RuleVerdict, int) {
+func (rt RedirectTarget) Action(pkt PacketBuffer) (RuleVerdict, int) {
 	newPkt := pkt.Clone()
 
 	// Set network header.
diff --git a/pkg/tcpip/iptables/types.go b/pkg/tcpip/stack/iptables_types.go
index e7fcf6bff..2ffb55f2a 100644
--- a/pkg/tcpip/iptables/types.go
+++ b/pkg/tcpip/stack/iptables_types.go
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-package iptables
+package stack
 
 import (
 	"gvisor.dev/gvisor/pkg/tcpip"
@@ -168,7 +168,7 @@ type Matcher interface {
 	// used for suspicious packets.
 	//
 	// Precondition: packet.NetworkHeader is set.
-	Match(hook Hook, packet tcpip.PacketBuffer, interfaceName string) (matches bool, hotdrop bool)
+	Match(hook Hook, packet PacketBuffer, interfaceName string) (matches bool, hotdrop bool)
 }
 
 // A Target is the interface for taking an action for a packet.
@@ -176,5 +176,5 @@ type Target interface {
 	// Action takes an action on the packet and returns a verdict on how
 	// traversal should (or should not) continue. If the return value is
 	// Jump, it also returns the index of the rule to jump to.
-	Action(packet tcpip.PacketBuffer) (RuleVerdict, int)
+	Action(packet PacketBuffer) (RuleVerdict, int)
 }
diff --git a/pkg/tcpip/stack/ndp.go b/pkg/tcpip/stack/ndp.go
index d689a006d..630fdefc5 100644
--- a/pkg/tcpip/stack/ndp.go
+++ b/pkg/tcpip/stack/ndp.go
@@ -564,7 +564,7 @@ func (ndp *ndpState) sendDADPacket(addr tcpip.Address) *tcpip.Error {
 			Protocol: header.ICMPv6ProtocolNumber,
 			TTL:      header.NDPHopLimit,
 			TOS:      DefaultTOS,
-		}, tcpip.PacketBuffer{Header: hdr},
+		}, PacketBuffer{Header: hdr},
 	); err != nil {
 		sent.Dropped.Increment()
 		return err
@@ -1283,7 +1283,7 @@ func (ndp *ndpState) startSolicitingRouters() {
 				Protocol: header.ICMPv6ProtocolNumber,
 				TTL:      header.NDPHopLimit,
 				TOS:      DefaultTOS,
-			}, tcpip.PacketBuffer{Header: hdr},
+			}, PacketBuffer{Header: hdr},
 		); err != nil {
 			sent.Dropped.Increment()
 			log.Printf("startSolicitingRouters: error writing NDP router solicit message on NIC(%d); err = %s", ndp.nic.ID(), err)
diff --git a/pkg/tcpip/stack/ndp_test.go b/pkg/tcpip/stack/ndp_test.go
index 4368c236c..06edd05b6 100644
--- a/pkg/tcpip/stack/ndp_test.go
+++ b/pkg/tcpip/stack/ndp_test.go
@@ -602,7 +602,7 @@ func TestDADFail(t *testing.T) {
 			// Receive a packet to simulate multiple nodes owning or
 			// attempting to own the same address.
 			hdr := test.makeBuf(addr1)
-			e.InjectInbound(header.IPv6ProtocolNumber, tcpip.PacketBuffer{
+			e.InjectInbound(header.IPv6ProtocolNumber, stack.PacketBuffer{
 				Data: hdr.View().ToVectorisedView(),
 			})
 
@@ -918,7 +918,7 @@ func TestSetNDPConfigurations(t *testing.T) {
 
 // raBufWithOptsAndDHCPv6 returns a valid NDP Router Advertisement with options
 // and DHCPv6 configurations specified.
-func raBufWithOptsAndDHCPv6(ip tcpip.Address, rl uint16, managedAddress, otherConfigurations bool, optSer header.NDPOptionsSerializer) tcpip.PacketBuffer {
+func raBufWithOptsAndDHCPv6(ip tcpip.Address, rl uint16, managedAddress, otherConfigurations bool, optSer header.NDPOptionsSerializer) stack.PacketBuffer {
 	icmpSize := header.ICMPv6HeaderSize + header.NDPRAMinimumSize + int(optSer.Length())
 	hdr := buffer.NewPrependable(header.IPv6MinimumSize + icmpSize)
 	pkt := header.ICMPv6(hdr.Prepend(icmpSize))
@@ -953,14 +953,14 @@ func raBufWithOptsAndDHCPv6(ip tcpip.Address, rl uint16, managedAddress, otherCo
 		DstAddr:       header.IPv6AllNodesMulticastAddress,
 	})
 
-	return tcpip.PacketBuffer{Data: hdr.View().ToVectorisedView()}
+	return stack.PacketBuffer{Data: hdr.View().ToVectorisedView()}
 }
 
 // raBufWithOpts returns a valid NDP Router Advertisement with options.
 //
 // Note, raBufWithOpts does not populate any of the RA fields other than the
 // Router Lifetime.
-func raBufWithOpts(ip tcpip.Address, rl uint16, optSer header.NDPOptionsSerializer) tcpip.PacketBuffer {
+func raBufWithOpts(ip tcpip.Address, rl uint16, optSer header.NDPOptionsSerializer) stack.PacketBuffer {
 	return raBufWithOptsAndDHCPv6(ip, rl, false, false, optSer)
 }
 
@@ -969,7 +969,7 @@ func raBufWithOpts(ip tcpip.Address, rl uint16, optSer header.NDPOptionsSerializ
 //
 // Note, raBufWithDHCPv6 does not populate any of the RA fields other than the
 // DHCPv6 related ones.
-func raBufWithDHCPv6(ip tcpip.Address, managedAddresses, otherConfiguratiosns bool) tcpip.PacketBuffer {
+func raBufWithDHCPv6(ip tcpip.Address, managedAddresses, otherConfiguratiosns bool) stack.PacketBuffer {
 	return raBufWithOptsAndDHCPv6(ip, 0, managedAddresses, otherConfiguratiosns, header.NDPOptionsSerializer{})
 }
 
@@ -977,7 +977,7 @@ func raBufWithDHCPv6(ip tcpip.Address, managedAddresses, otherConfiguratiosns bo
 //
 // Note, raBuf does not populate any of the RA fields other than the
 // Router Lifetime.
-func raBuf(ip tcpip.Address, rl uint16) tcpip.PacketBuffer {
+func raBuf(ip tcpip.Address, rl uint16) stack.PacketBuffer {
 	return raBufWithOpts(ip, rl, header.NDPOptionsSerializer{})
 }
 
@@ -986,7 +986,7 @@ func raBuf(ip tcpip.Address, rl uint16) tcpip.PacketBuffer {
 //
 // Note, raBufWithPI does not populate any of the RA fields other than the
 // Router Lifetime.
-func raBufWithPI(ip tcpip.Address, rl uint16, prefix tcpip.AddressWithPrefix, onLink, auto bool, vl, pl uint32) tcpip.PacketBuffer {
+func raBufWithPI(ip tcpip.Address, rl uint16, prefix tcpip.AddressWithPrefix, onLink, auto bool, vl, pl uint32) stack.PacketBuffer {
 	flags := uint8(0)
 	if onLink {
 		// The OnLink flag is the 7th bit in the flags byte.
diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index 9dcb1d52c..b6fa647ea 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -26,7 +26,6 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
-	"gvisor.dev/gvisor/pkg/tcpip/iptables"
 )
 
 var ipv4BroadcastAddr = tcpip.ProtocolAddress{
@@ -1144,7 +1143,7 @@ func (n *NIC) isInGroup(addr tcpip.Address) bool {
 	return joins != 0
 }
 
-func handlePacket(protocol tcpip.NetworkProtocolNumber, dst, src tcpip.Address, localLinkAddr, remotelinkAddr tcpip.LinkAddress, ref *referencedNetworkEndpoint, pkt tcpip.PacketBuffer) {
+func handlePacket(protocol tcpip.NetworkProtocolNumber, dst, src tcpip.Address, localLinkAddr, remotelinkAddr tcpip.LinkAddress, ref *referencedNetworkEndpoint, pkt PacketBuffer) {
 	r := makeRoute(protocol, dst, src, localLinkAddr, ref, false /* handleLocal */, false /* multicastLoop */)
 	r.RemoteLinkAddress = remotelinkAddr
 
@@ -1158,7 +1157,7 @@ func handlePacket(protocol tcpip.NetworkProtocolNumber, dst, src tcpip.Address,
 // Note that the ownership of the slice backing vv is retained by the caller.
 // This rule applies only to the slice itself, not to the items of the slice;
 // the ownership of the items is not retained by the caller.
-func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) {
+func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt PacketBuffer) {
 	n.mu.RLock()
 	enabled := n.mu.enabled
 	// If the NIC is not yet enabled, don't receive any packets.
@@ -1222,7 +1221,7 @@ func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, local tcpip.Link
 	// TODO(gvisor.dev/issue/170): Not supporting iptables for IPv6 yet.
 	if protocol == header.IPv4ProtocolNumber {
 		ipt := n.stack.IPTables()
-		if ok := ipt.Check(iptables.Prerouting, pkt); !ok {
+		if ok := ipt.Check(Prerouting, pkt); !ok {
 			// iptables is telling us to drop the packet.
 			return
 		}
@@ -1287,7 +1286,7 @@ func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, local tcpip.Link
 	}
 }
 
-func (n *NIC) forwardPacket(r *Route, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) {
+func (n *NIC) forwardPacket(r *Route, protocol tcpip.NetworkProtocolNumber, pkt PacketBuffer) {
 	// TODO(b/143425874) Decrease the TTL field in forwarded packets.
 
 	firstData := pkt.Data.First()
@@ -1318,7 +1317,7 @@ func (n *NIC) forwardPacket(r *Route, protocol tcpip.NetworkProtocolNumber, pkt
 
 // DeliverTransportPacket delivers the packets to the appropriate transport
 // protocol endpoint.
-func (n *NIC) DeliverTransportPacket(r *Route, protocol tcpip.TransportProtocolNumber, pkt tcpip.PacketBuffer) {
+func (n *NIC) DeliverTransportPacket(r *Route, protocol tcpip.TransportProtocolNumber, pkt PacketBuffer) {
 	state, ok := n.stack.transportProtocols[protocol]
 	if !ok {
 		n.stack.stats.UnknownProtocolRcvdPackets.Increment()
@@ -1364,7 +1363,7 @@ func (n *NIC) DeliverTransportPacket(r *Route, protocol tcpip.TransportProtocolN
 
 // DeliverTransportControlPacket delivers control packets to the appropriate
 // transport protocol endpoint.
-func (n *NIC) DeliverTransportControlPacket(local, remote tcpip.Address, net tcpip.NetworkProtocolNumber, trans tcpip.TransportProtocolNumber, typ ControlType, extra uint32, pkt tcpip.PacketBuffer) {
+func (n *NIC) DeliverTransportControlPacket(local, remote tcpip.Address, net tcpip.NetworkProtocolNumber, trans tcpip.TransportProtocolNumber, typ ControlType, extra uint32, pkt PacketBuffer) {
 	state, ok := n.stack.transportProtocols[trans]
 	if !ok {
 		return
diff --git a/pkg/tcpip/stack/nic_test.go b/pkg/tcpip/stack/nic_test.go
index edaee3b86..d672fc157 100644
--- a/pkg/tcpip/stack/nic_test.go
+++ b/pkg/tcpip/stack/nic_test.go
@@ -17,7 +17,6 @@ package stack
 import (
 	"testing"
 
-	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 )
 
@@ -45,7 +44,7 @@ func TestDisabledRxStatsWhenNICDisabled(t *testing.T) {
 		t.FailNow()
 	}
 
-	nic.DeliverNetworkPacket(nil, "", "", 0, tcpip.PacketBuffer{Data: buffer.View([]byte{1, 2, 3, 4}).ToVectorisedView()})
+	nic.DeliverNetworkPacket(nil, "", "", 0, PacketBuffer{Data: buffer.View([]byte{1, 2, 3, 4}).ToVectorisedView()})
 
 	if got := nic.stats.DisabledRx.Packets.Value(); got != 1 {
 		t.Errorf("got DisabledRx.Packets = %d, want = 1", got)
diff --git a/pkg/tcpip/packet_buffer.go b/pkg/tcpip/stack/packet_buffer.go
index ab24372e7..9505a4e92 100644
--- a/pkg/tcpip/packet_buffer.go
+++ b/pkg/tcpip/stack/packet_buffer.go
@@ -11,7 +11,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-package tcpip
+package stack
 
 import "gvisor.dev/gvisor/pkg/tcpip/buffer"
 
@@ -55,6 +55,10 @@ type PacketBuffer struct {
 	LinkHeader      buffer.View
 	NetworkHeader   buffer.View
 	TransportHeader buffer.View
+
+	// Hash is the transport layer hash of this packet. A value of zero
+	// indicates no valid hash has been set.
+	Hash uint32
 }
 
 // Clone makes a copy of pk. It clones the Data field, which creates a new
diff --git a/pkg/tcpip/packet_buffer_state.go b/pkg/tcpip/stack/packet_buffer_state.go
index ad3cc24fa..0c6b7924c 100644
--- a/pkg/tcpip/packet_buffer_state.go
+++ b/pkg/tcpip/stack/packet_buffer_state.go
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-package tcpip
+package stack
 
 import "gvisor.dev/gvisor/pkg/tcpip/buffer"
 
diff --git a/pkg/tcpip/stack/rand.go b/pkg/tcpip/stack/rand.go
new file mode 100644
index 000000000..421fb5c15
--- /dev/null
+++ b/pkg/tcpip/stack/rand.go
@@ -0,0 +1,40 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package stack
+
+import (
+	mathrand "math/rand"
+
+	"gvisor.dev/gvisor/pkg/sync"
+)
+
+// lockedRandomSource provides a threadsafe rand.Source.
+type lockedRandomSource struct {
+	mu  sync.Mutex
+	src mathrand.Source
+}
+
+func (r *lockedRandomSource) Int63() (n int64) {
+	r.mu.Lock()
+	n = r.src.Int63()
+	r.mu.Unlock()
+	return n
+}
+
+func (r *lockedRandomSource) Seed(seed int64) {
+	r.mu.Lock()
+	r.src.Seed(seed)
+	r.mu.Unlock()
+}
diff --git a/pkg/tcpip/stack/registration.go b/pkg/tcpip/stack/registration.go
index fa28b46b1..ac043b722 100644
--- a/pkg/tcpip/stack/registration.go
+++ b/pkg/tcpip/stack/registration.go
@@ -67,12 +67,12 @@ type TransportEndpoint interface {
 	// this transport endpoint. It sets pkt.TransportHeader.
 	//
 	// HandlePacket takes ownership of pkt.
-	HandlePacket(r *Route, id TransportEndpointID, pkt tcpip.PacketBuffer)
+	HandlePacket(r *Route, id TransportEndpointID, pkt PacketBuffer)
 
 	// HandleControlPacket is called by the stack when new control (e.g.
 	// ICMP) packets arrive to this transport endpoint.
 	// HandleControlPacket takes ownership of pkt.
-	HandleControlPacket(id TransportEndpointID, typ ControlType, extra uint32, pkt tcpip.PacketBuffer)
+	HandleControlPacket(id TransportEndpointID, typ ControlType, extra uint32, pkt PacketBuffer)
 
 	// Abort initiates an expedited endpoint teardown. It puts the endpoint
 	// in a closed state and frees all resources associated with it. This
@@ -100,7 +100,7 @@ type RawTransportEndpoint interface {
 	// layer up.
 	//
 	// HandlePacket takes ownership of pkt.
-	HandlePacket(r *Route, pkt tcpip.PacketBuffer)
+	HandlePacket(r *Route, pkt PacketBuffer)
 }
 
 // PacketEndpoint is the interface that needs to be implemented by packet
@@ -118,7 +118,7 @@ type PacketEndpoint interface {
 	// should construct its own ethernet header for applications.
 	//
 	// HandlePacket takes ownership of pkt.
-	HandlePacket(nicID tcpip.NICID, addr tcpip.LinkAddress, netProto tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer)
+	HandlePacket(nicID tcpip.NICID, addr tcpip.LinkAddress, netProto tcpip.NetworkProtocolNumber, pkt PacketBuffer)
 }
 
 // TransportProtocol is the interface that needs to be implemented by transport
@@ -150,7 +150,7 @@ type TransportProtocol interface {
 	// stats purposes only).
 	//
 	// HandleUnknownDestinationPacket takes ownership of pkt.
-	HandleUnknownDestinationPacket(r *Route, id TransportEndpointID, pkt tcpip.PacketBuffer) bool
+	HandleUnknownDestinationPacket(r *Route, id TransportEndpointID, pkt PacketBuffer) bool
 
 	// SetOption allows enabling/disabling protocol specific features.
 	// SetOption returns an error if the option is not supported or the
@@ -180,7 +180,7 @@ type TransportDispatcher interface {
 	// pkt.NetworkHeader must be set before calling DeliverTransportPacket.
 	//
 	// DeliverTransportPacket takes ownership of pkt.
-	DeliverTransportPacket(r *Route, protocol tcpip.TransportProtocolNumber, pkt tcpip.PacketBuffer)
+	DeliverTransportPacket(r *Route, protocol tcpip.TransportProtocolNumber, pkt PacketBuffer)
 
 	// DeliverTransportControlPacket delivers control packets to the
 	// appropriate transport protocol endpoint.
@@ -189,7 +189,7 @@ type TransportDispatcher interface {
 	// DeliverTransportControlPacket.
 	//
 	// DeliverTransportControlPacket takes ownership of pkt.
-	DeliverTransportControlPacket(local, remote tcpip.Address, net tcpip.NetworkProtocolNumber, trans tcpip.TransportProtocolNumber, typ ControlType, extra uint32, pkt tcpip.PacketBuffer)
+	DeliverTransportControlPacket(local, remote tcpip.Address, net tcpip.NetworkProtocolNumber, trans tcpip.TransportProtocolNumber, typ ControlType, extra uint32, pkt PacketBuffer)
 }
 
 // PacketLooping specifies where an outbound packet should be sent.
@@ -242,15 +242,15 @@ type NetworkEndpoint interface {
 	// WritePacket writes a packet to the given destination address and
 	// protocol. It sets pkt.NetworkHeader. pkt.TransportHeader must have
 	// already been set.
-	WritePacket(r *Route, gso *GSO, params NetworkHeaderParams, pkt tcpip.PacketBuffer) *tcpip.Error
+	WritePacket(r *Route, gso *GSO, params NetworkHeaderParams, pkt PacketBuffer) *tcpip.Error
 
 	// WritePackets writes packets to the given destination address and
 	// protocol. pkts must not be zero length.
-	WritePackets(r *Route, gso *GSO, pkts []tcpip.PacketBuffer, params NetworkHeaderParams) (int, *tcpip.Error)
+	WritePackets(r *Route, gso *GSO, pkts []PacketBuffer, params NetworkHeaderParams) (int, *tcpip.Error)
 
 	// WriteHeaderIncludedPacket writes a packet that includes a network
 	// header to the given destination address.
-	WriteHeaderIncludedPacket(r *Route, pkt tcpip.PacketBuffer) *tcpip.Error
+	WriteHeaderIncludedPacket(r *Route, pkt PacketBuffer) *tcpip.Error
 
 	// ID returns the network protocol endpoint ID.
 	ID() *NetworkEndpointID
@@ -265,7 +265,7 @@ type NetworkEndpoint interface {
 	// this network endpoint. It sets pkt.NetworkHeader.
 	//
 	// HandlePacket takes ownership of pkt.
-	HandlePacket(r *Route, pkt tcpip.PacketBuffer)
+	HandlePacket(r *Route, pkt PacketBuffer)
 
 	// Close is called when the endpoint is reomved from a stack.
 	Close()
@@ -322,7 +322,7 @@ type NetworkDispatcher interface {
 	// packets sent via loopback), and won't have the field set.
 	//
 	// DeliverNetworkPacket takes ownership of pkt.
-	DeliverNetworkPacket(linkEP LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer)
+	DeliverNetworkPacket(linkEP LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt PacketBuffer)
 }
 
 // LinkEndpointCapabilities is the type associated with the capabilities
@@ -354,7 +354,7 @@ const (
 // LinkEndpoint is the interface implemented by data link layer protocols (e.g.,
 // ethernet, loopback, raw) and used by network layer protocols to send packets
 // out through the implementer's data link endpoint. When a link header exists,
-// it sets each tcpip.PacketBuffer's LinkHeader field before passing it up the
+// it sets each PacketBuffer's LinkHeader field before passing it up the
 // stack.
 type LinkEndpoint interface {
 	// MTU is the maximum transmission unit for this endpoint. This is
@@ -385,7 +385,7 @@ type LinkEndpoint interface {
 	// To participate in transparent bridging, a LinkEndpoint implementation
 	// should call eth.Encode with header.EthernetFields.SrcAddr set to
 	// r.LocalLinkAddress if it is provided.
-	WritePacket(r *Route, gso *GSO, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) *tcpip.Error
+	WritePacket(r *Route, gso *GSO, protocol tcpip.NetworkProtocolNumber, pkt PacketBuffer) *tcpip.Error
 
 	// WritePackets writes packets with the given protocol through the
 	// given route. pkts must not be zero length.
@@ -393,7 +393,7 @@ type LinkEndpoint interface {
 	// Right now, WritePackets is used only when the software segmentation
 	// offload is enabled. If it will be used for something else, it may
 	// require to change syscall filters.
-	WritePackets(r *Route, gso *GSO, pkts []tcpip.PacketBuffer, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error)
+	WritePackets(r *Route, gso *GSO, pkts []PacketBuffer, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error)
 
 	// WriteRawPacket writes a packet directly to the link. The packet
 	// should already have an ethernet header.
@@ -426,7 +426,7 @@ type InjectableLinkEndpoint interface {
 	LinkEndpoint
 
 	// InjectInbound injects an inbound packet.
-	InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer)
+	InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt PacketBuffer)
 
 	// InjectOutbound writes a fully formed outbound packet directly to the
 	// link.
diff --git a/pkg/tcpip/stack/route.go b/pkg/tcpip/stack/route.go
index f565aafb2..9fbe8a411 100644
--- a/pkg/tcpip/stack/route.go
+++ b/pkg/tcpip/stack/route.go
@@ -153,7 +153,7 @@ func (r *Route) IsResolutionRequired() bool {
 }
 
 // WritePacket writes the packet through the given route.
-func (r *Route) WritePacket(gso *GSO, params NetworkHeaderParams, pkt tcpip.PacketBuffer) *tcpip.Error {
+func (r *Route) WritePacket(gso *GSO, params NetworkHeaderParams, pkt PacketBuffer) *tcpip.Error {
 	if !r.ref.isValidForOutgoing() {
 		return tcpip.ErrInvalidEndpointState
 	}
@@ -169,7 +169,7 @@ func (r *Route) WritePacket(gso *GSO, params NetworkHeaderParams, pkt tcpip.Pack
 }
 
 // WritePackets writes the set of packets through the given route.
-func (r *Route) WritePackets(gso *GSO, pkts []tcpip.PacketBuffer, params NetworkHeaderParams) (int, *tcpip.Error) {
+func (r *Route) WritePackets(gso *GSO, pkts []PacketBuffer, params NetworkHeaderParams) (int, *tcpip.Error) {
 	if !r.ref.isValidForOutgoing() {
 		return 0, tcpip.ErrInvalidEndpointState
 	}
@@ -190,7 +190,7 @@ func (r *Route) WritePackets(gso *GSO, pkts []tcpip.PacketBuffer, params Network
 
 // WriteHeaderIncludedPacket writes a packet already containing a network
 // header through the given route.
-func (r *Route) WriteHeaderIncludedPacket(pkt tcpip.PacketBuffer) *tcpip.Error {
+func (r *Route) WriteHeaderIncludedPacket(pkt PacketBuffer) *tcpip.Error {
 	if !r.ref.isValidForOutgoing() {
 		return tcpip.ErrInvalidEndpointState
 	}
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index 6f423874a..41398a1b6 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -20,7 +20,9 @@
 package stack
 
 import (
+	"bytes"
 	"encoding/binary"
+	mathrand "math/rand"
 	"sync/atomic"
 	"time"
 
@@ -31,7 +33,6 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
-	"gvisor.dev/gvisor/pkg/tcpip/iptables"
 	"gvisor.dev/gvisor/pkg/tcpip/ports"
 	"gvisor.dev/gvisor/pkg/tcpip/seqnum"
 	"gvisor.dev/gvisor/pkg/waiter"
@@ -51,7 +52,7 @@ const (
 
 type transportProtocolState struct {
 	proto          TransportProtocol
-	defaultHandler func(r *Route, id TransportEndpointID, pkt tcpip.PacketBuffer) bool
+	defaultHandler func(r *Route, id TransportEndpointID, pkt PacketBuffer) bool
 }
 
 // TCPProbeFunc is the expected function type for a TCP probe function to be
@@ -428,7 +429,7 @@ type Stack struct {
 
 	// tables are the iptables packet filtering and manipulation rules. The are
 	// protected by tablesMu.`
-	tables iptables.IPTables
+	tables IPTables
 
 	// resumableEndpoints is a list of endpoints that need to be resumed if the
 	// stack is being restored.
@@ -466,6 +467,10 @@ type Stack struct {
 	// forwarder holds the packets that wait for their link-address resolutions
 	// to complete, and forwards them when each resolution is done.
 	forwarder *forwardQueue
+
+	// randomGenerator is an injectable pseudo random generator that can be
+	// used when a random number is required.
+	randomGenerator *mathrand.Rand
 }
 
 // UniqueID is an abstract generator of unique identifiers.
@@ -526,9 +531,16 @@ type Options struct {
 	// this is non-nil.
 	RawFactory RawFactory
 
-	// OpaqueIIDOpts hold the options for generating opaque interface identifiers
-	// (IIDs) as outlined by RFC 7217.
+	// OpaqueIIDOpts hold the options for generating opaque interface
+	// identifiers (IIDs) as outlined by RFC 7217.
 	OpaqueIIDOpts OpaqueInterfaceIdentifierOptions
+
+	// RandSource is an optional source to use to generate random
+	// numbers. If omitted it defaults to a Source seeded by the data
+	// returned by rand.Read().
+	//
+	// RandSource must be thread-safe.
+	RandSource mathrand.Source
 }
 
 // TransportEndpointInfo holds useful information about a transport endpoint
@@ -624,6 +636,13 @@ func New(opts Options) *Stack {
 		opts.UniqueID = new(uniqueIDGenerator)
 	}
 
+	randSrc := opts.RandSource
+	if randSrc == nil {
+		// Source provided by mathrand.NewSource is not thread-safe so
+		// we wrap it in a simple thread-safe version.
+		randSrc = &lockedRandomSource{src: mathrand.NewSource(generateRandInt64())}
+	}
+
 	// Make sure opts.NDPConfigs contains valid values only.
 	opts.NDPConfigs.validate()
 
@@ -646,6 +665,7 @@ func New(opts Options) *Stack {
 		ndpDisp:              opts.NDPDisp,
 		opaqueIIDOpts:        opts.OpaqueIIDOpts,
 		forwarder:            newForwardQueue(),
+		randomGenerator:      mathrand.New(randSrc),
 	}
 
 	// Add specified network protocols.
@@ -738,7 +758,7 @@ func (s *Stack) TransportProtocolOption(transport tcpip.TransportProtocolNumber,
 //
 // It must be called only during initialization of the stack. Changing it as the
 // stack is operating is not supported.
-func (s *Stack) SetTransportProtocolHandler(p tcpip.TransportProtocolNumber, h func(*Route, TransportEndpointID, tcpip.PacketBuffer) bool) {
+func (s *Stack) SetTransportProtocolHandler(p tcpip.TransportProtocolNumber, h func(*Route, TransportEndpointID, PacketBuffer) bool) {
 	state := s.transportProtocols[p]
 	if state != nil {
 		state.defaultHandler = h
@@ -1701,7 +1721,7 @@ func (s *Stack) IsInGroup(nicID tcpip.NICID, multicastAddr tcpip.Address) (bool,
 }
 
 // IPTables returns the stack's iptables.
-func (s *Stack) IPTables() iptables.IPTables {
+func (s *Stack) IPTables() IPTables {
 	s.tablesMu.RLock()
 	t := s.tables
 	s.tablesMu.RUnlock()
@@ -1709,7 +1729,7 @@ func (s *Stack) IPTables() iptables.IPTables {
 }
 
 // SetIPTables sets the stack's iptables.
-func (s *Stack) SetIPTables(ipt iptables.IPTables) {
+func (s *Stack) SetIPTables(ipt IPTables) {
 	s.tablesMu.Lock()
 	s.tables = ipt
 	s.tablesMu.Unlock()
@@ -1819,6 +1839,12 @@ func (s *Stack) Seed() uint32 {
 	return s.seed
 }
 
+// Rand returns a reference to a pseudo random generator that can be used
+// to generate random numbers as required.
+func (s *Stack) Rand() *mathrand.Rand {
+	return s.randomGenerator
+}
+
 func generateRandUint32() uint32 {
 	b := make([]byte, 4)
 	if _, err := rand.Read(b); err != nil {
@@ -1826,3 +1852,16 @@ func generateRandUint32() uint32 {
 	}
 	return binary.LittleEndian.Uint32(b)
 }
+
+func generateRandInt64() int64 {
+	b := make([]byte, 8)
+	if _, err := rand.Read(b); err != nil {
+		panic(err)
+	}
+	buf := bytes.NewReader(b)
+	var v int64
+	if err := binary.Read(buf, binary.LittleEndian, &v); err != nil {
+		panic(err)
+	}
+	return v
+}
diff --git a/pkg/tcpip/stack/stack_test.go b/pkg/tcpip/stack/stack_test.go
index 9836b340f..555fcd92f 100644
--- a/pkg/tcpip/stack/stack_test.go
+++ b/pkg/tcpip/stack/stack_test.go
@@ -90,7 +90,7 @@ func (f *fakeNetworkEndpoint) ID() *stack.NetworkEndpointID {
 	return &f.id
 }
 
-func (f *fakeNetworkEndpoint) HandlePacket(r *stack.Route, pkt tcpip.PacketBuffer) {
+func (f *fakeNetworkEndpoint) HandlePacket(r *stack.Route, pkt stack.PacketBuffer) {
 	// Increment the received packet count in the protocol descriptor.
 	f.proto.packetCount[int(f.id.LocalAddress[0])%len(f.proto.packetCount)]++
 
@@ -126,7 +126,7 @@ func (f *fakeNetworkEndpoint) Capabilities() stack.LinkEndpointCapabilities {
 	return f.ep.Capabilities()
 }
 
-func (f *fakeNetworkEndpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.NetworkHeaderParams, pkt tcpip.PacketBuffer) *tcpip.Error {
+func (f *fakeNetworkEndpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.NetworkHeaderParams, pkt stack.PacketBuffer) *tcpip.Error {
 	// Increment the sent packet count in the protocol descriptor.
 	f.proto.sendPacketCount[int(r.RemoteAddress[0])%len(f.proto.sendPacketCount)]++
 
@@ -141,7 +141,7 @@ func (f *fakeNetworkEndpoint) WritePacket(r *stack.Route, gso *stack.GSO, params
 		views := make([]buffer.View, 1, 1+len(pkt.Data.Views()))
 		views[0] = pkt.Header.View()
 		views = append(views, pkt.Data.Views()...)
-		f.HandlePacket(r, tcpip.PacketBuffer{
+		f.HandlePacket(r, stack.PacketBuffer{
 			Data: buffer.NewVectorisedView(len(views[0])+pkt.Data.Size(), views),
 		})
 	}
@@ -153,11 +153,11 @@ func (f *fakeNetworkEndpoint) WritePacket(r *stack.Route, gso *stack.GSO, params
 }
 
 // WritePackets implements stack.LinkEndpoint.WritePackets.
-func (f *fakeNetworkEndpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []tcpip.PacketBuffer, params stack.NetworkHeaderParams) (int, *tcpip.Error) {
+func (f *fakeNetworkEndpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []stack.PacketBuffer, params stack.NetworkHeaderParams) (int, *tcpip.Error) {
 	panic("not implemented")
 }
 
-func (*fakeNetworkEndpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt tcpip.PacketBuffer) *tcpip.Error {
+func (*fakeNetworkEndpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt stack.PacketBuffer) *tcpip.Error {
 	return tcpip.ErrNotSupported
 }
 
@@ -287,7 +287,7 @@ func TestNetworkReceive(t *testing.T) {
 
 	// Make sure packet with wrong address is not delivered.
 	buf[0] = 3
-	ep.InjectInbound(fakeNetNumber, tcpip.PacketBuffer{
+	ep.InjectInbound(fakeNetNumber, stack.PacketBuffer{
 		Data: buf.ToVectorisedView(),
 	})
 	if fakeNet.packetCount[1] != 0 {
@@ -299,7 +299,7 @@ func TestNetworkReceive(t *testing.T) {
 
 	// Make sure packet is delivered to first endpoint.
 	buf[0] = 1
-	ep.InjectInbound(fakeNetNumber, tcpip.PacketBuffer{
+	ep.InjectInbound(fakeNetNumber, stack.PacketBuffer{
 		Data: buf.ToVectorisedView(),
 	})
 	if fakeNet.packetCount[1] != 1 {
@@ -311,7 +311,7 @@ func TestNetworkReceive(t *testing.T) {
 
 	// Make sure packet is delivered to second endpoint.
 	buf[0] = 2
-	ep.InjectInbound(fakeNetNumber, tcpip.PacketBuffer{
+	ep.InjectInbound(fakeNetNumber, stack.PacketBuffer{
 		Data: buf.ToVectorisedView(),
 	})
 	if fakeNet.packetCount[1] != 1 {
@@ -322,7 +322,7 @@ func TestNetworkReceive(t *testing.T) {
 	}
 
 	// Make sure packet is not delivered if protocol number is wrong.
-	ep.InjectInbound(fakeNetNumber-1, tcpip.PacketBuffer{
+	ep.InjectInbound(fakeNetNumber-1, stack.PacketBuffer{
 		Data: buf.ToVectorisedView(),
 	})
 	if fakeNet.packetCount[1] != 1 {
@@ -334,7 +334,7 @@ func TestNetworkReceive(t *testing.T) {
 
 	// Make sure packet that is too small is dropped.
 	buf.CapLength(2)
-	ep.InjectInbound(fakeNetNumber, tcpip.PacketBuffer{
+	ep.InjectInbound(fakeNetNumber, stack.PacketBuffer{
 		Data: buf.ToVectorisedView(),
 	})
 	if fakeNet.packetCount[1] != 1 {
@@ -356,7 +356,7 @@ func sendTo(s *stack.Stack, addr tcpip.Address, payload buffer.View) *tcpip.Erro
 
 func send(r stack.Route, payload buffer.View) *tcpip.Error {
 	hdr := buffer.NewPrependable(int(r.MaxHeaderLength()))
-	return r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: fakeTransNumber, TTL: 123, TOS: stack.DefaultTOS}, tcpip.PacketBuffer{
+	return r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: fakeTransNumber, TTL: 123, TOS: stack.DefaultTOS}, stack.PacketBuffer{
 		Header: hdr,
 		Data:   payload.ToVectorisedView(),
 	})
@@ -414,7 +414,7 @@ func testFailingRecv(t *testing.T, fakeNet *fakeNetworkProtocol, localAddrByte b
 
 func testRecvInternal(t *testing.T, fakeNet *fakeNetworkProtocol, localAddrByte byte, ep *channel.Endpoint, buf buffer.View, want int) {
 	t.Helper()
-	ep.InjectInbound(fakeNetNumber, tcpip.PacketBuffer{
+	ep.InjectInbound(fakeNetNumber, stack.PacketBuffer{
 		Data: buf.ToVectorisedView(),
 	})
 	if got := fakeNet.PacketCount(localAddrByte); got != want {
@@ -2257,7 +2257,7 @@ func TestNICStats(t *testing.T) {
 
 	// Send a packet to address 1.
 	buf := buffer.NewView(30)
-	ep1.InjectInbound(fakeNetNumber, tcpip.PacketBuffer{
+	ep1.InjectInbound(fakeNetNumber, stack.PacketBuffer{
 		Data: buf.ToVectorisedView(),
 	})
 	if got, want := s.NICInfo()[1].Stats.Rx.Packets.Value(), uint64(1); got != want {
@@ -2339,7 +2339,7 @@ func TestNICForwarding(t *testing.T) {
 			// Send a packet to dstAddr.
 			buf := buffer.NewView(30)
 			buf[0] = dstAddr[0]
-			ep1.InjectInbound(fakeNetNumber, tcpip.PacketBuffer{
+			ep1.InjectInbound(fakeNetNumber, stack.PacketBuffer{
 				Data: buf.ToVectorisedView(),
 			})
 
diff --git a/pkg/tcpip/stack/transport_demuxer.go b/pkg/tcpip/stack/transport_demuxer.go
index d4c0359e8..9a33ed375 100644
--- a/pkg/tcpip/stack/transport_demuxer.go
+++ b/pkg/tcpip/stack/transport_demuxer.go
@@ -35,7 +35,7 @@ type protocolIDs struct {
 type transportEndpoints struct {
 	// mu protects all fields of the transportEndpoints.
 	mu        sync.RWMutex
-	endpoints map[TransportEndpointID]*endpointsByNic
+	endpoints map[TransportEndpointID]*endpointsByNIC
 	// rawEndpoints contains endpoints for raw sockets, which receive all
 	// traffic of a given protocol regardless of port.
 	rawEndpoints []RawTransportEndpoint
@@ -46,11 +46,11 @@ type transportEndpoints struct {
 func (eps *transportEndpoints) unregisterEndpoint(id TransportEndpointID, ep TransportEndpoint, bindToDevice tcpip.NICID) {
 	eps.mu.Lock()
 	defer eps.mu.Unlock()
-	epsByNic, ok := eps.endpoints[id]
+	epsByNIC, ok := eps.endpoints[id]
 	if !ok {
 		return
 	}
-	if !epsByNic.unregisterEndpoint(bindToDevice, ep) {
+	if !epsByNIC.unregisterEndpoint(bindToDevice, ep) {
 		return
 	}
 	delete(eps.endpoints, id)
@@ -66,18 +66,85 @@ func (eps *transportEndpoints) transportEndpoints() []TransportEndpoint {
 	return es
 }
 
-type endpointsByNic struct {
+// iterEndpointsLocked yields all endpointsByNIC in eps that match id, in
+// descending order of match quality. If a call to yield returns false,
+// iterEndpointsLocked stops iteration and returns immediately.
+//
+// Preconditions: eps.mu must be locked.
+func (eps *transportEndpoints) iterEndpointsLocked(id TransportEndpointID, yield func(*endpointsByNIC) bool) {
+	// Try to find a match with the id as provided.
+	if ep, ok := eps.endpoints[id]; ok {
+		if !yield(ep) {
+			return
+		}
+	}
+
+	// Try to find a match with the id minus the local address.
+	nid := id
+
+	nid.LocalAddress = ""
+	if ep, ok := eps.endpoints[nid]; ok {
+		if !yield(ep) {
+			return
+		}
+	}
+
+	// Try to find a match with the id minus the remote part.
+	nid.LocalAddress = id.LocalAddress
+	nid.RemoteAddress = ""
+	nid.RemotePort = 0
+	if ep, ok := eps.endpoints[nid]; ok {
+		if !yield(ep) {
+			return
+		}
+	}
+
+	// Try to find a match with only the local port.
+	nid.LocalAddress = ""
+	if ep, ok := eps.endpoints[nid]; ok {
+		if !yield(ep) {
+			return
+		}
+	}
+}
+
+// findAllEndpointsLocked returns all endpointsByNIC in eps that match id, in
+// descending order of match quality.
+//
+// Preconditions: eps.mu must be locked.
+func (eps *transportEndpoints) findAllEndpointsLocked(id TransportEndpointID) []*endpointsByNIC {
+	var matchedEPs []*endpointsByNIC
+	eps.iterEndpointsLocked(id, func(ep *endpointsByNIC) bool {
+		matchedEPs = append(matchedEPs, ep)
+		return true
+	})
+	return matchedEPs
+}
+
+// findEndpointLocked returns the endpoint that most closely matches the given id.
+//
+// Preconditions: eps.mu must be locked.
+func (eps *transportEndpoints) findEndpointLocked(id TransportEndpointID) *endpointsByNIC {
+	var matchedEP *endpointsByNIC
+	eps.iterEndpointsLocked(id, func(ep *endpointsByNIC) bool {
+		matchedEP = ep
+		return false
+	})
+	return matchedEP
+}
+
+type endpointsByNIC struct {
 	mu        sync.RWMutex
 	endpoints map[tcpip.NICID]*multiPortEndpoint
 	// seed is a random secret for a jenkins hash.
 	seed uint32
 }
 
-func (epsByNic *endpointsByNic) transportEndpoints() []TransportEndpoint {
-	epsByNic.mu.RLock()
-	defer epsByNic.mu.RUnlock()
+func (epsByNIC *endpointsByNIC) transportEndpoints() []TransportEndpoint {
+	epsByNIC.mu.RLock()
+	defer epsByNIC.mu.RUnlock()
 	var eps []TransportEndpoint
-	for _, ep := range epsByNic.endpoints {
+	for _, ep := range epsByNIC.endpoints {
 		eps = append(eps, ep.transportEndpoints()...)
 	}
 	return eps
@@ -85,13 +152,13 @@ func (epsByNic *endpointsByNic) transportEndpoints() []TransportEndpoint {
 
 // HandlePacket is called by the stack when new packets arrive to this transport
 // endpoint.
-func (epsByNic *endpointsByNic) handlePacket(r *Route, id TransportEndpointID, pkt tcpip.PacketBuffer) {
-	epsByNic.mu.RLock()
+func (epsByNIC *endpointsByNIC) handlePacket(r *Route, id TransportEndpointID, pkt PacketBuffer) {
+	epsByNIC.mu.RLock()
 
-	mpep, ok := epsByNic.endpoints[r.ref.nic.ID()]
+	mpep, ok := epsByNIC.endpoints[r.ref.nic.ID()]
 	if !ok {
-		if mpep, ok = epsByNic.endpoints[0]; !ok {
-			epsByNic.mu.RUnlock() // Don't use defer for performance reasons.
+		if mpep, ok = epsByNIC.endpoints[0]; !ok {
+			epsByNIC.mu.RUnlock() // Don't use defer for performance reasons.
 			return
 		}
 	}
@@ -100,29 +167,29 @@ func (epsByNic *endpointsByNic) handlePacket(r *Route, id TransportEndpointID, p
 	// endpoints bound to the right device.
 	if isMulticastOrBroadcast(id.LocalAddress) {
 		mpep.handlePacketAll(r, id, pkt)
-		epsByNic.mu.RUnlock() // Don't use defer for performance reasons.
+		epsByNIC.mu.RUnlock() // Don't use defer for performance reasons.
 		return
 	}
 	// multiPortEndpoints are guaranteed to have at least one element.
-	transEP := selectEndpoint(id, mpep, epsByNic.seed)
+	transEP := selectEndpoint(id, mpep, epsByNIC.seed)
 	if queuedProtocol, mustQueue := mpep.demux.queuedProtocols[protocolIDs{mpep.netProto, mpep.transProto}]; mustQueue {
 		queuedProtocol.QueuePacket(r, transEP, id, pkt)
-		epsByNic.mu.RUnlock()
+		epsByNIC.mu.RUnlock()
 		return
 	}
 
 	transEP.HandlePacket(r, id, pkt)
-	epsByNic.mu.RUnlock() // Don't use defer for performance reasons.
+	epsByNIC.mu.RUnlock() // Don't use defer for performance reasons.
 }
 
 // HandleControlPacket implements stack.TransportEndpoint.HandleControlPacket.
-func (epsByNic *endpointsByNic) handleControlPacket(n *NIC, id TransportEndpointID, typ ControlType, extra uint32, pkt tcpip.PacketBuffer) {
-	epsByNic.mu.RLock()
-	defer epsByNic.mu.RUnlock()
+func (epsByNIC *endpointsByNIC) handleControlPacket(n *NIC, id TransportEndpointID, typ ControlType, extra uint32, pkt PacketBuffer) {
+	epsByNIC.mu.RLock()
+	defer epsByNIC.mu.RUnlock()
 
-	mpep, ok := epsByNic.endpoints[n.ID()]
+	mpep, ok := epsByNIC.endpoints[n.ID()]
 	if !ok {
-		mpep, ok = epsByNic.endpoints[0]
+		mpep, ok = epsByNIC.endpoints[0]
 	}
 	if !ok {
 		return
@@ -132,16 +199,16 @@ func (epsByNic *endpointsByNic) handleControlPacket(n *NIC, id TransportEndpoint
 	// broadcast like we are doing with handlePacket above?
 
 	// multiPortEndpoints are guaranteed to have at least one element.
-	selectEndpoint(id, mpep, epsByNic.seed).HandleControlPacket(id, typ, extra, pkt)
+	selectEndpoint(id, mpep, epsByNIC.seed).HandleControlPacket(id, typ, extra, pkt)
 }
 
 // registerEndpoint returns true if it succeeds. It fails and returns
 // false if ep already has an element with the same key.
-func (epsByNic *endpointsByNic) registerEndpoint(d *transportDemuxer, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, t TransportEndpoint, reusePort bool, bindToDevice tcpip.NICID) *tcpip.Error {
-	epsByNic.mu.Lock()
-	defer epsByNic.mu.Unlock()
+func (epsByNIC *endpointsByNIC) registerEndpoint(d *transportDemuxer, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, t TransportEndpoint, reusePort bool, bindToDevice tcpip.NICID) *tcpip.Error {
+	epsByNIC.mu.Lock()
+	defer epsByNIC.mu.Unlock()
 
-	multiPortEp, ok := epsByNic.endpoints[bindToDevice]
+	multiPortEp, ok := epsByNIC.endpoints[bindToDevice]
 	if !ok {
 		multiPortEp = &multiPortEndpoint{
 			demux:      d,
@@ -149,24 +216,24 @@ func (epsByNic *endpointsByNic) registerEndpoint(d *transportDemuxer, netProto t
 			transProto: transProto,
 			reuse:      reusePort,
 		}
-		epsByNic.endpoints[bindToDevice] = multiPortEp
+		epsByNIC.endpoints[bindToDevice] = multiPortEp
 	}
 
 	return multiPortEp.singleRegisterEndpoint(t, reusePort)
 }
 
-// unregisterEndpoint returns true if endpointsByNic has to be unregistered.
-func (epsByNic *endpointsByNic) unregisterEndpoint(bindToDevice tcpip.NICID, t TransportEndpoint) bool {
-	epsByNic.mu.Lock()
-	defer epsByNic.mu.Unlock()
-	multiPortEp, ok := epsByNic.endpoints[bindToDevice]
+// unregisterEndpoint returns true if endpointsByNIC has to be unregistered.
+func (epsByNIC *endpointsByNIC) unregisterEndpoint(bindToDevice tcpip.NICID, t TransportEndpoint) bool {
+	epsByNIC.mu.Lock()
+	defer epsByNIC.mu.Unlock()
+	multiPortEp, ok := epsByNIC.endpoints[bindToDevice]
 	if !ok {
 		return false
 	}
 	if multiPortEp.unregisterEndpoint(t) {
-		delete(epsByNic.endpoints, bindToDevice)
+		delete(epsByNIC.endpoints, bindToDevice)
 	}
-	return len(epsByNic.endpoints) == 0
+	return len(epsByNIC.endpoints) == 0
 }
 
 // transportDemuxer demultiplexes packets targeted at a transport endpoint
@@ -184,7 +251,7 @@ type transportDemuxer struct {
 // the dispatcher to delivery packets to the QueuePacket method instead of
 // calling HandlePacket directly on the endpoint.
 type queuedTransportProtocol interface {
-	QueuePacket(r *Route, ep TransportEndpoint, id TransportEndpointID, pkt tcpip.PacketBuffer)
+	QueuePacket(r *Route, ep TransportEndpoint, id TransportEndpointID, pkt PacketBuffer)
 }
 
 func newTransportDemuxer(stack *Stack) *transportDemuxer {
@@ -198,7 +265,7 @@ func newTransportDemuxer(stack *Stack) *transportDemuxer {
 		for proto := range stack.transportProtocols {
 			protoIDs := protocolIDs{netProto, proto}
 			d.protocol[protoIDs] = &transportEndpoints{
-				endpoints: make(map[TransportEndpointID]*endpointsByNic),
+				endpoints: make(map[TransportEndpointID]*endpointsByNIC),
 			}
 			qTransProto, isQueued := (stack.transportProtocols[proto].proto).(queuedTransportProtocol)
 			if isQueued {
@@ -312,7 +379,7 @@ func selectEndpoint(id TransportEndpointID, mpep *multiPortEndpoint, seed uint32
 	return mpep.endpoints[idx]
 }
 
-func (ep *multiPortEndpoint) handlePacketAll(r *Route, id TransportEndpointID, pkt tcpip.PacketBuffer) {
+func (ep *multiPortEndpoint) handlePacketAll(r *Route, id TransportEndpointID, pkt PacketBuffer) {
 	ep.mu.RLock()
 	queuedProtocol, mustQueue := ep.demux.queuedProtocols[protocolIDs{ep.netProto, ep.transProto}]
 	// HandlePacket takes ownership of pkt, so each endpoint needs
@@ -378,16 +445,16 @@ func (d *transportDemuxer) singleRegisterEndpoint(netProto tcpip.NetworkProtocol
 	eps.mu.Lock()
 	defer eps.mu.Unlock()
 
-	epsByNic, ok := eps.endpoints[id]
+	epsByNIC, ok := eps.endpoints[id]
 	if !ok {
-		epsByNic = &endpointsByNic{
+		epsByNIC = &endpointsByNIC{
 			endpoints: make(map[tcpip.NICID]*multiPortEndpoint),
 			seed:      rand.Uint32(),
 		}
-		eps.endpoints[id] = epsByNic
+		eps.endpoints[id] = epsByNIC
 	}
 
-	return epsByNic.registerEndpoint(d, netProto, protocol, ep, reusePort, bindToDevice)
+	return epsByNIC.registerEndpoint(d, netProto, protocol, ep, reusePort, bindToDevice)
 }
 
 // unregisterEndpoint unregisters the endpoint with the given id such that it
@@ -403,7 +470,7 @@ func (d *transportDemuxer) unregisterEndpoint(netProtos []tcpip.NetworkProtocolN
 // deliverPacket attempts to find one or more matching transport endpoints, and
 // then, if matches are found, delivers the packet to them. Returns true if
 // the packet no longer needs to be handled.
-func (d *transportDemuxer) deliverPacket(r *Route, protocol tcpip.TransportProtocolNumber, pkt tcpip.PacketBuffer, id TransportEndpointID) bool {
+func (d *transportDemuxer) deliverPacket(r *Route, protocol tcpip.TransportProtocolNumber, pkt PacketBuffer, id TransportEndpointID) bool {
 	eps, ok := d.protocol[protocolIDs{r.NetProto, protocol}]
 	if !ok {
 		return false
@@ -413,7 +480,7 @@ func (d *transportDemuxer) deliverPacket(r *Route, protocol tcpip.TransportProto
 	// transport endpoints.
 	if protocol == header.UDPProtocolNumber && isMulticastOrBroadcast(id.LocalAddress) {
 		eps.mu.RLock()
-		destEPs := d.findAllEndpointsLocked(eps, id)
+		destEPs := eps.findAllEndpointsLocked(id)
 		eps.mu.RUnlock()
 		// Fail if we didn't find at least one matching transport endpoint.
 		if len(destEPs) == 0 {
@@ -439,7 +506,7 @@ func (d *transportDemuxer) deliverPacket(r *Route, protocol tcpip.TransportProto
 	}
 
 	eps.mu.RLock()
-	ep := d.findEndpointLocked(eps, id)
+	ep := eps.findEndpointLocked(id)
 	eps.mu.RUnlock()
 	if ep == nil {
 		if protocol == header.UDPProtocolNumber {
@@ -453,7 +520,7 @@ func (d *transportDemuxer) deliverPacket(r *Route, protocol tcpip.TransportProto
 
 // deliverRawPacket attempts to deliver the given packet and returns whether it
 // was delivered successfully.
-func (d *transportDemuxer) deliverRawPacket(r *Route, protocol tcpip.TransportProtocolNumber, pkt tcpip.PacketBuffer) bool {
+func (d *transportDemuxer) deliverRawPacket(r *Route, protocol tcpip.TransportProtocolNumber, pkt PacketBuffer) bool {
 	eps, ok := d.protocol[protocolIDs{r.NetProto, protocol}]
 	if !ok {
 		return false
@@ -477,121 +544,53 @@ func (d *transportDemuxer) deliverRawPacket(r *Route, protocol tcpip.TransportPr
 
 // deliverControlPacket attempts to deliver the given control packet. Returns
 // true if it found an endpoint, false otherwise.
-func (d *transportDemuxer) deliverControlPacket(n *NIC, net tcpip.NetworkProtocolNumber, trans tcpip.TransportProtocolNumber, typ ControlType, extra uint32, pkt tcpip.PacketBuffer, id TransportEndpointID) bool {
+func (d *transportDemuxer) deliverControlPacket(n *NIC, net tcpip.NetworkProtocolNumber, trans tcpip.TransportProtocolNumber, typ ControlType, extra uint32, pkt PacketBuffer, id TransportEndpointID) bool {
 	eps, ok := d.protocol[protocolIDs{net, trans}]
 	if !ok {
 		return false
 	}
 
-	// Try to find the endpoint.
 	eps.mu.RLock()
-	ep := d.findEndpointLocked(eps, id)
+	ep := eps.findEndpointLocked(id)
 	eps.mu.RUnlock()
-
-	// Fail if we didn't find one.
 	if ep == nil {
 		return false
 	}
 
-	// Deliver the packet.
 	ep.handleControlPacket(n, id, typ, extra, pkt)
-
 	return true
 }
 
-// iterEndpointsLocked yields all endpointsByNic in eps that match id, in
-// descending order of match quality. If a call to yield returns false,
-// iterEndpointsLocked stops iteration and returns immediately.
-//
-// Preconditions: eps.mu must be locked.
-func (d *transportDemuxer) iterEndpointsLocked(eps *transportEndpoints, id TransportEndpointID, yield func(*endpointsByNic) bool) {
-	// Try to find a match with the id as provided.
-	if ep, ok := eps.endpoints[id]; ok {
-		if !yield(ep) {
-			return
-		}
-	}
-
-	// Try to find a match with the id minus the local address.
-	nid := id
-
-	nid.LocalAddress = ""
-	if ep, ok := eps.endpoints[nid]; ok {
-		if !yield(ep) {
-			return
-		}
-	}
-
-	// Try to find a match with the id minus the remote part.
-	nid.LocalAddress = id.LocalAddress
-	nid.RemoteAddress = ""
-	nid.RemotePort = 0
-	if ep, ok := eps.endpoints[nid]; ok {
-		if !yield(ep) {
-			return
-		}
-	}
-
-	// Try to find a match with only the local port.
-	nid.LocalAddress = ""
-	if ep, ok := eps.endpoints[nid]; ok {
-		if !yield(ep) {
-			return
-		}
-	}
-}
-
-func (d *transportDemuxer) findAllEndpointsLocked(eps *transportEndpoints, id TransportEndpointID) []*endpointsByNic {
-	var matchedEPs []*endpointsByNic
-	d.iterEndpointsLocked(eps, id, func(ep *endpointsByNic) bool {
-		matchedEPs = append(matchedEPs, ep)
-		return true
-	})
-	return matchedEPs
-}
-
 // findTransportEndpoint find a single endpoint that most closely matches the provided id.
 func (d *transportDemuxer) findTransportEndpoint(netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, id TransportEndpointID, r *Route) TransportEndpoint {
 	eps, ok := d.protocol[protocolIDs{netProto, transProto}]
 	if !ok {
 		return nil
 	}
-	// Try to find the endpoint.
+
 	eps.mu.RLock()
-	epsByNic := d.findEndpointLocked(eps, id)
-	// Fail if we didn't find one.
-	if epsByNic == nil {
+	epsByNIC := eps.findEndpointLocked(id)
+	if epsByNIC == nil {
 		eps.mu.RUnlock()
 		return nil
 	}
 
-	epsByNic.mu.RLock()
+	epsByNIC.mu.RLock()
 	eps.mu.RUnlock()
 
-	mpep, ok := epsByNic.endpoints[r.ref.nic.ID()]
+	mpep, ok := epsByNIC.endpoints[r.ref.nic.ID()]
 	if !ok {
-		if mpep, ok = epsByNic.endpoints[0]; !ok {
-			epsByNic.mu.RUnlock() // Don't use defer for performance reasons.
+		if mpep, ok = epsByNIC.endpoints[0]; !ok {
+			epsByNIC.mu.RUnlock() // Don't use defer for performance reasons.
 			return nil
 		}
 	}
 
-	ep := selectEndpoint(id, mpep, epsByNic.seed)
-	epsByNic.mu.RUnlock()
+	ep := selectEndpoint(id, mpep, epsByNIC.seed)
+	epsByNIC.mu.RUnlock()
 	return ep
 }
 
-// findEndpointLocked returns the endpoint that most closely matches the given
-// id.
-func (d *transportDemuxer) findEndpointLocked(eps *transportEndpoints, id TransportEndpointID) *endpointsByNic {
-	var matchedEP *endpointsByNic
-	d.iterEndpointsLocked(eps, id, func(ep *endpointsByNic) bool {
-		matchedEP = ep
-		return false
-	})
-	return matchedEP
-}
-
 // registerRawEndpoint registers the given endpoint with the dispatcher such
 // that packets of the appropriate protocol are delivered to it. A single
 // packet can be sent to one or more raw endpoints along with a non-raw
diff --git a/pkg/tcpip/stack/transport_demuxer_test.go b/pkg/tcpip/stack/transport_demuxer_test.go
index 0e3e239c5..c65b0c632 100644
--- a/pkg/tcpip/stack/transport_demuxer_test.go
+++ b/pkg/tcpip/stack/transport_demuxer_test.go
@@ -31,84 +31,58 @@ import (
 )
 
 const (
-	stackV6Addr = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01"
-	testV6Addr  = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02"
+	testSrcAddrV6 = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01"
+	testDstAddrV6 = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02"
 
-	stackAddr = "\x0a\x00\x00\x01"
-	stackPort = 1234
-	testPort  = 4096
+	testSrcAddrV4 = "\x0a\x00\x00\x01"
+	testDstAddrV4 = "\x0a\x00\x00\x02"
+
+	testDstPort = 1234
+	testSrcPort = 4096
 )
 
 type testContext struct {
-	t       *testing.T
 	linkEps map[tcpip.NICID]*channel.Endpoint
 	s       *stack.Stack
-
-	ep tcpip.Endpoint
-	wq waiter.Queue
-}
-
-func (c *testContext) cleanup() {
-	if c.ep != nil {
-		c.ep.Close()
-	}
-}
-
-func (c *testContext) createV6Endpoint(v6only bool) {
-	var err *tcpip.Error
-	c.ep, err = c.s.NewEndpoint(udp.ProtocolNumber, ipv6.ProtocolNumber, &c.wq)
-	if err != nil {
-		c.t.Fatalf("NewEndpoint failed: %v", err)
-	}
-
-	if err := c.ep.SetSockOptBool(tcpip.V6OnlyOption, v6only); err != nil {
-		c.t.Fatalf("SetSockOpt failed: %v", err)
-	}
+	wq      waiter.Queue
 }
 
 // newDualTestContextMultiNIC creates the testing context and also linkEpIDs NICs.
 func newDualTestContextMultiNIC(t *testing.T, mtu uint32, linkEpIDs []tcpip.NICID) *testContext {
 	s := stack.New(stack.Options{
 		NetworkProtocols:   []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol()},
-		TransportProtocols: []stack.TransportProtocol{udp.NewProtocol()}})
+		TransportProtocols: []stack.TransportProtocol{udp.NewProtocol()},
+	})
 	linkEps := make(map[tcpip.NICID]*channel.Endpoint)
 	for _, linkEpID := range linkEpIDs {
 		channelEp := channel.New(256, mtu, "")
 		if err := s.CreateNIC(linkEpID, channelEp); err != nil {
-			t.Fatalf("CreateNIC failed: %v", err)
+			t.Fatalf("CreateNIC failed: %s", err)
 		}
 		linkEps[linkEpID] = channelEp
 
-		if err := s.AddAddress(linkEpID, ipv4.ProtocolNumber, stackAddr); err != nil {
-			t.Fatalf("AddAddress IPv4 failed: %v", err)
+		if err := s.AddAddress(linkEpID, ipv4.ProtocolNumber, testDstAddrV4); err != nil {
+			t.Fatalf("AddAddress IPv4 failed: %s", err)
 		}
 
-		if err := s.AddAddress(linkEpID, ipv6.ProtocolNumber, stackV6Addr); err != nil {
-			t.Fatalf("AddAddress IPv6 failed: %v", err)
+		if err := s.AddAddress(linkEpID, ipv6.ProtocolNumber, testDstAddrV6); err != nil {
+			t.Fatalf("AddAddress IPv6 failed: %s", err)
 		}
 	}
 
 	s.SetRouteTable([]tcpip.Route{
-		{
-			Destination: header.IPv4EmptySubnet,
-			NIC:         1,
-		},
-		{
-			Destination: header.IPv6EmptySubnet,
-			NIC:         1,
-		},
+		{Destination: header.IPv4EmptySubnet, NIC: 1},
+		{Destination: header.IPv6EmptySubnet, NIC: 1},
 	})
 
 	return &testContext{
-		t:       t,
 		s:       s,
 		linkEps: linkEps,
 	}
 }
 
 type headers struct {
-	srcPort uint16
-	dstPort uint16
+	srcPort, dstPort uint16
 }
 
 func newPayload() []byte {
@@ -119,6 +93,47 @@ func newPayload() []byte {
 	return b
 }
 
+func (c *testContext) sendV4Packet(payload []byte, h *headers, linkEpID tcpip.NICID) {
+	buf := buffer.NewView(header.UDPMinimumSize + header.IPv4MinimumSize + len(payload))
+	payloadStart := len(buf) - len(payload)
+	copy(buf[payloadStart:], payload)
+
+	// Initialize the IP header.
+	ip := header.IPv4(buf)
+	ip.Encode(&header.IPv4Fields{
+		IHL:         header.IPv4MinimumSize,
+		TOS:         0x80,
+		TotalLength: uint16(len(buf)),
+		TTL:         65,
+		Protocol:    uint8(udp.ProtocolNumber),
+		SrcAddr:     testSrcAddrV4,
+		DstAddr:     testDstAddrV4,
+	})
+	ip.SetChecksum(^ip.CalculateChecksum())
+
+	// Initialize the UDP header.
+	u := header.UDP(buf[header.IPv4MinimumSize:])
+	u.Encode(&header.UDPFields{
+		SrcPort: h.srcPort,
+		DstPort: h.dstPort,
+		Length:  uint16(header.UDPMinimumSize + len(payload)),
+	})
+
+	// Calculate the UDP pseudo-header checksum.
+	xsum := header.PseudoHeaderChecksum(udp.ProtocolNumber, testSrcAddrV4, testDstAddrV4, uint16(len(u)))
+
+	// Calculate the UDP checksum and set it.
+	xsum = header.Checksum(payload, xsum)
+	u.SetChecksum(^u.CalculateChecksum(xsum))
+
+	// Inject packet.
+	c.linkEps[linkEpID].InjectInbound(ipv4.ProtocolNumber, stack.PacketBuffer{
+		Data:            buf.ToVectorisedView(),
+		NetworkHeader:   buffer.View(ip),
+		TransportHeader: buffer.View(u),
+	})
+}
+
 func (c *testContext) sendV6Packet(payload []byte, h *headers, linkEpID tcpip.NICID) {
 	// Allocate a buffer for data and headers.
 	buf := buffer.NewView(header.UDPMinimumSize + header.IPv6MinimumSize + len(payload))
@@ -130,8 +145,8 @@ func (c *testContext) sendV6Packet(payload []byte, h *headers, linkEpID tcpip.NI
 		PayloadLength: uint16(header.UDPMinimumSize + len(payload)),
 		NextHeader:    uint8(udp.ProtocolNumber),
 		HopLimit:      65,
-		SrcAddr:       testV6Addr,
-		DstAddr:       stackV6Addr,
+		SrcAddr:       testSrcAddrV6,
+		DstAddr:       testDstAddrV6,
 	})
 
 	// Initialize the UDP header.
@@ -143,15 +158,17 @@ func (c *testContext) sendV6Packet(payload []byte, h *headers, linkEpID tcpip.NI
 	})
 
 	// Calculate the UDP pseudo-header checksum.
-	xsum := header.PseudoHeaderChecksum(udp.ProtocolNumber, testV6Addr, stackV6Addr, uint16(len(u)))
+	xsum := header.PseudoHeaderChecksum(udp.ProtocolNumber, testSrcAddrV6, testDstAddrV6, uint16(len(u)))
 
 	// Calculate the UDP checksum and set it.
 	xsum = header.Checksum(payload, xsum)
 	u.SetChecksum(^u.CalculateChecksum(xsum))
 
 	// Inject packet.
-	c.linkEps[linkEpID].InjectInbound(ipv6.ProtocolNumber, tcpip.PacketBuffer{
-		Data: buf.ToVectorisedView(),
+	c.linkEps[linkEpID].InjectInbound(ipv6.ProtocolNumber, stack.PacketBuffer{
+		Data:            buf.ToVectorisedView(),
+		NetworkHeader:   buffer.View(ip),
+		TransportHeader: buffer.View(u),
 	})
 }
 
@@ -179,15 +196,15 @@ func TestTransportDemuxerRegister(t *testing.T) {
 				t.Fatalf("%T does not implement stack.TransportEndpoint", ep)
 			}
 			if got, want := s.RegisterTransportEndpoint(0, []tcpip.NetworkProtocolNumber{test.proto}, udp.ProtocolNumber, stack.TransportEndpointID{}, tEP, false, 0), test.want; got != want {
-				t.Fatalf("s.RegisterTransportEndpoint(...) = %v, want %v", got, want)
+				t.Fatalf("s.RegisterTransportEndpoint(...) = %s, want %s", got, want)
 			}
 		})
 	}
 }
 
-// TestReuseBindToDevice injects varied packets on input devices and checks that
+// TestBindToDeviceDistribution injects varied packets on input devices and checks that
 // the distribution of packets received matches expectations.
-func TestDistribution(t *testing.T) {
+func TestBindToDeviceDistribution(t *testing.T) {
 	type endpointSockopts struct {
 		reuse        int
 		bindToDevice tcpip.NICID
@@ -196,19 +213,19 @@ func TestDistribution(t *testing.T) {
 		name string
 		// endpoints will received the inject packets.
 		endpoints []endpointSockopts
-		// wantedDistribution is the wanted ratio of packets received on each
+		// wantDistributions is the want ratio of packets received on each
 		// endpoint for each NIC on which packets are injected.
-		wantedDistributions map[tcpip.NICID][]float64
+		wantDistributions map[tcpip.NICID][]float64
 	}{
 		{
 			"BindPortReuse",
 			// 5 endpoints that all have reuse set.
 			[]endpointSockopts{
-				{1, 0},
-				{1, 0},
-				{1, 0},
-				{1, 0},
-				{1, 0},
+				{reuse: 1, bindToDevice: 0},
+				{reuse: 1, bindToDevice: 0},
+				{reuse: 1, bindToDevice: 0},
+				{reuse: 1, bindToDevice: 0},
+				{reuse: 1, bindToDevice: 0},
 			},
 			map[tcpip.NICID][]float64{
 				// Injected packets on dev0 get distributed evenly.
@@ -219,9 +236,9 @@ func TestDistribution(t *testing.T) {
 			"BindToDevice",
 			// 3 endpoints with various bindings.
 			[]endpointSockopts{
-				{0, 1},
-				{0, 2},
-				{0, 3},
+				{reuse: 0, bindToDevice: 1},
+				{reuse: 0, bindToDevice: 2},
+				{reuse: 0, bindToDevice: 3},
 			},
 			map[tcpip.NICID][]float64{
 				// Injected packets on dev0 go only to the endpoint bound to dev0.
@@ -236,12 +253,12 @@ func TestDistribution(t *testing.T) {
 			"ReuseAndBindToDevice",
 			// 6 endpoints with various bindings.
 			[]endpointSockopts{
-				{1, 1},
-				{1, 1},
-				{1, 2},
-				{1, 2},
-				{1, 2},
-				{1, 0},
+				{reuse: 1, bindToDevice: 1},
+				{reuse: 1, bindToDevice: 1},
+				{reuse: 1, bindToDevice: 2},
+				{reuse: 1, bindToDevice: 2},
+				{reuse: 1, bindToDevice: 2},
+				{reuse: 1, bindToDevice: 0},
 			},
 			map[tcpip.NICID][]float64{
 				// Injected packets on dev0 get distributed among endpoints bound to
@@ -255,17 +272,17 @@ func TestDistribution(t *testing.T) {
 			},
 		},
 	} {
-		t.Run(test.name, func(t *testing.T) {
-			for device, wantedDistribution := range test.wantedDistributions {
-				t.Run(string(device), func(t *testing.T) {
+		for protoName, netProtoNum := range map[string]tcpip.NetworkProtocolNumber{
+			"IPv4": ipv4.ProtocolNumber,
+			"IPv6": ipv6.ProtocolNumber,
+		} {
+			for device, wantDistribution := range test.wantDistributions {
+				t.Run(test.name+protoName+string(device), func(t *testing.T) {
 					var devices []tcpip.NICID
-					for d := range test.wantedDistributions {
+					for d := range test.wantDistributions {
 						devices = append(devices, d)
 					}
 					c := newDualTestContextMultiNIC(t, defaultMTU, devices)
-					defer c.cleanup()
-
-					c.createV6Endpoint(false)
 
 					eps := make(map[tcpip.Endpoint]int)
 
@@ -279,9 +296,9 @@ func TestDistribution(t *testing.T) {
 						defer close(ch)
 
 						var err *tcpip.Error
-						ep, err := c.s.NewEndpoint(udp.ProtocolNumber, ipv6.ProtocolNumber, &wq)
+						ep, err := c.s.NewEndpoint(udp.ProtocolNumber, netProtoNum, &wq)
 						if err != nil {
-							c.t.Fatalf("NewEndpoint failed: %v", err)
+							t.Fatalf("NewEndpoint failed: %s", err)
 						}
 						eps[ep] = i
 
@@ -294,20 +311,30 @@ func TestDistribution(t *testing.T) {
 						defer ep.Close()
 						reusePortOption := tcpip.ReusePortOption(endpoint.reuse)
 						if err := ep.SetSockOpt(reusePortOption); err != nil {
-							c.t.Fatalf("SetSockOpt(%#v) on endpoint %d failed: %v", reusePortOption, i, err)
+							t.Fatalf("SetSockOpt(%#v) on endpoint %d failed: %s", reusePortOption, i, err)
 						}
 						bindToDeviceOption := tcpip.BindToDeviceOption(endpoint.bindToDevice)
 						if err := ep.SetSockOpt(bindToDeviceOption); err != nil {
-							c.t.Fatalf("SetSockOpt(%#v) on endpoint %d failed: %v", bindToDeviceOption, i, err)
+							t.Fatalf("SetSockOpt(%#v) on endpoint %d failed: %s", bindToDeviceOption, i, err)
+						}
+
+						var dstAddr tcpip.Address
+						switch netProtoNum {
+						case ipv4.ProtocolNumber:
+							dstAddr = testDstAddrV4
+						case ipv6.ProtocolNumber:
+							dstAddr = testDstAddrV6
+						default:
+							t.Fatalf("unexpected protocol number: %d", netProtoNum)
 						}
-						if err := ep.Bind(tcpip.FullAddress{Addr: stackV6Addr, Port: stackPort}); err != nil {
-							t.Fatalf("ep.Bind(...) on endpoint %d failed: %v", i, err)
+						if err := ep.Bind(tcpip.FullAddress{Addr: dstAddr, Port: testDstPort}); err != nil {
+							t.Fatalf("ep.Bind(...) on endpoint %d failed: %s", i, err)
 						}
 					}
 
 					npackets := 100000
 					nports := 10000
-					if got, want := len(test.endpoints), len(wantedDistribution); got != want {
+					if got, want := len(test.endpoints), len(wantDistribution); got != want {
 						t.Fatalf("got len(test.endpoints) = %d, want %d", got, want)
 					}
 					ports := make(map[uint16]tcpip.Endpoint)
@@ -316,17 +343,22 @@ func TestDistribution(t *testing.T) {
 						// Send a packet.
 						port := uint16(i % nports)
 						payload := newPayload()
-						c.sendV6Packet(payload,
-							&headers{
-								srcPort: testPort + port,
-								dstPort: stackPort},
-							device)
+						hdrs := &headers{
+							srcPort: testSrcPort + port,
+							dstPort: testDstPort,
+						}
+						switch netProtoNum {
+						case ipv4.ProtocolNumber:
+							c.sendV4Packet(payload, hdrs, device)
+						case ipv6.ProtocolNumber:
+							c.sendV6Packet(payload, hdrs, device)
+						default:
+							t.Fatalf("unexpected protocol number: %d", netProtoNum)
+						}
 
-						var addr tcpip.FullAddress
 						ep := <-pollChannel
-						_, _, err := ep.Read(&addr)
-						if err != nil {
-							c.t.Fatalf("Read on endpoint %d failed: %v", eps[ep], err)
+						if _, _, err := ep.Read(nil); err != nil {
+							t.Fatalf("Read on endpoint %d failed: %s", eps[ep], err)
 						}
 						stats[ep]++
 						if i < nports {
@@ -342,17 +374,17 @@ func TestDistribution(t *testing.T) {
 
 					// Check that a packet distribution is as expected.
 					for ep, i := range eps {
-						wantedRatio := wantedDistribution[i]
-						wantedRecv := wantedRatio * float64(npackets)
+						wantRatio := wantDistribution[i]
+						wantRecv := wantRatio * float64(npackets)
 						actualRecv := stats[ep]
 						actualRatio := float64(stats[ep]) / float64(npackets)
 						// The deviation is less than 10%.
-						if math.Abs(actualRatio-wantedRatio) > 0.05 {
-							t.Errorf("wanted about %.0f%% (%.0f of %d) packets to arrive on endpoint %d, got %.0f%% (%d of %d)", wantedRatio*100, wantedRecv, npackets, i, actualRatio*100, actualRecv, npackets)
+						if math.Abs(actualRatio-wantRatio) > 0.05 {
+							t.Errorf("want about %.0f%% (%.0f of %d) packets to arrive on endpoint %d, got %.0f%% (%d of %d)", wantRatio*100, wantRecv, npackets, i, actualRatio*100, actualRecv, npackets)
 						}
 					}
 				})
 			}
-		})
+		}
 	}
 }
diff --git a/pkg/tcpip/stack/transport_test.go b/pkg/tcpip/stack/transport_test.go
index 5d1da2f8b..8ca9ac3cf 100644
--- a/pkg/tcpip/stack/transport_test.go
+++ b/pkg/tcpip/stack/transport_test.go
@@ -19,7 +19,6 @@ import (
 
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
-	"gvisor.dev/gvisor/pkg/tcpip/iptables"
 	"gvisor.dev/gvisor/pkg/tcpip/link/channel"
 	"gvisor.dev/gvisor/pkg/tcpip/link/loopback"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
@@ -87,7 +86,7 @@ func (f *fakeTransportEndpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions
 	if err != nil {
 		return 0, nil, err
 	}
-	if err := f.route.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: fakeTransNumber, TTL: 123, TOS: stack.DefaultTOS}, tcpip.PacketBuffer{
+	if err := f.route.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: fakeTransNumber, TTL: 123, TOS: stack.DefaultTOS}, stack.PacketBuffer{
 		Header: hdr,
 		Data:   buffer.View(v).ToVectorisedView(),
 	}); err != nil {
@@ -214,7 +213,7 @@ func (*fakeTransportEndpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Erro
 	return tcpip.FullAddress{}, nil
 }
 
-func (f *fakeTransportEndpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, _ tcpip.PacketBuffer) {
+func (f *fakeTransportEndpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, _ stack.PacketBuffer) {
 	// Increment the number of received packets.
 	f.proto.packetCount++
 	if f.acceptQueue != nil {
@@ -231,7 +230,7 @@ func (f *fakeTransportEndpoint) HandlePacket(r *stack.Route, id stack.TransportE
 	}
 }
 
-func (f *fakeTransportEndpoint) HandleControlPacket(stack.TransportEndpointID, stack.ControlType, uint32, tcpip.PacketBuffer) {
+func (f *fakeTransportEndpoint) HandleControlPacket(stack.TransportEndpointID, stack.ControlType, uint32, stack.PacketBuffer) {
 	// Increment the number of received control packets.
 	f.proto.controlCount++
 }
@@ -242,8 +241,8 @@ func (f *fakeTransportEndpoint) State() uint32 {
 
 func (f *fakeTransportEndpoint) ModerateRecvBuf(copied int) {}
 
-func (f *fakeTransportEndpoint) IPTables() (iptables.IPTables, error) {
-	return iptables.IPTables{}, nil
+func (f *fakeTransportEndpoint) IPTables() (stack.IPTables, error) {
+	return stack.IPTables{}, nil
 }
 
 func (f *fakeTransportEndpoint) Resume(*stack.Stack) {}
@@ -288,7 +287,7 @@ func (*fakeTransportProtocol) ParsePorts(buffer.View) (src, dst uint16, err *tcp
 	return 0, 0, nil
 }
 
-func (*fakeTransportProtocol) HandleUnknownDestinationPacket(*stack.Route, stack.TransportEndpointID, tcpip.PacketBuffer) bool {
+func (*fakeTransportProtocol) HandleUnknownDestinationPacket(*stack.Route, stack.TransportEndpointID, stack.PacketBuffer) bool {
 	return true
 }
 
@@ -368,7 +367,7 @@ func TestTransportReceive(t *testing.T) {
 	// Make sure packet with wrong protocol is not delivered.
 	buf[0] = 1
 	buf[2] = 0
-	linkEP.InjectInbound(fakeNetNumber, tcpip.PacketBuffer{
+	linkEP.InjectInbound(fakeNetNumber, stack.PacketBuffer{
 		Data: buf.ToVectorisedView(),
 	})
 	if fakeTrans.packetCount != 0 {
@@ -379,7 +378,7 @@ func TestTransportReceive(t *testing.T) {
 	buf[0] = 1
 	buf[1] = 3
 	buf[2] = byte(fakeTransNumber)
-	linkEP.InjectInbound(fakeNetNumber, tcpip.PacketBuffer{
+	linkEP.InjectInbound(fakeNetNumber, stack.PacketBuffer{
 		Data: buf.ToVectorisedView(),
 	})
 	if fakeTrans.packetCount != 0 {
@@ -390,7 +389,7 @@ func TestTransportReceive(t *testing.T) {
 	buf[0] = 1
 	buf[1] = 2
 	buf[2] = byte(fakeTransNumber)
-	linkEP.InjectInbound(fakeNetNumber, tcpip.PacketBuffer{
+	linkEP.InjectInbound(fakeNetNumber, stack.PacketBuffer{
 		Data: buf.ToVectorisedView(),
 	})
 	if fakeTrans.packetCount != 1 {
@@ -445,7 +444,7 @@ func TestTransportControlReceive(t *testing.T) {
 	buf[fakeNetHeaderLen+0] = 0
 	buf[fakeNetHeaderLen+1] = 1
 	buf[fakeNetHeaderLen+2] = 0
-	linkEP.InjectInbound(fakeNetNumber, tcpip.PacketBuffer{
+	linkEP.InjectInbound(fakeNetNumber, stack.PacketBuffer{
 		Data: buf.ToVectorisedView(),
 	})
 	if fakeTrans.controlCount != 0 {
@@ -456,7 +455,7 @@ func TestTransportControlReceive(t *testing.T) {
 	buf[fakeNetHeaderLen+0] = 3
 	buf[fakeNetHeaderLen+1] = 1
 	buf[fakeNetHeaderLen+2] = byte(fakeTransNumber)
-	linkEP.InjectInbound(fakeNetNumber, tcpip.PacketBuffer{
+	linkEP.InjectInbound(fakeNetNumber, stack.PacketBuffer{
 		Data: buf.ToVectorisedView(),
 	})
 	if fakeTrans.controlCount != 0 {
@@ -467,7 +466,7 @@ func TestTransportControlReceive(t *testing.T) {
 	buf[fakeNetHeaderLen+0] = 2
 	buf[fakeNetHeaderLen+1] = 1
 	buf[fakeNetHeaderLen+2] = byte(fakeTransNumber)
-	linkEP.InjectInbound(fakeNetNumber, tcpip.PacketBuffer{
+	linkEP.InjectInbound(fakeNetNumber, stack.PacketBuffer{
 		Data: buf.ToVectorisedView(),
 	})
 	if fakeTrans.controlCount != 1 {
@@ -622,7 +621,7 @@ func TestTransportForwarding(t *testing.T) {
 	req[0] = 1
 	req[1] = 3
 	req[2] = byte(fakeTransNumber)
-	ep2.InjectInbound(fakeNetNumber, tcpip.PacketBuffer{
+	ep2.InjectInbound(fakeNetNumber, stack.PacketBuffer{
 		Data: req.ToVectorisedView(),
 	})
 
diff --git a/pkg/tcpip/transport/icmp/BUILD b/pkg/tcpip/transport/icmp/BUILD
index ac18ec5b1..9ce625c17 100644
--- a/pkg/tcpip/transport/icmp/BUILD
+++ b/pkg/tcpip/transport/icmp/BUILD
@@ -31,7 +31,6 @@ go_library(
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/header",
-        "//pkg/tcpip/iptables",
         "//pkg/tcpip/stack",
         "//pkg/tcpip/transport/raw",
         "//pkg/tcpip/transport/tcp",
diff --git a/pkg/tcpip/transport/icmp/endpoint.go b/pkg/tcpip/transport/icmp/endpoint.go
index 2a396e9bc..613b12ead 100644
--- a/pkg/tcpip/transport/icmp/endpoint.go
+++ b/pkg/tcpip/transport/icmp/endpoint.go
@@ -19,7 +19,6 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
-	"gvisor.dev/gvisor/pkg/tcpip/iptables"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
@@ -135,7 +134,7 @@ func (e *endpoint) Close() {
 func (e *endpoint) ModerateRecvBuf(copied int) {}
 
 // IPTables implements tcpip.Endpoint.IPTables.
-func (e *endpoint) IPTables() (iptables.IPTables, error) {
+func (e *endpoint) IPTables() (stack.IPTables, error) {
 	return e.stack.IPTables(), nil
 }
 
@@ -441,7 +440,7 @@ func send4(r *stack.Route, ident uint16, data buffer.View, ttl uint8) *tcpip.Err
 	if ttl == 0 {
 		ttl = r.DefaultTTL()
 	}
-	return r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: header.ICMPv4ProtocolNumber, TTL: ttl, TOS: stack.DefaultTOS}, tcpip.PacketBuffer{
+	return r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: header.ICMPv4ProtocolNumber, TTL: ttl, TOS: stack.DefaultTOS}, stack.PacketBuffer{
 		Header:          hdr,
 		Data:            data.ToVectorisedView(),
 		TransportHeader: buffer.View(icmpv4),
@@ -471,7 +470,7 @@ func send6(r *stack.Route, ident uint16, data buffer.View, ttl uint8) *tcpip.Err
 	if ttl == 0 {
 		ttl = r.DefaultTTL()
 	}
-	return r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: header.ICMPv6ProtocolNumber, TTL: ttl, TOS: stack.DefaultTOS}, tcpip.PacketBuffer{
+	return r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: header.ICMPv6ProtocolNumber, TTL: ttl, TOS: stack.DefaultTOS}, stack.PacketBuffer{
 		Header:          hdr,
 		Data:            dataVV,
 		TransportHeader: buffer.View(icmpv6),
@@ -733,7 +732,7 @@ func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
 
 // HandlePacket is called by the stack when new packets arrive to this transport
 // endpoint.
-func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pkt tcpip.PacketBuffer) {
+func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pkt stack.PacketBuffer) {
 	// Only accept echo replies.
 	switch e.NetProto {
 	case header.IPv4ProtocolNumber:
@@ -795,7 +794,7 @@ func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pk
 }
 
 // HandleControlPacket implements stack.TransportEndpoint.HandleControlPacket.
-func (e *endpoint) HandleControlPacket(id stack.TransportEndpointID, typ stack.ControlType, extra uint32, pkt tcpip.PacketBuffer) {
+func (e *endpoint) HandleControlPacket(id stack.TransportEndpointID, typ stack.ControlType, extra uint32, pkt stack.PacketBuffer) {
 }
 
 // State implements tcpip.Endpoint.State. The ICMP endpoint currently doesn't
diff --git a/pkg/tcpip/transport/icmp/protocol.go b/pkg/tcpip/transport/icmp/protocol.go
index 113d92901..3c47692b2 100644
--- a/pkg/tcpip/transport/icmp/protocol.go
+++ b/pkg/tcpip/transport/icmp/protocol.go
@@ -104,7 +104,7 @@ func (p *protocol) ParsePorts(v buffer.View) (src, dst uint16, err *tcpip.Error)
 
 // HandleUnknownDestinationPacket handles packets targeted at this protocol but
 // that don't match any existing endpoint.
-func (*protocol) HandleUnknownDestinationPacket(*stack.Route, stack.TransportEndpointID, tcpip.PacketBuffer) bool {
+func (*protocol) HandleUnknownDestinationPacket(*stack.Route, stack.TransportEndpointID, stack.PacketBuffer) bool {
 	return true
 }
 
diff --git a/pkg/tcpip/transport/packet/BUILD b/pkg/tcpip/transport/packet/BUILD
index d22de6b26..b989b1209 100644
--- a/pkg/tcpip/transport/packet/BUILD
+++ b/pkg/tcpip/transport/packet/BUILD
@@ -31,7 +31,6 @@ go_library(
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/header",
-        "//pkg/tcpip/iptables",
         "//pkg/tcpip/stack",
         "//pkg/waiter",
     ],
diff --git a/pkg/tcpip/transport/packet/endpoint.go b/pkg/tcpip/transport/packet/endpoint.go
index 09a1cd436..df49d0995 100644
--- a/pkg/tcpip/transport/packet/endpoint.go
+++ b/pkg/tcpip/transport/packet/endpoint.go
@@ -29,7 +29,6 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
-	"gvisor.dev/gvisor/pkg/tcpip/iptables"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
@@ -100,8 +99,8 @@ func NewEndpoint(s *stack.Stack, cooked bool, netProto tcpip.NetworkProtocolNumb
 }
 
 // Abort implements stack.TransportEndpoint.Abort.
-func (e *endpoint) Abort() {
-	e.Close()
+func (ep *endpoint) Abort() {
+	ep.Close()
 }
 
 // Close implements tcpip.Endpoint.Close.
@@ -134,7 +133,7 @@ func (ep *endpoint) Close() {
 func (ep *endpoint) ModerateRecvBuf(copied int) {}
 
 // IPTables implements tcpip.Endpoint.IPTables.
-func (ep *endpoint) IPTables() (iptables.IPTables, error) {
+func (ep *endpoint) IPTables() (stack.IPTables, error) {
 	return ep.stack.IPTables(), nil
 }
 
@@ -299,7 +298,7 @@ func (ep *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
 }
 
 // HandlePacket implements stack.PacketEndpoint.HandlePacket.
-func (ep *endpoint) HandlePacket(nicID tcpip.NICID, localAddr tcpip.LinkAddress, netProto tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) {
+func (ep *endpoint) HandlePacket(nicID tcpip.NICID, localAddr tcpip.LinkAddress, netProto tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) {
 	ep.rcvMu.Lock()
 
 	// Drop the packet if our buffer is currently full.
diff --git a/pkg/tcpip/transport/raw/BUILD b/pkg/tcpip/transport/raw/BUILD
index c9baf4600..2eab09088 100644
--- a/pkg/tcpip/transport/raw/BUILD
+++ b/pkg/tcpip/transport/raw/BUILD
@@ -32,7 +32,6 @@ go_library(
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/header",
-        "//pkg/tcpip/iptables",
         "//pkg/tcpip/stack",
         "//pkg/tcpip/transport/packet",
         "//pkg/waiter",
diff --git a/pkg/tcpip/transport/raw/endpoint.go b/pkg/tcpip/transport/raw/endpoint.go
index 2ef5fac76..536dafd1e 100644
--- a/pkg/tcpip/transport/raw/endpoint.go
+++ b/pkg/tcpip/transport/raw/endpoint.go
@@ -30,7 +30,6 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
-	"gvisor.dev/gvisor/pkg/tcpip/iptables"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
@@ -161,7 +160,7 @@ func (e *endpoint) Close() {
 func (e *endpoint) ModerateRecvBuf(copied int) {}
 
 // IPTables implements tcpip.Endpoint.IPTables.
-func (e *endpoint) IPTables() (iptables.IPTables, error) {
+func (e *endpoint) IPTables() (stack.IPTables, error) {
 	return e.stack.IPTables(), nil
 }
 
@@ -342,7 +341,7 @@ func (e *endpoint) finishWrite(payloadBytes []byte, route *stack.Route) (int64,
 	switch e.NetProto {
 	case header.IPv4ProtocolNumber:
 		if !e.associated {
-			if err := route.WriteHeaderIncludedPacket(tcpip.PacketBuffer{
+			if err := route.WriteHeaderIncludedPacket(stack.PacketBuffer{
 				Data: buffer.View(payloadBytes).ToVectorisedView(),
 			}); err != nil {
 				return 0, nil, err
@@ -350,7 +349,7 @@ func (e *endpoint) finishWrite(payloadBytes []byte, route *stack.Route) (int64,
 			break
 		}
 		hdr := buffer.NewPrependable(len(payloadBytes) + int(route.MaxHeaderLength()))
-		if err := route.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: e.TransProto, TTL: route.DefaultTTL(), TOS: stack.DefaultTOS}, tcpip.PacketBuffer{
+		if err := route.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: e.TransProto, TTL: route.DefaultTTL(), TOS: stack.DefaultTOS}, stack.PacketBuffer{
 			Header: hdr,
 			Data:   buffer.View(payloadBytes).ToVectorisedView(),
 		}); err != nil {
@@ -574,7 +573,7 @@ func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
 }
 
 // HandlePacket implements stack.RawTransportEndpoint.HandlePacket.
-func (e *endpoint) HandlePacket(route *stack.Route, pkt tcpip.PacketBuffer) {
+func (e *endpoint) HandlePacket(route *stack.Route, pkt stack.PacketBuffer) {
 	e.rcvMu.Lock()
 
 	// Drop the packet if our buffer is currently full.
diff --git a/pkg/tcpip/transport/tcp/BUILD b/pkg/tcpip/transport/tcp/BUILD
index a32f9eacf..7f94f9646 100644
--- a/pkg/tcpip/transport/tcp/BUILD
+++ b/pkg/tcpip/transport/tcp/BUILD
@@ -66,12 +66,10 @@ go_library(
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/hash/jenkins",
         "//pkg/tcpip/header",
-        "//pkg/tcpip/iptables",
         "//pkg/tcpip/ports",
         "//pkg/tcpip/seqnum",
         "//pkg/tcpip/stack",
         "//pkg/tcpip/transport/raw",
-        "//pkg/tmutex",
         "//pkg/waiter",
         "@com_github_google_btree//:go_default_library",
     ],
diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go
index 85049e54e..375ca21f6 100644
--- a/pkg/tcpip/transport/tcp/accept.go
+++ b/pkg/tcpip/transport/tcp/accept.go
@@ -221,7 +221,8 @@ func (l *listenContext) isCookieValid(id stack.TransportEndpointID, cookie seqnu
 }
 
 // createConnectingEndpoint creates a new endpoint in a connecting state, with
-// the connection parameters given by the arguments.
+// the connection parameters given by the arguments. The endpoint is returned
+// with n.mu held.
 func (l *listenContext) createConnectingEndpoint(s *segment, iss seqnum.Value, irs seqnum.Value, rcvdSynOpts *header.TCPSynOptions, queue *waiter.Queue) (*endpoint, *tcpip.Error) {
 	// Create a new endpoint.
 	netProto := l.netProto
@@ -243,21 +244,6 @@ func (l *listenContext) createConnectingEndpoint(s *segment, iss seqnum.Value, i
 
 	n.initGSO()
 
-	// Now inherit any socket options that should be inherited from the
-	// listening endpoint.
-	// In case of Forwarder listenEP will be nil and hence this check.
-	if l.listenEP != nil {
-		l.listenEP.propagateInheritableOptions(n)
-	}
-
-	// Register new endpoint so that packets are routed to it.
-	if err := n.stack.RegisterTransportEndpoint(n.boundNICID, n.effectiveNetProtos, ProtocolNumber, n.ID, n, n.reusePort, n.boundBindToDevice); err != nil {
-		n.Close()
-		return nil, err
-	}
-
-	n.isRegistered = true
-
 	// Create sender and receiver.
 	//
 	// The receiver at least temporarily has a zero receive window scale,
@@ -269,11 +255,27 @@ func (l *listenContext) createConnectingEndpoint(s *segment, iss seqnum.Value, i
 	// window to grow to a really large value.
 	n.rcvAutoParams.prevCopied = n.initialReceiveWindow()
 
+	// Lock the endpoint before registering to ensure that no out of
+	// band changes are possible due to incoming packets etc till
+	// the endpoint is done initializing.
+	n.mu.Lock()
+
+	// Register new endpoint so that packets are routed to it.
+	if err := n.stack.RegisterTransportEndpoint(n.boundNICID, n.effectiveNetProtos, ProtocolNumber, n.ID, n, n.reusePort, n.boundBindToDevice); err != nil {
+		n.mu.Unlock()
+		n.Close()
+		return nil, err
+	}
+
+	n.isRegistered = true
+
 	return n, nil
 }
 
 // createEndpointAndPerformHandshake creates a new endpoint in connected state
 // and then performs the TCP 3-way handshake.
+//
+// The new endpoint is returned with e.mu held.
 func (l *listenContext) createEndpointAndPerformHandshake(s *segment, opts *header.TCPSynOptions, queue *waiter.Queue) (*endpoint, *tcpip.Error) {
 	// Create new endpoint.
 	irs := s.sequenceNumber
@@ -289,9 +291,25 @@ func (l *listenContext) createEndpointAndPerformHandshake(s *segment, opts *head
 		l.listenEP.mu.Lock()
 		if l.listenEP.EndpointState() != StateListen {
 			l.listenEP.mu.Unlock()
+			// Ensure we release any registrations done by the newly
+			// created endpoint.
+			ep.mu.Unlock()
+			ep.Close()
+
+			// Wake up any waiters. This is strictly not required normally
+			// as a socket that was never accepted can't really have any
+			// registered waiters except when stack.Wait() is called which
+			// waits for all registered endpoints to stop and expects an
+			// EventHUp.
+			ep.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.EventIn | waiter.EventOut)
 			return nil, tcpip.ErrConnectionAborted
 		}
 		l.addPendingEndpoint(ep)
+
+		// Propagate any inheritable options from the listening endpoint
+		// to the newly created endpoint.
+		l.listenEP.propagateInheritableOptionsLocked(ep)
+
 		deferAccept = l.listenEP.deferAccept
 		l.listenEP.mu.Unlock()
 	}
@@ -299,6 +317,7 @@ func (l *listenContext) createEndpointAndPerformHandshake(s *segment, opts *head
 	// Perform the 3-way handshake.
 	h := newPassiveHandshake(ep, seqnum.Size(ep.initialReceiveWindow()), isn, irs, opts, deferAccept)
 	if err := h.execute(); err != nil {
+		ep.mu.Unlock()
 		ep.Close()
 		// Wake up any waiters. This is strictly not required normally
 		// as a socket that was never accepted can't really have any
@@ -312,9 +331,7 @@ func (l *listenContext) createEndpointAndPerformHandshake(s *segment, opts *head
 		}
 		return nil, err
 	}
-	ep.mu.Lock()
 	ep.isConnectNotified = true
-	ep.mu.Unlock()
 
 	// Update the receive window scaling. We can't do it before the
 	// handshake because it's possible that the peer doesn't support window
@@ -348,30 +365,38 @@ func (l *listenContext) closeAllPendingEndpoints() {
 }
 
 // deliverAccepted delivers the newly-accepted endpoint to the listener. If the
-// endpoint has transitioned out of the listen state, the new endpoint is closed
-// instead.
+// endpoint has transitioned out of the listen state (acceptedChan is nil),
+// the new endpoint is closed instead.
 func (e *endpoint) deliverAccepted(n *endpoint) {
 	e.mu.Lock()
-	state := e.EndpointState()
 	e.pendingAccepted.Add(1)
-	defer e.pendingAccepted.Done()
-	acceptedChan := e.acceptedChan
 	e.mu.Unlock()
+	defer e.pendingAccepted.Done()
 
-	if state == StateListen {
-		acceptedChan <- n
-		e.waiterQueue.Notify(waiter.EventIn)
-	} else {
-		n.Close()
+	e.acceptMu.Lock()
+	for {
+		if e.acceptedChan == nil {
+			e.acceptMu.Unlock()
+			n.Close()
+			return
+		}
+		select {
+		case e.acceptedChan <- n:
+			e.acceptMu.Unlock()
+			e.waiterQueue.Notify(waiter.EventIn)
+			return
+		default:
+			e.acceptCond.Wait()
+		}
 	}
 }
 
-// propagateInheritableOptions propagates any options set on the listening
+// propagateInheritableOptionsLocked propagates any options set on the listening
 // endpoint to the newly created endpoint.
-func (e *endpoint) propagateInheritableOptions(n *endpoint) {
-	e.mu.Lock()
+//
+// Precondition: e.mu and n.mu must be held.
+func (e *endpoint) propagateInheritableOptionsLocked(n *endpoint) {
 	n.userTimeout = e.userTimeout
-	e.mu.Unlock()
 }
 
 // handleSynSegment is called in its own goroutine once the listening endpoint
@@ -382,7 +407,11 @@ func (e *endpoint) propagateInheritableOptions(n *endpoint) {
 // cookies to accept connections.
 func (e *endpoint) handleSynSegment(ctx *listenContext, s *segment, opts *header.TCPSynOptions) {
 	defer decSynRcvdCount()
-	defer e.decSynRcvdCount()
+	defer func() {
+		e.mu.Lock()
+		e.decSynRcvdCount()
+		e.mu.Unlock()
+	}()
 	defer s.decRef()
 
 	n, err := ctx.createEndpointAndPerformHandshake(s, opts, &waiter.Queue{})
@@ -399,30 +428,24 @@ func (e *endpoint) handleSynSegment(ctx *listenContext, s *segment, opts *header
 }
 
 func (e *endpoint) incSynRcvdCount() bool {
-	e.mu.Lock()
-	if e.synRcvdCount >= cap(e.acceptedChan) {
-		e.mu.Unlock()
-		return false
+	e.acceptMu.Lock()
+	canInc := e.synRcvdCount < cap(e.acceptedChan)
+	e.acceptMu.Unlock()
+	if canInc {
+		e.synRcvdCount++
 	}
-	e.synRcvdCount++
-	e.mu.Unlock()
-	return true
+	return canInc
 }
 
 func (e *endpoint) decSynRcvdCount() {
-	e.mu.Lock()
 	e.synRcvdCount--
-	e.mu.Unlock()
 }
 
 func (e *endpoint) acceptQueueIsFull() bool {
-	e.mu.Lock()
-	if l, c := len(e.acceptedChan)+e.synRcvdCount, cap(e.acceptedChan); l >= c {
-		e.mu.Unlock()
-		return true
-	}
-	e.mu.Unlock()
-	return false
+	e.acceptMu.Lock()
+	full := len(e.acceptedChan)+e.synRcvdCount >= cap(e.acceptedChan)
+	e.acceptMu.Unlock()
+	return full
 }
 
 // handleListenSegment is called when a listening endpoint receives a segment
@@ -432,7 +455,15 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) {
 		// RFC 793 section 3.4 page 35 (figure 12) outlines that a RST
 		// must be sent in response to a SYN-ACK while in the listen
 		// state to prevent completing a handshake from an old SYN.
-		e.sendTCP(&s.route, s.id, buffer.VectorisedView{}, e.ttl, e.sendTOS, header.TCPFlagRst, s.ackNumber, 0, 0, nil, nil)
+		e.sendTCP(&s.route, tcpFields{
+			id:     s.id,
+			ttl:    e.ttl,
+			tos:    e.sendTOS,
+			flags:  header.TCPFlagRst,
+			seq:    s.ackNumber,
+			ack:    0,
+			rcvWnd: 0,
+		}, buffer.VectorisedView{}, nil)
 		return
 	}
 
@@ -480,7 +511,15 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) {
 				TSEcr: opts.TSVal,
 				MSS:   mssForRoute(&s.route),
 			}
-			e.sendSynTCP(&s.route, s.id, e.ttl, e.sendTOS, header.TCPFlagSyn|header.TCPFlagAck, cookie, s.sequenceNumber+1, ctx.rcvWnd, synOpts)
+			e.sendSynTCP(&s.route, tcpFields{
+				id:     s.id,
+				ttl:    e.ttl,
+				tos:    e.sendTOS,
+				flags:  header.TCPFlagSyn | header.TCPFlagAck,
+				seq:    cookie,
+				ack:    s.sequenceNumber + 1,
+				rcvWnd: ctx.rcvWnd,
+			}, synOpts)
 			e.stack.Stats().TCP.ListenOverflowSynCookieSent.Increment()
 		}
 
@@ -559,6 +598,10 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) {
 			return
 		}
 
+		// Propagate any inheritable options from the listening endpoint
+		// to the newly created endpoint.
+		e.propagateInheritableOptionsLocked(n)
+
 		// clear the tsOffset for the newly created
 		// endpoint as the Timestamp was already
 		// randomly offset when the original SYN-ACK was
@@ -593,14 +636,12 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) {
 func (e *endpoint) protocolListenLoop(rcvWnd seqnum.Size) *tcpip.Error {
 	e.mu.Lock()
 	v6only := e.v6only
-	e.mu.Unlock()
 	ctx := newListenContext(e.stack, e, rcvWnd, v6only, e.NetProto)
 
 	defer func() {
 		// Mark endpoint as closed. This will prevent goroutines running
 		// handleSynSegment() from attempting to queue new connections
 		// to the endpoint.
-		e.mu.Lock()
 		e.setEndpointState(StateClose)
 
 		// close any endpoints in SYN-RCVD state.
@@ -622,7 +663,10 @@ func (e *endpoint) protocolListenLoop(rcvWnd seqnum.Size) *tcpip.Error {
 	s.AddWaker(&e.notificationWaker, wakerForNotification)
 	s.AddWaker(&e.newSegmentWaker, wakerForNewSegment)
 	for {
-		switch index, _ := s.Fetch(true); index {
+		e.mu.Unlock()
+		index, _ := s.Fetch(true)
+		e.mu.Lock()
+		switch index {
 		case wakerForNotification:
 			n := e.fetchNotifications()
 			if n&notifyClose != 0 {
@@ -635,7 +679,9 @@ func (e *endpoint) protocolListenLoop(rcvWnd seqnum.Size) *tcpip.Error {
 					s.decRef()
 				}
 				close(e.drainDone)
+				e.mu.Unlock()
 				<-e.undrain
+				e.mu.Lock()
 			}
 
 		case wakerForNewSegment:
diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index be86af502..1d245c2c6 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -61,6 +61,9 @@ const (
 )
 
 // handshake holds the state used during a TCP 3-way handshake.
+//
+// NOTE: handshake.ep.mu is held during handshake processing. It is released if
+// we are going to block and reacquired when we start processing an event.
 type handshake struct {
 	ep     *endpoint
 	state  handshakeState
@@ -209,9 +212,7 @@ func (h *handshake) resetToSynRcvd(iss seqnum.Value, irs seqnum.Value, opts *hea
 	h.mss = opts.MSS
 	h.sndWndScale = opts.WS
 	h.deferAccept = deferAccept
-	h.ep.mu.Lock()
 	h.ep.setEndpointState(StateSynRecv)
-	h.ep.mu.Unlock()
 }
 
 // checkAck checks if the ACK number, if present, of a segment received during
@@ -241,9 +242,7 @@ func (h *handshake) synSentState(s *segment) *tcpip.Error {
 			// RFC 793, page 67, states that "If the RST bit is set [and] If the ACK
 			// was acceptable then signal the user "error: connection reset", drop
 			// the segment, enter CLOSED state, delete TCB, and return."
-			h.ep.mu.Lock()
 			h.ep.workerCleanup = true
-			h.ep.mu.Unlock()
 			// Although the RFC above calls out ECONNRESET, Linux actually returns
 			// ECONNREFUSED here so we do as well.
 			return tcpip.ErrConnectionRefused
@@ -281,9 +280,7 @@ func (h *handshake) synSentState(s *segment) *tcpip.Error {
 	if s.flagIsSet(header.TCPFlagAck) {
 		h.state = handshakeCompleted
 
-		h.ep.mu.Lock()
 		h.ep.transitionToStateEstablishedLocked(h)
-		h.ep.mu.Unlock()
 
 		h.ep.sendRaw(buffer.VectorisedView{}, header.TCPFlagAck, h.iss+1, h.ackNum, h.rcvWnd>>h.effectiveRcvWndScale())
 		return nil
@@ -293,11 +290,9 @@ func (h *handshake) synSentState(s *segment) *tcpip.Error {
 	// but resend our own SYN and wait for it to be acknowledged in the
 	// SYN-RCVD state.
 	h.state = handshakeSynRcvd
-	h.ep.mu.Lock()
 	ttl := h.ep.ttl
 	amss := h.ep.amss
 	h.ep.setEndpointState(StateSynRecv)
-	h.ep.mu.Unlock()
 	synOpts := header.TCPSynOptions{
 		WS:    int(h.effectiveRcvWndScale()),
 		TS:    rcvSynOpts.TS,
@@ -313,7 +308,15 @@ func (h *handshake) synSentState(s *segment) *tcpip.Error {
 	if ttl == 0 {
 		ttl = s.route.DefaultTTL()
 	}
-	h.ep.sendSynTCP(&s.route, h.ep.ID, ttl, h.ep.sendTOS, h.flags, h.iss, h.ackNum, h.rcvWnd, synOpts)
+	h.ep.sendSynTCP(&s.route, tcpFields{
+		id:     h.ep.ID,
+		ttl:    ttl,
+		tos:    h.ep.sendTOS,
+		flags:  h.flags,
+		seq:    h.iss,
+		ack:    h.ackNum,
+		rcvWnd: h.rcvWnd,
+	}, synOpts)
 	return nil
 }
 
@@ -357,10 +360,6 @@ func (h *handshake) synRcvdState(s *segment) *tcpip.Error {
 			return tcpip.ErrInvalidEndpointState
 		}
 
-		h.ep.mu.RLock()
-		amss := h.ep.amss
-		h.ep.mu.RUnlock()
-
 		h.resetState()
 		synOpts := header.TCPSynOptions{
 			WS:            h.rcvWndScale,
@@ -368,9 +367,17 @@ func (h *handshake) synRcvdState(s *segment) *tcpip.Error {
 			TSVal:         h.ep.timestamp(),
 			TSEcr:         h.ep.recentTimestamp(),
 			SACKPermitted: h.ep.sackPermitted,
-			MSS:           amss,
+			MSS:           h.ep.amss,
 		}
-		h.ep.sendSynTCP(&s.route, h.ep.ID, h.ep.ttl, h.ep.sendTOS, h.flags, h.iss, h.ackNum, h.rcvWnd, synOpts)
+		h.ep.sendSynTCP(&s.route, tcpFields{
+			id:     h.ep.ID,
+			ttl:    h.ep.ttl,
+			tos:    h.ep.sendTOS,
+			flags:  h.flags,
+			seq:    h.iss,
+			ack:    h.ackNum,
+			rcvWnd: h.rcvWnd,
+		}, synOpts)
 		return nil
 	}
 
@@ -399,15 +406,14 @@ func (h *handshake) synRcvdState(s *segment) *tcpip.Error {
 		}
 		h.state = handshakeCompleted
 
-		h.ep.mu.Lock()
 		h.ep.transitionToStateEstablishedLocked(h)
+
 		// If the segment has data then requeue it for the receiver
 		// to process it again once main loop is started.
 		if s.data.Size() > 0 {
 			s.incRef()
 			h.ep.enqueueSegment(s)
 		}
-		h.ep.mu.Unlock()
 		return nil
 	}
 
@@ -493,7 +499,9 @@ func (h *handshake) resolveRoute() *tcpip.Error {
 			}
 			if n&notifyDrain != 0 {
 				close(h.ep.drainDone)
+				h.ep.mu.Unlock()
 				<-h.ep.undrain
+				h.ep.mu.Lock()
 			}
 		}
 
@@ -535,7 +543,6 @@ func (h *handshake) execute() *tcpip.Error {
 
 	// Send the initial SYN segment and loop until the handshake is
 	// completed.
-	h.ep.mu.Lock()
 	h.ep.amss = calculateAdvertisedMSS(h.ep.userMSS, h.ep.route)
 
 	synOpts := header.TCPSynOptions{
@@ -546,7 +553,6 @@ func (h *handshake) execute() *tcpip.Error {
 		SACKPermitted: bool(sackEnabled),
 		MSS:           h.ep.amss,
 	}
-	h.ep.mu.Unlock()
 
 	// Execute is also called in a listen context so we want to make sure we
 	// only send the TS/SACK option when we received the TS/SACK in the
@@ -560,10 +566,23 @@ func (h *handshake) execute() *tcpip.Error {
 			synOpts.WS = -1
 		}
 	}
-	h.ep.sendSynTCP(&h.ep.route, h.ep.ID, h.ep.ttl, h.ep.sendTOS, h.flags, h.iss, h.ackNum, h.rcvWnd, synOpts)
+
+	h.ep.sendSynTCP(&h.ep.route, tcpFields{
+		id:     h.ep.ID,
+		ttl:    h.ep.ttl,
+		tos:    h.ep.sendTOS,
+		flags:  h.flags,
+		seq:    h.iss,
+		ack:    h.ackNum,
+		rcvWnd: h.rcvWnd,
+	}, synOpts)
 
 	for h.state != handshakeCompleted {
-		switch index, _ := s.Fetch(true); index {
+		h.ep.mu.Unlock()
+		index, _ := s.Fetch(true)
+		h.ep.mu.Lock()
+		switch index {
+
 		case wakerForResend:
 			timeOut *= 2
 			if timeOut > MaxRTO {
@@ -579,7 +598,15 @@ func (h *handshake) execute() *tcpip.Error {
 			// the connection with another ACK or data (as ACKs are never
 			// retransmitted on their own).
 			if h.active || !h.acked || h.deferAccept != 0 && time.Since(h.startTime) > h.deferAccept {
-				h.ep.sendSynTCP(&h.ep.route, h.ep.ID, h.ep.ttl, h.ep.sendTOS, h.flags, h.iss, h.ackNum, h.rcvWnd, synOpts)
+				h.ep.sendSynTCP(&h.ep.route, tcpFields{
+					id:     h.ep.ID,
+					ttl:    h.ep.ttl,
+					tos:    h.ep.sendTOS,
+					flags:  h.flags,
+					seq:    h.iss,
+					ack:    h.ackNum,
+					rcvWnd: h.rcvWnd,
+				}, synOpts)
 			}
 
 		case wakerForNotification:
@@ -600,7 +627,9 @@ func (h *handshake) execute() *tcpip.Error {
 					}
 				}
 				close(h.ep.drainDone)
+				h.ep.mu.Unlock()
 				<-h.ep.undrain
+				h.ep.mu.Lock()
 			}
 
 		case wakerForNewSegment:
@@ -690,18 +719,33 @@ func makeSynOptions(opts header.TCPSynOptions) []byte {
 	return options[:offset]
 }
 
-func (e *endpoint) sendSynTCP(r *stack.Route, id stack.TransportEndpointID, ttl, tos uint8, flags byte, seq, ack seqnum.Value, rcvWnd seqnum.Size, opts header.TCPSynOptions) *tcpip.Error {
-	options := makeSynOptions(opts)
+// tcpFields is a struct to carry different parameters required by the
+// send*TCP variant functions below.
+type tcpFields struct {
+	id     stack.TransportEndpointID
+	ttl    uint8
+	tos    uint8
+	flags  byte
+	seq    seqnum.Value
+	ack    seqnum.Value
+	rcvWnd seqnum.Size
+	opts   []byte
+	txHash uint32
+}
+
+func (e *endpoint) sendSynTCP(r *stack.Route, tf tcpFields, opts header.TCPSynOptions) *tcpip.Error {
+	tf.opts = makeSynOptions(opts)
 	// We ignore SYN send errors and let the callers re-attempt send.
-	if err := e.sendTCP(r, id, buffer.VectorisedView{}, ttl, tos, flags, seq, ack, rcvWnd, options, nil); err != nil {
+	if err := e.sendTCP(r, tf, buffer.VectorisedView{}, nil); err != nil {
 		e.stats.SendErrors.SynSendToNetworkFailed.Increment()
 	}
-	putOptions(options)
+	putOptions(tf.opts)
 	return nil
 }
 
-func (e *endpoint) sendTCP(r *stack.Route, id stack.TransportEndpointID, data buffer.VectorisedView, ttl, tos uint8, flags byte, seq, ack seqnum.Value, rcvWnd seqnum.Size, opts []byte, gso *stack.GSO) *tcpip.Error {
-	if err := sendTCP(r, id, data, ttl, tos, flags, seq, ack, rcvWnd, opts, gso); err != nil {
+func (e *endpoint) sendTCP(r *stack.Route, tf tcpFields, data buffer.VectorisedView, gso *stack.GSO) *tcpip.Error {
+	tf.txHash = e.txHash
+	if err := sendTCP(r, tf, data, gso); err != nil {
 		e.stats.SendErrors.SegmentSendToNetworkFailed.Increment()
 		return err
 	}
@@ -709,8 +753,8 @@ func (e *endpoint) sendTCP(r *stack.Route, id stack.TransportEndpointID, data bu
 	return nil
 }
 
-func buildTCPHdr(r *stack.Route, id stack.TransportEndpointID, pkt *tcpip.PacketBuffer, flags byte, seq, ack seqnum.Value, rcvWnd seqnum.Size, opts []byte, gso *stack.GSO) {
-	optLen := len(opts)
+func buildTCPHdr(r *stack.Route, tf tcpFields, pkt *stack.PacketBuffer, gso *stack.GSO) {
+	optLen := len(tf.opts)
 	hdr := &pkt.Header
 	packetSize := pkt.DataSize
 	off := pkt.DataOffset
@@ -718,15 +762,15 @@ func buildTCPHdr(r *stack.Route, id stack.TransportEndpointID, pkt *tcpip.Packet
 	tcp := header.TCP(hdr.Prepend(header.TCPMinimumSize + optLen))
 	pkt.TransportHeader = buffer.View(tcp)
 	tcp.Encode(&header.TCPFields{
-		SrcPort:    id.LocalPort,
-		DstPort:    id.RemotePort,
-		SeqNum:     uint32(seq),
-		AckNum:     uint32(ack),
+		SrcPort:    tf.id.LocalPort,
+		DstPort:    tf.id.RemotePort,
+		SeqNum:     uint32(tf.seq),
+		AckNum:     uint32(tf.ack),
 		DataOffset: uint8(header.TCPMinimumSize + optLen),
-		Flags:      flags,
-		WindowSize: uint16(rcvWnd),
+		Flags:      tf.flags,
+		WindowSize: uint16(tf.rcvWnd),
 	})
-	copy(tcp[header.TCPMinimumSize:], opts)
+	copy(tcp[header.TCPMinimumSize:], tf.opts)
 
 	length := uint16(hdr.UsedLength() + packetSize)
 	xsum := r.PseudoHeaderChecksum(ProtocolNumber, length)
@@ -741,13 +785,12 @@ func buildTCPHdr(r *stack.Route, id stack.TransportEndpointID, pkt *tcpip.Packet
 		xsum = header.ChecksumVVWithOffset(pkt.Data, xsum, off, packetSize)
 		tcp.SetChecksum(^tcp.CalculateChecksum(xsum))
 	}
-
 }
 
-func sendTCPBatch(r *stack.Route, id stack.TransportEndpointID, data buffer.VectorisedView, ttl, tos uint8, flags byte, seq, ack seqnum.Value, rcvWnd seqnum.Size, opts []byte, gso *stack.GSO) *tcpip.Error {
-	optLen := len(opts)
-	if rcvWnd > 0xffff {
-		rcvWnd = 0xffff
+func sendTCPBatch(r *stack.Route, tf tcpFields, data buffer.VectorisedView, gso *stack.GSO) *tcpip.Error {
+	optLen := len(tf.opts)
+	if tf.rcvWnd > 0xffff {
+		tf.rcvWnd = 0xffff
 	}
 
 	mss := int(gso.MSS)
@@ -756,7 +799,7 @@ func sendTCPBatch(r *stack.Route, id stack.TransportEndpointID, data buffer.Vect
 	// Allocate one big slice for all the headers.
 	hdrSize := header.TCPMinimumSize + int(r.MaxHeaderLength()) + optLen
 	buf := make([]byte, n*hdrSize)
-	pkts := make([]tcpip.PacketBuffer, n)
+	pkts := make([]stack.PacketBuffer, n)
 	for i := range pkts {
 		pkts[i].Header = buffer.NewEmptyPrependableFromView(buf[i*hdrSize:][:hdrSize])
 	}
@@ -772,14 +815,15 @@ func sendTCPBatch(r *stack.Route, id stack.TransportEndpointID, data buffer.Vect
 		pkts[i].DataOffset = off
 		pkts[i].DataSize = packetSize
 		pkts[i].Data = data
-		buildTCPHdr(r, id, &pkts[i], flags, seq, ack, rcvWnd, opts, gso)
+		pkts[i].Hash = tf.txHash
+		buildTCPHdr(r, tf, &pkts[i], gso)
 		off += packetSize
-		seq = seq.Add(seqnum.Size(packetSize))
+		tf.seq = tf.seq.Add(seqnum.Size(packetSize))
 	}
-	if ttl == 0 {
-		ttl = r.DefaultTTL()
+	if tf.ttl == 0 {
+		tf.ttl = r.DefaultTTL()
 	}
-	sent, err := r.WritePackets(gso, pkts, stack.NetworkHeaderParams{Protocol: ProtocolNumber, TTL: ttl, TOS: tos})
+	sent, err := r.WritePackets(gso, pkts, stack.NetworkHeaderParams{Protocol: ProtocolNumber, TTL: tf.ttl, TOS: tf.tos})
 	if err != nil {
 		r.Stats().TCP.SegmentSendErrors.IncrementBy(uint64(n - sent))
 	}
@@ -789,33 +833,34 @@ func sendTCPBatch(r *stack.Route, id stack.TransportEndpointID, data buffer.Vect
 
 // sendTCP sends a TCP segment with the provided options via the provided
 // network endpoint and under the provided identity.
-func sendTCP(r *stack.Route, id stack.TransportEndpointID, data buffer.VectorisedView, ttl, tos uint8, flags byte, seq, ack seqnum.Value, rcvWnd seqnum.Size, opts []byte, gso *stack.GSO) *tcpip.Error {
-	optLen := len(opts)
-	if rcvWnd > 0xffff {
-		rcvWnd = 0xffff
+func sendTCP(r *stack.Route, tf tcpFields, data buffer.VectorisedView, gso *stack.GSO) *tcpip.Error {
+	optLen := len(tf.opts)
+	if tf.rcvWnd > 0xffff {
+		tf.rcvWnd = 0xffff
 	}
 
 	if r.Loop&stack.PacketLoop == 0 && gso != nil && gso.Type == stack.GSOSW && int(gso.MSS) < data.Size() {
-		return sendTCPBatch(r, id, data, ttl, tos, flags, seq, ack, rcvWnd, opts, gso)
+		return sendTCPBatch(r, tf, data, gso)
 	}
 
-	pkt := tcpip.PacketBuffer{
+	pkt := stack.PacketBuffer{
 		Header:     buffer.NewPrependable(header.TCPMinimumSize + int(r.MaxHeaderLength()) + optLen),
 		DataOffset: 0,
 		DataSize:   data.Size(),
 		Data:       data,
+		Hash:       tf.txHash,
 	}
-	buildTCPHdr(r, id, &pkt, flags, seq, ack, rcvWnd, opts, gso)
+	buildTCPHdr(r, tf, &pkt, gso)
 
-	if ttl == 0 {
-		ttl = r.DefaultTTL()
+	if tf.ttl == 0 {
+		tf.ttl = r.DefaultTTL()
 	}
-	if err := r.WritePacket(gso, stack.NetworkHeaderParams{Protocol: ProtocolNumber, TTL: ttl, TOS: tos}, pkt); err != nil {
+	if err := r.WritePacket(gso, stack.NetworkHeaderParams{Protocol: ProtocolNumber, TTL: tf.ttl, TOS: tf.tos}, pkt); err != nil {
 		r.Stats().TCP.SegmentSendErrors.Increment()
 		return err
 	}
 	r.Stats().TCP.SegmentsSent.Increment()
-	if (flags & header.TCPFlagRst) != 0 {
+	if (tf.flags & header.TCPFlagRst) != 0 {
 		r.Stats().TCP.ResetsSent.Increment()
 	}
 	return nil
@@ -867,7 +912,16 @@ func (e *endpoint) sendRaw(data buffer.VectorisedView, flags byte, seq, ack seqn
 		sackBlocks = e.sack.Blocks[:e.sack.NumBlocks]
 	}
 	options := e.makeOptions(sackBlocks)
-	err := e.sendTCP(&e.route, e.ID, data, e.ttl, e.sendTOS, flags, seq, ack, rcvWnd, options, e.gso)
+	err := e.sendTCP(&e.route, tcpFields{
+		id:     e.ID,
+		ttl:    e.ttl,
+		tos:    e.sendTOS,
+		flags:  flags,
+		seq:    seq,
+		ack:    ack,
+		rcvWnd: rcvWnd,
+		opts:   options,
+	}, data, e.gso)
 	putOptions(options)
 	return err
 }
@@ -882,7 +936,6 @@ func (e *endpoint) handleWrite() *tcpip.Error {
 	first := e.sndQueue.Front()
 	if first != nil {
 		e.snd.writeList.PushBackList(&e.sndQueue)
-		e.snd.sndNxtList.UpdateForward(e.sndBufInQueue)
 		e.sndBufInQueue = 0
 	}
 
@@ -1016,7 +1069,6 @@ func (e *endpoint) handleReset(s *segment) (ok bool, err *tcpip.Error) {
 		// except SYN-SENT, all reset (RST) segments are
 		// validated by checking their SEQ-fields." So
 		// we only process it if it's acceptable.
-		e.mu.Lock()
 		switch e.EndpointState() {
 		// In case of a RST in CLOSE-WAIT linux moves
 		// the socket to closed state with an error set
@@ -1040,11 +1092,9 @@ func (e *endpoint) handleReset(s *segment) (ok bool, err *tcpip.Error) {
 		case StateCloseWait:
 			e.transitionToStateCloseLocked()
 			e.HardError = tcpip.ErrAborted
-			e.mu.Unlock()
 			e.notifyProtocolGoroutine(notifyTickleWorker)
 			return false, nil
 		default:
-			e.mu.Unlock()
 			// RFC 793, page 37 states that "in all states
 			// except SYN-SENT, all reset (RST) segments are
 			// validated by checking their SEQ-fields." So
@@ -1157,9 +1207,7 @@ func (e *endpoint) handleSegment(s *segment) (cont bool, err *tcpip.Error) {
 		// Now check if the received segment has caused us to transition
 		// to a CLOSED state, if yes then terminate processing and do
 		// not invoke the sender.
-		e.mu.RLock()
 		state := e.state
-		e.mu.RUnlock()
 		if state == StateClose {
 			// When we get into StateClose while processing from the queue,
 			// return immediately and let the protocolMainloop handle it.
@@ -1182,9 +1230,7 @@ func (e *endpoint) handleSegment(s *segment) (cont bool, err *tcpip.Error) {
 // keepalive packets periodically when the connection is idle. If we don't hear
 // from the other side after a number of tries, we terminate the connection.
 func (e *endpoint) keepaliveTimerExpired() *tcpip.Error {
-	e.mu.RLock()
 	userTimeout := e.userTimeout
-	e.mu.RUnlock()
 
 	e.keepalive.Lock()
 	if !e.keepalive.enabled || !e.keepalive.timer.checkExpiration() {
@@ -1248,6 +1294,7 @@ func (e *endpoint) disableKeepaliveTimer() {
 // goroutine and is responsible for sending segments and handling received
 // segments.
 func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{}) *tcpip.Error {
+	e.mu.Lock()
 	var closeTimer *time.Timer
 	var closeWaker sleep.Waker
 
@@ -1269,7 +1316,6 @@ func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{
 		}
 
 		e.mu.Unlock()
-		e.workMu.Unlock()
 		// When the protocol loop exits we should wake up our waiters.
 		e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.EventIn | waiter.EventOut)
 	}
@@ -1280,16 +1326,13 @@ func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{
 		// completion.
 		initialRcvWnd := e.initialReceiveWindow()
 		h := newHandshake(e, seqnum.Size(initialRcvWnd))
-		e.mu.Lock()
 		h.ep.setEndpointState(StateSynSent)
-		e.mu.Unlock()
 
 		if err := h.execute(); err != nil {
 			e.lastErrorMu.Lock()
 			e.lastError = err
 			e.lastErrorMu.Unlock()
 
-			e.mu.Lock()
 			e.setEndpointState(StateError)
 			e.HardError = err
 
@@ -1302,9 +1345,7 @@ func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{
 	e.keepalive.timer.init(&e.keepalive.waker)
 	defer e.keepalive.timer.cleanup()
 
-	e.mu.Lock()
 	drained := e.drainDone != nil
-	e.mu.Unlock()
 	if drained {
 		close(e.drainDone)
 		<-e.undrain
@@ -1330,10 +1371,8 @@ func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{
 				// This means the socket is being closed due
 				// to the TCP-FIN-WAIT2 timeout was hit. Just
 				// mark the socket as closed.
-				e.mu.Lock()
 				e.transitionToStateCloseLocked()
 				e.workerCleanup = true
-				e.mu.Unlock()
 				return nil
 			},
 		},
@@ -1388,7 +1427,6 @@ func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{
 				}
 
 				if n&notifyClose != 0 && closeTimer == nil {
-					e.mu.Lock()
 					if e.EndpointState() == StateFinWait2 && e.closed {
 						// The socket has been closed and we are in FIN_WAIT2
 						// so start the FIN_WAIT2 timer.
@@ -1397,7 +1435,6 @@ func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{
 						})
 						e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.EventIn | waiter.EventOut)
 					}
-					e.mu.Unlock()
 				}
 
 				if n&notifyKeepaliveChanged != 0 {
@@ -1417,7 +1454,9 @@ func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{
 						// Only block the worker if the endpoint
 						// is not in closed state or error state.
 						close(e.drainDone)
+						e.mu.Unlock()
 						<-e.undrain
+						e.mu.Lock()
 					}
 				}
 
@@ -1460,7 +1499,6 @@ func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{
 	}
 	e.rcvListMu.Unlock()
 
-	e.mu.Lock()
 	if e.workerCleanup {
 		e.notifyProtocolGoroutine(notifyClose)
 	}
@@ -1468,7 +1506,6 @@ func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{
 	// Main loop. Handle segments until both send and receive ends of the
 	// connection have completed.
 	cleanupOnError := func(err *tcpip.Error) {
-		e.mu.Lock()
 		e.workerCleanup = true
 		if err != nil {
 			e.resetConnectionLocked(err)
@@ -1480,16 +1517,11 @@ func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{
 loop:
 	for e.EndpointState() != StateTimeWait && e.EndpointState() != StateClose && e.EndpointState() != StateError {
 		e.mu.Unlock()
-		e.workMu.Unlock()
 		v, _ := s.Fetch(true)
-		e.workMu.Lock()
+		e.mu.Lock()
 
-		// We need to double check here because the notification maybe
+		// We need to double check here because the notification may be
 		// stale by the time we got around to processing it.
-		//
-		// NOTE: since we now hold the workMu the processors cannot
-		// change the state of the endpoint so it's safe to proceed
-		// after this check.
 		switch e.EndpointState() {
 		case StateError:
 			// If the endpoint has already transitioned to an ERROR
@@ -1502,21 +1534,17 @@ loop:
 		case StateTimeWait:
 			fallthrough
 		case StateClose:
-			e.mu.Lock()
 			break loop
 		default:
 			if err := funcs[v].f(); err != nil {
 				cleanupOnError(err)
 				return nil
 			}
-			e.mu.Lock()
 		}
 	}
 
-	state := e.EndpointState()
-	e.mu.Unlock()
 	var reuseTW func()
-	if state == StateTimeWait {
+	if e.EndpointState() == StateTimeWait {
 		// Disable close timer as we now entering real TIME_WAIT.
 		if closeTimer != nil {
 			closeTimer.Stop()
@@ -1526,14 +1554,11 @@ loop:
 		s.Done()
 		// Wake up any waiters before we enter TIME_WAIT.
 		e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.EventIn | waiter.EventOut)
-		e.mu.Lock()
 		e.workerCleanup = true
-		e.mu.Unlock()
 		reuseTW = e.doTimeWait()
 	}
 
 	// Mark endpoint as closed.
-	e.mu.Lock()
 	if e.EndpointState() != StateError {
 		e.transitionToStateCloseLocked()
 	}
@@ -1649,9 +1674,9 @@ func (e *endpoint) doTimeWait() (twReuse func()) {
 	defer timeWaitTimer.Stop()
 
 	for {
-		e.workMu.Unlock()
+		e.mu.Unlock()
 		v, _ := s.Fetch(true)
-		e.workMu.Lock()
+		e.mu.Lock()
 		switch v {
 		case newSegment:
 			extendTimeWait, reuseTW := e.handleTimeWaitSegments()
@@ -1674,7 +1699,9 @@ func (e *endpoint) doTimeWait() (twReuse func()) {
 					e.handleTimeWaitSegments()
 				}
 				close(e.drainDone)
+				e.mu.Unlock()
 				<-e.undrain
+				e.mu.Lock()
 				return nil
 			}
 		case timeWaitDone:
diff --git a/pkg/tcpip/transport/tcp/dispatcher.go b/pkg/tcpip/transport/tcp/dispatcher.go
index d792b07d6..6062ca916 100644
--- a/pkg/tcpip/transport/tcp/dispatcher.go
+++ b/pkg/tcpip/transport/tcp/dispatcher.go
@@ -18,7 +18,6 @@ import (
 	"gvisor.dev/gvisor/pkg/rand"
 	"gvisor.dev/gvisor/pkg/sleep"
 	"gvisor.dev/gvisor/pkg/sync"
-	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/hash/jenkins"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
@@ -128,7 +127,7 @@ func (p *processor) handleSegments() {
 				continue
 			}
 
-			if !ep.workMu.TryLock() {
+			if !ep.mu.TryLock() {
 				ep.newSegmentWaker.Assert()
 				continue
 			}
@@ -138,12 +137,10 @@ func (p *processor) handleSegments() {
 			if err := ep.handleSegments(true /* fastPath */); err != nil || ep.EndpointState() == StateClose {
 				// Send any active resets if required.
 				if err != nil {
-					ep.mu.Lock()
 					ep.resetConnectionLocked(err)
-					ep.mu.Unlock()
 				}
 				ep.notifyProtocolGoroutine(notifyTickleWorker)
-				ep.workMu.Unlock()
+				ep.mu.Unlock()
 				continue
 			}
 
@@ -151,7 +148,7 @@ func (p *processor) handleSegments() {
 				p.epQ.enqueue(ep)
 			}
 
-			ep.workMu.Unlock()
+			ep.mu.Unlock()
 		}
 	}
 }
@@ -189,7 +186,7 @@ func (d *dispatcher) wait() {
 	}
 }
 
-func (d *dispatcher) queuePacket(r *stack.Route, stackEP stack.TransportEndpoint, id stack.TransportEndpointID, pkt tcpip.PacketBuffer) {
+func (d *dispatcher) queuePacket(r *stack.Route, stackEP stack.TransportEndpoint, id stack.TransportEndpointID, pkt stack.PacketBuffer) {
 	ep := stackEP.(*endpoint)
 	s := newSegment(r, id, pkt)
 	if !s.parse() {
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index 5187a5e25..1ebee0cfe 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -18,6 +18,7 @@ import (
 	"encoding/binary"
 	"fmt"
 	"math"
+	"runtime"
 	"strings"
 	"sync/atomic"
 	"time"
@@ -29,11 +30,9 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/hash/jenkins"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
-	"gvisor.dev/gvisor/pkg/tcpip/iptables"
 	"gvisor.dev/gvisor/pkg/tcpip/ports"
 	"gvisor.dev/gvisor/pkg/tcpip/seqnum"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
-	"gvisor.dev/gvisor/pkg/tmutex"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
@@ -283,6 +282,38 @@ func (*EndpointInfo) IsEndpointInfo() {}
 // synchronized. The protocol implementation, however, runs in a single
 // goroutine.
 //
+// Each endpoint has a few mutexes:
+//
+// e.mu -> Primary mutex for an endpoint must be held for all operations except
+// in e.Readiness where acquiring it will result in a deadlock in epoll
+// implementation.
+//
+// The following three mutexes can be acquired independent of e.mu but if
+// acquired with e.mu then e.mu must be acquired first.
+//
+// e.acceptMu -> protects acceptedChan.
+// e.rcvListMu -> Protects the rcvList and associated fields.
+// e.sndBufMu -> Protects the sndQueue and associated fields.
+// e.lastErrorMu -> Protects the lastError field.
+//
+// LOCKING/UNLOCKING of the endpoint.  The locking of an endpoint is different
+// based on the context in which the lock is acquired. In the syscall context
+// e.LockUser/e.UnlockUser should be used and when doing background processing
+// e.mu.Lock/e.mu.Unlock should be used. The distinction is described below
+// in brief.
+//
+// The reason for this locking behaviour is to avoid wakeups to handle packets.
+// In cases where the endpoint is already locked the background processor can
+// queue the packet up and go its merry way and the lock owner will eventually
+// process the backlog when releasing the lock. Similarly when acquiring the
+// lock from say a syscall goroutine we can implement a bit of spinning if we
+// know that the lock is not held by another syscall goroutine. Background
+// processors should never hold the lock for long and we can avoid an expensive
+// sleep/wakeup by spinning for a shortwhile.
+//
+// For more details please see the detailed documentation on
+// e.LockUser/e.UnlockUser methods.
+//
 // +stateify savable
 type endpoint struct {
 	EndpointInfo
@@ -299,12 +330,6 @@ type endpoint struct {
 	// Precondition: epQueue.mu must be held to read/write this field..
 	pendingProcessing bool `state:"nosave"`
 
-	// workMu is used to arbitrate which goroutine may perform protocol
-	// work. Only the main protocol goroutine is expected to call Lock() on
-	// it, but other goroutines (e.g., send) may call TryLock() to eagerly
-	// perform work without having to wait for the main one to wake up.
-	workMu tmutex.Mutex `state:"nosave"`
-
 	// The following fields are initialized at creation time and do not
 	// change throughout the lifetime of the endpoint.
 	stack       *stack.Stack  `state:"manual"`
@@ -330,15 +355,11 @@ type endpoint struct {
 	rcvBufSize    int
 	rcvBufUsed    int
 	rcvAutoParams rcvBufAutoTuneParams
-	// zeroWindow indicates that the window was closed due to receive buffer
-	// space being filled up. This is set by the worker goroutine before
-	// moving a segment to the rcvList. This setting is cleared by the
-	// endpoint when a Read() call reads enough data for the new window to
-	// be non-zero.
-	zeroWindow bool
 
-	// The following fields are protected by the mutex.
-	mu sync.RWMutex `state:"nosave"`
+	// mu protects all endpoint fields unless documented otherwise. mu must
+	// be acquired before interacting with the endpoint fields.
+	mu          sync.Mutex `state:"nosave"`
+	ownedByUser uint32
 
 	// state must be read/set using the EndpointState()/setEndpointState() methods.
 	state EndpointState `state:".(EndpointState)"`
@@ -513,6 +534,23 @@ type endpoint struct {
 	// to the acceptedChan below terminate before we close acceptedChan.
 	pendingAccepted sync.WaitGroup `state:"nosave"`
 
+	// acceptMu protects acceptedChan.
+	acceptMu sync.Mutex `state:"nosave"`
+
+	// acceptCond is a condition variable that can be used to block on when
+	// acceptedChan is full and an endpoint is ready to be delivered.
+	//
+	// This condition variable is required because just blocking on sending
+	// to acceptedChan does not work in cases where endpoint.Listen is
+	// called twice with different backlog values. In such cases the channel
+	// is closed and a new one created. Any pending goroutines blocking on
+	// the write to the channel will panic.
+	//
+	// We use this condition variable to block/unblock goroutines which
+	// tried to deliver an endpoint but couldn't because accept backlog was
+	// full ( See: endpoint.deliverAccepted ).
+	acceptCond *sync.Cond `state:"nosave"`
+
 	// acceptedChan is used by a listening endpoint protocol goroutine to
 	// send newly accepted connections to the endpoint so that they can be
 	// read by Accept() calls.
@@ -561,6 +599,10 @@ type endpoint struct {
 	// endpoint and at this point the endpoint is only around
 	// to complete the TCP shutdown.
 	closed bool
+
+	// txHash is the transport layer hash to be set on outbound packets
+	// emitted by this endpoint.
+	txHash uint32
 }
 
 // UniqueID implements stack.TransportEndpoint.UniqueID.
@@ -583,14 +625,93 @@ func calculateAdvertisedMSS(userMSS uint16, r stack.Route) uint16 {
 	return maxMSS
 }
 
+// LockUser tries to lock e.mu and if it fails it will check if the lock is held
+// by another syscall goroutine. If yes, then it will goto sleep waiting for the
+// lock to be released, if not then it will spin till it acquires the lock or
+// another syscall goroutine acquires it in which case it will goto sleep as
+// described above.
+//
+// The assumption behind spinning here being that background packet processing
+// should not be holding the lock for long and spinning reduces latency as we
+// avoid an expensive sleep/wakeup of of the syscall goroutine).
+func (e *endpoint) LockUser() {
+	for {
+		// Try first if the sock is locked then check if it's owned
+		// by another user goroutine if not then we spin, otherwise
+		// we just goto sleep on the Lock() and wait.
+		if !e.mu.TryLock() {
+			// If socket is owned by the user then just goto sleep
+			// as the lock could be held for a reasonably long time.
+			if atomic.LoadUint32(&e.ownedByUser) == 1 {
+				e.mu.Lock()
+				atomic.StoreUint32(&e.ownedByUser, 1)
+				return
+			}
+			// Spin but yield the processor since the lower half
+			// should yield the lock soon.
+			runtime.Gosched()
+			continue
+		}
+		atomic.StoreUint32(&e.ownedByUser, 1)
+		return
+	}
+}
+
+// UnlockUser will check if there are any segments already queued for processing
+// and process any such segments before unlocking e.mu. This is required because
+// we when packets arrive and endpoint lock is already held then such packets
+// are queued up to be processed. If the lock is held by the endpoint goroutine
+// then it will process these packets but if the lock is instead held by the
+// syscall goroutine then we can have the syscall goroutine process the backlog
+// before unlocking.
+//
+// This avoids an unnecessary wakeup of the endpoint protocol goroutine for the
+// endpoint. It's also required eventually when we get rid of the endpoint
+// protocol goroutine altogether.
+//
+// Precondition: e.LockUser() must have been called before calling e.UnlockUser()
+func (e *endpoint) UnlockUser() {
+	// Lock segment queue before checking so that we avoid a race where
+	// segments can be queued between the time we check if queue is empty
+	// and actually unlock the endpoint mutex.
+	for {
+		e.segmentQueue.mu.Lock()
+		if e.segmentQueue.emptyLocked() {
+			if atomic.SwapUint32(&e.ownedByUser, 0) != 1 {
+				panic("e.UnlockUser() called without calling e.LockUser()")
+			}
+			e.mu.Unlock()
+			e.segmentQueue.mu.Unlock()
+			return
+		}
+		e.segmentQueue.mu.Unlock()
+
+		switch e.EndpointState() {
+		case StateEstablished:
+			if err := e.handleSegments(true /* fastPath */); err != nil {
+				e.notifyProtocolGoroutine(notifyTickleWorker)
+			}
+		default:
+			// Since we are waking the endpoint goroutine here just unlock
+			// and let it process the queued segments.
+			e.newSegmentWaker.Assert()
+			if atomic.SwapUint32(&e.ownedByUser, 0) != 1 {
+				panic("e.UnlockUser() called without calling e.LockUser()")
+			}
+			e.mu.Unlock()
+			return
+		}
+	}
+}
+
 // StopWork halts packet processing. Only to be used in tests.
 func (e *endpoint) StopWork() {
-	e.workMu.Lock()
+	e.mu.Lock()
 }
 
 // ResumeWork resumes packet processing. Only to be used in tests.
 func (e *endpoint) ResumeWork() {
-	e.workMu.Unlock()
+	e.mu.Unlock()
 }
 
 // setEndpointState updates the state of the endpoint to state atomically. This
@@ -672,6 +793,7 @@ func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQue
 			count:    9,
 		},
 		uniqueID: s.UniqueID(),
+		txHash:   s.Rand().Uint32(),
 	}
 
 	var ss SendBufferSizeOption
@@ -709,9 +831,8 @@ func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQue
 	}
 
 	e.segmentQueue.setLimit(MaxUnprocessedSegments)
-	e.workMu.Init()
-	e.workMu.Lock()
 	e.tsOffset = timeStampOffset()
+	e.acceptCond = sync.NewCond(&e.acceptMu)
 
 	return e
 }
@@ -721,9 +842,6 @@ func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQue
 func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
 	result := waiter.EventMask(0)
 
-	e.mu.RLock()
-	defer e.mu.RUnlock()
-
 	switch e.EndpointState() {
 	case StateInitial, StateBound, StateConnecting, StateSynSent, StateSynRecv:
 		// Ready for nothing.
@@ -735,9 +853,11 @@ func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
 	case StateListen:
 		// Check if there's anything in the accepted channel.
 		if (mask & waiter.EventIn) != 0 {
+			e.acceptMu.Lock()
 			if len(e.acceptedChan) > 0 {
 				result |= waiter.EventIn
 			}
+			e.acceptMu.Unlock()
 		}
 	}
 	if e.EndpointState().connected() {
@@ -823,20 +943,22 @@ func (e *endpoint) Abort() {
 // with it. It must be called only once and with no other concurrent calls to
 // the endpoint.
 func (e *endpoint) Close() {
-	e.mu.Lock()
-	closed := e.closed
-	e.closed = true
-	e.mu.Unlock()
-	if closed {
+	e.LockUser()
+	defer e.UnlockUser()
+	if e.closed {
 		return
 	}
 
 	// Issue a shutdown so that the peer knows we won't send any more data
 	// if we're connected, or stop accepting if we're listening.
-	e.Shutdown(tcpip.ShutdownWrite | tcpip.ShutdownRead)
-
-	e.mu.Lock()
+	e.shutdownLocked(tcpip.ShutdownWrite | tcpip.ShutdownRead)
+	e.closeNoShutdownLocked()
+}
 
+// closeNoShutdown closes the endpoint without doing a full shutdown. This is
+// used when a connection needs to be aborted with a RST and we want to skip
+// a full 4 way TCP shutdown.
+func (e *endpoint) closeNoShutdownLocked() {
 	// For listening sockets, we always release ports inline so that they
 	// are immediately available for reuse after Close() is called. If also
 	// registered, we unregister as well otherwise the next user would fail
@@ -853,6 +975,8 @@ func (e *endpoint) Close() {
 		e.boundPortFlags = ports.Flags{}
 	}
 
+	// Mark endpoint as closed.
+	e.closed = true
 	// Either perform the local cleanup or kick the worker to make sure it
 	// knows it needs to cleanup.
 	switch e.EndpointState() {
@@ -873,48 +997,33 @@ func (e *endpoint) Close() {
 		// goroutine terminates.
 		e.notifyProtocolGoroutine(notifyClose)
 	}
-
-	e.mu.Unlock()
 }
 
 // closePendingAcceptableConnections closes all connections that have completed
 // handshake but not yet been delivered to the application.
 func (e *endpoint) closePendingAcceptableConnectionsLocked() {
-	done := make(chan struct{})
-	// Spin a goroutine up as ranging on e.acceptedChan will just block when
-	// there are no more connections in the channel. Using a non-blocking
-	// select does not work as it can potentially select the default case
-	// even when there are pending writes but that are not yet written to
-	// the channel.
-	go func() {
-		defer close(done)
-		for n := range e.acceptedChan {
-			n.notifyProtocolGoroutine(notifyReset)
-			// close all connections that have completed but
-			// not accepted by the application.
-			n.Close()
-		}
-	}()
-	// pendingAccepted(see endpoint.deliverAccepted) tracks the number of
-	// endpoints which have completed handshake but are not yet written to
-	// the e.acceptedChan. We wait here till the goroutine above can drain
-	// all such connections from e.acceptedChan.
-	e.pendingAccepted.Wait()
+	e.acceptMu.Lock()
+	if e.acceptedChan == nil {
+		e.acceptMu.Unlock()
+		return
+	}
+
 	close(e.acceptedChan)
-	<-done
 	e.acceptedChan = nil
+	e.acceptCond.Broadcast()
+	e.acceptMu.Unlock()
+
+	// Wait for all pending endpoints to close.
+	e.pendingAccepted.Wait()
 }
 
 // cleanupLocked frees all resources associated with the endpoint. It is called
 // after Close() is called and the worker goroutine (if any) is done with its
 // work.
 func (e *endpoint) cleanupLocked() {
-
 	// Close all endpoints that might have been accepted by TCP but not by
 	// the client.
-	if e.acceptedChan != nil {
-		e.closePendingAcceptableConnectionsLocked()
-	}
+	e.closePendingAcceptableConnectionsLocked()
 
 	e.workerCleanup = false
 
@@ -954,18 +1063,18 @@ func (e *endpoint) initialReceiveWindow() int {
 // ModerateRecvBuf adjusts the receive buffer and the advertised window
 // based on the number of bytes copied to user space.
 func (e *endpoint) ModerateRecvBuf(copied int) {
-	e.mu.RLock()
+	e.LockUser()
+	defer e.UnlockUser()
+
 	e.rcvListMu.Lock()
 	if e.rcvAutoParams.disabled {
 		e.rcvListMu.Unlock()
-		e.mu.RUnlock()
 		return
 	}
 	now := time.Now()
 	if rtt := e.rcvAutoParams.rtt; rtt == 0 || now.Sub(e.rcvAutoParams.measureTime) < rtt {
 		e.rcvAutoParams.copied += copied
 		e.rcvListMu.Unlock()
-		e.mu.RUnlock()
 		return
 	}
 	prevRTTCopied := e.rcvAutoParams.copied + copied
@@ -1021,17 +1130,16 @@ func (e *endpoint) ModerateRecvBuf(copied int) {
 	e.rcvAutoParams.measureTime = now
 	e.rcvAutoParams.copied = 0
 	e.rcvListMu.Unlock()
-	e.mu.RUnlock()
 }
 
 // IPTables implements tcpip.Endpoint.IPTables.
-func (e *endpoint) IPTables() (iptables.IPTables, error) {
+func (e *endpoint) IPTables() (stack.IPTables, error) {
 	return e.stack.IPTables(), nil
 }
 
 // Read reads data from the endpoint.
 func (e *endpoint) Read(*tcpip.FullAddress) (buffer.View, tcpip.ControlMessages, *tcpip.Error) {
-	e.mu.RLock()
+	e.LockUser()
 	// The endpoint can be read if it's connected, or if it's already closed
 	// but has some pending unread data. Also note that a RST being received
 	// would cause the state to become StateError so we should allow the
@@ -1041,7 +1149,7 @@ func (e *endpoint) Read(*tcpip.FullAddress) (buffer.View, tcpip.ControlMessages,
 	if s := e.EndpointState(); !s.connected() && s != StateClose && bufUsed == 0 {
 		e.rcvListMu.Unlock()
 		he := e.HardError
-		e.mu.RUnlock()
+		e.UnlockUser()
 		if s == StateError {
 			return buffer.View{}, tcpip.ControlMessages{}, he
 		}
@@ -1051,7 +1159,7 @@ func (e *endpoint) Read(*tcpip.FullAddress) (buffer.View, tcpip.ControlMessages,
 
 	v, err := e.readLocked()
 	e.rcvListMu.Unlock()
-	e.mu.RUnlock()
+	e.UnlockUser()
 
 	if err == tcpip.ErrClosedForReceive {
 		e.stats.ReadErrors.ReadClosed.Increment()
@@ -1124,13 +1232,13 @@ func (e *endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c
 	// (without the MSG_FASTOPEN flag). Corking is unimplemented, so opts.More
 	// and opts.EndOfRecord are also ignored.
 
-	e.mu.RLock()
+	e.LockUser()
 	e.sndBufMu.Lock()
 
 	avail, err := e.isEndpointWritableLocked()
 	if err != nil {
 		e.sndBufMu.Unlock()
-		e.mu.RUnlock()
+		e.UnlockUser()
 		e.stats.WriteErrors.WriteClosed.Increment()
 		return 0, nil, err
 	}
@@ -1142,113 +1250,68 @@ func (e *endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c
 	// are copying data in.
 	if !opts.Atomic {
 		e.sndBufMu.Unlock()
-		e.mu.RUnlock()
+		e.UnlockUser()
 	}
 
 	// Fetch data.
 	v, perr := p.Payload(avail)
 	if perr != nil || len(v) == 0 {
-		if opts.Atomic { // See above.
+		// Note that perr may be nil if len(v) == 0.
+		if opts.Atomic {
 			e.sndBufMu.Unlock()
-			e.mu.RUnlock()
+			e.UnlockUser()
 		}
-		// Note that perr may be nil if len(v) == 0.
 		return 0, nil, perr
 	}
 
-	if opts.Atomic {
+	queueAndSend := func() (int64, <-chan struct{}, *tcpip.Error) {
 		// Add data to the send queue.
 		s := newSegmentFromView(&e.route, e.ID, v)
 		e.sndBufUsed += len(v)
 		e.sndBufInQueue += seqnum.Size(len(v))
 		e.sndQueue.PushBack(s)
 		e.sndBufMu.Unlock()
-		// Release the endpoint lock to prevent deadlocks due to lock
-		// order inversion when acquiring workMu.
-		e.mu.RUnlock()
-	}
-
-	if e.workMu.TryLock() {
-		// Since we released locks in between it's possible that the
-		// endpoint transitioned to a CLOSED/ERROR states so make
-		// sure endpoint is still writable before trying to write.
-		if !opts.Atomic { // See above.
-			e.mu.RLock()
-			e.sndBufMu.Lock()
-
-			// Because we released the lock before copying, check state again
-			// to make sure the endpoint is still in a valid state for a write.
-			avail, err = e.isEndpointWritableLocked()
-			if err != nil {
-				e.sndBufMu.Unlock()
-				e.mu.RUnlock()
-				e.stats.WriteErrors.WriteClosed.Increment()
-				return 0, nil, err
-			}
-
-			// Discard any excess data copied in due to avail being reduced due
-			// to a simultaneous write call to the socket.
-			if avail < len(v) {
-				v = v[:avail]
-			}
-			// Add data to the send queue.
-			s := newSegmentFromView(&e.route, e.ID, v)
-			e.sndBufUsed += len(v)
-			e.sndBufInQueue += seqnum.Size(len(v))
-			e.sndQueue.PushBack(s)
-			e.sndBufMu.Unlock()
-			// Release the endpoint lock to prevent deadlocks due to lock
-			// order inversion when acquiring workMu.
-			e.mu.RUnlock()
 
-		}
 		// Do the work inline.
 		e.handleWrite()
-		e.workMu.Unlock()
-	} else {
-		if !opts.Atomic { // See above.
-			e.mu.RLock()
-			e.sndBufMu.Lock()
+		e.UnlockUser()
+		return int64(len(v)), nil, nil
+	}
 
-			// Because we released the lock before copying, check state again
-			// to make sure the endpoint is still in a valid state for a write.
-			avail, err = e.isEndpointWritableLocked()
-			if err != nil {
-				e.sndBufMu.Unlock()
-				e.mu.RUnlock()
-				e.stats.WriteErrors.WriteClosed.Increment()
-				return 0, nil, err
-			}
+	if opts.Atomic {
+		// Locks released in queueAndSend()
+		return queueAndSend()
+	}
 
-			// Discard any excess data copied in due to avail being reduced due
-			// to a simultaneous write call to the socket.
-			if avail < len(v) {
-				v = v[:avail]
-			}
-			// Add data to the send queue.
-			s := newSegmentFromView(&e.route, e.ID, v)
-			e.sndBufUsed += len(v)
-			e.sndBufInQueue += seqnum.Size(len(v))
-			e.sndQueue.PushBack(s)
-			e.sndBufMu.Unlock()
-			// Release the endpoint lock to prevent deadlocks due to lock
-			// order inversion when acquiring workMu.
-			e.mu.RUnlock()
+	// Since we released locks in between it's possible that the
+	// endpoint transitioned to a CLOSED/ERROR states so make
+	// sure endpoint is still writable before trying to write.
+	e.LockUser()
+	e.sndBufMu.Lock()
+	avail, err = e.isEndpointWritableLocked()
+	if err != nil {
+		e.sndBufMu.Unlock()
+		e.UnlockUser()
+		e.stats.WriteErrors.WriteClosed.Increment()
+		return 0, nil, err
+	}
 
-		}
-		// Let the protocol goroutine do the work.
-		e.sndWaker.Assert()
+	// Discard any excess data copied in due to avail being reduced due
+	// to a simultaneous write call to the socket.
+	if avail < len(v) {
+		v = v[:avail]
 	}
 
-	return int64(len(v)), nil, nil
+	// Locks released in queueAndSend()
+	return queueAndSend()
 }
 
 // Peek reads data without consuming it from the endpoint.
 //
 // This method does not block if there is no data pending.
 func (e *endpoint) Peek(vec [][]byte) (int64, tcpip.ControlMessages, *tcpip.Error) {
-	e.mu.RLock()
-	defer e.mu.RUnlock()
+	e.LockUser()
+	defer e.UnlockUser()
 
 	// The endpoint can be read if it's connected, or if it's already closed
 	// but has some pending unread data.
@@ -1339,6 +1402,9 @@ func (e *endpoint) windowCrossedACKThresholdLocked(deltaBefore int) (crossed boo
 
 // SetSockOptBool sets a socket option.
 func (e *endpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error {
+	e.LockUser()
+	defer e.UnlockUser()
+
 	switch opt {
 	case tcpip.V6OnlyOption:
 		// We only recognize this option on v6 endpoints.
@@ -1346,9 +1412,6 @@ func (e *endpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error {
 			return tcpip.ErrInvalidEndpointState
 		}
 
-		e.mu.Lock()
-		defer e.mu.Unlock()
-
 		// We only allow this to be set when we're in the initial state.
 		if e.EndpointState() != StateInitial {
 			return tcpip.ErrInvalidEndpointState
@@ -1379,7 +1442,7 @@ func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
 
 		mask := uint32(notifyReceiveWindowChanged)
 
-		e.mu.RLock()
+		e.LockUser()
 		e.rcvListMu.Lock()
 
 		// Make sure the receive buffer size allows us to send a
@@ -1409,8 +1472,9 @@ func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
 		if crossed, above := e.windowCrossedACKThresholdLocked(availAfter - availBefore); crossed && above {
 			mask |= notifyNonZeroReceiveWindow
 		}
+
 		e.rcvListMu.Unlock()
-		e.mu.RUnlock()
+		e.UnlockUser()
 		e.notifyProtocolGoroutine(mask)
 		return nil
 
@@ -1466,15 +1530,15 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 		return nil
 
 	case tcpip.ReuseAddressOption:
-		e.mu.Lock()
+		e.LockUser()
 		e.reuseAddr = v != 0
-		e.mu.Unlock()
+		e.UnlockUser()
 		return nil
 
 	case tcpip.ReusePortOption:
-		e.mu.Lock()
+		e.LockUser()
 		e.reusePort = v != 0
-		e.mu.Unlock()
+		e.UnlockUser()
 		return nil
 
 	case tcpip.BindToDeviceOption:
@@ -1482,9 +1546,9 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 		if id != 0 && !e.stack.HasNIC(id) {
 			return tcpip.ErrUnknownDevice
 		}
-		e.mu.Lock()
+		e.LockUser()
 		e.bindToDevice = id
-		e.mu.Unlock()
+		e.UnlockUser()
 		return nil
 
 	case tcpip.QuickAckOption:
@@ -1500,16 +1564,16 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 		if userMSS < header.TCPMinimumMSS || userMSS > header.TCPMaximumMSS {
 			return tcpip.ErrInvalidOptionValue
 		}
-		e.mu.Lock()
+		e.LockUser()
 		e.userMSS = uint16(userMSS)
-		e.mu.Unlock()
+		e.UnlockUser()
 		e.notifyProtocolGoroutine(notifyMSSChanged)
 		return nil
 
 	case tcpip.TTLOption:
-		e.mu.Lock()
+		e.LockUser()
 		e.ttl = uint8(v)
-		e.mu.Unlock()
+		e.UnlockUser()
 		return nil
 
 	case tcpip.KeepaliveEnabledOption:
@@ -1541,15 +1605,15 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 		return nil
 
 	case tcpip.TCPUserTimeoutOption:
-		e.mu.Lock()
+		e.LockUser()
 		e.userTimeout = time.Duration(v)
-		e.mu.Unlock()
+		e.UnlockUser()
 		return nil
 
 	case tcpip.BroadcastOption:
-		e.mu.Lock()
+		e.LockUser()
 		e.broadcast = v != 0
-		e.mu.Unlock()
+		e.UnlockUser()
 		return nil
 
 	case tcpip.CongestionControlOption:
@@ -1563,22 +1627,16 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 		availCC := strings.Split(string(avail), " ")
 		for _, cc := range availCC {
 			if v == tcpip.CongestionControlOption(cc) {
-				// Acquire the work mutex as we may need to
-				// reinitialize the congestion control state.
-				e.mu.Lock()
+				e.LockUser()
 				state := e.EndpointState()
 				e.cc = v
-				e.mu.Unlock()
 				switch state {
 				case StateEstablished:
-					e.workMu.Lock()
-					e.mu.Lock()
 					if e.EndpointState() == state {
 						e.snd.cc = e.snd.initCongestionControl(e.cc)
 					}
-					e.mu.Unlock()
-					e.workMu.Unlock()
 				}
+				e.UnlockUser()
 				return nil
 			}
 		}
@@ -1588,23 +1646,23 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 		return tcpip.ErrNoSuchFile
 
 	case tcpip.IPv4TOSOption:
-		e.mu.Lock()
+		e.LockUser()
 		// TODO(gvisor.dev/issue/995): ECN is not currently supported,
 		// ignore the bits for now.
 		e.sendTOS = uint8(v) & ^uint8(inetECNMask)
-		e.mu.Unlock()
+		e.UnlockUser()
 		return nil
 
 	case tcpip.IPv6TrafficClassOption:
-		e.mu.Lock()
+		e.LockUser()
 		// TODO(gvisor.dev/issue/995): ECN is not currently supported,
 		// ignore the bits for now.
 		e.sendTOS = uint8(v) & ^uint8(inetECNMask)
-		e.mu.Unlock()
+		e.UnlockUser()
 		return nil
 
 	case tcpip.TCPLingerTimeoutOption:
-		e.mu.Lock()
+		e.LockUser()
 		if v < 0 {
 			// Same as effectively disabling TCPLinger timeout.
 			v = 0
@@ -1622,16 +1680,16 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 			v = stkTCPLingerTimeout
 		}
 		e.tcpLingerTimeout = time.Duration(v)
-		e.mu.Unlock()
+		e.UnlockUser()
 		return nil
 
 	case tcpip.TCPDeferAcceptOption:
-		e.mu.Lock()
+		e.LockUser()
 		if time.Duration(v) > MaxRTO {
 			v = tcpip.TCPDeferAcceptOption(MaxRTO)
 		}
 		e.deferAccept = time.Duration(v)
-		e.mu.Unlock()
+		e.UnlockUser()
 		return nil
 
 	default:
@@ -1641,8 +1699,8 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 
 // readyReceiveSize returns the number of bytes ready to be received.
 func (e *endpoint) readyReceiveSize() (int, *tcpip.Error) {
-	e.mu.RLock()
-	defer e.mu.RUnlock()
+	e.LockUser()
+	defer e.UnlockUser()
 
 	// The endpoint cannot be in listen state.
 	if e.EndpointState() == StateListen {
@@ -1664,9 +1722,9 @@ func (e *endpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) {
 			return false, tcpip.ErrUnknownProtocolOption
 		}
 
-		e.mu.Lock()
+		e.LockUser()
 		v := e.v6only
-		e.mu.Unlock()
+		e.UnlockUser()
 
 		return v, nil
 	}
@@ -1730,9 +1788,9 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 		return nil
 
 	case *tcpip.ReuseAddressOption:
-		e.mu.RLock()
+		e.LockUser()
 		v := e.reuseAddr
-		e.mu.RUnlock()
+		e.UnlockUser()
 
 		*o = 0
 		if v {
@@ -1741,9 +1799,9 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 		return nil
 
 	case *tcpip.ReusePortOption:
-		e.mu.RLock()
+		e.LockUser()
 		v := e.reusePort
-		e.mu.RUnlock()
+		e.UnlockUser()
 
 		*o = 0
 		if v {
@@ -1752,9 +1810,9 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 		return nil
 
 	case *tcpip.BindToDeviceOption:
-		e.mu.RLock()
+		e.LockUser()
 		*o = tcpip.BindToDeviceOption(e.bindToDevice)
-		e.mu.RUnlock()
+		e.UnlockUser()
 		return nil
 
 	case *tcpip.QuickAckOption:
@@ -1765,16 +1823,16 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 		return nil
 
 	case *tcpip.TTLOption:
-		e.mu.Lock()
+		e.LockUser()
 		*o = tcpip.TTLOption(e.ttl)
-		e.mu.Unlock()
+		e.UnlockUser()
 		return nil
 
 	case *tcpip.TCPInfoOption:
 		*o = tcpip.TCPInfoOption{}
-		e.mu.RLock()
+		e.LockUser()
 		snd := e.snd
-		e.mu.RUnlock()
+		e.UnlockUser()
 		if snd != nil {
 			snd.rtt.Lock()
 			o.RTT = snd.rtt.srtt
@@ -1813,9 +1871,9 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 		return nil
 
 	case *tcpip.TCPUserTimeoutOption:
-		e.mu.Lock()
+		e.LockUser()
 		*o = tcpip.TCPUserTimeoutOption(e.userTimeout)
-		e.mu.Unlock()
+		e.UnlockUser()
 		return nil
 
 	case *tcpip.OutOfBandInlineOption:
@@ -1824,9 +1882,9 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 		return nil
 
 	case *tcpip.BroadcastOption:
-		e.mu.Lock()
+		e.LockUser()
 		v := e.broadcast
-		e.mu.Unlock()
+		e.UnlockUser()
 
 		*o = 0
 		if v {
@@ -1835,33 +1893,33 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 		return nil
 
 	case *tcpip.CongestionControlOption:
-		e.mu.Lock()
+		e.LockUser()
 		*o = e.cc
-		e.mu.Unlock()
+		e.UnlockUser()
 		return nil
 
 	case *tcpip.IPv4TOSOption:
-		e.mu.RLock()
+		e.LockUser()
 		*o = tcpip.IPv4TOSOption(e.sendTOS)
-		e.mu.RUnlock()
+		e.UnlockUser()
 		return nil
 
 	case *tcpip.IPv6TrafficClassOption:
-		e.mu.RLock()
+		e.LockUser()
 		*o = tcpip.IPv6TrafficClassOption(e.sendTOS)
-		e.mu.RUnlock()
+		e.UnlockUser()
 		return nil
 
 	case *tcpip.TCPLingerTimeoutOption:
-		e.mu.Lock()
+		e.LockUser()
 		*o = tcpip.TCPLingerTimeoutOption(e.tcpLingerTimeout)
-		e.mu.Unlock()
+		e.UnlockUser()
 		return nil
 
 	case *tcpip.TCPDeferAcceptOption:
-		e.mu.Lock()
+		e.LockUser()
 		*o = tcpip.TCPDeferAcceptOption(e.deferAccept)
-		e.mu.Unlock()
+		e.UnlockUser()
 		return nil
 
 	default:
@@ -1901,8 +1959,8 @@ func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
 // yet accepted by the app, they are restored without running the main goroutine
 // here.
 func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) *tcpip.Error {
-	e.mu.Lock()
-	defer e.mu.Unlock()
+	e.LockUser()
+	defer e.UnlockUser()
 
 	connectingAddr := addr.Addr
 
@@ -2071,9 +2129,13 @@ func (*endpoint) ConnectEndpoint(tcpip.Endpoint) *tcpip.Error {
 // Shutdown closes the read and/or write end of the endpoint connection to its
 // peer.
 func (e *endpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error {
-	e.mu.Lock()
+	e.LockUser()
+	defer e.UnlockUser()
+	return e.shutdownLocked(flags)
+}
+
+func (e *endpoint) shutdownLocked(flags tcpip.ShutdownFlags) *tcpip.Error {
 	e.shutdownFlags |= flags
-	finQueued := false
 	switch {
 	case e.EndpointState().connected():
 		// Close for read.
@@ -2087,24 +2149,9 @@ func (e *endpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error {
 			// If we're fully closed and we have unread data we need to abort
 			// the connection with a RST.
 			if (e.shutdownFlags&tcpip.ShutdownWrite) != 0 && rcvBufUsed > 0 {
-				e.mu.Unlock()
-				// Try to send an active reset immediately if the
-				// work mutex is available.
-				if e.workMu.TryLock() {
-					e.mu.Lock()
-					// We need to double check here to make
-					// sure worker has not transitioned the
-					// endpoint out of a connected state
-					// before trying to send a reset.
-					if e.EndpointState().connected() {
-						e.resetConnectionLocked(tcpip.ErrConnectionAborted)
-						e.notifyProtocolGoroutine(notifyTickleWorker)
-					}
-					e.mu.Unlock()
-					e.workMu.Unlock()
-				} else {
-					e.notifyProtocolGoroutine(notifyReset)
-				}
+				e.resetConnectionLocked(tcpip.ErrConnectionAborted)
+				// Wake up worker to terminate loop.
+				e.notifyProtocolGoroutine(notifyTickleWorker)
 				return nil
 			}
 		}
@@ -2116,42 +2163,32 @@ func (e *endpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error {
 				// Already closed.
 				e.sndBufMu.Unlock()
 				if e.EndpointState() == StateTimeWait {
-					e.mu.Unlock()
 					return tcpip.ErrNotConnected
 				}
-				break
+				return nil
 			}
 
 			// Queue fin segment.
 			s := newSegmentFromView(&e.route, e.ID, nil)
 			e.sndQueue.PushBack(s)
 			e.sndBufInQueue++
-			finQueued = true
 			// Mark endpoint as closed.
 			e.sndClosed = true
 			e.sndBufMu.Unlock()
+			e.handleClose()
 		}
 
+		return nil
 	case e.EndpointState() == StateListen:
 		// Tell protocolListenLoop to stop.
 		if flags&tcpip.ShutdownRead != 0 {
 			e.notifyProtocolGoroutine(notifyClose)
 		}
+		return nil
+
 	default:
-		e.mu.Unlock()
 		return tcpip.ErrNotConnected
 	}
-	e.mu.Unlock()
-	if finQueued {
-		if e.workMu.TryLock() {
-			e.handleClose()
-			e.workMu.Unlock()
-		} else {
-			// Tell protocol goroutine to close.
-			e.sndCloseWaker.Assert()
-		}
-	}
-	return nil
 }
 
 // Listen puts the endpoint in "listen" mode, which allows it to accept
@@ -2166,8 +2203,8 @@ func (e *endpoint) Listen(backlog int) *tcpip.Error {
 }
 
 func (e *endpoint) listen(backlog int) *tcpip.Error {
-	e.mu.Lock()
-	defer e.mu.Unlock()
+	e.LockUser()
+	defer e.UnlockUser()
 
 	// Allow the backlog to be adjusted if the endpoint is not shutting down.
 	// When the endpoint shuts down, it sets workerCleanup to true, and from
@@ -2176,6 +2213,8 @@ func (e *endpoint) listen(backlog int) *tcpip.Error {
 	if e.EndpointState() == StateListen && !e.workerCleanup {
 		// Adjust the size of the channel iff we can fix existing
 		// pending connections into the new one.
+		e.acceptMu.Lock()
+		defer e.acceptMu.Unlock()
 		if len(e.acceptedChan) > backlog {
 			return tcpip.ErrInvalidEndpointState
 		}
@@ -2188,6 +2227,11 @@ func (e *endpoint) listen(backlog int) *tcpip.Error {
 		for ep := range origChan {
 			e.acceptedChan <- ep
 		}
+
+		// Notify any blocked goroutines that they can attempt to
+		// deliver endpoints again.
+		e.acceptCond.Broadcast()
+
 		return nil
 	}
 
@@ -2217,9 +2261,12 @@ func (e *endpoint) listen(backlog int) *tcpip.Error {
 	// The channel may be non-nil when we're restoring the endpoint, and it
 	// may be pre-populated with some previously accepted (but not Accepted)
 	// endpoints.
+	e.acceptMu.Lock()
 	if e.acceptedChan == nil {
 		e.acceptedChan = make(chan *endpoint, backlog)
 	}
+	e.acceptMu.Unlock()
+
 	e.workerRunning = true
 	go e.protocolListenLoop( // S/R-SAFE: drained on save.
 		seqnum.Size(e.receiveBufferAvailable()))
@@ -2229,7 +2276,6 @@ func (e *endpoint) listen(backlog int) *tcpip.Error {
 // startAcceptedLoop sets up required state and starts a goroutine with the
 // main loop for accepted connections.
 func (e *endpoint) startAcceptedLoop() {
-	e.mu.Lock()
 	e.workerRunning = true
 	e.mu.Unlock()
 	wakerInitDone := make(chan struct{})
@@ -2240,8 +2286,8 @@ func (e *endpoint) startAcceptedLoop() {
 // Accept returns a new endpoint if a peer has established a connection
 // to an endpoint previously set to listen mode.
 func (e *endpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
-	e.mu.RLock()
-	defer e.mu.RUnlock()
+	e.LockUser()
+	defer e.UnlockUser()
 
 	// Endpoint must be in listen state before it can accept connections.
 	if e.EndpointState() != StateListen {
@@ -2249,9 +2295,12 @@ func (e *endpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
 	}
 
 	// Get the new accepted endpoint.
+	e.acceptMu.Lock()
+	defer e.acceptMu.Unlock()
 	var n *endpoint
 	select {
 	case n = <-e.acceptedChan:
+		e.acceptCond.Signal()
 	default:
 		return nil, nil, tcpip.ErrWouldBlock
 	}
@@ -2260,8 +2309,8 @@ func (e *endpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
 
 // Bind binds the endpoint to a specific local port and optionally address.
 func (e *endpoint) Bind(addr tcpip.FullAddress) (err *tcpip.Error) {
-	e.mu.Lock()
-	defer e.mu.Unlock()
+	e.LockUser()
+	defer e.UnlockUser()
 
 	return e.bindLocked(addr)
 }
@@ -2339,8 +2388,8 @@ func (e *endpoint) bindLocked(addr tcpip.FullAddress) (err *tcpip.Error) {
 
 // GetLocalAddress returns the address to which the endpoint is bound.
 func (e *endpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) {
-	e.mu.RLock()
-	defer e.mu.RUnlock()
+	e.LockUser()
+	defer e.UnlockUser()
 
 	return tcpip.FullAddress{
 		Addr: e.ID.LocalAddress,
@@ -2351,8 +2400,8 @@ func (e *endpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) {
 
 // GetRemoteAddress returns the address to which the endpoint is connected.
 func (e *endpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) {
-	e.mu.RLock()
-	defer e.mu.RUnlock()
+	e.LockUser()
+	defer e.UnlockUser()
 
 	if !e.EndpointState().connected() {
 		return tcpip.FullAddress{}, tcpip.ErrNotConnected
@@ -2365,7 +2414,7 @@ func (e *endpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) {
 	}, nil
 }
 
-func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pkt tcpip.PacketBuffer) {
+func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pkt stack.PacketBuffer) {
 	// TCP HandlePacket is not required anymore as inbound packets first
 	// land at the Dispatcher which then can either delivery using the
 	// worker go routine or directly do the invoke the tcp processing inline
@@ -2384,7 +2433,7 @@ func (e *endpoint) enqueueSegment(s *segment) bool {
 }
 
 // HandleControlPacket implements stack.TransportEndpoint.HandleControlPacket.
-func (e *endpoint) HandleControlPacket(id stack.TransportEndpointID, typ stack.ControlType, extra uint32, pkt tcpip.PacketBuffer) {
+func (e *endpoint) HandleControlPacket(id stack.TransportEndpointID, typ stack.ControlType, extra uint32, pkt stack.PacketBuffer) {
 	switch typ {
 	case stack.ControlPacketTooBig:
 		e.sndBufMu.Lock()
@@ -2419,7 +2468,6 @@ func (e *endpoint) updateSndBufferUsage(v int) {
 // to be read, or when the connection is closed for receiving (in which case
 // s will be nil).
 func (e *endpoint) readyToRead(s *segment) {
-	e.mu.RLock()
 	e.rcvListMu.Lock()
 	if s != nil {
 		s.incRef()
@@ -2434,7 +2482,6 @@ func (e *endpoint) readyToRead(s *segment) {
 		e.rcvClosed = true
 	}
 	e.rcvListMu.Unlock()
-	e.mu.RUnlock()
 	e.waiterQueue.Notify(waiter.EventIn)
 }
 
@@ -2578,9 +2625,7 @@ func (e *endpoint) completeState() stack.TCPEndpointState {
 	s.SegTime = time.Now()
 
 	// Copy EndpointID.
-	e.mu.Lock()
 	s.ID = stack.TCPEndpointID(e.ID)
-	e.mu.Unlock()
 
 	// Copy endpoint rcv state.
 	e.rcvListMu.Lock()
@@ -2710,10 +2755,10 @@ func (e *endpoint) State() uint32 {
 
 // Info returns a copy of the endpoint info.
 func (e *endpoint) Info() tcpip.EndpointInfo {
-	e.mu.RLock()
+	e.LockUser()
 	// Make a copy of the endpoint info.
 	ret := e.EndpointInfo
-	e.mu.RUnlock()
+	e.UnlockUser()
 	return &ret
 }
 
@@ -2728,9 +2773,9 @@ func (e *endpoint) Wait() {
 	e.waiterQueue.EventRegister(&waitEntry, waiter.EventHUp)
 	defer e.waiterQueue.EventUnregister(&waitEntry)
 	for {
-		e.mu.Lock()
+		e.LockUser()
 		running := e.workerRunning
-		e.mu.Unlock()
+		e.UnlockUser()
 		if !running {
 			break
 		}
diff --git a/pkg/tcpip/transport/tcp/endpoint_state.go b/pkg/tcpip/transport/tcp/endpoint_state.go
index 4a46f0ec5..c3c692555 100644
--- a/pkg/tcpip/transport/tcp/endpoint_state.go
+++ b/pkg/tcpip/transport/tcp/endpoint_state.go
@@ -162,8 +162,8 @@ func (e *endpoint) loadState(state EndpointState) {
 		connectingLoading.Add(1)
 	}
 	// Directly update the state here rather than using e.setEndpointState
-	// as the endpoint is still being loaded and the stack reference to increment
-	// metrics is not yet initialized.
+	// as the endpoint is still being loaded and the stack reference is not
+	// yet initialized.
 	atomic.StoreUint32((*uint32)(&e.state), uint32(state))
 }
 
@@ -173,6 +173,9 @@ func (e *endpoint) afterLoad() {
 	// Restore the endpoint to InitialState as it will be moved to
 	// its origEndpointState during Resume.
 	e.state = StateInitial
+	// Condition variables and mutexs are not S/R'ed so reinitialize
+	// acceptCond with e.acceptMu.
+	e.acceptCond = sync.NewCond(&e.acceptMu)
 	stack.StackFromEnv.RegisterRestoredEndpoint(e)
 }
 
@@ -180,7 +183,6 @@ func (e *endpoint) afterLoad() {
 func (e *endpoint) Resume(s *stack.Stack) {
 	e.stack = s
 	e.segmentQueue.setLimit(MaxUnprocessedSegments)
-	e.workMu.Init()
 	state := e.origEndpointState
 	switch state {
 	case StateInitial, StateBound, StateListen, StateConnecting, StateEstablished:
diff --git a/pkg/tcpip/transport/tcp/forwarder.go b/pkg/tcpip/transport/tcp/forwarder.go
index c9ee5bf06..a094471b8 100644
--- a/pkg/tcpip/transport/tcp/forwarder.go
+++ b/pkg/tcpip/transport/tcp/forwarder.go
@@ -61,7 +61,7 @@ func NewForwarder(s *stack.Stack, rcvWnd, maxInFlight int, handler func(*Forward
 //
 // This function is expected to be passed as an argument to the
 // stack.SetTransportProtocolHandler function.
-func (f *Forwarder) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pkt tcpip.PacketBuffer) bool {
+func (f *Forwarder) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pkt stack.PacketBuffer) bool {
 	s := newSegment(r, id, pkt)
 	defer s.decRef()
 
diff --git a/pkg/tcpip/transport/tcp/protocol.go b/pkg/tcpip/transport/tcp/protocol.go
index 73098d904..1377107ca 100644
--- a/pkg/tcpip/transport/tcp/protocol.go
+++ b/pkg/tcpip/transport/tcp/protocol.go
@@ -95,7 +95,7 @@ const (
 )
 
 type protocol struct {
-	mu                         sync.Mutex
+	mu                         sync.RWMutex
 	sackEnabled                bool
 	delayEnabled               bool
 	sendBufferSize             SendBufferSizeOption
@@ -140,7 +140,7 @@ func (*protocol) ParsePorts(v buffer.View) (src, dst uint16, err *tcpip.Error) {
 // to a specific processing queue. Each queue is serviced by its own processor
 // goroutine which is responsible for dequeuing and doing full TCP dispatch of
 // the packet.
-func (p *protocol) QueuePacket(r *stack.Route, ep stack.TransportEndpoint, id stack.TransportEndpointID, pkt tcpip.PacketBuffer) {
+func (p *protocol) QueuePacket(r *stack.Route, ep stack.TransportEndpoint, id stack.TransportEndpointID, pkt stack.PacketBuffer) {
 	p.dispatcher.queuePacket(r, ep, id, pkt)
 }
 
@@ -151,7 +151,7 @@ func (p *protocol) QueuePacket(r *stack.Route, ep stack.TransportEndpoint, id st
 // a reset is sent in response to any incoming segment except another reset. In
 // particular, SYNs addressed to a non-existent connection are rejected by this
 // means."
-func (*protocol) HandleUnknownDestinationPacket(r *stack.Route, id stack.TransportEndpointID, pkt tcpip.PacketBuffer) bool {
+func (*protocol) HandleUnknownDestinationPacket(r *stack.Route, id stack.TransportEndpointID, pkt stack.PacketBuffer) bool {
 	s := newSegment(r, id, pkt)
 	defer s.decRef()
 
@@ -191,7 +191,15 @@ func replyWithReset(s *segment) {
 		flags |= header.TCPFlagAck
 		ack = s.sequenceNumber.Add(s.logicalLen())
 	}
-	sendTCP(&s.route, s.id, buffer.VectorisedView{}, s.route.DefaultTTL(), stack.DefaultTOS, flags, seq, ack, 0 /* rcvWnd */, nil /* options */, nil /* gso */)
+	sendTCP(&s.route, tcpFields{
+		id:     s.id,
+		ttl:    s.route.DefaultTTL(),
+		tos:    stack.DefaultTOS,
+		flags:  flags,
+		seq:    seq,
+		ack:    ack,
+		rcvWnd: 0,
+	}, buffer.VectorisedView{}, nil /* gso */)
 }
 
 // SetOption implements stack.TransportProtocol.SetOption.
@@ -273,57 +281,57 @@ func (p *protocol) SetOption(option interface{}) *tcpip.Error {
 func (p *protocol) Option(option interface{}) *tcpip.Error {
 	switch v := option.(type) {
 	case *SACKEnabled:
-		p.mu.Lock()
+		p.mu.RLock()
 		*v = SACKEnabled(p.sackEnabled)
-		p.mu.Unlock()
+		p.mu.RUnlock()
 		return nil
 
 	case *DelayEnabled:
-		p.mu.Lock()
+		p.mu.RLock()
 		*v = DelayEnabled(p.delayEnabled)
-		p.mu.Unlock()
+		p.mu.RUnlock()
 		return nil
 
 	case *SendBufferSizeOption:
-		p.mu.Lock()
+		p.mu.RLock()
 		*v = p.sendBufferSize
-		p.mu.Unlock()
+		p.mu.RUnlock()
 		return nil
 
 	case *ReceiveBufferSizeOption:
-		p.mu.Lock()
+		p.mu.RLock()
 		*v = p.recvBufferSize
-		p.mu.Unlock()
+		p.mu.RUnlock()
 		return nil
 
 	case *tcpip.CongestionControlOption:
-		p.mu.Lock()
+		p.mu.RLock()
 		*v = tcpip.CongestionControlOption(p.congestionControl)
-		p.mu.Unlock()
+		p.mu.RUnlock()
 		return nil
 
 	case *tcpip.AvailableCongestionControlOption:
-		p.mu.Lock()
+		p.mu.RLock()
 		*v = tcpip.AvailableCongestionControlOption(strings.Join(p.availableCongestionControl, " "))
-		p.mu.Unlock()
+		p.mu.RUnlock()
 		return nil
 
 	case *tcpip.ModerateReceiveBufferOption:
-		p.mu.Lock()
+		p.mu.RLock()
 		*v = tcpip.ModerateReceiveBufferOption(p.moderateReceiveBuffer)
-		p.mu.Unlock()
+		p.mu.RUnlock()
 		return nil
 
 	case *tcpip.TCPLingerTimeoutOption:
-		p.mu.Lock()
+		p.mu.RLock()
 		*v = tcpip.TCPLingerTimeoutOption(p.tcpLingerTimeout)
-		p.mu.Unlock()
+		p.mu.RUnlock()
 		return nil
 
 	case *tcpip.TCPTimeWaitTimeoutOption:
-		p.mu.Lock()
+		p.mu.RLock()
 		*v = tcpip.TCPTimeWaitTimeoutOption(p.tcpTimeWaitTimeout)
-		p.mu.Unlock()
+		p.mu.RUnlock()
 		return nil
 
 	default:
diff --git a/pkg/tcpip/transport/tcp/rcv.go b/pkg/tcpip/transport/tcp/rcv.go
index d80aff1b6..caf8977b3 100644
--- a/pkg/tcpip/transport/tcp/rcv.go
+++ b/pkg/tcpip/transport/tcp/rcv.go
@@ -168,7 +168,6 @@ func (r *receiver) consumeSegment(s *segment, segSeq seqnum.Value, segLen seqnum
 
 		// We just received a FIN, our next state depends on whether we sent a
 		// FIN already or not.
-		r.ep.mu.Lock()
 		switch r.ep.EndpointState() {
 		case StateEstablished:
 			r.ep.setEndpointState(StateCloseWait)
@@ -183,7 +182,6 @@ func (r *receiver) consumeSegment(s *segment, segSeq seqnum.Value, segLen seqnum
 		case StateFinWait2:
 			r.ep.setEndpointState(StateTimeWait)
 		}
-		r.ep.mu.Unlock()
 
 		// Flush out any pending segments, except the very first one if
 		// it happens to be the one we're handling now because the
@@ -208,7 +206,6 @@ func (r *receiver) consumeSegment(s *segment, segSeq seqnum.Value, segLen seqnum
 	// Handle ACK (not FIN-ACK, which we handled above) during one of the
 	// shutdown states.
 	if s.flagIsSet(header.TCPFlagAck) && s.ackNumber == r.ep.snd.sndNxt {
-		r.ep.mu.Lock()
 		switch r.ep.EndpointState() {
 		case StateFinWait1:
 			r.ep.setEndpointState(StateFinWait2)
@@ -222,7 +219,6 @@ func (r *receiver) consumeSegment(s *segment, segSeq seqnum.Value, segLen seqnum
 		case StateLastAck:
 			r.ep.transitionToStateCloseLocked()
 		}
-		r.ep.mu.Unlock()
 	}
 
 	return true
@@ -336,10 +332,8 @@ func (r *receiver) handleRcvdSegmentClosing(s *segment, state EndpointState, clo
 // handleRcvdSegment handles TCP segments directed at the connection managed by
 // r as they arrive. It is called by the protocol main loop.
 func (r *receiver) handleRcvdSegment(s *segment) (drop bool, err *tcpip.Error) {
-	r.ep.mu.RLock()
 	state := r.ep.EndpointState()
 	closed := r.ep.closed
-	r.ep.mu.RUnlock()
 
 	if state != StateEstablished {
 		drop, err := r.handleRcvdSegmentClosing(s, state, closed)
diff --git a/pkg/tcpip/transport/tcp/segment.go b/pkg/tcpip/transport/tcp/segment.go
index 1c10da5ca..e6fe7985d 100644
--- a/pkg/tcpip/transport/tcp/segment.go
+++ b/pkg/tcpip/transport/tcp/segment.go
@@ -18,7 +18,6 @@ import (
 	"sync/atomic"
 	"time"
 
-	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/seqnum"
@@ -56,12 +55,12 @@ type segment struct {
 	options        []byte `state:".([]byte)"`
 	hasNewSACKInfo bool
 	rcvdTime       time.Time `state:".(unixTime)"`
-	// xmitTime is the last transmit time of this segment. A zero value
-	// indicates that the segment has yet to be transmitted.
-	xmitTime time.Time `state:".(unixTime)"`
+	// xmitTime is the last transmit time of this segment.
+	xmitTime  time.Time `state:".(unixTime)"`
+	xmitCount uint32
 }
 
-func newSegment(r *stack.Route, id stack.TransportEndpointID, pkt tcpip.PacketBuffer) *segment {
+func newSegment(r *stack.Route, id stack.TransportEndpointID, pkt stack.PacketBuffer) *segment {
 	s := &segment{
 		refCnt: 1,
 		id:     id,
diff --git a/pkg/tcpip/transport/tcp/segment_queue.go b/pkg/tcpip/transport/tcp/segment_queue.go
index bd20a7ee9..48a257137 100644
--- a/pkg/tcpip/transport/tcp/segment_queue.go
+++ b/pkg/tcpip/transport/tcp/segment_queue.go
@@ -28,10 +28,16 @@ type segmentQueue struct {
 	used  int
 }
 
+// emptyLocked determines if the queue is empty.
+// Preconditions: q.mu must be held.
+func (q *segmentQueue) emptyLocked() bool {
+	return q.used == 0
+}
+
 // empty determines if the queue is empty.
 func (q *segmentQueue) empty() bool {
 	q.mu.Lock()
-	r := q.used == 0
+	r := q.emptyLocked()
 	q.mu.Unlock()
 
 	return r
diff --git a/pkg/tcpip/transport/tcp/snd.go b/pkg/tcpip/transport/tcp/snd.go
index b74b61e7d..6b7bac37d 100644
--- a/pkg/tcpip/transport/tcp/snd.go
+++ b/pkg/tcpip/transport/tcp/snd.go
@@ -126,10 +126,6 @@ type sender struct {
 	// sndNxt is the sequence number of the next segment to be sent.
 	sndNxt seqnum.Value
 
-	// sndNxtList is the sequence number of the next segment to be added to
-	// the send list.
-	sndNxtList seqnum.Value
-
 	// rttMeasureSeqNum is the sequence number being used for the latest RTT
 	// measurement.
 	rttMeasureSeqNum seqnum.Value
@@ -229,7 +225,6 @@ func newSender(ep *endpoint, iss, irs seqnum.Value, sndWnd seqnum.Size, mss uint
 		sndWnd:           sndWnd,
 		sndUna:           iss + 1,
 		sndNxt:           iss + 1,
-		sndNxtList:       iss + 1,
 		rto:              1 * time.Second,
 		rttMeasureSeqNum: iss + 1,
 		lastSendTime:     time.Now(),
@@ -455,9 +450,7 @@ func (s *sender) retransmitTimerExpired() bool {
 	// Give up if we've waited more than a minute since the last resend or
 	// if a user time out is set and we have exceeded the user specified
 	// timeout since the first retransmission.
-	s.ep.mu.RLock()
 	uto := s.ep.userTimeout
-	s.ep.mu.RUnlock()
 
 	if s.firstRetransmittedSegXmitTime.IsZero() {
 		// We store the original xmitTime of the segment that we are
@@ -713,7 +706,6 @@ func (s *sender) maybeSendSegment(seg *segment, limit int, end seqnum.Value) (se
 		default:
 			s.ep.setEndpointState(StateFinWait1)
 		}
-
 	} else {
 		// We're sending a non-FIN segment.
 		if seg.flags&header.TCPFlagFin != 0 {
@@ -1229,7 +1221,7 @@ func (s *sender) handleRcvdSegment(seg *segment) {
 
 // sendSegment sends the specified segment.
 func (s *sender) sendSegment(seg *segment) *tcpip.Error {
-	if !seg.xmitTime.IsZero() {
+	if seg.xmitCount > 0 {
 		s.ep.stack.Stats().TCP.Retransmits.Increment()
 		s.ep.stats.SendErrors.Retransmits.Increment()
 		if s.sndCwnd < s.sndSsthresh {
@@ -1237,6 +1229,7 @@ func (s *sender) sendSegment(seg *segment) *tcpip.Error {
 		}
 	}
 	seg.xmitTime = time.Now()
+	seg.xmitCount++
 	return s.sendSegmentFromView(seg.data, seg.flags, seg.sequenceNumber)
 }
 
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index 5b2b16afa..ce3df7478 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -2236,9 +2236,18 @@ func TestSegmentMerging(t *testing.T) {
 
 			c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
 
-			// Prevent the endpoint from processing packets.
-			test.stop(c.EP)
+			// Send tcp.InitialCwnd number of segments to fill up
+			// InitialWindow but don't ACK. That should prevent
+			// anymore packets from going out.
+			for i := 0; i < tcp.InitialCwnd; i++ {
+				view := buffer.NewViewFromBytes([]byte{0})
+				if _, _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
+					t.Fatalf("Write #%d failed: %s", i+1, err)
+				}
+			}
 
+			// Now send the segments that should get merged as the congestion
+			// window is full and we won't be able to send any more packets.
 			var allData []byte
 			for i, data := range [][]byte{{1, 2, 3, 4}, {5, 6, 7}, {8, 9}, {10}, {11}} {
 				allData = append(allData, data...)
@@ -2248,8 +2257,29 @@ func TestSegmentMerging(t *testing.T) {
 				}
 			}
 
-			// Let the endpoint process the segments that we just sent.
-			test.resume(c.EP)
+			// Check that we get tcp.InitialCwnd packets.
+			for i := 0; i < tcp.InitialCwnd; i++ {
+				b := c.GetPacket()
+				checker.IPv4(t, b,
+					checker.PayloadLen(header.TCPMinimumSize+1),
+					checker.TCP(
+						checker.DstPort(context.TestPort),
+						checker.SeqNum(uint32(c.IRS)+uint32(i)+1),
+						checker.AckNum(790),
+						checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
+					),
+				)
+			}
+
+			// Acknowledge the data.
+			c.SendPacket(nil, &context.Headers{
+				SrcPort: context.TestPort,
+				DstPort: c.Port,
+				Flags:   header.TCPFlagAck,
+				SeqNum:  790,
+				AckNum:  c.IRS.Add(1 + 10), // 10 for the 10 bytes of payload.
+				RcvWnd:  30000,
+			})
 
 			// Check that data is received.
 			b := c.GetPacket()
@@ -2257,7 +2287,7 @@ func TestSegmentMerging(t *testing.T) {
 				checker.PayloadLen(len(allData)+header.TCPMinimumSize),
 				checker.TCP(
 					checker.DstPort(context.TestPort),
-					checker.SeqNum(uint32(c.IRS)+1),
+					checker.SeqNum(uint32(c.IRS)+11),
 					checker.AckNum(790),
 					checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
 				),
@@ -2273,7 +2303,7 @@ func TestSegmentMerging(t *testing.T) {
 				DstPort: c.Port,
 				Flags:   header.TCPFlagAck,
 				SeqNum:  790,
-				AckNum:  c.IRS.Add(1 + seqnum.Size(len(allData))),
+				AckNum:  c.IRS.Add(11 + seqnum.Size(len(allData))),
 				RcvWnd:  30000,
 			})
 		})
diff --git a/pkg/tcpip/transport/tcp/testing/context/context.go b/pkg/tcpip/transport/tcp/testing/context/context.go
index 8cea20fb5..d4f6bc635 100644
--- a/pkg/tcpip/transport/tcp/testing/context/context.go
+++ b/pkg/tcpip/transport/tcp/testing/context/context.go
@@ -307,7 +307,7 @@ func (c *Context) SendICMPPacket(typ header.ICMPv4Type, code uint8, p1, p2 []byt
 	copy(icmp[header.ICMPv4PayloadOffset:], p2)
 
 	// Inject packet.
-	c.linkEP.InjectInbound(ipv4.ProtocolNumber, tcpip.PacketBuffer{
+	c.linkEP.InjectInbound(ipv4.ProtocolNumber, stack.PacketBuffer{
 		Data: buf.ToVectorisedView(),
 	})
 }
@@ -363,7 +363,7 @@ func (c *Context) BuildSegmentWithAddrs(payload []byte, h *Headers, src, dst tcp
 // SendSegment sends a TCP segment that has already been built and written to a
 // buffer.VectorisedView.
 func (c *Context) SendSegment(s buffer.VectorisedView) {
-	c.linkEP.InjectInbound(ipv4.ProtocolNumber, tcpip.PacketBuffer{
+	c.linkEP.InjectInbound(ipv4.ProtocolNumber, stack.PacketBuffer{
 		Data: s,
 	})
 }
@@ -371,7 +371,7 @@ func (c *Context) SendSegment(s buffer.VectorisedView) {
 // SendPacket builds and sends a TCP segment(with the provided payload & TCP
 // headers) in an IPv4 packet via the link layer endpoint.
 func (c *Context) SendPacket(payload []byte, h *Headers) {
-	c.linkEP.InjectInbound(ipv4.ProtocolNumber, tcpip.PacketBuffer{
+	c.linkEP.InjectInbound(ipv4.ProtocolNumber, stack.PacketBuffer{
 		Data: c.BuildSegment(payload, h),
 	})
 }
@@ -380,7 +380,7 @@ func (c *Context) SendPacket(payload []byte, h *Headers) {
 // & TCPheaders) in an IPv4 packet via the link layer endpoint using the
 // provided source and destination IPv4 addresses.
 func (c *Context) SendPacketWithAddrs(payload []byte, h *Headers, src, dst tcpip.Address) {
-	c.linkEP.InjectInbound(ipv4.ProtocolNumber, tcpip.PacketBuffer{
+	c.linkEP.InjectInbound(ipv4.ProtocolNumber, stack.PacketBuffer{
 		Data: c.BuildSegmentWithAddrs(payload, h, src, dst),
 	})
 }
@@ -548,7 +548,7 @@ func (c *Context) SendV6PacketWithAddrs(payload []byte, h *Headers, src, dst tcp
 	t.SetChecksum(^t.CalculateChecksum(xsum))
 
 	// Inject packet.
-	c.linkEP.InjectInbound(ipv6.ProtocolNumber, tcpip.PacketBuffer{
+	c.linkEP.InjectInbound(ipv6.ProtocolNumber, stack.PacketBuffer{
 		Data: buf.ToVectorisedView(),
 	})
 }
diff --git a/pkg/tcpip/transport/udp/BUILD b/pkg/tcpip/transport/udp/BUILD
index adc908e24..b5d2d0ba6 100644
--- a/pkg/tcpip/transport/udp/BUILD
+++ b/pkg/tcpip/transport/udp/BUILD
@@ -32,7 +32,6 @@ go_library(
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/header",
-        "//pkg/tcpip/iptables",
         "//pkg/tcpip/ports",
         "//pkg/tcpip/stack",
         "//pkg/tcpip/transport/raw",
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index 0af4514e1..a3372ac58 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -19,7 +19,6 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
-	"gvisor.dev/gvisor/pkg/tcpip/iptables"
 	"gvisor.dev/gvisor/pkg/tcpip/ports"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/waiter"
@@ -234,7 +233,7 @@ func (e *endpoint) Close() {
 func (e *endpoint) ModerateRecvBuf(copied int) {}
 
 // IPTables implements tcpip.Endpoint.IPTables.
-func (e *endpoint) IPTables() (iptables.IPTables, error) {
+func (e *endpoint) IPTables() (stack.IPTables, error) {
 	return e.stack.IPTables(), nil
 }
 
@@ -913,7 +912,7 @@ func sendUDP(r *stack.Route, data buffer.VectorisedView, localPort, remotePort u
 	if useDefaultTTL {
 		ttl = r.DefaultTTL()
 	}
-	if err := r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: ProtocolNumber, TTL: ttl, TOS: tos}, tcpip.PacketBuffer{
+	if err := r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: ProtocolNumber, TTL: ttl, TOS: tos}, stack.PacketBuffer{
 		Header:          hdr,
 		Data:            data,
 		TransportHeader: buffer.View(udp),
@@ -1260,7 +1259,7 @@ func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
 
 // HandlePacket is called by the stack when new packets arrive to this transport
 // endpoint.
-func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pkt tcpip.PacketBuffer) {
+func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pkt stack.PacketBuffer) {
 	// Get the header then trim it from the view.
 	hdr := header.UDP(pkt.Data.First())
 	if int(hdr.Length()) > pkt.Data.Size() {
@@ -1327,7 +1326,7 @@ func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pk
 }
 
 // HandleControlPacket implements stack.TransportEndpoint.HandleControlPacket.
-func (e *endpoint) HandleControlPacket(id stack.TransportEndpointID, typ stack.ControlType, extra uint32, pkt tcpip.PacketBuffer) {
+func (e *endpoint) HandleControlPacket(id stack.TransportEndpointID, typ stack.ControlType, extra uint32, pkt stack.PacketBuffer) {
 }
 
 // State implements tcpip.Endpoint.State.
diff --git a/pkg/tcpip/transport/udp/forwarder.go b/pkg/tcpip/transport/udp/forwarder.go
index fc706ede2..a674ceb68 100644
--- a/pkg/tcpip/transport/udp/forwarder.go
+++ b/pkg/tcpip/transport/udp/forwarder.go
@@ -43,7 +43,7 @@ func NewForwarder(s *stack.Stack, handler func(*ForwarderRequest)) *Forwarder {
 //
 // This function is expected to be passed as an argument to the
 // stack.SetTransportProtocolHandler function.
-func (f *Forwarder) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pkt tcpip.PacketBuffer) bool {
+func (f *Forwarder) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pkt stack.PacketBuffer) bool {
 	f.handler(&ForwarderRequest{
 		stack: f.stack,
 		route: r,
@@ -61,7 +61,7 @@ type ForwarderRequest struct {
 	stack *stack.Stack
 	route *stack.Route
 	id    stack.TransportEndpointID
-	pkt   tcpip.PacketBuffer
+	pkt   stack.PacketBuffer
 }
 
 // ID returns the 4-tuple (src address, src port, dst address, dst port) that
diff --git a/pkg/tcpip/transport/udp/protocol.go b/pkg/tcpip/transport/udp/protocol.go
index 8df089d22..6e31a9bac 100644
--- a/pkg/tcpip/transport/udp/protocol.go
+++ b/pkg/tcpip/transport/udp/protocol.go
@@ -66,7 +66,7 @@ func (*protocol) ParsePorts(v buffer.View) (src, dst uint16, err *tcpip.Error) {
 
 // HandleUnknownDestinationPacket handles packets targeted at this protocol but
 // that don't match any existing endpoint.
-func (p *protocol) HandleUnknownDestinationPacket(r *stack.Route, id stack.TransportEndpointID, pkt tcpip.PacketBuffer) bool {
+func (p *protocol) HandleUnknownDestinationPacket(r *stack.Route, id stack.TransportEndpointID, pkt stack.PacketBuffer) bool {
 	// Get the header then trim it from the view.
 	hdr := header.UDP(pkt.Data.First())
 	if int(hdr.Length()) > pkt.Data.Size() {
@@ -135,7 +135,7 @@ func (p *protocol) HandleUnknownDestinationPacket(r *stack.Route, id stack.Trans
 		pkt.SetType(header.ICMPv4DstUnreachable)
 		pkt.SetCode(header.ICMPv4PortUnreachable)
 		pkt.SetChecksum(header.ICMPv4Checksum(pkt, payload))
-		r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: header.ICMPv4ProtocolNumber, TTL: r.DefaultTTL(), TOS: stack.DefaultTOS}, tcpip.PacketBuffer{
+		r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: header.ICMPv4ProtocolNumber, TTL: r.DefaultTTL(), TOS: stack.DefaultTOS}, stack.PacketBuffer{
 			Header: hdr,
 			Data:   payload,
 		})
@@ -172,7 +172,7 @@ func (p *protocol) HandleUnknownDestinationPacket(r *stack.Route, id stack.Trans
 		pkt.SetType(header.ICMPv6DstUnreachable)
 		pkt.SetCode(header.ICMPv6PortUnreachable)
 		pkt.SetChecksum(header.ICMPv6Checksum(pkt, r.LocalAddress, r.RemoteAddress, payload))
-		r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: header.ICMPv6ProtocolNumber, TTL: r.DefaultTTL(), TOS: stack.DefaultTOS}, tcpip.PacketBuffer{
+		r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: header.ICMPv6ProtocolNumber, TTL: r.DefaultTTL(), TOS: stack.DefaultTOS}, stack.PacketBuffer{
 			Header: hdr,
 			Data:   payload,
 		})
diff --git a/pkg/tcpip/transport/udp/udp_test.go b/pkg/tcpip/transport/udp/udp_test.go
index 34b7c2360..0905726c1 100644
--- a/pkg/tcpip/transport/udp/udp_test.go
+++ b/pkg/tcpip/transport/udp/udp_test.go
@@ -439,7 +439,7 @@ func (c *testContext) injectV6Packet(payload []byte, h *header4Tuple, valid bool
 	u.SetChecksum(^u.CalculateChecksum(xsum))
 
 	// Inject packet.
-	c.linkEP.InjectInbound(ipv6.ProtocolNumber, tcpip.PacketBuffer{
+	c.linkEP.InjectInbound(ipv6.ProtocolNumber, stack.PacketBuffer{
 		Data:            buf.ToVectorisedView(),
 		NetworkHeader:   buffer.View(ip),
 		TransportHeader: buffer.View(u),
@@ -486,7 +486,7 @@ func (c *testContext) injectV4Packet(payload []byte, h *header4Tuple, valid bool
 
 	// Inject packet.
 
-	c.linkEP.InjectInbound(ipv4.ProtocolNumber, tcpip.PacketBuffer{
+	c.linkEP.InjectInbound(ipv4.ProtocolNumber, stack.PacketBuffer{
 		Data:            buf.ToVectorisedView(),
 		NetworkHeader:   buffer.View(ip),
 		TransportHeader: buffer.View(u),
diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go
index f459d1973..06b9f888a 100644
--- a/runsc/boot/filter/config.go
+++ b/runsc/boot/filter/config.go
@@ -291,6 +291,14 @@ var allowedSyscalls = seccomp.SyscallRules{
 			seccomp.AllowValue(uint64(os.Getpid())),
 		},
 	},
+	syscall.SYS_UTIMENSAT: []seccomp.Rule{
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowValue(0), /* null pathname */
+			seccomp.AllowAny{},
+			seccomp.AllowValue(0), /* flags */
+		},
+	},
 	syscall.SYS_WRITE: {},
 	// The only user in rawfile.NonBlockingWrite3 always passes iovcnt with
 	// values 2 or 3. Three iovec-s are passed, when the PACKET_VNET_HDR
diff --git a/runsc/cmd/chroot.go b/runsc/cmd/chroot.go
index b5a0ce17d..189244765 100644
--- a/runsc/cmd/chroot.go
+++ b/runsc/cmd/chroot.go
@@ -50,7 +50,7 @@ func pivotRoot(root string) error {
 	// new_root, so after umounting the old_root, we will see only
 	// the new_root in "/".
 	if err := syscall.PivotRoot(".", "."); err != nil {
-		return fmt.Errorf("error changing root filesystem: %v", err)
+		return fmt.Errorf("pivot_root failed, make sure that the root mount has a parent: %v", err)
 	}
 
 	if err := syscall.Unmount(".", syscall.MNT_DETACH); err != nil {
diff --git a/runsc/cmd/gofer.go b/runsc/cmd/gofer.go
index 6e06f3c0f..02e5af3d3 100644
--- a/runsc/cmd/gofer.go
+++ b/runsc/cmd/gofer.go
@@ -335,7 +335,7 @@ func setupRootFS(spec *specs.Spec, conf *boot.Config) error {
 
 	if !conf.TestOnlyAllowRunAsCurrentUserWithoutChroot {
 		if err := pivotRoot("/proc"); err != nil {
-			Fatalf("faild to change the root file system: %v", err)
+			Fatalf("failed to change the root file system: %v", err)
 		}
 		if err := os.Chdir("/"); err != nil {
 			Fatalf("failed to change working directory")
diff --git a/scripts/packetimpact_tests.sh b/scripts/packetimpact_tests.sh
new file mode 100755
index 000000000..027d11e64
--- /dev/null
+++ b/scripts/packetimpact_tests.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+# Copyright 2019 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+source $(dirname $0)/common.sh
+
+install_runsc_for_test runsc-d
+test_runsc $(bazel query "attr(tags, packetimpact, tests(//test/packetimpact/...))")
diff --git a/test/e2e/exec_test.go b/test/e2e/exec_test.go
index 4074d2285..594c8e752 100644
--- a/test/e2e/exec_test.go
+++ b/test/e2e/exec_test.go
@@ -240,17 +240,7 @@ func TestExecEnvHasHome(t *testing.T) {
 	}
 	d := dockerutil.MakeDocker("exec-env-home-test")
 
-	// We will check that HOME is set for root user, and also for a new
-	// non-root user we will create.
-	newUID := 1234
-	newHome := "/foo/bar"
-
-	// Create a new user with a home directory, and then sleep.
-	script := fmt.Sprintf(`
-	mkdir -p -m 777 %s && \
-	adduser foo -D -u %d -h %s && \
-	sleep 1000`, newHome, newUID, newHome)
-	if err := d.Run("alpine", "/bin/sh", "-c", script); err != nil {
+	if err := d.Run("alpine", "sleep", "1000"); err != nil {
 		t.Fatalf("docker run failed: %v", err)
 	}
 	defer d.CleanUp()
@@ -264,7 +254,15 @@ func TestExecEnvHasHome(t *testing.T) {
 		t.Errorf("wanted exec output to contain %q, got %q", want, got)
 	}
 
-	// Execute the same as uid 123 and expect newHome.
+	// Create a new user with a home directory.
+	newUID := 1234
+	newHome := "/foo/bar"
+	cmd := fmt.Sprintf("mkdir -p -m 777 %q && adduser foo -D -u %d -h %q", newHome, newUID, newHome)
+	if _, err := d.Exec("/bin/sh", "-c", cmd); err != nil {
+		t.Fatalf("docker exec failed: %v", err)
+	}
+
+	// Execute the same as the new user and expect newHome.
 	got, err = d.ExecAsUser(strconv.Itoa(newUID), "/bin/sh", "-c", "echo $HOME")
 	if err != nil {
 		t.Fatalf("docker exec failed: %v", err)
diff --git a/test/iptables/filter_input.go b/test/iptables/filter_input.go
index 05647de33..41e0cfa8d 100644
--- a/test/iptables/filter_input.go
+++ b/test/iptables/filter_input.go
@@ -194,8 +194,11 @@ func (FilterInputDropTCPDestPort) ContainerAction(ip net.IP) error {
 
 // LocalAction implements TestCase.LocalAction.
 func (FilterInputDropTCPDestPort) LocalAction(ip net.IP) error {
-	if err := connectTCP(ip, dropPort, sendloopDuration); err == nil {
-		return fmt.Errorf("connection destined to port %d should not be accepted, but got accepted", dropPort)
+	// Ensure we cannot connect to the container.
+	for start := time.Now(); time.Since(start) < sendloopDuration; {
+		if err := connectTCP(ip, dropPort, sendloopDuration-time.Since(start)); err == nil {
+			return fmt.Errorf("expected not to connect, but was able to connect on port %d", dropPort)
+		}
 	}
 
 	return nil
@@ -226,8 +229,11 @@ func (FilterInputDropTCPSrcPort) ContainerAction(ip net.IP) error {
 
 // LocalAction implements TestCase.LocalAction.
 func (FilterInputDropTCPSrcPort) LocalAction(ip net.IP) error {
-	if err := connectTCP(ip, acceptPort, sendloopDuration); err == nil {
-		return fmt.Errorf("connection should not be accepted, but was")
+	// Ensure we cannot connect to the container.
+	for start := time.Now(); time.Since(start) < sendloopDuration; {
+		if err := connectTCP(ip, acceptPort, sendloopDuration-time.Since(start)); err == nil {
+			return fmt.Errorf("expected not to connect, but was able to connect on port %d", acceptPort)
+		}
 	}
 
 	return nil
diff --git a/test/iptables/iptables_util.go b/test/iptables/iptables_util.go
index e8ae65c5a..134391e8d 100644
--- a/test/iptables/iptables_util.go
+++ b/test/iptables/iptables_util.go
@@ -144,7 +144,7 @@ func connectTCP(ip net.IP, port int, timeout time.Duration) error {
 	// The container may not be listening when we first connect, so retry
 	// upon error.
 	callback := func() error {
-		conn, err := net.DialTCP("tcp4", nil, &contAddr)
+		conn, err := net.DialTimeout("tcp", contAddr.String(), timeout)
 		if conn != nil {
 			conn.Close()
 		}
diff --git a/test/packetdrill/fin_wait2_timeout.pkt b/test/packetdrill/fin_wait2_timeout.pkt
index 613f0bec9..93ab08575 100644
--- a/test/packetdrill/fin_wait2_timeout.pkt
+++ b/test/packetdrill/fin_wait2_timeout.pkt
@@ -19,5 +19,5 @@
 +0 > F. 1:1(0) ack 1 <...>
 +0 < . 1:1(0) ack 2 win 257
 
-+1.1 < . 1:1(0) ack 2 win 257
++2 < . 1:1(0) ack 2 win 257
 +0 > R  2:2(0) win 0
diff --git a/test/packetimpact/tests/defs.bzl b/test/packetimpact/tests/defs.bzl
index 3baac567a..1b4213d9b 100644
--- a/test/packetimpact/tests/defs.bzl
+++ b/test/packetimpact/tests/defs.bzl
@@ -71,7 +71,7 @@ def packetimpact_linux_test(name, testbench_binary, **kwargs):
         name = name + "_linux_test",
         testbench_binary = testbench_binary,
         flags = ["--dut_platform", "linux"],
-        tags = PACKETIMPACT_TAGS,
+        tags = PACKETIMPACT_TAGS + ["packetimpact"],
         **kwargs
     )
 
@@ -89,7 +89,7 @@ def packetimpact_netstack_test(name, testbench_binary, **kwargs):
         # This is the default runtime unless
         # "--test_arg=--runtime=OTHER_RUNTIME" is used to override the value.
         flags = ["--dut_platform", "netstack", "--runtime=runsc-d"],
-        tags = PACKETIMPACT_TAGS,
+        tags = PACKETIMPACT_TAGS + ["packetimpact"],
         **kwargs
     )
 
diff --git a/test/perf/linux/futex_benchmark.cc b/test/perf/linux/futex_benchmark.cc
index b349d50bf..241f39896 100644
--- a/test/perf/linux/futex_benchmark.cc
+++ b/test/perf/linux/futex_benchmark.cc
@@ -33,24 +33,24 @@ namespace testing {
 namespace {
 
 inline int FutexWait(std::atomic<int32_t>* v, int32_t val) {
-  return syscall(SYS_futex, v, FUTEX_BITSET_MATCH_ANY, nullptr);
+  return syscall(SYS_futex, v, FUTEX_WAIT_PRIVATE, val, nullptr);
 }
 
-inline int FutexWaitRelativeTimeout(std::atomic<int32_t>* v, int32_t val,
-                                    const struct timespec* reltime) {
-  return syscall(SYS_futex, v, FUTEX_WAIT_PRIVATE, reltime);
+inline int FutexWaitMonotonicTimeout(std::atomic<int32_t>* v, int32_t val,
+                                     const struct timespec* timeout) {
+  return syscall(SYS_futex, v, FUTEX_WAIT_PRIVATE, val, timeout);
 }
 
-inline int FutexWaitAbsoluteTimeout(std::atomic<int32_t>* v, int32_t val,
-                                    const struct timespec* abstime) {
-  return syscall(SYS_futex, v, FUTEX_BITSET_MATCH_ANY, abstime);
+inline int FutexWaitMonotonicDeadline(std::atomic<int32_t>* v, int32_t val,
+                                      const struct timespec* deadline) {
+  return syscall(SYS_futex, v, FUTEX_WAIT_BITSET_PRIVATE, val, deadline,
+                 nullptr, FUTEX_BITSET_MATCH_ANY);
 }
 
-inline int FutexWaitBitsetAbsoluteTimeout(std::atomic<int32_t>* v, int32_t val,
-                                          int32_t bits,
-                                          const struct timespec* abstime) {
+inline int FutexWaitRealtimeDeadline(std::atomic<int32_t>* v, int32_t val,
+                                     const struct timespec* deadline) {
   return syscall(SYS_futex, v, FUTEX_WAIT_BITSET_PRIVATE | FUTEX_CLOCK_REALTIME,
-                 val, abstime, nullptr, bits);
+                 val, deadline, nullptr, FUTEX_BITSET_MATCH_ANY);
 }
 
 inline int FutexWake(std::atomic<int32_t>* v, int32_t count) {
@@ -62,11 +62,11 @@ void BM_FutexWakeNop(benchmark::State& state) {
   std::atomic<int32_t> v(0);
 
   for (auto _ : state) {
-    EXPECT_EQ(0, FutexWake(&v, 1));
+    TEST_PCHECK(FutexWake(&v, 1) == 0);
   }
 }
 
-BENCHMARK(BM_FutexWakeNop);
+BENCHMARK(BM_FutexWakeNop)->MinTime(5);
 
 // This just uses FUTEX_WAIT on an address whose value has changed, i.e., the
 // syscall won't wait.
@@ -74,43 +74,63 @@ void BM_FutexWaitNop(benchmark::State& state) {
   std::atomic<int32_t> v(0);
 
   for (auto _ : state) {
-    EXPECT_EQ(-EAGAIN, FutexWait(&v, 1));
+    TEST_PCHECK(FutexWait(&v, 1) == -1 && errno == EAGAIN);
   }
 }
 
-BENCHMARK(BM_FutexWaitNop);
+BENCHMARK(BM_FutexWaitNop)->MinTime(5);
 
 // This uses FUTEX_WAIT with a timeout on an address whose value never
 // changes, such that it always times out. Timeout overhead can be estimated by
 // timer overruns for short timeouts.
-void BM_FutexWaitTimeout(benchmark::State& state) {
+void BM_FutexWaitMonotonicTimeout(benchmark::State& state) {
   const int timeout_ns = state.range(0);
   std::atomic<int32_t> v(0);
   auto ts = absl::ToTimespec(absl::Nanoseconds(timeout_ns));
 
   for (auto _ : state) {
-    EXPECT_EQ(-ETIMEDOUT, FutexWaitRelativeTimeout(&v, 0, &ts));
+    TEST_PCHECK(FutexWaitMonotonicTimeout(&v, 0, &ts) == -1 &&
+                errno == ETIMEDOUT);
   }
 }
 
-BENCHMARK(BM_FutexWaitTimeout)
+BENCHMARK(BM_FutexWaitMonotonicTimeout)
+    ->MinTime(5)
+    ->UseRealTime()
     ->Arg(1)
     ->Arg(10)
     ->Arg(100)
     ->Arg(1000)
     ->Arg(10000);
 
-// This calls FUTEX_WAIT_BITSET with CLOCK_REALTIME.
-void BM_FutexWaitBitset(benchmark::State& state) {
+// This uses FUTEX_WAIT_BITSET with a deadline that is in the past. This allows
+// estimation of the overhead of setting up a timer for a deadline (as opposed
+// to a timeout as specified for FUTEX_WAIT).
+void BM_FutexWaitMonotonicDeadline(benchmark::State& state) {
   std::atomic<int32_t> v(0);
-  int timeout_ns = state.range(0);
-  auto ts = absl::ToTimespec(absl::Nanoseconds(timeout_ns));
+  struct timespec ts = {};
+
   for (auto _ : state) {
-    EXPECT_EQ(-ETIMEDOUT, FutexWaitBitsetAbsoluteTimeout(&v, 0, 1, &ts));
+    TEST_PCHECK(FutexWaitMonotonicDeadline(&v, 0, &ts) == -1 &&
+                errno == ETIMEDOUT);
   }
 }
 
-BENCHMARK(BM_FutexWaitBitset)->Range(0, 100000);
+BENCHMARK(BM_FutexWaitMonotonicDeadline)->MinTime(5);
+
+// This is equivalent to BM_FutexWaitMonotonicDeadline, but uses CLOCK_REALTIME
+// instead of CLOCK_MONOTONIC for the deadline.
+void BM_FutexWaitRealtimeDeadline(benchmark::State& state) {
+  std::atomic<int32_t> v(0);
+  struct timespec ts = {};
+
+  for (auto _ : state) {
+    TEST_PCHECK(FutexWaitRealtimeDeadline(&v, 0, &ts) == -1 &&
+                errno == ETIMEDOUT);
+  }
+}
+
+BENCHMARK(BM_FutexWaitRealtimeDeadline)->MinTime(5);
 
 int64_t GetCurrentMonotonicTimeNanos() {
   struct timespec ts;
@@ -130,11 +150,10 @@ void SpinNanos(int64_t delay_ns) {
 
 // Each iteration of FutexRoundtripDelayed involves a thread sending a futex
 // wakeup to another thread, which spins for delay_us and then sends a futex
-// wakeup back. The time per iteration is 2*  (delay_us + kBeforeWakeDelayNs +
+// wakeup back. The time per iteration is 2 * (delay_us + kBeforeWakeDelayNs +
 // futex/scheduling overhead).
 void BM_FutexRoundtripDelayed(benchmark::State& state) {
   const int delay_us = state.range(0);
-
   const int64_t delay_ns = delay_us * 1000;
   // Spin for an extra kBeforeWakeDelayNs before invoking FUTEX_WAKE to reduce
   // the probability that the wakeup comes before the wait, preventing the wait
@@ -165,83 +184,14 @@ void BM_FutexRoundtripDelayed(benchmark::State& state) {
 }
 
 BENCHMARK(BM_FutexRoundtripDelayed)
+    ->MinTime(5)
+    ->UseRealTime()
     ->Arg(0)
     ->Arg(10)
     ->Arg(20)
     ->Arg(50)
     ->Arg(100);
 
-// FutexLock is a simple, dumb futex based lock implementation.
-// It will try to acquire the lock by atomically incrementing the
-// lock word. If it did not increment the lock from 0 to 1, someone
-// else has the lock, so it will FUTEX_WAIT until it is woken in
-// the unlock path.
-class FutexLock {
- public:
-  FutexLock() : lock_word_(0) {}
-
-  void lock(struct timespec* deadline) {
-    int32_t val;
-    while ((val = lock_word_.fetch_add(1, std::memory_order_acquire) + 1) !=
-           1) {
-      // If we didn't get the lock by incrementing from 0 to 1,
-      // do a FUTEX_WAIT with the desired current value set to
-      // val. If val is no longer what the atomic increment returned,
-      // someone might have set it to 0 so we can try to acquire
-      // again.
-      int ret = FutexWaitAbsoluteTimeout(&lock_word_, val, deadline);
-      if (ret == 0 || ret == -EWOULDBLOCK || ret == -EINTR) {
-        continue;
-      } else {
-        FAIL() << "unexpected FUTEX_WAIT return: " << ret;
-      }
-    }
-  }
-
-  void unlock() {
-    // Store 0 into the lock word and wake one waiter. We intentionally
-    // ignore the return value of the FUTEX_WAKE here, since there may be
-    // no waiters to wake anyway.
-    lock_word_.store(0, std::memory_order_release);
-    (void)FutexWake(&lock_word_, 1);
-  }
-
- private:
-  std::atomic<int32_t> lock_word_;
-};
-
-FutexLock* test_lock;  // Used below.
-
-void FutexContend(benchmark::State& state, int thread_index,
-                  struct timespec* deadline) {
-  int counter = 0;
-  if (thread_index == 0) {
-    test_lock = new FutexLock();
-  }
-  for (auto _ : state) {
-    test_lock->lock(deadline);
-    counter++;
-    test_lock->unlock();
-  }
-  if (thread_index == 0) {
-    delete test_lock;
-  }
-  state.SetItemsProcessed(state.iterations());
-}
-
-void BM_FutexContend(benchmark::State& state) {
-  FutexContend(state, state.thread_index, nullptr);
-}
-
-BENCHMARK(BM_FutexContend)->ThreadRange(1, 1024)->UseRealTime();
-
-void BM_FutexDeadlineContend(benchmark::State& state) {
-  auto deadline = absl::ToTimespec(absl::Now() + absl::Minutes(10));
-  FutexContend(state, state.thread_index, &deadline);
-}
-
-BENCHMARK(BM_FutexDeadlineContend)->ThreadRange(1, 1024)->UseRealTime();
-
 }  // namespace
 
 }  // namespace testing
diff --git a/test/runner/runner.go b/test/runner/runner.go
index a78ef38e0..0d3742f71 100644
--- a/test/runner/runner.go
+++ b/test/runner/runner.go
@@ -300,6 +300,7 @@ func runTestCaseRunsc(testBin string, tc gtest.TestCase, t *testing.T) {
 
 	// Test spec comes with pre-defined mounts that we don't want. Reset it.
 	spec.Mounts = nil
+	testTmpDir := "/tmp"
 	if *useTmpfs {
 		// Forces '/tmp' to be mounted as tmpfs, otherwise test that rely on
 		// features only available in gVisor's internal tmpfs may fail.
@@ -325,11 +326,19 @@ func runTestCaseRunsc(testBin string, tc gtest.TestCase, t *testing.T) {
 			t.Fatalf("could not chmod temp dir: %v", err)
 		}
 
-		spec.Mounts = append(spec.Mounts, specs.Mount{
-			Type:        "bind",
-			Destination: "/tmp",
-			Source:      tmpDir,
-		})
+		// "/tmp" is not replaced with a tmpfs mount inside the sandbox
+		// when it's not empty. This ensures that testTmpDir uses gofer
+		// in exclusive mode.
+		testTmpDir = tmpDir
+		if *fileAccess == "shared" {
+			// All external mounts except the root mount are shared.
+			spec.Mounts = append(spec.Mounts, specs.Mount{
+				Type:        "bind",
+				Destination: "/tmp",
+				Source:      tmpDir,
+			})
+			testTmpDir = "/tmp"
+		}
 	}
 
 	// Set environment variables that indicate we are
@@ -349,12 +358,8 @@ func runTestCaseRunsc(testBin string, tc gtest.TestCase, t *testing.T) {
 
 	// Set TEST_TMPDIR to /tmp, as some of the syscall tests require it to
 	// be backed by tmpfs.
-	for i, kv := range env {
-		if strings.HasPrefix(kv, "TEST_TMPDIR=") {
-			env[i] = "TEST_TMPDIR=/tmp"
-			break
-		}
-	}
+	env = filterEnv(env, []string{"TEST_TMPDIR"})
+	env = append(env, fmt.Sprintf("TEST_TMPDIR=%s", testTmpDir))
 
 	spec.Process.Env = env
 
diff --git a/test/syscalls/linux/proc_net.cc b/test/syscalls/linux/proc_net.cc
index 05c952b99..4e23d1e78 100644
--- a/test/syscalls/linux/proc_net.cc
+++ b/test/syscalls/linux/proc_net.cc
@@ -92,6 +92,59 @@ TEST(ProcSysNetIpv4Sack, CanReadAndWrite) {
   EXPECT_EQ(buf, to_write);
 }
 
+// DeviceEntry is an entry in /proc/net/dev
+struct DeviceEntry {
+  std::string name;
+  uint64_t stats[16];
+};
+
+PosixErrorOr<std::vector<DeviceEntry>> GetDeviceMetricsFromProc(
+    const std::string dev) {
+  std::vector<std::string> lines = absl::StrSplit(dev, '\n');
+  std::vector<DeviceEntry> entries;
+
+  // /proc/net/dev prints 2 lines of headers followed by a line of metrics for
+  // each network interface.
+  for (unsigned i = 2; i < lines.size(); i++) {
+    // Ignore empty lines.
+    if (lines[i].empty()) {
+      continue;
+    }
+
+    std::vector<std::string> values =
+        absl::StrSplit(lines[i], ' ', absl::SkipWhitespace());
+
+    // Interface name + 16 values.
+    if (values.size() != 17) {
+      return PosixError(EINVAL, "invalid line: " + lines[i]);
+    }
+
+    DeviceEntry entry;
+    entry.name = values[0];
+    // Skip the interface name and read only the values.
+    for (unsigned j = 1; j < 17; j++) {
+      uint64_t num;
+      if (!absl::SimpleAtoi(values[j], &num)) {
+        return PosixError(EINVAL, "invalid value: " + values[j]);
+      }
+      entry.stats[j - 1] = num;
+    }
+
+    entries.push_back(entry);
+  }
+
+  return entries;
+}
+
+// TEST(ProcNetDev, Format) tests that /proc/net/dev is parsable and
+// contains at least one entry.
+TEST(ProcNetDev, Format) {
+  auto dev = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/net/dev"));
+  auto entries = ASSERT_NO_ERRNO_AND_VALUE(GetDeviceMetricsFromProc(dev));
+
+  EXPECT_GT(entries.size(), 0);
+}
+
 PosixErrorOr<uint64_t> GetSNMPMetricFromProc(const std::string snmp,
                                              const std::string& type,
                                              const std::string& item) {
diff --git a/test/syscalls/linux/seccomp.cc b/test/syscalls/linux/seccomp.cc
index 8e0fc9acc..ce88d90dd 100644
--- a/test/syscalls/linux/seccomp.cc
+++ b/test/syscalls/linux/seccomp.cc
@@ -70,20 +70,27 @@ void ApplySeccompFilter(uint32_t sysno, uint32_t filtered_result,
   MaybeSave();
 
   struct sock_filter filter[] = {
-      // A = seccomp_data.arch
-      BPF_STMT(BPF_LD | BPF_ABS | BPF_W, 4),
-      // if (A != AUDIT_ARCH_X86_64) goto kill
-      BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, AUDIT_ARCH_X86_64, 0, 4),
-      // A = seccomp_data.nr
-      BPF_STMT(BPF_LD | BPF_ABS | BPF_W, 0),
-      // if (A != sysno) goto allow
-      BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, sysno, 0, 1),
-      // return filtered_result
-      BPF_STMT(BPF_RET | BPF_K, filtered_result),
-      // allow: return SECCOMP_RET_ALLOW
-      BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
-      // kill: return SECCOMP_RET_KILL
-      BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_KILL),
+    // A = seccomp_data.arch
+    BPF_STMT(BPF_LD | BPF_ABS | BPF_W, 4),
+#if defined(__x86_64__)
+    // if (A != AUDIT_ARCH_X86_64) goto kill
+    BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, AUDIT_ARCH_X86_64, 0, 4),
+#elif defined(__aarch64__)
+    // if (A != AUDIT_ARCH_AARCH64) goto kill
+    BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, AUDIT_ARCH_AARCH64, 0, 4),
+#else
+#error "Unknown architecture"
+#endif
+    // A = seccomp_data.nr
+    BPF_STMT(BPF_LD | BPF_ABS | BPF_W, 0),
+    // if (A != sysno) goto allow
+    BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, sysno, 0, 1),
+    // return filtered_result
+    BPF_STMT(BPF_RET | BPF_K, filtered_result),
+    // allow: return SECCOMP_RET_ALLOW
+    BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
+    // kill: return SECCOMP_RET_KILL
+    BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_KILL),
   };
   struct sock_fprog prog;
   prog.len = ABSL_ARRAYSIZE(filter);
@@ -179,9 +186,12 @@ TEST(SeccompTest, RetTrapCausesSIGSYS) {
           TEST_CHECK(info->si_errno == kTrapValue);
           TEST_CHECK(info->si_call_addr != nullptr);
           TEST_CHECK(info->si_syscall == kFilteredSyscall);
-#ifdef __x86_64__
+#if defined(__x86_64__)
           TEST_CHECK(info->si_arch == AUDIT_ARCH_X86_64);
           TEST_CHECK(uc->uc_mcontext.gregs[REG_RAX] == kFilteredSyscall);
+#elif defined(__aarch64__)
+          TEST_CHECK(info->si_arch == AUDIT_ARCH_AARCH64);
+          TEST_CHECK(uc->uc_mcontext.regs[8] == kFilteredSyscall);
 #endif  // defined(__x86_64__)
           _exit(0);
         });
diff --git a/test/syscalls/linux/stat.cc b/test/syscalls/linux/stat.cc
index 513b9cd1c..2503960f3 100644
--- a/test/syscalls/linux/stat.cc
+++ b/test/syscalls/linux/stat.cc
@@ -34,6 +34,13 @@
 #include "test/util/temp_path.h"
 #include "test/util/test_util.h"
 
+#ifndef AT_STATX_FORCE_SYNC
+#define AT_STATX_FORCE_SYNC 0x2000
+#endif
+#ifndef AT_STATX_DONT_SYNC
+#define AT_STATX_DONT_SYNC 0x4000
+#endif
+
 namespace gvisor {
 namespace testing {
 
@@ -700,8 +707,10 @@ TEST_F(StatTest, StatxInvalidFlags) {
   struct kernel_statx stx;
   EXPECT_THAT(statx(AT_FDCWD, test_file_name_.c_str(), 12345, 0, &stx),
               SyscallFailsWithErrno(EINVAL));
+
+  // Sync flags are mutually exclusive.
   EXPECT_THAT(statx(AT_FDCWD, test_file_name_.c_str(),
-                    0x6000 /* AT_STATX_SYNC_TYPE */, 0, &stx),
+                    AT_STATX_FORCE_SYNC | AT_STATX_DONT_SYNC, 0, &stx),
               SyscallFailsWithErrno(EINVAL));
 }
 
diff --git a/test/syscalls/linux/sticky.cc b/test/syscalls/linux/sticky.cc
index 7e73325bf..92eec0449 100644
--- a/test/syscalls/linux/sticky.cc
+++ b/test/syscalls/linux/sticky.cc
@@ -42,8 +42,9 @@ TEST(StickyTest, StickyBitPermDenied) {
 
   auto dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
   EXPECT_THAT(chmod(dir.path().c_str(), 0777 | S_ISVTX), SyscallSucceeds());
-  std::string path = JoinPath(dir.path(), "NewDir");
-  ASSERT_THAT(mkdir(path.c_str(), 0755), SyscallSucceeds());
+  const FileDescriptor dirfd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(dir.path(), O_DIRECTORY));
+  ASSERT_THAT(mkdirat(dirfd.get(), "NewDir", 0755), SyscallSucceeds());
 
   // Drop privileges and change IDs only in child thread, or else this parent
   // thread won't be able to open some log files after the test ends.
@@ -61,7 +62,8 @@ TEST(StickyTest, StickyBitPermDenied) {
         syscall(SYS_setresuid, -1, absl::GetFlag(FLAGS_scratch_uid), -1),
         SyscallSucceeds());
 
-    EXPECT_THAT(rmdir(path.c_str()), SyscallFailsWithErrno(EPERM));
+    EXPECT_THAT(unlinkat(dirfd.get(), "NewDir", AT_REMOVEDIR),
+                SyscallFailsWithErrno(EPERM));
   });
 }
 
@@ -96,8 +98,9 @@ TEST(StickyTest, StickyBitCapFOWNER) {
 
   auto dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
   EXPECT_THAT(chmod(dir.path().c_str(), 0777 | S_ISVTX), SyscallSucceeds());
-  std::string path = JoinPath(dir.path(), "NewDir");
-  ASSERT_THAT(mkdir(path.c_str(), 0755), SyscallSucceeds());
+  const FileDescriptor dirfd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(dir.path(), O_DIRECTORY));
+  ASSERT_THAT(mkdirat(dirfd.get(), "NewDir", 0755), SyscallSucceeds());
 
   // Drop privileges and change IDs only in child thread, or else this parent
   // thread won't be able to open some log files after the test ends.
@@ -114,7 +117,8 @@ TEST(StickyTest, StickyBitCapFOWNER) {
         SyscallSucceeds());
 
     EXPECT_NO_ERRNO(SetCapability(CAP_FOWNER, true));
-    EXPECT_THAT(rmdir(path.c_str()), SyscallSucceeds());
+    EXPECT_THAT(unlinkat(dirfd.get(), "NewDir", AT_REMOVEDIR),
+                SyscallSucceeds());
   });
 }
 }  // namespace
diff --git a/test/syscalls/linux/sysret.cc b/test/syscalls/linux/sysret.cc
index 569190a59..19ffbd85b 100644
--- a/test/syscalls/linux/sysret.cc
+++ b/test/syscalls/linux/sysret.cc
@@ -58,7 +58,8 @@ class SysretTest : public ::testing::Test {
 
     iov.iov_base = &regs_;
     iov.iov_len = sizeof(regs_);
-    ASSERT_THAT(ptrace(PTRACE_GETREGSET, pid, NT_PRSTATUS, &iov), SyscallSucceeds());
+    ASSERT_THAT(ptrace(PTRACE_GETREGSET, pid, NT_PRSTATUS, &iov),
+                SyscallSucceeds());
 
     child_ = pid;
   }
@@ -75,7 +76,8 @@ class SysretTest : public ::testing::Test {
 #else
 #error "Unknown architecture"
 #endif
-    ASSERT_THAT(ptrace(PTRACE_SETREGSET, child_, NT_PRSTATUS, &iov), SyscallSucceeds());
+    ASSERT_THAT(ptrace(PTRACE_SETREGSET, child_, NT_PRSTATUS, &iov),
+                SyscallSucceeds());
   }
 
   void SetRsp(uint64_t newrsp) {
@@ -86,7 +88,8 @@ class SysretTest : public ::testing::Test {
 #else
 #error "Unknown architecture"
 #endif
-    ASSERT_THAT(ptrace(PTRACE_SETREGSET, child_, NT_PRSTATUS, &iov), SyscallSucceeds());
+    ASSERT_THAT(ptrace(PTRACE_SETREGSET, child_, NT_PRSTATUS, &iov),
+                SyscallSucceeds());
   }
 
   // Wait waits for the child pid and returns the exit status.
diff --git a/tools/BUILD b/tools/BUILD
index e73a9c885..ba3506c04 100644
--- a/tools/BUILD
+++ b/tools/BUILD
@@ -1,3 +1,3 @@
 package(licenses = ["notice"])
 
-exports_files(["nogo.js"])
+exports_files(["nogo.json"])
diff --git a/tools/bazeldefs/defs.bzl b/tools/bazeldefs/defs.bzl
index 64171ad8d..0a74370a6 100644
--- a/tools/bazeldefs/defs.bzl
+++ b/tools/bazeldefs/defs.bzl
@@ -65,10 +65,17 @@ def cc_binary(name, static = False, **kwargs):
         **kwargs: the rest of the args.
     """
     if static:
-        if "linkopts" in kwargs:
-            kwargs["linkopts"] += ["-static", "-lstdc++"]
-        else:
-            kwargs["linkopts"] = ["-static", "-lstdc++"]
+        # How to statically link a c++ program that uses threads, like for gRPC:
+        # https://gcc.gnu.org/legacy-ml/gcc-help/2010-05/msg00029.html
+        if "linkopts" not in kwargs:
+            kwargs["linkopts"] = []
+        kwargs["linkopts"] += [
+            "-static",
+            "-lstdc++",
+            "-Wl,--whole-archive",
+            "-lpthread",
+            "-Wl,--no-whole-archive",
+        ]
     _cc_binary(
         name = name,
         **kwargs
diff --git a/tools/go_marshal/gomarshal/generator.go b/tools/go_marshal/gomarshal/generator.go
index 729489de5..82983804c 100644
--- a/tools/go_marshal/gomarshal/generator.go
+++ b/tools/go_marshal/gomarshal/generator.go
@@ -413,7 +413,7 @@ func (g *Generator) writeTests(ts []*testGenerator) error {
 	// empty example instead.
 	if len(ts) == 0 {
 		b.reset()
-		b.emit("func ExampleEmptyTestSuite() {\n")
+		b.emit("func Example() {\n")
 		b.inIndent(func() {
 			b.emit("// This example is intentionally empty to ensure this file contains at least\n")
 			b.emit("// one testable entity. go-marshal is forced to emit a test file if a package\n")
diff --git a/tools/images/BUILD b/tools/images/BUILD
index fe11f08a3..66ffd02aa 100644
--- a/tools/images/BUILD
+++ b/tools/images/BUILD
@@ -9,7 +9,7 @@ package(
 genrule(
     name = "zone",
     outs = ["zone.txt"],
-    cmd = "gcloud config get-value compute/zone > $@",
+    cmd = "gcloud config get-value compute/zone > \"$@\"",
     tags = [
         "local",
         "manual",
diff --git a/tools/nogo.js b/tools/nogo.js
deleted file mode 100644
index fc0a4d1f0..000000000
--- a/tools/nogo.js
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-  "checkunsafe": {
-    "exclude_files": {
-      "/external/": "not subject to constraint"
-    }
-  }
-}
diff --git a/tools/nogo.json b/tools/nogo.json
new file mode 100644
index 000000000..ff369be6f
--- /dev/null
+++ b/tools/nogo.json
@@ -0,0 +1,95 @@
+{
+  "assign": {
+    "exclude_files": {
+      "/external/bazel_gazelle/walk/walk.go": "allowed: false positive"
+    }
+  },
+  "checkunsafe": {
+    "exclude_files": {
+      "/external/": "allowed: not subject to unsafe naming rules"
+    }
+  },
+  "copylocks": {
+    "exclude_files": {
+      ".*_state_autogen.go": "fix: m.Failf copies by value",
+      "/pkg/log/json.go": "fix: Emit passes lock by value: gvisor.dev/gvisor/pkg/log.JSONEmitter contains gvisor.dev/gvisor/pkg/log.Writer contains gvisor.dev/gvisor/pkg/sync.Mutex",
+      "/pkg/log/log_test.go": "fix: call of fmt.Printf copies lock value: gvisor.dev/gvisor/pkg/log.Writer contains gvisor.dev/gvisor/pkg/sync.Mutex",
+      "/pkg/sentry/fs/host/socket_test.go": "fix: call of t.Errorf copies lock value: gvisor.dev/gvisor/pkg/sentry/fs/host.ConnectedEndpoint contains gvisor.dev/gvisor/pkg/refs.AtomicRefCount contains gvisor.dev/gvisor/pkg/sync.Mutex",
+      "/pkg/sentry/fs/proc/sys_net.go": "fix: Truncate passes lock by value: gvisor.dev/gvisor/pkg/sentry/fs/proc.tcpMemInode contains gvisor.dev/gvisor/pkg/sentry/fs/fsutil.SimpleFileInode contains gvisor.dev/gvisor/pkg/sentry/fs/fsutil.InodeSimpleAttributes contains gvisor.dev/gvisor/pkg/sync.RWMutex",
+      "/pkg/sentry/fs/proc/sys_net.go": "fix: Truncate passes lock by value: gvisor.dev/gvisor/pkg/sentry/fs/proc.tcpSack contains gvisor.dev/gvisor/pkg/sentry/fs/fsutil.SimpleFileInode contains gvisor.dev/gvisor/pkg/sentry/fs/fsutil.InodeSimpleAttributes contains gvisor.dev/gvisor/pkg/sync.RWMutex",
+      "/pkg/sentry/fs/tty/slave.go": "fix: Truncate passes lock by value: gvisor.dev/gvisor/pkg/sentry/fs/tty.slaveInodeOperations contains gvisor.dev/gvisor/pkg/sentry/fs/fsutil.SimpleFileInode contains gvisor.dev/gvisor/pkg/sentry/fs/fsutil.InodeSimpleAttributes contains gvisor.dev/gvisor/pkg/sync.RWMutex",
+      "/pkg/sentry/kernel/time/time.go": "fix: Readiness passes lock by value: gvisor.dev/gvisor/pkg/sentry/kernel/time.ClockEventsQueue contains gvisor.dev/gvisor/pkg/waiter.Queue contains gvisor.dev/gvisor/pkg/sync.RWMutex",
+      "/pkg/sentry/kernel/syscalls_state.go": "fix: assignment copies lock value to *s: gvisor.dev/gvisor/pkg/sentry/kernel.SyscallTable contains gvisor.dev/gvisor/pkg/sentry/kernel.SyscallFlagsTable contains gvisor.dev/gvisor/pkg/sync.Mutex"
+    }
+  },
+  "lostcancel": {
+    "exclude_files": {
+      "/pkg/tcpip/network/arp/arp_test.go": "fix: the cancel function returned by context.WithTimeout should be called, not discarded, to avoid a context leak",
+      "/pkg/tcpip/stack/ndp_test.go": "fix: the cancel function returned by context.WithTimeout should be called, not discarded, to avoid a context leak",
+      "/pkg/tcpip/transport/udp/udp_test.go": "fix: the cancel function returned by context.WithTimeout should be called, not discarded, to avoid a context leak",
+      "/pkg/tcpip/transport/tcp/testing/context/context.go": "fix: the cancel function returned by context.WithTimeout should be called, not discarded, to avoid a context leak"
+    }
+  },
+  "nilness": {
+    "exclude_files": {
+      "/com_github_vishvananda_netlink/route_linux.go": "allowed: false positive",
+      "/external/bazel_gazelle/cmd/gazelle/.*": "allowed: false positive",
+      "/org_golang_x_tools/go/packages/golist.go": "allowed: runtime internals",
+      "/pkg/sentry/platform/kvm/kvm_test.go": "allowed: intentional"
+    }
+  },
+  "printf": {
+    "exclude_files": {
+      ".*_abi_autogen_test.go": "fix: Sprintf format has insufficient args",
+      "/pkg/segment/test/segment_test.go": "fix: Errorf format %d arg seg.Start is a func value, not called",
+      "/pkg/tcpip/tcpip_test.go": "fix: Error call has possible formatting directive %q",
+      "/pkg/tcpip/header/eth_test.go": "fix: Fatalf format %s reads arg #3, but call has 2 args",
+      "/pkg/tcpip/header/ndp_test.go": "fix: Errorf format %d reads arg #1, but call has 0 args",
+      "/pkg/eventchannel/event_test.go": "fix: Fatal call has possible formatting directive %v",
+      "/pkg/tcpip/stack/ndp.go": "fix: Fatalf format %s has arg protocolAddr of wrong type gvisor.dev/gvisor/pkg/tcpip.ProtocolAddress",
+      "/pkg/sentry/fs/fdpipe/pipe_test.go": "fix: Errorf format %s has arg flags of wrong type gvisor.dev/gvisor/pkg/sentry/fs.FileFlags",
+      "/pkg/sentry/fs/fdpipe/pipe_test.go": "fix: Errorf format %d arg f.FD is a func value, not called",
+      "/pkg/tcpip/link/fdbased/endpoint.go": "fix: Sprintf format %v with arg p causes recursive String method call",
+      "/pkg/tcpip/transport/udp/udp_test.go": "fix: Fatalf format %s has arg h.srcAddr of wrong type gvisor.dev/gvisor/pkg/tcpip.FullAddress",
+      "/pkg/tcpip/transport/tcp/tcp_test.go": "fix: Fatalf format %s has arg tcpTW of wrong type gvisor.dev/gvisor/pkg/tcpip.TCPTimeWaitTimeoutOption",
+      "/pkg/tcpip/transport/tcp/tcp_test.go": "fix: Errorf call needs 1 arg but has 2 args",
+      "/pkg/tcpip/stack/ndp_test.go": "fix: Errorf format %s reads arg #3, but call has 2 args",
+      "/pkg/tcpip/stack/ndp_test.go": "fix: Fatalf format %s reads arg #5, but call has 4 args",
+      "/pkg/tcpip/stack/stack_test.go": "fix: Fatalf format %s has arg protoAddr of wrong type gvisor.dev/gvisor/pkg/tcpip.ProtocolAddress",
+      "/pkg/tcpip/stack/stack_test.go": "fix: Fatalf format %s has arg nic1ProtoAddr of wrong type gvisor.dev/gvisor/pkg/tcpip.ProtocolAddress",
+      "/pkg/tcpip/stack/stack_test.go": "fix: Fatalf format %s has arg nic2ProtoAddr of wrong type gvisor.dev/gvisor/pkg/tcpip.ProtocolAddress",
+      "/pkg/tcpip/stack/stack_test.go": "fix: Fatal call has possible formatting directive %t",
+      "/pkg/tcpip/stack/stack_test.go": "fix: Fatalf call has arguments but no formatting directives",
+      "/pkg/tcpip/link/fdbased/endpoint.go": "fix: Sprintf format %v with arg p causes recursive String method call",
+      "/pkg/sentry/fsimpl/tmpfs/stat_test.go": "fix: Errorf format %v reads arg #1, but call has 0 args",
+      "/runsc/container/test_app/test_app.go": "fix: Fatal call has possible formatting directive %q",
+      "/test/root/cgroup_test.go": "fix: Errorf format %s has arg gots of wrong type []int",
+      "/test/root/cgroup_test.go": "fix: Fatalf format %v reads arg #3, but call has 2 args",
+      "/test/runtimes/runner.go": "fix: Skip call has possible formatting directive %q",
+      "/test/runtimes/blacklist_test.go": "fix: Errorf format %q has arg blacklistFile of wrong type *string"
+    }
+  },
+  "structtag": {
+    "exclude_files": {
+      "/external/": "allowed: may use arbitrary tags"
+    }
+  },
+  "unsafeptr": {
+    "exclude_files": {
+      ".*_test.go": "allowed: exclude tests",
+      "/pkg/flipcall/flipcall_unsafe.go": "allowed: special case",
+      "/pkg/gohacks/gohacks_unsafe.go": "allowed: special case",
+      "/pkg/sentry/fs/fsutil/host_file_mapper_unsafe.go": "allowed: special case",
+      "/pkg/sentry/platform/kvm/(bluepill|machine)_unsafe.go": "allowed: special case",
+      "/pkg/sentry/platform/ring0/pagetables/allocator_unsafe.go": "allowed: special case",
+      "/pkg/sentry/platform/safecopy/safecopy_unsafe.go": "allowed: special case",
+      "/pkg/sentry/vfs/mount_unsafe.go": "allowed: special case"
+    }
+  },
+  "unusedresult": {
+    "exclude_files": {
+      "/pkg/sentry/fsimpl/proc/task_net.go": "fix: result of fmt.Sprintf call not used",
+      "/pkg/sentry/fsimpl/proc/tasks_net.go": "fix: result of fmt.Sprintf call not used"
+    }
+  }
+}